#!/usr/bin/env python3
"""
Cadastre Viewer — Local listings proxy
python3 listings-proxy.py   →   http://localhost:3001

/listings?url=URL   → GeoJSON FeatureCollection
/structure?url=URL  → debug info
/?url=URL           → raw passthrough / health check
"""

from http.server import HTTPServer, BaseHTTPRequestHandler
import urllib.request, urllib.parse, urllib.error
import gzip, json, re, sys, time, html as html_module

PORT   = 3001
ALLOWED = ['zonaprop.com.ar', 'argenprop.com', 'nominatim.openstreetmap.org']

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'es-AR,es;q=0.9,en;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Cache-Control': 'no-cache',
    'Upgrade-Insecure-Requests': '1',
}


# ── HTTP fetch ────────────────────────────────────────────────────────────────

def fetch(url, extra_headers=None):
    h = dict(HEADERS)
    if extra_headers:
        h.update(extra_headers)
    req = urllib.request.Request(url, headers=h)
    with urllib.request.urlopen(req, timeout=25) as r:
        raw = r.read()
        ct  = r.headers.get('Content-Type', 'text/html')
    if len(raw) > 2 and raw[:2] == b'\x1f\x8b':
        raw = gzip.decompress(raw)
    return raw.decode('utf-8', errors='replace'), ct


# ── Data extraction strategies ────────────────────────────────────────────────

def strategy_next_data(html):
    """Traditional Next.js pages: __NEXT_DATA__ script tag."""
    m = re.search(r'<script[^>]+id=["\']__NEXT_DATA__["\'][^>]*>(.*?)</script>',
                  html, re.DOTALL | re.IGNORECASE)
    if not m:
        return None
    try:
        data = json.loads(m.group(1))
        # App Router minimal __NEXT_DATA__ — skip it
        if list(data.keys()) == ['namedChunks']:
            print('  __NEXT_DATA__ is App Router stub (namedChunks only), skipping')
            return None
        print('  Found classic __NEXT_DATA__')
        return data
    except Exception as e:
        print(f'  __NEXT_DATA__ parse error: {e}')
        return None


def strategy_next_f(html):
    """Next.js App Router: self.__next_f.push([1, "rsc_payload"]) chunks."""
    chunks = []
    # Collect all RSC chunk strings
    for m in re.finditer(r'self\.__next_f\.push\(\[(?:1|2),("(?:[^"\\]|\\.)*")\]\)',
                         html, re.DOTALL):
        try:
            text = json.loads(m.group(1))   # decode the JSON-encoded string
            chunks.append(text)
        except Exception:
            pass

    if not chunks:
        print('  No __next_f chunks found')
        return None

    print(f'  Found {len(chunks)} __next_f RSC chunks')
    full = '\n'.join(chunks)

    # Method A: look for "listPostings":[...] directly in the RSC text
    for key in ('listPostings', 'postings', 'results', 'listings', 'items'):
        m = re.search(rf'"{key}"\s*:\s*(\[{{.*?}},', full, re.DOTALL)
        if m:
            # try to grab the full array
            start = m.start(1)
            # find matching close bracket
            depth, i = 0, start
            while i < len(full):
                if full[i] == '[': depth += 1
                elif full[i] == ']':
                    depth -= 1
                    if depth == 0:
                        try:
                            arr = json.loads(full[start:i+1])
                            if arr and is_listing(arr[0]):
                                print(f'  Found "{key}" array in RSC chunks: {len(arr)} items')
                                return {'_direct': arr}
                        except Exception:
                            pass
                        break
                i += 1

    # Method B: parse each RSC line as JSON and walk for listings
    for line in full.split('\n'):
        m = re.match(r'^\d+:([\[{].*)', line)
        if not m:
            continue
        try:
            val = json.loads(m.group(1))
            lst = walk(val)
            if lst:
                print(f'  Found listing array in RSC line: {len(lst)} items')
                return {'_direct': lst}
        except Exception:
            pass

    # Method C: aggressive JSON extraction from RSC text
    # Find any JSON array whose first element looks like a listing
    for m in re.finditer(r'(\[{"(?:id|postingId)":', full):
        start = m.start(1)
        depth, i = 0, start
        while i < min(len(full), start + 500000):
            if full[i] == '[': depth += 1
            elif full[i] == ']':
                depth -= 1
                if depth == 0:
                    try:
                        arr = json.loads(full[start:i+1])
                        if arr and is_listing(arr[0]):
                            print(f'  Found listing array via aggressive scan: {len(arr)} items')
                            return {'_direct': arr}
                    except Exception:
                        pass
                    break
            i += 1

    print('  RSC chunks found but no listing array extracted')
    return None


def strategy_escaped_json(html):
    """
    ZonaProp stores data as a doubly-encoded JSON string in a JS variable.
    Raw HTML contains:  listPostings\":[{\"postingId\":\"123\"...}]
    We find listPostings, bracket-match the array treating single backslash
    as an escape prefix, then unescape and parse.
    """
    idx = html.find('listPostings')
    if idx == -1:
        return None

    bracket = html.find('[', idx)
    if bracket == -1 or bracket - idx > 30:
        return None

    # Bracket-match accounting for backslash-escapes
    depth, i = 0, bracket
    limit = min(len(html), bracket + 8_000_000)
    while i < limit:
        ch = html[i]
        if ch == chr(92):      # backslash: skip next char
            i += 2
            continue
        if ch == '[':
            depth += 1
        elif ch == ']':
            depth -= 1
            if depth == 0:
                raw = html[bracket : i + 1]
                # Unescape: \x -> x  (handles \" -> " and \/ -> /)
                unescaped = re.sub(r'\\(.)', r'\1', raw)
                # Simpler fallback: just replace \" with "
                simple = raw.replace('\\"', '"').replace('\\/', '/')
                for candidate in (unescaped, simple, raw):
                    try:
                        arr = json.loads(candidate)
                        if isinstance(arr, list) and arr and is_listing(arr[0]):
                            print(f'  Found {len(arr)} listings via escaped-JSON strategy')
                            return {'_direct': arr}
                    except Exception:
                        pass
                break
        i += 1
    return None

def strategy_direct_scan(html):

    """
    Direct bracket-matching: find any known listing key in raw HTML
    and extract the full JSON array by counting brackets.
    Works regardless of how the data is embedded (var, window., etc.)
    """
    keys_to_try = [
        'listPostings', 'postings', '"listings"', '"results"',
        '"avisos"', '"publicaciones"', '"propiedades"',
    ]
    for key in keys_to_try:
        idx = html.find(f'"{key}"' if not key.startswith('"') else key)
        if idx == -1:
            continue
        # Find the '[' that opens the array after this key
        bracket_pos = html.find('[', idx)
        close_pos   = html.find(']', idx)  # might be closing something else
        if bracket_pos == -1 or (close_pos != -1 and close_pos < bracket_pos):
            continue
        if bracket_pos - idx > 200:
            continue  # too far away, not a direct assignment

        # Bracket-match to find the full array
        depth, i = 0, bracket_pos
        limit = min(len(html), bracket_pos + 5_000_000)  # 5MB max scan
        while i < limit:
            c = html[i]
            if c == '[':   depth += 1
            elif c == ']':
                depth -= 1
                if depth == 0:
                    arr_str = html[bracket_pos:i+1]
                    try:
                        arr = json.loads(arr_str)
                        if isinstance(arr, list) and arr and is_listing(arr[0]):
                            print(f'  Found "{key}" via direct scan: {len(arr)} items')
                            return {'_direct': arr}
                    except json.JSONDecodeError:
                        pass
                    break
            i += 1

    # Also try: find the enclosing object around "listPostings"
    idx = html.find('"listPostings"')
    if idx != -1:
        # Walk back to find the opening '{' of the parent object
        # Look in a window of 10KB before the key
        window = html[max(0, idx-10000):idx+500000]
        # Try to parse as JSON object starting from various '{'
        for start_m in re.finditer(r'\{', window[:10000]):
            try:
                candidate = window[start_m.start():]
                # Quick check before full parse
                if '"listPostings"' not in candidate[:100000]:
                    continue
                obj = json.loads(candidate)
                lst = walk(obj)
                if lst:
                    print(f'  Found listings in parent object of listPostings: {len(lst)} items')
                    return {'_direct': lst}
            except Exception:
                pass

    return None


def strategy_zonaprop_api(url, html):

    """
    Try ZonaProp's internal search API.
    The old API: /nfs/webservice/noLogin.do
    The newer one might be discoverable from the page JS.
    """
    # Look for API URL pattern in the page HTML/JS
    api_patterns = [
        r'["\'](/(?:api|nfs|rest|search)[^"\']*search[^"\']*)["\']',
        r'fetch\(["\']([^"\']+search[^"\']+)["\']',
        r'axios\.[a-z]+\(["\']([^"\']+search[^"\']+)["\']',
    ]
    for pat in api_patterns:
        for m in re.finditer(pat, html):
            candidate = m.group(1)
            if 'search' in candidate.lower() and len(candidate) < 200:
                print(f'  Candidate API path: {candidate}')

    # Try the known old API endpoint with parameters extracted from URL
    slug = re.sub(r'https?://[^/]+', '', url).strip('/')
    slug = re.sub(r'\.html$', '', slug)

    # Extract operation and property types from slug
    operation = 'venta' if 'venta' in slug else 'alquiler'
    parts = slug.split('-')

    # Known ZonaProp search API (may still work)
    api_url = 'https://www.zonaprop.com.ar/api/v2/search'
    params = urllib.parse.urlencode({
        'country': 'ar',
        'searchType': '1',
        'operation': operation,
    })
    try:
        resp, _ = fetch(f'{api_url}?{params}', extra_headers={
            'Accept': 'application/json',
            'Referer': url,
            'X-Requested-With': 'XMLHttpRequest',
        })
        data = json.loads(resp)
        if data:
            print(f'  ZonaProp API responded')
            return data
    except Exception as e:
        print(f'  ZonaProp API attempt failed: {e}')

    return None


def strategy_inline_json(html):
    """Scan all <script> tags for large JSON objects containing listings."""
    for i, m in enumerate(re.finditer(r'<script(?:[^>]*)>(.*?)</script>', html, re.DOTALL)):
        text = m.group(1).strip()
        if len(text) < 300:
            continue

        # Try direct JSON
        for prefix in ('{', '['):
            if text.startswith(prefix):
                try:
                    d = json.loads(text)
                    lst = walk(d)
                    if lst:
                        print(f'  Found listings in inline script {i}')
                        return {'_direct': lst}
                except Exception:
                    pass

        # Try variable assignments with large JSON values
        for var_m in re.finditer(r'=\s*(\{[^=]{50,}\})\s*[;,\n]', text, re.DOTALL):
            try:
                d = json.loads(var_m.group(1))
                lst = walk(d)
                if lst:
                    print(f'  Found listings in variable assignment, script {i}')
                    return {'_direct': lst}
            except Exception:
                pass

    return None


# ── JSON utilities ────────────────────────────────────────────────────────────

def is_listing(o):
    if not isinstance(o, dict): return False
    keys = ' '.join(str(k).lower() for k in o.keys())
    has_id    = any(k in keys for k in ('id', 'postingid', 'propertyid'))
    has_price = any(k in keys for k in ('price', 'precio', 'operation', 'priceoper'))
    has_place = any(k in keys for k in ('address', 'location', 'geo', 'city', 'ciudad'))
    return has_id and (has_price or has_place)


def walk(obj, depth=0):
    """Find first array of listing objects anywhere in nested structure."""
    if depth > 10 or obj is None:
        return None
    if isinstance(obj, list):
        if len(obj) >= 1 and is_listing(obj[0]):
            return obj
        # Recurse into list items (but not too many)
        for item in obj[:5]:
            found = walk(item, depth+1)
            if found:
                return found
        return None
    if isinstance(obj, dict):
        for k in ('listPostings', 'postings', 'results', 'listings', 'items',
                  'data', 'properties', 'list', 'avisos', 'publicaciones',
                  'searchResults', 'posting', 'realEstate'):
            v = obj.get(k)
            if isinstance(v, list) and v and is_listing(v[0]):
                return v
        for v in obj.values():
            found = walk(v, depth+1)
            if found:
                return found
    return None


# ── Extractor ─────────────────────────────────────────────────────────────────

def get(obj, *keys):
    for k in keys:
        try:
            obj = obj[k]
        except (KeyError, IndexError, TypeError):
            return None
    return obj


def extract(l, source_domain):
    is_zona = 'zonaprop' in source_domain

    # ── ZonaProp-specific paths (confirmed from /sample) ─────────────────────
    # Coordinates: postingLocation.postingGeolocation.geolocation.latitude/longitude
    lat = (get(l,'postingLocation','postingGeolocation','geolocation','latitude')
           or get(l,'geo','lat') or get(l,'geo','latitude')
           or get(l,'address','geoLocation','lat')
           or get(l,'geolocation','lat')
           or l.get('latitude') or l.get('lat'))
    lon = (get(l,'postingLocation','postingGeolocation','geolocation','longitude')
           or get(l,'geo','lon') or get(l,'geo','lng') or get(l,'geo','longitude')
           or get(l,'address','geoLocation','lon') or get(l,'address','geoLocation','lng')
           or get(l,'geolocation','lng') or get(l,'geolocation','lon')
           or l.get('longitude') or l.get('lng') or l.get('lon'))

    # Address: house.address.name has the full readable string
    address = (get(l,'house','address','name')
               or get(l,'postingLocation','address','name')
               or '')
    if address:
        # Clean up the ZonaProp "Type Venta X Zone, City, Province" prefix format
        # e.g. "Casas Venta Barrio ..., La Concepción, Luján, GBA Oeste"
        # → keep as-is, it's readable enough
        pass
    else:
        addr = l.get('address') or l.get('location') or {}
        if isinstance(addr, str):
            address = addr
        else:
            # Build from location hierarchy
            loc = get(l,'postingLocation','location') or {}
            city   = loc.get('name','')
            parent = loc.get('parent') or {}
            ciudad = parent.get('name','')
            address = ', '.join(filter(None,[city, ciudad]))

    # Price
    price, currency = None, 'USD'
    for fn in [
        lambda x: (get(x,'priceOperationTypes',0,'prices',0,'amount'),
                   get(x,'priceOperationTypes',0,'prices',0,'currency') or 'USD'),
        lambda x: (get(x,'operations',0,'prices',0,'price') or get(x,'operations',0,'prices',0,'amount'),
                   get(x,'operations',0,'prices',0,'currency') or 'USD'),
        lambda x: (get(x,'price','amount'), get(x,'price','currency') or 'USD'),
        lambda x: (l.get('price'), 'USD') if not isinstance(l.get('price'),dict) else (None,'USD'),
    ]:
        try:
            p, c = fn(l)
            if p is not None: price, currency = p, c; break
        except Exception: pass

    # Area: mainFeatures.CFT100 = superficie total, CFT101 = cubierta
    area = None
    mf = l.get('mainFeatures') or {}
    for feat_id in ('CFT100','CFT101'):
        v = mf.get(feat_id,{}).get('value')
        if v:
            try: area = float(v); break
            except Exception: pass
    if not area:
        for f in ('surfaceTotals','totalSurface','area','surface','totalArea','coveredArea'):
            v = l.get(f)
            if v is not None:
                area = v[0].get('value') if isinstance(v,list) and v else (v if not isinstance(v,list) else None)
                if area: break

    # Rooms: mainFeatures.CFT1
    rooms = None
    if mf.get('CFT1',{}).get('value'):
        try: rooms = int(mf['CFT1']['value'])
        except Exception: pass
    if not rooms:
        rooms = l.get('rooms') or l.get('roomsAmount') or l.get('bedrooms')

    # Image: visiblePictures.pictures[0].url730x532
    img = (get(l,'visiblePictures','pictures',0,'url730x532')
           or get(l,'visiblePictures','pictures',0,'url360x266')
           or get(l,'house','image'))
    if not img:
        photos = l.get('photos') or []
        if photos and isinstance(photos[0],dict):
            img = photos[0].get('image') or photos[0].get('url')

    url = l.get('url') or l.get('slug') or l.get('link') or ''
    base = 'https://www.zonaprop.com.ar' if is_zona else 'https://www.argenprop.com'
    if url and not url.startswith('http'):
        url = base + url

    title = (l.get('title') or l.get('generatedTitle') or l.get('developmentName') or
             (get(l,'realEstateType','name') or '') + (' en ' + address.split(',')[0] if address else ''))

    prop_type = get(l,'realEstateType','name') or ''

    return {
        'id':       str(l.get('postingId') or l.get('id') or l.get('propertyId') or ''),
        'title':    str(title or '').strip(),
        'price':    price,
        'currency': currency,
        'address':  address,
        'area':     area,
        'rooms':    rooms,
        'type':     prop_type,
        'lat':      float(lat) if lat is not None else None,
        'lng':      float(lon) if lon is not None else None,
        'url':      url,
        'img':      img,
        'source':   'ZonaProp' if is_zona else 'ArgenProp',
    }


def geocode(address):
    try:
        q  = urllib.parse.quote(address + ', Buenos Aires, Argentina')
        r, _ = fetch(f'https://nominatim.openstreetmap.org/search?format=json&limit=1&q={q}',
                     extra_headers={'User-Agent': 'CadastreViewer/1.0'})
        results = json.loads(r)
        if results:
            return float(results[0]['lat']), float(results[0]['lon'])
    except Exception:
        pass
    return None, None


# ── Main parse entry ──────────────────────────────────────────────────────────

def parse_listings(html, url):
    print(f'  Page size: {len(html)//1024} KB | '
          f'has __NEXT_DATA__: {"__NEXT_DATA__" in html} | '
          f'has __next_f: {"__next_f" in html} | '
          f'has listPostings: {"listPostings" in html}')

    data = None
    for strategy in [strategy_next_data, strategy_next_f, strategy_direct_scan, strategy_inline_json]:
        data = strategy(html)
        if data:
            break
    if not data:
        data = strategy_zonaprop_api(url, html)

    if not data:
        print('  All strategies failed — no listing data found')
        return []

    # Get the raw list
    if isinstance(data, dict) and '_direct' in data:
        raw_list = data['_direct'] if isinstance(data['_direct'], list) else walk(data['_direct'])
    else:
        raw_list = walk(data)

    if not raw_list:
        print('  Data found but no listing array identified')
        return []

    print(f'  Extracting {len(raw_list)} items...')
    # Log first item keys and coords fields for debugging
    if raw_list:
        sample = raw_list[0]
        print(f'  Sample listing keys: {list(sample.keys())}')
        # Print any key that might hold coordinates
        for k,v in sample.items():
            if isinstance(v, dict) and any(c in str(v).lower() for c in ('lat','lon','lng','coord','geo')):
                print(f'  Coord-like field [{k}]: {str(v)[:150]}')
        # Print address field
        for k in ('address','location','geo','postingLocation','mapLocation','geoLocation'):
            if k in sample:
                print(f'  [{k}]: {str(sample[k])[:150]}')
    domain = 'zonaprop' if 'zonaprop' in url else 'argenprop'
    listings = []
    for item in raw_list:
        try:
            lst = extract(item, domain)
            if lst.get('title') or lst.get('address'):
                listings.append(lst)
        except Exception:
            pass

    # Geocode up to 10 without coords
    missing = [l for l in listings if not l.get('lat') and l.get('address')]
    if missing:
        print(f'  Geocoding {len(missing)} listings without coordinates...')
        for i, l in enumerate(missing):
            lat, lon = geocode(l['address'])
            if lat:
                l['lat'] = lat
                l['lng'] = lon
                print(f'    geocoded [{i+1}/{len(missing)}]: {l["address"][:50]} -> {lat:.4f},{lon:.4f}')
            time.sleep(0.4)

    mapped   = sum(1 for l in listings if l.get('lat'))
    print(f'  Result: {len(listings)} listings, {mapped} with coordinates')
    return listings


def to_geojson(listings):
    features = [
        {'type': 'Feature',
         'geometry': {'type': 'Point', 'coordinates': [l['lng'], l['lat']]},
         'properties': {k: v for k, v in l.items() if k not in ('lat', 'lng')}}
        for l in listings if l.get('lat') and l.get('lng')
    ]
    return {
        'type': 'FeatureCollection',
        'features': features,
        'meta': {
            'total':     len(listings),
            'mapped':    len(features),
            'no_coords': len(listings) - len(features),
        },
    }


# ── HTTP handler ──────────────────────────────────────────────────────────────

class Handler(BaseHTTPRequestHandler):

    def do_OPTIONS(self):
        self.send_response(200)
        self._cors()
        self.end_headers()

    def do_GET(self):
        parsed = urllib.parse.urlparse(self.path)
        params = urllib.parse.parse_qs(parsed.query)
        path   = parsed.path

        # /listings?url=...&pages=N
        if path == '/listings':
            url = params.get('url', [''])[0]
            if not url:
                return self._json({'error': 'Missing ?url='}, 400)
            if not any(a in url for a in ALLOWED):
                return self._json({'error': 'Domain not allowed'}, 403)
            max_pages  = min(int(params.get('pages', ['5'])[0]), 50)
            is_zona    = 'zonaprop.com.ar' in url
            # Strip -map suffix, query string, and existing -pagina-N
            url = re.sub(r'-map\.html$', '.html', url)
            url = re.sub(r'[?#].*$', '', url)
            url = re.sub(r'-pagina-\d+\.html$', '.html', url)
            print(f'\n→ /listings  {url}  (max {max_pages} pages)')
            try:
                all_listings = []
                seen_ids     = set()
                for page in range(1, max_pages + 1):
                    if page == 1:
                        page_url = url
                    elif is_zona:
                        page_url = url.replace('.html', f'-pagina-{page}.html')
                    else:
                        # ArgenProp: append --pagina-N
                        page_url = url.rstrip('/') + f'--pagina-{page}'
                    print(f'  Page {page}: {page_url.split("/")[-1]}')
                    try:
                        html, _ = fetch(page_url)
                    except urllib.error.HTTPError as e:
                        print(f'  HTTP {e.code} on page {page} — stopping')
                        break
                    listings = parse_listings(html, url)
                    if not listings:
                        print(f'  No listings on page {page} — stopping')
                        break
                    # Deduplicate by id
                    new = [l for l in listings if l.get('id') not in seen_ids]
                    seen_ids.update(l.get('id','') for l in new)
                    all_listings.extend(new)
                    print(f'  Page {page}: {len(new)} new  (total: {len(all_listings)})')
                    if len(listings) < 20:
                        print(f'  Last page reached ({len(listings)} < 20)')
                        break
                    time.sleep(0.5)  # be polite

                gj = to_geojson(all_listings)
                print(f'  → {gj["meta"]["mapped"]} mapped / {gj["meta"]["total"]} total')
                return self._json(gj)
            except Exception as e:
                import traceback; traceback.print_exc()
                return self._json({'error': str(e)}, 502)

        # /structure?url=...  — debug
        if path == '/structure':
            url = params.get('url', [''])[0]
            if not url:
                return self._json({'error': 'Missing ?url='}, 400)
            try:
                html, _ = fetch(url)
                info = {
                    'page_kb':          len(html) // 1024,
                    'has___NEXT_DATA__': '__NEXT_DATA__' in html,
                    'has___next_f':     '__next_f' in html,
                    'has_listPostings': 'listPostings' in html,
                    'has_postings':     '"postings"' in html,
                    'has_results':      '"results"' in html,
                    'script_count':     len(re.findall(r'<script', html)),
                    'next_f_chunks':    len(re.findall(r'self\.__next_f\.push', html)),
                }
                # Try to get __NEXT_DATA__ keys
                m = re.search(r'<script[^>]+id=["\']__NEXT_DATA__["\'][^>]*>(.*?)</script>',
                              html, re.DOTALL)
                if m:
                    try:
                        nd = json.loads(m.group(1))
                        info['__NEXT_DATA__keys'] = list(nd.keys())[:20]
                    except Exception:
                        info['__NEXT_DATA__'] = 'parse error'

                # Show context around "listPostings" if present
                lp_idx = html.find('"listPostings"')
                if lp_idx != -1:
                    info['listPostings_context'] = html[max(0,lp_idx-80):lp_idx+200]

                return self._json(info)
            except Exception as e:
                return self._json({'error': str(e)}, 502)

        # /sample?url=...  — return first 2 raw listing objects for debugging
        if path == '/sample':
            url = params.get('url', [''])[0]
            if not url: return self._json({'error': 'Missing ?url='}, 400)
            url = re.sub(r'-map\.html$', '.html', url)
            url = re.sub(r'[?#].*$', '', url)
            try:
                html_s, _ = fetch(url)
                raw_list = None
                for strategy in [strategy_next_data, strategy_next_f,
                                  strategy_escaped_json, strategy_direct_scan]:
                    data = strategy(html_s)
                    if data:
                        raw_list = data.get('_direct') or walk(data)
                        if raw_list: break
                if not raw_list:
                    return self._json({'error': 'no listings found', 'has_listPostings': 'listPostings' in html_s})
                # Return first 2 items so we can inspect structure
                sample = raw_list[:2]
                return self._json({'count': len(raw_list), 'sample': sample})
            except Exception as e:
                return self._json({'error': str(e)}, 502)

        # / or /?url=...  — health check / raw proxy
        url = params.get('url', [''])[0]
        if not url:
            return self._json({'status': 'running', 'port': PORT,
                               'usage': '/listings?url=URL or /structure?url=URL'})
        if not any(a in url for a in ALLOWED):
            return self._json({'error': 'Domain not allowed'}, 403)
        print(f'\n→ /proxy  {url[:90]}')
        try:
            html, ct = fetch(url)
            body = html.encode('utf-8')
            self.send_response(200)
            self._cors()
            self.send_header('Content-Type', ct.split(';')[0] + '; charset=utf-8')
            self.send_header('Content-Length', str(len(body)))
            self.end_headers()
            self.wfile.write(body)
            print(f'  OK  {len(body)//1024} KB')
        except Exception as e:
            self._json({'error': str(e)}, 502)

    def _cors(self):
        self.send_header('Access-Control-Allow-Origin',  '*')
        self.send_header('Access-Control-Allow-Methods', 'GET, OPTIONS')
        self.send_header('Access-Control-Allow-Headers', '*')

    def _json(self, data, status=200):
        body = json.dumps(data, ensure_ascii=False, default=str).encode('utf-8')
        self.send_response(status)
        self._cors()
        self.send_header('Content-Type', 'application/json; charset=utf-8')
        self.send_header('Content-Length', str(len(body)))
        self.end_headers()
        self.wfile.write(body)

    def log_message(self, *a):
        pass


if __name__ == '__main__':
    server = HTTPServer(('127.0.0.1', PORT), Handler)
    print('=' * 52)
    print(f'  Cadastre proxy      http://localhost:{PORT}')
    print('=' * 52)
    print('  /listings?url=URL   → GeoJSON listings')
    print('  /structure?url=URL  → debug page info')
    print()
    print('  Press Ctrl+C to stop')
    print()
    try:
        server.serve_forever()
    except KeyboardInterrupt:
        print('\nStopped.')
        sys.exit(0)
