#!/usr/bin/env python3
"""
listings-to-geojson.py
──────────────────────
Downloads ALL pages of a ZonaProp or ArgenProp search and saves a GeoJSON
file you can load directly into the Cadastre Viewer.

Usage:
    python3 listings-to-geojson.py "URL" [max_pages]

ZonaProp examples:
    python3 listings-to-geojson.py "https://www.zonaprop.com.ar/casas-terrenos-venta-tigre-mas-1000-m2.html"
    python3 listings-to-geojson.py "https://www.zonaprop.com.ar/casas-campos-terrenos-venta-escobar-tigre.html" 50

ArgenProp examples:
    python3 listings-to-geojson.py "https://www.argenprop.com/casa--en-venta--en-tigre"
    python3 listings-to-geojson.py "https://www.argenprop.com/terreno--en-venta--en-escobar--tigre--san-fernando" 30

Output: saves a .geojson file in the current directory, ready to load in Cadastre Viewer.
No dependencies beyond Python stdlib.
"""

import sys, json, re, time, gzip, os
import urllib.request, urllib.parse, urllib.error

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'es-AR,es;q=0.9,en;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Cache-Control': 'no-cache',
    'Upgrade-Insecure-Requests': '1',
}


def fetch(url):
    req = urllib.request.Request(url, headers=HEADERS)
    with urllib.request.urlopen(req, timeout=25) as r:
        raw = r.read()
    if len(raw) > 2 and raw[:2] == b'\x1f\x8b':
        raw = gzip.decompress(raw)
    return raw.decode('utf-8', errors='replace')


def get(obj, *keys):
    for k in keys:
        try:
            obj = obj[k]
        except (KeyError, IndexError, TypeError):
            return None
    return obj


# ── ZonaProp extraction ───────────────────────────────────────────────────────

def find_zonaprop_listings(html):
    """Find listPostings array in raw page HTML (handles double-encoded JSON)."""
    idx = html.find('listPostings')
    if idx == -1:
        return None
    bracket = html.find('[', idx)
    if bracket == -1 or bracket - idx > 30:
        return None
    depth, i = 0, bracket
    limit = min(len(html), bracket + 10_000_000)
    while i < limit:
        ch = html[i]
        if ch == chr(92):      # backslash: skip next char
            i += 2; continue
        if ch == '[':   depth += 1
        elif ch == ']':
            depth -= 1
            if depth == 0:
                raw = html[bracket:i+1]
                for candidate in (
                    re.sub(r'\\(.)', r'\1', raw),
                    raw.replace('\\"', '"').replace('\\/', '/'),
                    raw,
                ):
                    try:
                        arr = json.loads(candidate)
                        if isinstance(arr, list) and arr and 'postingId' in str(arr[0]):
                            return arr
                    except Exception:
                        pass
                return None
        i += 1
    return None


def extract_zonaprop(l):
    # Coordinates — confirmed path from live data
    lat = get(l, 'postingLocation', 'postingGeolocation', 'geolocation', 'latitude')
    lon = get(l, 'postingLocation', 'postingGeolocation', 'geolocation', 'longitude')

    # Address
    address = (get(l, 'house', 'address', 'name')
               or get(l, 'postingLocation', 'address', 'name') or '')

    # Price
    price, currency = None, 'USD'
    for fn in [
        lambda x: (get(x, 'priceOperationTypes', 0, 'prices', 0, 'amount'),
                   get(x, 'priceOperationTypes', 0, 'prices', 0, 'currency') or 'USD'),
        lambda x: (get(x, 'operations', 0, 'prices', 0, 'price') or
                   get(x, 'operations', 0, 'prices', 0, 'amount'),
                   get(x, 'operations', 0, 'prices', 0, 'currency') or 'USD'),
    ]:
        try:
            p, c = fn(l)
            if p:
                price, currency = p, c; break
        except Exception:
            pass

    # Area
    area = None
    mf = l.get('mainFeatures') or {}
    for fid in ('CFT100', 'CFT101'):
        v = get(mf, fid, 'value')
        if v:
            try: area = float(v); break
            except Exception: pass

    # Rooms
    rooms = None
    v = get(mf, 'CFT1', 'value')
    if v:
        try: rooms = int(v)
        except Exception: pass

    img = (get(l, 'visiblePictures', 'pictures', 0, 'url730x532')
           or get(l, 'visiblePictures', 'pictures', 0, 'url360x266')
           or get(l, 'house', 'image'))

    url = l.get('url', '')
    if url and not url.startswith('http'):
        url = 'https://www.zonaprop.com.ar' + url

    prop_type = get(l, 'realEstateType', 'name') or ''
    title = (l.get('title') or l.get('generatedTitle') or
             prop_type + (' en ' + address.split(',')[0] if address else ''))

    return {
        'id':       str(l.get('postingId') or l.get('id', '')),
        'title':    str(title).strip(),
        'type':     prop_type,
        'price':    price,
        'currency': currency,
        'address':  address,
        'area_m2':  area,
        'rooms':    rooms,
        'lat':      float(lat) if lat else None,
        'lng':      float(lon) if lon else None,
        'url':      url,
        'img':      img,
        'source':   'ZonaProp',
    }


# ── ArgenProp extraction ──────────────────────────────────────────────────────

def find_argenprop_listings(html):
    """
    ArgenProp uses Next.js. Try:
    1. __NEXT_DATA__ script tag
    2. Direct scan for known array keys
    """
    # Strategy 1: __NEXT_DATA__
    m = re.search(r'<script[^>]+id=["\']__NEXT_DATA__["\'][^>]*>(.*?)</script>',
                  html, re.DOTALL | re.IGNORECASE)
    if m:
        try:
            data = json.loads(m.group(1))
            lst = _walk(data)
            if lst:
                return lst, 'next_data'
        except Exception:
            pass

    # Strategy 2: scan all <script> blocks for large JSON
    for script_m in re.finditer(r'<script(?:[^>]*)>(.*?)</script>', html, re.DOTALL):
        text = script_m.group(1).strip()
        if len(text) < 300:
            continue
        if text.startswith('{') or text.startswith('['):
            try:
                d = json.loads(text)
                lst = _walk(d)
                if lst:
                    return lst, 'script_block'
            except Exception:
                pass

    # Strategy 3: bracket-match known keys
    for key in ('"listings"', '"results"', '"properties"', '"items"', '"avisos"'):
        idx = html.find(key)
        if idx == -1:
            continue
        bracket = html.find('[', idx)
        if bracket == -1 or bracket - idx > 20:
            continue
        depth, i = 0, bracket
        limit = min(len(html), bracket + 5_000_000)
        while i < limit:
            ch = html[i]
            if ch == chr(92):
                i += 2; continue
            if ch == '[':   depth += 1
            elif ch == ']':
                depth -= 1
                if depth == 0:
                    raw = html[bracket:i+1]
                    for cand in (re.sub(r'\\(.)', r'\1', raw), raw):
                        try:
                            arr = json.loads(cand)
                            if isinstance(arr, list) and arr and _is_listing(arr[0]):
                                return arr, f'bracket_{key}'
                        except Exception:
                            pass
                    break
            i += 1

    return None, None


def _is_listing(o):
    if not isinstance(o, dict): return False
    keys = ' '.join(str(k).lower() for k in o.keys())
    has_id    = any(k in keys for k in ('id', 'postingid', 'propertyid', 'codigo'))
    has_price = any(k in keys for k in ('price', 'precio', 'operation', 'valor'))
    has_place = any(k in keys for k in ('address', 'location', 'geo', 'ciudad', 'barrio'))
    return has_id and (has_price or has_place)


def _walk(obj, depth=0):
    """Recursively find first array of listing objects."""
    if depth > 10 or obj is None:
        return None
    if isinstance(obj, list):
        if len(obj) >= 1 and _is_listing(obj[0]):
            return obj
        for item in obj[:3]:
            found = _walk(item, depth+1)
            if found: return found
        return None
    if isinstance(obj, dict):
        for k in ('listings', 'results', 'properties', 'list', 'items',
                  'data', 'avisos', 'inmuebles', 'postings', 'listPostings'):
            v = obj.get(k)
            if isinstance(v, list) and v and _is_listing(v[0]):
                return v
        for v in obj.values():
            found = _walk(v, depth+1)
            if found: return found
    return None


def extract_argenprop(l):
    # Coordinates
    geo = (l.get('geolocation') or l.get('geo') or
           l.get('coordinates') or l.get('location') or {})
    if isinstance(geo, dict):
        lat = geo.get('lat') or geo.get('latitude') or geo.get('y')
        lon = geo.get('lng') or geo.get('longitude') or geo.get('lon') or geo.get('x')
    else:
        lat = lon = None
    if not lat: lat = l.get('lat') or l.get('latitude')
    if not lon: lon = l.get('lng') or l.get('longitude')

    # Address
    loc = l.get('address') or l.get('location') or l.get('ubicacion') or {}
    if isinstance(loc, str):
        address = loc
    elif isinstance(loc, dict):
        address = (loc.get('name') or loc.get('display') or
                   ', '.join(filter(None, [
                       str(loc.get('street', '') or ''),
                       str(loc.get('streetNumber', '') or ''),
                       str(loc.get('neighborhood', '') or loc.get('barrio', '') or ''),
                       str(loc.get('city', '') or loc.get('ciudad', '') or loc.get('cityName', '') or ''),
                   ])))
    else:
        address = ''

    # Price
    price, currency = None, 'USD'
    p = l.get('price') or l.get('precio')
    if isinstance(p, dict):
        price    = p.get('amount') or p.get('value') or p.get('monto')
        currency = p.get('currency') or p.get('moneda') or 'USD'
    elif isinstance(p, (int, float)):
        price = p
    if not price:
        try:
            price = (get(l, 'operations', 0, 'prices', 0, 'price') or
                     get(l, 'priceOperationTypes', 0, 'prices', 0, 'amount'))
            currency = (get(l, 'operations', 0, 'prices', 0, 'currency') or
                        get(l, 'priceOperationTypes', 0, 'prices', 0, 'currency') or 'USD')
        except Exception:
            pass

    # Area
    area = (l.get('totalArea') or l.get('area_total') or l.get('area') or
            l.get('surface') or l.get('superficie') or l.get('coveredArea'))
    if not area:
        try:
            area = get(l, 'mainFeatures', 'CFT100', 'value') or get(l, 'features', 'totalArea')
            if area: area = float(area)
        except Exception:
            pass

    rooms = (l.get('rooms') or l.get('ambientes') or l.get('bedrooms') or
             l.get('roomsAmount') or l.get('habitaciones'))

    # Property type
    prop_type = (l.get('propertyType') or l.get('tipo') or
                 get(l, 'realEstateType', 'name') or
                 l.get('type') or l.get('tipoInmueble') or '')
    if isinstance(prop_type, dict):
        prop_type = prop_type.get('name') or prop_type.get('nombre') or ''

    # Image
    photos = l.get('photos') or l.get('fotos') or l.get('images') or []
    img = None
    if photos and isinstance(photos[0], dict):
        img = (photos[0].get('url') or photos[0].get('image') or
               photos[0].get('src') or photos[0].get('thumb'))
    elif photos and isinstance(photos[0], str):
        img = photos[0]
    if not img:
        img = l.get('mainPhoto') or l.get('thumbnail') or l.get('image')

    url = l.get('url') or l.get('slug') or l.get('link') or l.get('permalink') or ''
    if url and not url.startswith('http'):
        url = 'https://www.argenprop.com' + url

    title = (l.get('title') or l.get('titulo') or l.get('name') or
             str(prop_type) + (' en ' + str(address).split(',')[0] if address else ''))

    return {
        'id':       str(l.get('id') or l.get('propertyId') or l.get('codigo') or ''),
        'title':    str(title).strip(),
        'type':     str(prop_type).strip(),
        'price':    price,
        'currency': str(currency).strip() if currency else 'USD',
        'address':  str(address).strip(),
        'area_m2':  float(area) if area else None,
        'rooms':    int(rooms) if rooms else None,
        'lat':      float(lat) if lat else None,
        'lng':      float(lon) if lon else None,
        'url':      url,
        'img':      img,
        'source':   'ArgenProp',
    }


# ── Total count ───────────────────────────────────────────────────────────────

def total_from_html(html, is_zona):
    patterns = [
        r'"totalCount"\s*:\s*(\d+)',
        r'"total"\s*:\s*(\d+)',
        r'"totalResults"\s*:\s*(\d+)',
        r'"count"\s*:\s*(\d+)',
        r'(\d[\d.]+)\s*(?:propiedades|resultado|aviso|inmueble)',
    ]
    for pat in patterns:
        m = re.search(pat, html, re.IGNORECASE)
        if m:
            n = int(m.group(1).replace('.', '').replace(',', ''))
            if 1 < n < 500000:
                return n
    return None


# ── Pagination URL builder ────────────────────────────────────────────────────

def page_url(base_url, page, is_zona):
    if page == 1:
        return base_url
    if is_zona:
        return base_url.replace('.html', f'-pagina-{page}.html')
    else:
        # ArgenProp: try different pagination patterns
        # Pattern: URL--pagina-N
        return base_url.rstrip('/') + f'--pagina-{page}'


# ── Main scraper ──────────────────────────────────────────────────────────────

def scrape(url, max_pages=200):
    is_zona  = 'zonaprop.com.ar' in url
    is_argen = 'argenprop.com'   in url

    if not is_zona and not is_argen:
        print('ERROR: URL must be from zonaprop.com.ar or argenprop.com')
        sys.exit(1)

    # Clean URL
    url = re.sub(r'-map\.html$', '.html', url)
    url = re.sub(r'[?#].*$', '', url)
    if is_zona:
        url = re.sub(r'-pagina-\d+\.html$', '.html', url)
    else:
        url = re.sub(r'--pagina-\d+$', '', url)

    source      = 'ZonaProp' if is_zona else 'ArgenProp'
    find_fn     = find_zonaprop_listings if is_zona else lambda h: find_argenprop_listings(h)[0]
    extract_fn  = extract_zonaprop       if is_zona else extract_argenprop

    all_listings = []
    seen_ids     = set()

    print(f'\nSource : {source}')
    print(f'URL    : {url}')
    print(f'Max    : {max_pages} pages (~{max_pages*30} listings)\n')

    for page in range(1, max_pages + 1):
        purl = page_url(url, page, is_zona)
        print(f'  Page {page:3d}  {purl.split("/")[-1][:65]}', end='  ', flush=True)

        try:
            html = fetch(purl)
        except urllib.error.HTTPError as e:
            if e.code == 404 and page > 1:
                print(f'404 — end of results')
                break
            print(f'HTTP {e.code} — stop')
            break
        except Exception as e:
            print(f'Error: {e} — stop')
            break

        # Show total on first page
        if page == 1:
            total = total_from_html(html, is_zona)
            if total:
                pages_needed = -(-total // 30)
                print()
                print(f'  Total results: {total:,}  (~{pages_needed} pages to get all)')
                if max_pages < pages_needed:
                    print(f'  Fetching {max_pages} of {pages_needed} pages '
                          f'(~{max_pages*30} of {total:,} listings)')
                print()
                print(f'  Page {page:3d}  {purl.split("/")[-1][:65]}', end='  ', flush=True)

        raw_list = find_fn(html)
        if not raw_list:
            print('no listings found — stop')
            if page == 1:
                print(f'\n  DEBUG: has __NEXT_DATA__: {"__NEXT_DATA__" in html}')
                print(f'  DEBUG: has listPostings: {"listPostings" in html}')
                print(f'  DEBUG: has "listings": {chr(34)+"listings"+chr(34) in html}')
                print(f'  DEBUG: page size: {len(html)//1024} KB')
                print(f'  TIP: Try opening the URL in your browser and check if listings load.')
                print(f'       ArgenProp may require different URL format — see examples in script header.')
            break

        new = []
        for item in raw_list:
            try:
                lst = extract_fn(item)
                lid = lst.get('id') or str(hash(lst.get('url','')))
                if lid not in seen_ids:
                    seen_ids.add(lid)
                    new.append(lst)
            except Exception:
                pass

        all_listings.extend(new)
        with_coords = sum(1 for l in new if l.get('lat'))
        print(f'+{len(new):3d} listings  ({with_coords} with coords)  total={len(all_listings)}')

        if len(raw_list) < 10:
            print(f'  Last page ({len(raw_list)} items) — done')
            break

        time.sleep(0.6)

    return all_listings


# ── Output ────────────────────────────────────────────────────────────────────

def to_geojson(listings):
    features = []
    for l in listings:
        if not (l.get('lat') and l.get('lng')):
            continue
        features.append({
            'type': 'Feature',
            'geometry': {
                'type': 'Point',
                'coordinates': [l['lng'], l['lat']],
            },
            'properties': {k: v for k, v in l.items()
                           if k not in ('lat', 'lng')},
        })
    return {
        'type': 'FeatureCollection',
        'features': features,
        'meta': {
            'total':     len(listings),
            'mapped':    len(features),
            'no_coords': len(listings) - len(features),
        },
    }


def make_filename(url):
    slug = re.sub(r'https?://[^/]+/?', '', url)
    slug = re.sub(r'\.html$', '', slug)
    slug = re.sub(r'[^a-z0-9-]', '-', slug.lower())
    slug = re.sub(r'-+', '-', slug).strip('-')
    return slug[:70] + '.geojson'


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print(__doc__)
        sys.exit(1)

    url       = sys.argv[1]
    max_pages = int(sys.argv[2]) if len(sys.argv) > 2 else 200

    listings = scrape(url, max_pages)

    with_coords = sum(1 for l in listings if l.get('lat'))
    no_coords   = len(listings) - with_coords

    print(f'\n{"="*52}')
    print(f'  Total scraped : {len(listings):,}')
    print(f'  With coords   : {with_coords:,}')
    print(f'  Without coords: {no_coords:,}')

    if not listings:
        print('\n  No listings found. Tips:')
        print('  - Check URL is a search results page (not a single listing)')
        print('  - For ArgenProp, URL format: argenprop.com/TIPO--en-OPERACION--en-ZONA')
        print('    Example: argenprop.com/casa--en-venta--en-tigre')
        sys.exit(1)

    gj      = to_geojson(listings)
    outfile = make_filename(url)

    with open(outfile, 'w', encoding='utf-8') as f:
        json.dump(gj, f, ensure_ascii=False)

    size_kb = os.path.getsize(outfile) // 1024
    print(f'\n  Saved → {outfile}  ({size_kb} KB)')
    print(f'  Load in Cadastre Viewer → GeoJSON layers → Add GeoJSON file(s)')
    print('='*52)
