#!/usr/bin/env python3
"""
argenprop-to-geojson.py
───────────────────────
Descarga TODAS las páginas de una búsqueda de ArgenProp y guarda un GeoJSON.

Uso:
    python3 argenprop-to-geojson.py "URL" [max_paginas]

Ejemplos:
    python3 argenprop-to-geojson.py "https://www.argenprop.com/galpones/alquiler/partido-de-escobar-o-partido-de-malvinas-argentinas-o-partido-de-pilar-o-partido-de-san-miguel-o-partido-de-tigre"
    python3 argenprop-to-geojson.py "https://www.argenprop.com/galpones/venta/partido-de-tigre" 50

Sin dependencias externas — solo Python stdlib.
"""

import sys, json, re, time, gzip, os
import urllib.request, urllib.error

HEADERS = {
    'User-Agent': (
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
        'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
    ),
    'Accept':          'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'es-AR,es;q=0.9,en;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Cache-Control':   'no-cache',
}

MONEDA = {'1': 'ARS', '2': 'USD', '3': 'ARS'}


def fetch(url):
    req = urllib.request.Request(url, headers=HEADERS)
    with urllib.request.urlopen(req, timeout=30) as r:
        raw = r.read()
    if len(raw) > 2 and raw[:2] == b'\x1f\x8b':
        raw = gzip.decompress(raw)
    return raw.decode('utf-8', errors='replace')


def get_attr(html, name):
    m = re.search(rf'\b{re.escape(name)}="([^"]*)"', html, re.IGNORECASE)
    return m.group(1) if m else ''


def inner_text(html_fragment):
    text = re.sub(r'<[^>]+>', ' ', html_fragment)
    text = re.sub(r'&#[xX]([0-9a-fA-F]+);', lambda m: chr(int(m.group(1), 16)), text)
    text = re.sub(r'&#(\d+);', lambda m: chr(int(m.group(1))), text)
    text = text.replace('&nbsp;', ' ').replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
    return re.sub(r'\s+', ' ', text).strip()


def extract_block(block):
    """
    Parse one <div class="listing__item"> block.

    Confirmed field locations (from live ArgenProp HTML):
      id             → <div class="listing__item" id="NNNN">
      url            → <a class="card" href="/...">
      price          → montonormalizado attr on <a class="card">  (0 = consult)
      currency       → idmoneda attr (2=USD, 1=ARS)
      title          → <h2 class="card__title">
      address        → <p class="card__address" data-card-direccion>
      location       → <p class="card__title--primary">  (barrio, partido)
      area_m2        → <ul class="card__main-features"> <li> <span> NNN m² ...
      img            → first src or data-src inside <ul class="card__photos">
      agent          → <div class="card__agent"> <img alt="AGENT NAME">  (excluded from title)
    """

    # ── ID ───────────────────────────────────────────────────────────────────
    m = re.search(r'class="listing__item[^"]*"\s+id="(\d+)"', block)
    listing_id = m.group(1) if m else ''

    # ── Card anchor attrs ────────────────────────────────────────────────────
    card_m = re.search(r'<a\b([^>]*class="card[^"]*"[^>]*)>', block, re.IGNORECASE)
    card_tag = card_m.group(1) if card_m else ''

    href     = get_attr(card_tag, 'href') or get_attr(block[:300], 'href')
    url      = ('https://www.argenprop.com' + href) if href and not href.startswith('http') else href
    idmoneda = get_attr(card_tag, 'idmoneda') or '2'
    currency = MONEDA.get(idmoneda, 'USD')

    raw_price = get_attr(card_tag, 'montonormalizado') or get_attr(card_tag, 'montooperacion')
    try:
        price = float(raw_price) if raw_price else None
        if price == 0:
            price = None          # "Consultar precio"
    except ValueError:
        price = None

    # ── Title: h2.card__title (the property headline, not the agency name) ──
    title_m = re.search(r'<h2[^>]*class="card__title"[^>]*>(.*?)</h2>', block, re.DOTALL | re.IGNORECASE)
    title = inner_text(title_m.group(1)) if title_m else ''

    # ── Address: p[data-card-direccion] (street only, clean) ─────────────────
    addr_m = re.search(r'data-card-direccion[^>]*>(.*?)</p>', block, re.DOTALL | re.IGNORECASE)
    address = inner_text(addr_m.group(1)) if addr_m else ''

    # ── Location: p.card__title--primary (neighborhood + partido) ────────────
    loc_m = re.search(r'class="card__title--primary"[^>]*>(.*?)</p>', block, re.DOTALL | re.IGNORECASE)
    location = inner_text(loc_m.group(1)) if loc_m else ''

    # Full address = street + ", " + location (deduplicated)
    full_address = address
    if location and location not in address:
        full_address = f'{address}, {location}' if address else location

    # ── Area m²: first li>span containing m² in card__main-features ──────────
    feat_m = re.search(r'class="card__main-features"[^>]*>(.*?)</ul>', block, re.DOTALL | re.IGNORECASE)
    area = None
    if feat_m:
        for span in re.findall(r'<span>(.*?)</span>', feat_m.group(1), re.DOTALL):
            a_m = re.search(r'([\d][.\d,]*)\s*m[²2]', inner_text(span), re.IGNORECASE)
            if a_m:
                try:
                    area = float(a_m.group(1).replace('.', '').replace(',', '.'))
                    break
                except ValueError:
                    pass
    # Fallback: description text mentions "superficie ... NNN m2"
    if not area:
        info_m = re.search(r'class="card__info[^"]*"[^>]*>(.*?)</p>', block, re.DOTALL | re.IGNORECASE)
        if info_m:
            a_m = re.search(r'superficie[^:]*:\s*([\d][.\d,]*)\s*m[²2]', inner_text(info_m.group(1)), re.IGNORECASE)
            if a_m:
                try: area = float(a_m.group(1).replace('.', '').replace(',', '.'))
                except ValueError: pass

    # ── Property type from URL slug ───────────────────────────────────────────
    type_m = re.search(r'^/(galpon|deposito|bodega|nave|local|oficina|campo|terreno|casa|departamento)',
                       href, re.IGNORECASE)
    prop_type = type_m.group(1).capitalize() if type_m else 'Industrial'

    # ── Image: first src/data-src inside card__photos (property photo, not logo) ─
    photos_m = re.search(r'class="card__photos"[^>]*data-carousel[^>]*>(.*?)</ul>',
                         block, re.DOTALL | re.IGNORECASE)
    img = None
    if photos_m:
        img_m = re.search(r'<img\b[^>]*(?:\bsrc|\bdata-src)="([^"]+)"', photos_m.group(1), re.IGNORECASE)
        if img_m:
            img = img_m.group(1)

    # ── Coordinates (embedded in data attrs if present) ───────────────────────
    lat = lon = None
    for a in ('data-lat', 'data-latitud', 'data-latitude'):
        v = get_attr(block, a)
        if v:
            try: lat = float(v); break
            except ValueError: pass
    for a in ('data-lng', 'data-lon', 'data-longitud', 'data-longitude'):
        v = get_attr(block, a)
        if v:
            try: lon = float(v); break
            except ValueError: pass

    return {
        'id':       listing_id,
        'title':    title,
        'type':     prop_type,
        'price':    price,
        'currency': currency,
        'address':  full_address,
        'area_m2':  area,
        'rooms':    None,
        'lat':      lat,
        'lng':      lon,
        'url':      url,
        'img':      img,
        'source':   'ArgenProp',
    }


def parse_listings(html):
    parts = re.split(r'(?=<div[^>]+class="listing__item)', html)
    results = []
    for part in parts[1:]:
        try:
            prop = extract_block(part[:20000])
            if prop['id']:
                results.append(prop)
        except Exception:
            pass
    return results


def total_from_html(html):
    for pat in [
        r'(\d[\d.]*)\s*(?:propiedades?|resultados?|avisos?)\s*encontrad',
        r'Encontramos\s+(\d[\d.]*)',
        r'"totalCount"\s*:\s*(\d+)',
        r'"total"\s*:\s*(\d+)',
    ]:
        m = re.search(pat, html, re.IGNORECASE)
        if m:
            try:
                n = int(m.group(1).replace('.', ''))
                if 0 < n < 500_000:
                    return n
            except Exception:
                pass
    return None


def page_url(base, page):
    if page == 1:
        return base
    base = re.sub(r'\?pagina-\d+$', '', base)
    return f'{base}?pagina-{page}'


def clean_url(url):
    url = re.sub(r'#.*$', '', url)
    url = re.sub(r'\?pagina-\d+$', '', url)
    return url.rstrip('/')


def scrape(url, max_pages=200):
    url = clean_url(url)
    if 'argenprop.com' not in url:
        print('ERROR: La URL debe ser de argenprop.com'); sys.exit(1)

    print(f'\nFuente : ArgenProp')
    print(f'URL    : {url}')
    print(f'Máximo : {max_pages} páginas\n')

    all_props = []
    seen_ids  = set()

    for page in range(1, max_pages + 1):
        purl = page_url(url, page)
        slug = purl.split('argenprop.com')[-1][:72]
        print(f'  Pág {page:3d}  {slug}', end='  ', flush=True)

        try:
            html = fetch(purl)
        except urllib.error.HTTPError as e:
            print(f'HTTP {e.code} — {"fin" if e.code in (404, 400) and page > 1 else "error"}')
            break
        except Exception as e:
            print(f'Error: {e}'); break

        if page == 1:
            total = total_from_html(html)
            if total:
                pages_est = -(-total // 20)
                print(f'\n  Total: {total:,} propiedades (~{pages_est} páginas)')
                if max_pages < pages_est:
                    print(f'  Descargando {max_pages} de {pages_est} páginas')
                print(f'\n  Pág {page:3d}  {slug}', end='  ', flush=True)

        props = parse_listings(html)

        if not props:
            print('sin listings — deteniendo')
            if page == 1:
                print(f'  Página: {len(html)//1024} KB')
                print(f'  "listing__item": {"listing__item" in html}')
                print(f'  "card__address":  {"card__address" in html}')
                print(f'\n  Formato: argenprop.com/galpones/alquiler/partido-de-ZONA')
            break

        new = [p for p in props if p['id'] not in seen_ids]
        for p in new: seen_ids.add(p['id'])
        all_props.extend(new)

        with_coords = sum(1 for p in new if p.get('lat'))
        print(f'+{len(new):3d}  ({with_coords} con coords)  total={len(all_props)}')

        if len(new) == 0:
            print('  Sin novedades — fin de resultados'); break

        if len(props) < 5:
            print('  Última página — fin'); break

        time.sleep(0.8)

    return all_props


def to_geojson(props):
    features = []
    for p in props:
        features.append({
            'type': 'Feature',
            'geometry': (
                {'type': 'Point', 'coordinates': [p['lng'], p['lat']]}
                if p.get('lat') and p.get('lng') else None
            ),
            'properties': {k: v for k, v in p.items() if k not in ('lat', 'lng')},
        })
    mapped = sum(1 for f in features if f['geometry'])
    return {
        'type': 'FeatureCollection',
        'features': features,
        'meta': {'source': 'ArgenProp', 'total': len(props),
                 'mapped': mapped, 'no_coords': len(props) - mapped},
    }


def make_filename(url):
    slug = re.sub(r'https?://[^/]+/?', '', url)
    slug = re.sub(r'[?=&]', '-', slug)
    slug = re.sub(r'[^a-z0-9-]', '-', slug.lower())
    slug = re.sub(r'-+', '-', slug).strip('-')
    return slug[:80] + '.geojson'


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print(__doc__); sys.exit(0)

    url       = sys.argv[1]
    max_pages = int(sys.argv[2]) if len(sys.argv) > 2 else 200

    props     = scrape(url, max_pages)
    mapped    = sum(1 for p in props if p.get('lat'))

    print(f'\n{"="*54}')
    print(f'  Total descargados  : {len(props):,}')
    print(f'  Con coordenadas    : {mapped:,}')
    print(f'  Sin coordenadas    : {len(props) - mapped:,}')

    if not props:
        sys.exit(1)

    gj      = to_geojson(props)
    outfile = make_filename(url)
    with open(outfile, 'w', encoding='utf-8') as f:
        json.dump(gj, f, ensure_ascii=False)

    size_kb = os.path.getsize(outfile) // 1024
    print(f'\n  Guardado → {outfile}  ({size_kb} KB)')
    print('='*54)
