#!/usr/bin/env python3 """ argenprop-to-geojson.py ─────────────────────── Descarga TODAS las páginas de una búsqueda de ArgenProp y guarda un GeoJSON. Uso: python3 argenprop-to-geojson.py "URL" [max_paginas] Ejemplos: python3 argenprop-to-geojson.py "https://www.argenprop.com/galpones/alquiler/partido-de-escobar-o-partido-de-malvinas-argentinas-o-partido-de-pilar-o-partido-de-san-miguel-o-partido-de-tigre" python3 argenprop-to-geojson.py "https://www.argenprop.com/galpones/venta/partido-de-tigre" 50 Sin dependencias externas — solo Python stdlib. """ import sys, json, re, time, gzip, os import urllib.request, urllib.error HEADERS = { 'User-Agent': ( 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36' ), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'es-AR,es;q=0.9,en;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Cache-Control': 'no-cache', } MONEDA = {'1': 'ARS', '2': 'USD', '3': 'ARS'} def fetch(url): req = urllib.request.Request(url, headers=HEADERS) with urllib.request.urlopen(req, timeout=30) as r: raw = r.read() if len(raw) > 2 and raw[:2] == b'\x1f\x8b': raw = gzip.decompress(raw) return raw.decode('utf-8', errors='replace') def get_attr(html, name): m = re.search(rf'\b{re.escape(name)}="([^"]*)"', html, re.IGNORECASE) return m.group(1) if m else '' def inner_text(html_fragment): text = re.sub(r'<[^>]+>', ' ', html_fragment) text = re.sub(r'&#[xX]([0-9a-fA-F]+);', lambda m: chr(int(m.group(1), 16)), text) text = re.sub(r'&#(\d+);', lambda m: chr(int(m.group(1))), text) text = text.replace(' ', ' ').replace('&', '&').replace('<', '<').replace('>', '>') return re.sub(r'\s+', ' ', text).strip() def extract_block(block): """ Parse one

block. Confirmed field locations (from live ArgenProp HTML): id →

url → price → montonormalizado attr on (0 = consult) currency → idmoneda attr (2=USD, 1=ARS) title →

address →
location →
(barrio, partido) area_m2 →

NNN m² ... img → first src or data-src inside
agent →
(excluded from title) """ # ── ID ─────────────────────────────────────────────────────────────────── m = re.search(r'class="listing__item[^"]"\s+id="(\d+)"', block) listing_id = m.group(1) if m else '' # ── Card anchor attrs ──────────────────────────────────────────────────── card_m = re.search(r']class="card[^"]"[^>])>', block, re.IGNORECASE) card_tag = card_m.group(1) if card_m else '' href = get_attr(card_tag, 'href') or get_attr(block[:300], 'href') url = ('https://www.argenprop.com' + href) if href and not href.startswith('http') else href idmoneda = get_attr(card_tag, 'idmoneda') or '2' currency = MONEDA.get(idmoneda, 'USD') raw_price = get_attr(card_tag, 'montonormalizado') or get_attr(card_tag, 'montooperacion') try: price = float(raw_price) if raw_price else None if price == 0: price = None # "Consultar precio" except ValueError: price = None # ── Title: h2.card__title (the property headline, not the agency name) ── title_m = re.search(r']class="card__title"[^>]>(.*?)

', block, re.DOTALL | re.IGNORECASE) title = inner_text(title_m.group(1)) if title_m else '' # ── Address: p[data-card-direccion] (street only, clean) ───────────────── addr_m = re.search(r'data-card-direccion[^>]*>(.*?)

', block, re.DOTALL | re.IGNORECASE) address = inner_text(addr_m.group(1)) if addr_m else '' # ── Location: p.card__title--primary (neighborhood + partido) ──────────── loc_m = re.search(r'class="card__title--primary"[^>]*>(.*?)

', block, re.DOTALL | re.IGNORECASE) location = inner_text(loc_m.group(1)) if loc_m else '' # Full address = street + ", " + location (deduplicated) full_address = address if location and location not in address: full_address = f'{address}, {location}' if address else location # ── Area m²: first li>span containing m² in card__main-features ────────── feat_m = re.search(r'class="card__main-features"[^>]*>(.*?)', block, re.DOTALL | re.IGNORECASE) area = None if feat_m: for span in re.findall(r'(.*?)', feat_m.group(1), re.DOTALL): a_m = re.search(r'([\d][.\d,]*)\s*m[²2]', inner_text(span), re.IGNORECASE) if a_m: try: area = float(a_m.group(1).replace('.', '').replace(',', '.')) break except ValueError: pass # Fallback: description text mentions "superficie ... NNN m2" if not area: info_m = re.search(r'class="card__info[^"]*"[^>]*>(.*?)

', block, re.DOTALL | re.IGNORECASE) if info_m: a_m = re.search(r'superficie[^:]*:\s*([\d][.\d,]*)\s*m[²2]', inner_text(info_m.group(1)), re.IGNORECASE) if a_m: try: area = float(a_m.group(1).replace('.', '').replace(',', '.')) except ValueError: pass # ── Property type from URL slug ─────────────────────────────────────────── type_m = re.search(r'^/(galpon|deposito|bodega|nave|local|oficina|campo|terreno|casa|departamento)', href, re.IGNORECASE) prop_type = type_m.group(1).capitalize() if type_m else 'Industrial' # ── Image: first src/data-src inside card__photos (property photo, not logo) ─ photos_m = re.search(r'class="card__photos"[^>]*data-carousel[^>]*>(.*?)', block, re.DOTALL | re.IGNORECASE) img = None if photos_m: img_m = re.search(r']*(?:\bsrc|\bdata-src)="([^"]+)"', photos_m.group(1), re.IGNORECASE) if img_m: img = img_m.group(1) # ── Coordinates (embedded in data attrs if present) ─────────────────────── lat = lon = None for a in ('data-lat', 'data-latitud', 'data-latitude'): v = get_attr(block, a) if v: try: lat = float(v); break except ValueError: pass for a in ('data-lng', 'data-lon', 'data-longitud', 'data-longitude'): v = get_attr(block, a) if v: try: lon = float(v); break except ValueError: pass return { 'id': listing_id, 'title': title, 'type': prop_type, 'price': price, 'currency': currency, 'address': full_address, 'area_m2': area, 'rooms': None, 'lat': lat, 'lng': lon, 'url': url, 'img': img, 'source': 'ArgenProp', } def parse_listings(html): parts = re.split(r'(?=]+class="listing__item)', html) results = [] for part in parts[1:]: try: prop = extract_block(part[:20000]) if prop['id']: results.append(prop) except Exception: pass return results def total_from_html(html): for pat in [ r'(\d[\d.]*)\s*(?:propiedades?|resultados?|avisos?)\s*encontrad', r'Encontramos\s+(\d[\d.]*)', r'"totalCount"\s*:\s*(\d+)', r'"total"\s*:\s*(\d+)', ]: m = re.search(pat, html, re.IGNORECASE) if m: try: n = int(m.group(1).replace('.', '')) if 0 < n < 500_000: return n except Exception: pass return None def page_url(base, page): if page == 1: return base base = re.sub(r'\?pagina-\d+$', '', base) return f'{base}?pagina-{page}' def clean_url(url): url = re.sub(r'#.*$', '', url) url = re.sub(r'\?pagina-\d+$', '', url) return url.rstrip('/') def scrape(url, max_pages=200): url = clean_url(url) if 'argenprop.com' not in url: print('ERROR: La URL debe ser de argenprop.com'); sys.exit(1) print(f'\nFuente : ArgenProp') print(f'URL : {url}') print(f'Máximo : {max_pages} páginas\n') all_props = [] seen_ids = set() for page in range(1, max_pages + 1): purl = page_url(url, page) slug = purl.split('argenprop.com')[-1][:72] print(f' Pág {page:3d} {slug}', end=' ', flush=True) try: html = fetch(purl) except urllib.error.HTTPError as e: print(f'HTTP {e.code} — {"fin" if e.code in (404, 400) and page > 1 else "error"}') break except Exception as e: print(f'Error: {e}'); break if page == 1: total = total_from_html(html) if total: pages_est = -(-total // 20) print(f'\n Total: {total:,} propiedades (~{pages_est} páginas)') if max_pages < pages_est: print(f' Descargando {max_pages} de {pages_est} páginas') print(f'\n Pág {page:3d} {slug}', end=' ', flush=True) props = parse_listings(html) if not props: print('sin listings — deteniendo') if page == 1: print(f' Página: {len(html)//1024} KB') print(f' "listing__item": {"listing__item" in html}') print(f' "card__address": {"card__address" in html}') print(f'\n Formato: argenprop.com/galpones/alquiler/partido-de-ZONA') break new = [p for p in props if p['id'] not in seen_ids] for p in new: seen_ids.add(p['id']) all_props.extend(new) with_coords = sum(1 for p in new if p.get('lat')) print(f'+{len(new):3d} ({with_coords} con coords) total={len(all_props)}') if len(new) == 0: print(' Sin novedades — fin de resultados'); break if len(props) < 5: print(' Última página — fin'); break time.sleep(0.8) return all_props def to_geojson(props): features = [] for p in props: features.append({ 'type': 'Feature', 'geometry': ( {'type': 'Point', 'coordinates': [p['lng'], p['lat']]} if p.get('lat') and p.get('lng') else None ), 'properties': {k: v for k, v in p.items() if k not in ('lat', 'lng')}, }) mapped = sum(1 for f in features if f['geometry']) return { 'type': 'FeatureCollection', 'features': features, 'meta': {'source': 'ArgenProp', 'total': len(props), 'mapped': mapped, 'no_coords': len(props) - mapped}, } def make_filename(url): slug = re.sub(r'https?://[^/]+/?', '', url) slug = re.sub(r'[?=&]', '-', slug) slug = re.sub(r'[^a-z0-9-]', '-', slug.lower()) slug = re.sub(r'-+', '-', slug).strip('-') return slug[:80] + '.geojson' if __name__ == '__main__': if len(sys.argv) < 2: print(__doc__); sys.exit(0) url = sys.argv[1] max_pages = int(sys.argv[2]) if len(sys.argv) > 2 else 200 props = scrape(url, max_pages) mapped = sum(1 for p in props if p.get('lat')) print(f'\n{"="*54}') print(f' Total descargados : {len(props):,}') print(f' Con coordenadas : {mapped:,}') print(f' Sin coordenadas : {len(props) - mapped:,}') if not props: sys.exit(1) gj = to_geojson(props) outfile = make_filename(url) with open(outfile, 'w', encoding='utf-8') as f: json.dump(gj, f, ensure_ascii=False) size_kb = os.path.getsize(outfile) // 1024 print(f'\n Guardado → {outfile} ({size_kb} KB)') print('='*54)