#!/usr/bin/env python3 """ Cadastre Viewer — Local listings proxy python3 listings-proxy.py → http://localhost:3001 /listings?url=URL → GeoJSON FeatureCollection /structure?url=URL → debug info /?url=URL → raw passthrough / health check """ from http.server import HTTPServer, BaseHTTPRequestHandler import urllib.request, urllib.parse, urllib.error import gzip, json, re, sys, time, html as html_module PORT = 3001 ALLOWED = ['zonaprop.com.ar', 'argenprop.com', 'nominatim.openstreetmap.org'] HEADERS = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'es-AR,es;q=0.9,en;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Cache-Control': 'no-cache', 'Upgrade-Insecure-Requests': '1', } # ── HTTP fetch ──────────────────────────────────────────────────────────────── def fetch(url, extra_headers=None): h = dict(HEADERS) if extra_headers: h.update(extra_headers) req = urllib.request.Request(url, headers=h) with urllib.request.urlopen(req, timeout=25) as r: raw = r.read() ct = r.headers.get('Content-Type', 'text/html') if len(raw) > 2 and raw[:2] == b'\x1f\x8b': raw = gzip.decompress(raw) return raw.decode('utf-8', errors='replace'), ct # ── Data extraction strategies ──────────────────────────────────────────────── def strategy_next_data(html): """Traditional Next.js pages: __NEXT_DATA__ script tag.""" m = re.search(r']+id=["\']__NEXT_DATA__["\'][^>]*>(.*?)', html, re.DOTALL | re.IGNORECASE) if not m: return None try: data = json.loads(m.group(1)) # App Router minimal __NEXT_DATA__ — skip it if list(data.keys()) == ['namedChunks']: print(' __NEXT_DATA__ is App Router stub (namedChunks only), skipping') return None print(' Found classic __NEXT_DATA__') return data except Exception as e: print(f' __NEXT_DATA__ parse error: {e}') return None def strategy_next_f(html): """Next.js App Router: self.__next_f.push([1, "rsc_payload"]) chunks.""" chunks = [] # Collect all RSC chunk strings for m in re.finditer(r'self\.__next_f\.push$\[(?:1|2),("(?:[^"\\]|\\.)*")\]$', html, re.DOTALL): try: text = json.loads(m.group(1)) # decode the JSON-encoded string chunks.append(text) except Exception: pass if not chunks: print(' No __next_f chunks found') return None print(f' Found {len(chunks)} __next_f RSC chunks') full = '\n'.join(chunks) # Method A: look for "listPostings":[...] directly in the RSC text for key in ('listPostings', 'postings', 'results', 'listings', 'items'): m = re.search(rf'"{key}"\s*:\s*(\[{{.*?}},', full, re.DOTALL) if m: # try to grab the full array start = m.start(1) # find matching close bracket depth, i = 0, start while i < len(full): if full[i] == '[': depth += 1 elif full[i] == ']': depth -= 1 if depth == 0: try: arr = json.loads(full[start:i+1]) if arr and is_listing(arr[0]): print(f' Found "{key}" array in RSC chunks: {len(arr)} items') return {'_direct': arr} except Exception: pass break i += 1 # Method B: parse each RSC line as JSON and walk for listings for line in full.split('\n'): m = re.match(r'^\d+:([\[{].*)', line) if not m: continue try: val = json.loads(m.group(1)) lst = walk(val) if lst: print(f' Found listing array in RSC line: {len(lst)} items') return {'_direct': lst} except Exception: pass # Method C: aggressive JSON extraction from RSC text # Find any JSON array whose first element looks like a listing for m in re.finditer(r'(\[{"(?:id|postingId)":', full): start = m.start(1) depth, i = 0, start while i < min(len(full), start + 500000): if full[i] == '[': depth += 1 elif full[i] == ']': depth -= 1 if depth == 0: try: arr = json.loads(full[start:i+1]) if arr and is_listing(arr[0]): print(f' Found listing array via aggressive scan: {len(arr)} items') return {'_direct': arr} except Exception: pass break i += 1 print(' RSC chunks found but no listing array extracted') return None def strategy_escaped_json(html): """ ZonaProp stores data as a doubly-encoded JSON string in a JS variable. Raw HTML contains: listPostings\":[{\"postingId\":\"123\"...}] We find listPostings, bracket-match the array treating single backslash as an escape prefix, then unescape and parse. """ idx = html.find('listPostings') if idx == -1: return None bracket = html.find('[', idx) if bracket == -1 or bracket - idx > 30: return None # Bracket-match accounting for backslash-escapes depth, i = 0, bracket limit = min(len(html), bracket + 8_000_000) while i < limit: ch = html[i] if ch == chr(92): # backslash: skip next char i += 2 continue if ch == '[': depth += 1 elif ch == ']': depth -= 1 if depth == 0: raw = html[bracket : i + 1] # Unescape: \x -> x (handles \" -> " and \/ -> /) unescaped = re.sub(r'\\(.)', r'\1', raw) # Simpler fallback: just replace \" with " simple = raw.replace('\\"', '"').replace('\\/', '/') for candidate in (unescaped, simple, raw): try: arr = json.loads(candidate) if isinstance(arr, list) and arr and is_listing(arr[0]): print(f' Found {len(arr)} listings via escaped-JSON strategy') return {'_direct': arr} except Exception: pass break i += 1 return None def strategy_direct_scan(html): """ Direct bracket-matching: find any known listing key in raw HTML and extract the full JSON array by counting brackets. Works regardless of how the data is embedded (var, window., etc.) """ keys_to_try = [ 'listPostings', 'postings', '"listings"', '"results"', '"avisos"', '"publicaciones"', '"propiedades"', ] for key in keys_to_try: idx = html.find(f'"{key}"' if not key.startswith('"') else key) if idx == -1: continue # Find the '[' that opens the array after this key bracket_pos = html.find('[', idx) close_pos = html.find(']', idx) # might be closing something else if bracket_pos == -1 or (close_pos != -1 and close_pos < bracket_pos): continue if bracket_pos - idx > 200: continue # too far away, not a direct assignment # Bracket-match to find the full array depth, i = 0, bracket_pos limit = min(len(html), bracket_pos + 5_000_000) # 5MB max scan while i < limit: c = html[i] if c == '[': depth += 1 elif c == ']': depth -= 1 if depth == 0: arr_str = html[bracket_pos:i+1] try: arr = json.loads(arr_str) if isinstance(arr, list) and arr and is_listing(arr[0]): print(f' Found "{key}" via direct scan: {len(arr)} items') return {'_direct': arr} except json.JSONDecodeError: pass break i += 1 # Also try: find the enclosing object around "listPostings" idx = html.find('"listPostings"') if idx != -1: # Walk back to find the opening '{' of the parent object # Look in a window of 10KB before the key window = html[max(0, idx-10000):idx+500000] # Try to parse as JSON object starting from various '{' for start_m in re.finditer(r'\{', window[:10000]): try: candidate = window[start_m.start():] # Quick check before full parse if '"listPostings"' not in candidate[:100000]: continue obj = json.loads(candidate) lst = walk(obj) if lst: print(f' Found listings in parent object of listPostings: {len(lst)} items') return {'_direct': lst} except Exception: pass return None def strategy_zonaprop_api(url, html): """ Try ZonaProp's internal search API. The old API: /nfs/webservice/noLogin.do The newer one might be discoverable from the page JS. """ # Look for API URL pattern in the page HTML/JS api_patterns = [ r'["\'](/(?:api|nfs|rest|search)[^"\']*search[^"\']*)["\']', r'fetch\(["\']([^"\']+search[^"\']+)["\']', r'axios\.[a-z]+\(["\']([^"\']+search[^"\']+)["\']', ] for pat in api_patterns: for m in re.finditer(pat, html): candidate = m.group(1) if 'search' in candidate.lower() and len(candidate) < 200: print(f' Candidate API path: {candidate}') # Try the known old API endpoint with parameters extracted from URL slug = re.sub(r'https?://[^/]+', '', url).strip('/') slug = re.sub(r'\.html$', '', slug) # Extract operation and property types from slug operation = 'venta' if 'venta' in slug else 'alquiler' parts = slug.split('-') # Known ZonaProp search API (may still work) api_url = 'https://www.zonaprop.com.ar/api/v2/search' params = urllib.parse.urlencode({ 'country': 'ar', 'searchType': '1', 'operation': operation, }) try: resp, _ = fetch(f'{api_url}?{params}', extra_headers={ 'Accept': 'application/json', 'Referer': url, 'X-Requested-With': 'XMLHttpRequest', }) data = json.loads(resp) if data: print(f' ZonaProp API responded') return data except Exception as e: print(f' ZonaProp API attempt failed: {e}') return None def strategy_inline_json(html): """Scan all ', html, re.DOTALL)): text = m.group(1).strip() if len(text) < 300: continue # Try direct JSON for prefix in ('{', '['): if text.startswith(prefix): try: d = json.loads(text) lst = walk(d) if lst: print(f' Found listings in inline script {i}') return {'_direct': lst} except Exception: pass # Try variable assignments with large JSON values for var_m in re.finditer(r'=\s*(\{[^=]{50,}\})\s*[;,\n]', text, re.DOTALL): try: d = json.loads(var_m.group(1)) lst = walk(d) if lst: print(f' Found listings in variable assignment, script {i}') return {'_direct': lst} except Exception: pass return None # ── JSON utilities ──────────────────────────────────────────────────────────── def is_listing(o): if not isinstance(o, dict): return False keys = ' '.join(str(k).lower() for k in o.keys()) has_id = any(k in keys for k in ('id', 'postingid', 'propertyid')) has_price = any(k in keys for k in ('price', 'precio', 'operation', 'priceoper')) has_place = any(k in keys for k in ('address', 'location', 'geo', 'city', 'ciudad')) return has_id and (has_price or has_place) def walk(obj, depth=0): """Find first array of listing objects anywhere in nested structure.""" if depth > 10 or obj is None: return None if isinstance(obj, list): if len(obj) >= 1 and is_listing(obj[0]): return obj # Recurse into list items (but not too many) for item in obj[:5]: found = walk(item, depth+1) if found: return found return None if isinstance(obj, dict): for k in ('listPostings', 'postings', 'results', 'listings', 'items', 'data', 'properties', 'list', 'avisos', 'publicaciones', 'searchResults', 'posting', 'realEstate'): v = obj.get(k) if isinstance(v, list) and v and is_listing(v[0]): return v for v in obj.values(): found = walk(v, depth+1) if found: return found return None # ── Extractor ───────────────────────────────────────────────────────────────── def get(obj, *keys): for k in keys: try: obj = obj[k] except (KeyError, IndexError, TypeError): return None return obj def extract(l, source_domain): is_zona = 'zonaprop' in source_domain # ── ZonaProp-specific paths (confirmed from /sample) ───────────────────── # Coordinates: postingLocation.postingGeolocation.geolocation.latitude/longitude lat = (get(l,'postingLocation','postingGeolocation','geolocation','latitude') or get(l,'geo','lat') or get(l,'geo','latitude') or get(l,'address','geoLocation','lat') or get(l,'geolocation','lat') or l.get('latitude') or l.get('lat')) lon = (get(l,'postingLocation','postingGeolocation','geolocation','longitude') or get(l,'geo','lon') or get(l,'geo','lng') or get(l,'geo','longitude') or get(l,'address','geoLocation','lon') or get(l,'address','geoLocation','lng') or get(l,'geolocation','lng') or get(l,'geolocation','lon') or l.get('longitude') or l.get('lng') or l.get('lon')) # Address: house.address.name has the full readable string address = (get(l,'house','address','name') or get(l,'postingLocation','address','name') or '') if address: # Clean up the ZonaProp "Type Venta X Zone, City, Province" prefix format # e.g. "Casas Venta Barrio ..., La Concepción, Luján, GBA Oeste" # → keep as-is, it's readable enough pass else: addr = l.get('address') or l.get('location') or {} if isinstance(addr, str): address = addr else: # Build from location hierarchy loc = get(l,'postingLocation','location') or {} city = loc.get('name','') parent = loc.get('parent') or {} ciudad = parent.get('name','') address = ', '.join(filter(None,[city, ciudad])) # Price price, currency = None, 'USD' for fn in [ lambda x: (get(x,'priceOperationTypes',0,'prices',0,'amount'), get(x,'priceOperationTypes',0,'prices',0,'currency') or 'USD'), lambda x: (get(x,'operations',0,'prices',0,'price') or get(x,'operations',0,'prices',0,'amount'), get(x,'operations',0,'prices',0,'currency') or 'USD'), lambda x: (get(x,'price','amount'), get(x,'price','currency') or 'USD'), lambda x: (l.get('price'), 'USD') if not isinstance(l.get('price'),dict) else (None,'USD'), ]: try: p, c = fn(l) if p is not None: price, currency = p, c; break except Exception: pass # Area: mainFeatures.CFT100 = superficie total, CFT101 = cubierta area = None mf = l.get('mainFeatures') or {} for feat_id in ('CFT100','CFT101'): v = mf.get(feat_id,{}).get('value') if v: try: area = float(v); break except Exception: pass if not area: for f in ('surfaceTotals','totalSurface','area','surface','totalArea','coveredArea'): v = l.get(f) if v is not None: area = v[0].get('value') if isinstance(v,list) and v else (v if not isinstance(v,list) else None) if area: break # Rooms: mainFeatures.CFT1 rooms = None if mf.get('CFT1',{}).get('value'): try: rooms = int(mf['CFT1']['value']) except Exception: pass if not rooms: rooms = l.get('rooms') or l.get('roomsAmount') or l.get('bedrooms') # Image: visiblePictures.pictures[0].url730x532 img = (get(l,'visiblePictures','pictures',0,'url730x532') or get(l,'visiblePictures','pictures',0,'url360x266') or get(l,'house','image')) if not img: photos = l.get('photos') or [] if photos and isinstance(photos[0],dict): img = photos[0].get('image') or photos[0].get('url') url = l.get('url') or l.get('slug') or l.get('link') or '' base = 'https://www.zonaprop.com.ar' if is_zona else 'https://www.argenprop.com' if url and not url.startswith('http'): url = base + url title = (l.get('title') or l.get('generatedTitle') or l.get('developmentName') or (get(l,'realEstateType','name') or '') + (' en ' + address.split(',')[0] if address else '')) prop_type = get(l,'realEstateType','name') or '' return { 'id': str(l.get('postingId') or l.get('id') or l.get('propertyId') or ''), 'title': str(title or '').strip(), 'price': price, 'currency': currency, 'address': address, 'area': area, 'rooms': rooms, 'type': prop_type, 'lat': float(lat) if lat is not None else None, 'lng': float(lon) if lon is not None else None, 'url': url, 'img': img, 'source': 'ZonaProp' if is_zona else 'ArgenProp', } def geocode(address): try: q = urllib.parse.quote(address + ', Buenos Aires, Argentina') r, _ = fetch(f'https://nominatim.openstreetmap.org/search?format=json&limit=1&q={q}', extra_headers={'User-Agent': 'CadastreViewer/1.0'}) results = json.loads(r) if results: return float(results[0]['lat']), float(results[0]['lon']) except Exception: pass return None, None # ── Main parse entry ────────────────────────────────────────────────────────── def parse_listings(html, url): print(f' Page size: {len(html)//1024} KB | ' f'has __NEXT_DATA__: {"__NEXT_DATA__" in html} | ' f'has __next_f: {"__next_f" in html} | ' f'has listPostings: {"listPostings" in html}') data = None for strategy in [strategy_next_data, strategy_next_f, strategy_direct_scan, strategy_inline_json]: data = strategy(html) if data: break if not data: data = strategy_zonaprop_api(url, html) if not data: print(' All strategies failed — no listing data found') return [] # Get the raw list if isinstance(data, dict) and '_direct' in data: raw_list = data['_direct'] if isinstance(data['_direct'], list) else walk(data['_direct']) else: raw_list = walk(data) if not raw_list: print(' Data found but no listing array identified') return [] print(f' Extracting {len(raw_list)} items...') # Log first item keys and coords fields for debugging if raw_list: sample = raw_list[0] print(f' Sample listing keys: {list(sample.keys())}') # Print any key that might hold coordinates for k,v in sample.items(): if isinstance(v, dict) and any(c in str(v).lower() for c in ('lat','lon','lng','coord','geo')): print(f' Coord-like field [{k}]: {str(v)[:150]}') # Print address field for k in ('address','location','geo','postingLocation','mapLocation','geoLocation'): if k in sample: print(f' [{k}]: {str(sample[k])[:150]}') domain = 'zonaprop' if 'zonaprop' in url else 'argenprop' listings = [] for item in raw_list: try: lst = extract(item, domain) if lst.get('title') or lst.get('address'): listings.append(lst) except Exception: pass # Geocode up to 10 without coords missing = [l for l in listings if not l.get('lat') and l.get('address')] if missing: print(f' Geocoding {len(missing)} listings without coordinates...') for i, l in enumerate(missing): lat, lon = geocode(l['address']) if lat: l['lat'] = lat l['lng'] = lon print(f' geocoded [{i+1}/{len(missing)}]: {l["address"][:50]} -> {lat:.4f},{lon:.4f}') time.sleep(0.4) mapped = sum(1 for l in listings if l.get('lat')) print(f' Result: {len(listings)} listings, {mapped} with coordinates') return listings def to_geojson(listings): features = [ {'type': 'Feature', 'geometry': {'type': 'Point', 'coordinates': [l['lng'], l['lat']]}, 'properties': {k: v for k, v in l.items() if k not in ('lat', 'lng')}} for l in listings if l.get('lat') and l.get('lng') ] return { 'type': 'FeatureCollection', 'features': features, 'meta': { 'total': len(listings), 'mapped': len(features), 'no_coords': len(listings) - len(features), }, } # ── HTTP handler ────────────────────────────────────────────────────────────── class Handler(BaseHTTPRequestHandler): def do_OPTIONS(self): self.send_response(200) self._cors() self.end_headers() def do_GET(self): parsed = urllib.parse.urlparse(self.path) params = urllib.parse.parse_qs(parsed.query) path = parsed.path # /listings?url=...&pages=N if path == '/listings': url = params.get('url', [''])[0] if not url: return self._json({'error': 'Missing ?url='}, 400) if not any(a in url for a in ALLOWED): return self._json({'error': 'Domain not allowed'}, 403) max_pages = min(int(params.get('pages', ['5'])[0]), 50) is_zona = 'zonaprop.com.ar' in url # Strip -map suffix, query string, and existing -pagina-N url = re.sub(r'-map\.html$', '.html', url) url = re.sub(r'[?#].*$', '', url) url = re.sub(r'-pagina-\d+\.html$', '.html', url) print(f'\n→ /listings {url} (max {max_pages} pages)') try: all_listings = [] seen_ids = set() for page in range(1, max_pages + 1): if page == 1: page_url = url elif is_zona: page_url = url.replace('.html', f'-pagina-{page}.html') else: # ArgenProp: append --pagina-N page_url = url.rstrip('/') + f'--pagina-{page}' print(f' Page {page}: {page_url.split("/")[-1]}') try: html, _ = fetch(page_url) except urllib.error.HTTPError as e: print(f' HTTP {e.code} on page {page} — stopping') break listings = parse_listings(html, url) if not listings: print(f' No listings on page {page} — stopping') break # Deduplicate by id new = [l for l in listings if l.get('id') not in seen_ids] seen_ids.update(l.get('id','') for l in new) all_listings.extend(new) print(f' Page {page}: {len(new)} new (total: {len(all_listings)})') if len(listings) < 20: print(f' Last page reached ({len(listings)} < 20)') break time.sleep(0.5) # be polite gj = to_geojson(all_listings) print(f' → {gj["meta"]["mapped"]} mapped / {gj["meta"]["total"]} total') return self._json(gj) except Exception as e: import traceback; traceback.print_exc() return self._json({'error': str(e)}, 502) # /structure?url=... — debug if path == '/structure': url = params.get('url', [''])[0] if not url: return self._json({'error': 'Missing ?url='}, 400) try: html, _ = fetch(url) info = { 'page_kb': len(html) // 1024, 'has___NEXT_DATA__': '__NEXT_DATA__' in html, 'has___next_f': '__next_f' in html, 'has_listPostings': 'listPostings' in html, 'has_postings': '"postings"' in html, 'has_results': '"results"' in html, 'script_count': len(re.findall(r']+id=["\']__NEXT_DATA__["\'][^>]*>(.*?)', html, re.DOTALL) if m: try: nd = json.loads(m.group(1)) info['__NEXT_DATA__keys'] = list(nd.keys())[:20] except Exception: info['__NEXT_DATA__'] = 'parse error' # Show context around "listPostings" if present lp_idx = html.find('"listPostings"') if lp_idx != -1: info['listPostings_context'] = html[max(0,lp_idx-80):lp_idx+200] return self._json(info) except Exception as e: return self._json({'error': str(e)}, 502) # /sample?url=... — return first 2 raw listing objects for debugging if path == '/sample': url = params.get('url', [''])[0] if not url: return self._json({'error': 'Missing ?url='}, 400) url = re.sub(r'-map\.html$', '.html', url) url = re.sub(r'[?#].*$', '', url) try: html_s, _ = fetch(url) raw_list = None for strategy in [strategy_next_data, strategy_next_f, strategy_escaped_json, strategy_direct_scan]: data = strategy(html_s) if data: raw_list = data.get('_direct') or walk(data) if raw_list: break if not raw_list: return self._json({'error': 'no listings found', 'has_listPostings': 'listPostings' in html_s}) # Return first 2 items so we can inspect structure sample = raw_list[:2] return self._json({'count': len(raw_list), 'sample': sample}) except Exception as e: return self._json({'error': str(e)}, 502) # / or /?url=... — health check / raw proxy url = params.get('url', [''])[0] if not url: return self._json({'status': 'running', 'port': PORT, 'usage': '/listings?url=URL or /structure?url=URL'}) if not any(a in url for a in ALLOWED): return self._json({'error': 'Domain not allowed'}, 403) print(f'\n→ /proxy {url[:90]}') try: html, ct = fetch(url) body = html.encode('utf-8') self.send_response(200) self._cors() self.send_header('Content-Type', ct.split(';')[0] + '; charset=utf-8') self.send_header('Content-Length', str(len(body))) self.end_headers() self.wfile.write(body) print(f' OK {len(body)//1024} KB') except Exception as e: self._json({'error': str(e)}, 502) def _cors(self): self.send_header('Access-Control-Allow-Origin', '*') self.send_header('Access-Control-Allow-Methods', 'GET, OPTIONS') self.send_header('Access-Control-Allow-Headers', '*') def _json(self, data, status=200): body = json.dumps(data, ensure_ascii=False, default=str).encode('utf-8') self.send_response(status) self._cors() self.send_header('Content-Type', 'application/json; charset=utf-8') self.send_header('Content-Length', str(len(body))) self.end_headers() self.wfile.write(body) def log_message(self, *a): pass if __name__ == '__main__': server = HTTPServer(('127.0.0.1', PORT), Handler) print('=' * 52) print(f' Cadastre proxy http://localhost:{PORT}') print('=' * 52) print(' /listings?url=URL → GeoJSON listings') print(' /structure?url=URL → debug page info') print() print(' Press Ctrl+C to stop') print() try: server.serve_forever() except KeyboardInterrupt: print('\nStopped.') sys.exit(0)