#!/usr/bin/env python3 """ Regenerate exchange datasets for Arbitrum research. Outputs: - arbitrum_portal_exchanges.csv - arbitrum_llama_exchange_subset.csv - arbitrum_exchange_sources.csv The script expects: - data/raw_arbitrum_portal_projects.json (Portal `/api/projects` dump) - arbitrum_llama_exchanges.csv (DeFiLlama export) """ from __future__ import annotations import csv import json import re from pathlib import Path ROOT = Path(__file__).resolve().parents[4] # repo root DATA_DIR = ROOT / "docs" / "5_development" / "mev_research" / "datasets" PORTAL_RAW = ROOT / "data" / "raw_arbitrum_portal_projects.json" LLAMA_RAW = DATA_DIR / "arbitrum_llama_exchanges.csv" PORTAL_EXCHANGES = DATA_DIR / "arbitrum_portal_exchanges.csv" LLAMA_SUBSET = DATA_DIR / "arbitrum_llama_exchange_subset.csv" MERGED = DATA_DIR / "arbitrum_exchange_sources.csv" EXCHANGE_TAGS = { "DEX", "DEX Aggregator", "Perpetuals", "Options", "Derivatives", "Centralized Exchange", } LLAMA_ALLOWED = {"dexs", "dex aggregator", "derivatives", "options"} def load_portal_projects() -> list[dict]: with PORTAL_RAW.open() as f: return json.load(f) def write_portal_exchange_csv(projects: list[dict]) -> list[dict]: records: list[dict] = [] for project in projects: subs = [sub["title"].strip() for sub in project.get("subcategories", [])] if not subs: continue tags = sorted(EXCHANGE_TAGS.intersection(subs)) if not tags: continue records.append( { "name": project.get("title", "").strip(), "portal_id": project.get("id", "").strip(), "portal_exchange_tags": ";".join(tags), "portal_subcategories": ";".join(sorted(subs)), "chains": ";".join(project.get("chains", [])), "portal_url": project.get("url", "").strip(), } ) records.sort(key=lambda r: r["name"].lower()) with PORTAL_EXCHANGES.open("w", newline="") as f: writer = csv.DictWriter( f, [ "name", "portal_id", "portal_exchange_tags", "portal_subcategories", "chains", "portal_url", ], ) writer.writeheader() writer.writerows(records) return records def write_llama_subset() -> list[dict]: records: list[dict] = [] with LLAMA_RAW.open() as f: reader = csv.DictReader(f) for row in reader: category = row["category"].strip() if category.lower() not in LLAMA_ALLOWED: continue records.append( { "name": row["name"].strip(), "defillama_slug": row["slug"].strip(), "defillama_category": category, "defillama_tvl": row.get("arbitrum_tvl", "").strip(), "defillama_url": row.get("website", "").strip() or row.get("twitter", "").strip(), } ) records.sort(key=lambda r: r["name"].lower()) with LLAMA_SUBSET.open("w", newline="") as f: writer = csv.DictWriter( f, [ "name", "defillama_slug", "defillama_category", "defillama_tvl", "defillama_url", ], ) writer.writeheader() writer.writerows(records) return records def _norm(name: str) -> str: cleaned = re.sub(r"\\bv\\d+\\b", "", name.lower()) return re.sub(r"[^a-z0-9]", "", cleaned) def write_merged_dataset( portal_records: list[dict], llama_records: list[dict] ) -> None: portal_map = {_norm(row["name"]): row for row in portal_records} llama_map = {_norm(row["name"]): row for row in llama_records} all_keys = sorted(set(portal_map) | set(llama_map)) with MERGED.open("w", newline="") as f: writer = csv.DictWriter( f, [ "canonical_name", "sources", "portal_id", "portal_exchange_tags", "portal_subcategories", "portal_chains", "portal_url", "defillama_slug", "defillama_category", "defillama_tvl", "defillama_url", ], ) writer.writeheader() for key in all_keys: portal_row = portal_map.get(key) llama_row = llama_map.get(key) if portal_row and llama_row: name = ( portal_row["name"] if len(portal_row["name"]) <= len(llama_row["name"]) else llama_row["name"] ) sources = "Portal;DeFiLlama" elif portal_row: name = portal_row["name"] sources = "Portal" else: name = llama_row["name"] # type: ignore[union-attr] sources = "DeFiLlama" writer.writerow( { "canonical_name": name, "sources": sources, "portal_id": portal_row.get("portal_id", "") if portal_row else "", "portal_exchange_tags": portal_row.get("portal_exchange_tags", "") if portal_row else "", "portal_subcategories": portal_row.get("portal_subcategories", "") if portal_row else "", "portal_chains": portal_row.get("chains", "") if portal_row else "", "portal_url": portal_row.get("portal_url", "") if portal_row else "", "defillama_slug": llama_row.get("defillama_slug", "") if llama_row else "", "defillama_category": llama_row.get("defillama_category", "") if llama_row else "", "defillama_tvl": llama_row.get("defillama_tvl", "") if llama_row else "", "defillama_url": llama_row.get("defillama_url", "") if llama_row else "", } ) def main() -> None: if not PORTAL_RAW.exists(): raise FileNotFoundError( f"Missing {PORTAL_RAW}. Fetch via `curl -s https://portal-data.arbitrum.io/api/projects > {PORTAL_RAW}`" ) if not LLAMA_RAW.exists(): raise FileNotFoundError( f"Missing {LLAMA_RAW}. Pull fresh DeFiLlama export first." ) portal_records = write_portal_exchange_csv(load_portal_projects()) llama_records = write_llama_subset() write_merged_dataset(portal_records, llama_records) print( f"Generated {PORTAL_EXCHANGES.name}, {LLAMA_SUBSET.name}, and {MERGED.name}" ) if __name__ == "__main__": main()