mev-beta/docs/5_development/mev_research/datasets/update_exchange_datasets.py

#!/usr/bin/env python3
"""
Regenerate exchange datasets for Arbitrum research.

Outputs:
- arbitrum_portal_exchanges.csv
- arbitrum_llama_exchange_subset.csv
- arbitrum_exchange_sources.csv

The script expects:
- data/raw_arbitrum_portal_projects.json (Portal `/api/projects` dump)
- arbitrum_llama_exchanges.csv (DeFiLlama export)
"""

from __future__ import annotations

import csv
import json
import re
from pathlib import Path

ROOT = Path(__file__).resolve().parents[4]  # repo root
DATA_DIR = ROOT / "docs" / "5_development" / "mev_research" / "datasets"
PORTAL_RAW = ROOT / "data" / "raw_arbitrum_portal_projects.json"
LLAMA_RAW = DATA_DIR / "arbitrum_llama_exchanges.csv"

PORTAL_EXCHANGES = DATA_DIR / "arbitrum_portal_exchanges.csv"
LLAMA_SUBSET = DATA_DIR / "arbitrum_llama_exchange_subset.csv"
MERGED = DATA_DIR / "arbitrum_exchange_sources.csv"

EXCHANGE_TAGS = {
    "DEX",
    "DEX Aggregator",
    "Perpetuals",
    "Options",
    "Derivatives",
    "Centralized Exchange",
}

LLAMA_ALLOWED = {"dexs", "dex aggregator", "derivatives", "options"}


def load_portal_projects() -> list[dict]:
    with PORTAL_RAW.open() as f:
        return json.load(f)


def write_portal_exchange_csv(projects: list[dict]) -> list[dict]:
    records: list[dict] = []
    for project in projects:
        subs = [sub["title"].strip() for sub in project.get("subcategories", [])]
        if not subs:
            continue
        tags = sorted(EXCHANGE_TAGS.intersection(subs))
        if not tags:
            continue
        records.append(
            {
                "name": project.get("title", "").strip(),
                "portal_id": project.get("id", "").strip(),
                "portal_exchange_tags": ";".join(tags),
                "portal_subcategories": ";".join(sorted(subs)),
                "chains": ";".join(project.get("chains", [])),
                "portal_url": project.get("url", "").strip(),
            }
        )

    records.sort(key=lambda r: r["name"].lower())
    with PORTAL_EXCHANGES.open("w", newline="") as f:
        writer = csv.DictWriter(
            f,
            [
                "name",
                "portal_id",
                "portal_exchange_tags",
                "portal_subcategories",
                "chains",
                "portal_url",
            ],
        )
        writer.writeheader()
        writer.writerows(records)
    return records


def write_llama_subset() -> list[dict]:
    records: list[dict] = []
    with LLAMA_RAW.open() as f:
        reader = csv.DictReader(f)
        for row in reader:
            category = row["category"].strip()
            if category.lower() not in LLAMA_ALLOWED:
                continue
            records.append(
                {
                    "name": row["name"].strip(),
                    "defillama_slug": row["slug"].strip(),
                    "defillama_category": category,
                    "defillama_tvl": row.get("arbitrum_tvl", "").strip(),
                    "defillama_url": row.get("website", "").strip()
                    or row.get("twitter", "").strip(),
                }
            )
    records.sort(key=lambda r: r["name"].lower())
    with LLAMA_SUBSET.open("w", newline="") as f:
        writer = csv.DictWriter(
            f,
            [
                "name",
                "defillama_slug",
                "defillama_category",
                "defillama_tvl",
                "defillama_url",
            ],
        )
        writer.writeheader()
        writer.writerows(records)
    return records


def _norm(name: str) -> str:
    cleaned = re.sub(r"\\bv\\d+\\b", "", name.lower())
    return re.sub(r"[^a-z0-9]", "", cleaned)


def write_merged_dataset(
    portal_records: list[dict], llama_records: list[dict]
) -> None:
    portal_map = {_norm(row["name"]): row for row in portal_records}
    llama_map = {_norm(row["name"]): row for row in llama_records}
    all_keys = sorted(set(portal_map) | set(llama_map))

    with MERGED.open("w", newline="") as f:
        writer = csv.DictWriter(
            f,
            [
                "canonical_name",
                "sources",
                "portal_id",
                "portal_exchange_tags",
                "portal_subcategories",
                "portal_chains",
                "portal_url",
                "defillama_slug",
                "defillama_category",
                "defillama_tvl",
                "defillama_url",
            ],
        )
        writer.writeheader()
        for key in all_keys:
            portal_row = portal_map.get(key)
            llama_row = llama_map.get(key)
            if portal_row and llama_row:
                name = (
                    portal_row["name"]
                    if len(portal_row["name"]) <= len(llama_row["name"])
                    else llama_row["name"]
                )
                sources = "Portal;DeFiLlama"
            elif portal_row:
                name = portal_row["name"]
                sources = "Portal"
            else:
                name = llama_row["name"]  # type: ignore[union-attr]
                sources = "DeFiLlama"

            writer.writerow(
                {
                    "canonical_name": name,
                    "sources": sources,
                    "portal_id": portal_row.get("portal_id", "") if portal_row else "",
                    "portal_exchange_tags": portal_row.get("portal_exchange_tags", "")
                    if portal_row
                    else "",
                    "portal_subcategories": portal_row.get("portal_subcategories", "")
                    if portal_row
                    else "",
                    "portal_chains": portal_row.get("chains", "") if portal_row else "",
                    "portal_url": portal_row.get("portal_url", "") if portal_row else "",
                    "defillama_slug": llama_row.get("defillama_slug", "")
                    if llama_row
                    else "",
                    "defillama_category": llama_row.get("defillama_category", "")
                    if llama_row
                    else "",
                    "defillama_tvl": llama_row.get("defillama_tvl", "")
                    if llama_row
                    else "",
                    "defillama_url": llama_row.get("defillama_url", "")
                    if llama_row
                    else "",
                }
            )


def main() -> None:
    if not PORTAL_RAW.exists():
        raise FileNotFoundError(
            f"Missing {PORTAL_RAW}. Fetch via `curl -s https://portal-data.arbitrum.io/api/projects > {PORTAL_RAW}`"
        )
    if not LLAMA_RAW.exists():
        raise FileNotFoundError(
            f"Missing {LLAMA_RAW}. Pull fresh DeFiLlama export first."
        )

    portal_records = write_portal_exchange_csv(load_portal_projects())
    llama_records = write_llama_subset()
    write_merged_dataset(portal_records, llama_records)
    print(
        f"Generated {PORTAL_EXCHANGES.name}, {LLAMA_SUBSET.name}, and {MERGED.name}"
    )


if __name__ == "__main__":
    main()