diff --git a/judgearena/arenas_utils.py b/judgearena/arenas_utils.py index e85750f..ec2754e 100644 --- a/judgearena/arenas_utils.py +++ b/judgearena/arenas_utils.py @@ -5,6 +5,7 @@ from fast_langdetect import detect_language from huggingface_hub import snapshot_download +from judgearena.dataset_revisions import hf_revision from judgearena.log import get_logger logger = get_logger(__name__) @@ -30,11 +31,13 @@ def _load_arena_dataframe( ) -> pd.DataFrame: assert arena in KNOWN_ARENAS if arena == "LMArena-55k": + repo_id = "lmarena-ai/arena-human-preference-55k" path = snapshot_download( - repo_id="lmarena-ai/arena-human-preference-55k", + repo_id=repo_id, repo_type="dataset", allow_patterns="*.csv", force_download=False, + revision=hf_revision(repo_id), ) df = pd.read_csv(Path(path) / "train.csv") @@ -70,11 +73,13 @@ def _winner_55k(row) -> str | None: elif "LMArena" in arena: size = arena.split("-")[1] # "100k" or "140k" + repo_id = f"lmarena-ai/arena-human-preference-{size}" path = snapshot_download( - repo_id=f"lmarena-ai/arena-human-preference-{size}", + repo_id=repo_id, repo_type="dataset", allow_patterns="*parquet", force_download=False, + revision=hf_revision(repo_id), ) parquet_files = sorted((Path(path) / "data").glob("*.parquet")) df = pd.concat([pd.read_parquet(f) for f in parquet_files], ignore_index=True) @@ -171,9 +176,12 @@ def get_winner( return df +_DEFAULT_COMPARIA_REVISION = hf_revision("ministere-culture/comparia-votes") + + def load_arena_dataframe( arena: str | None, - comparia_revision: str = "7a40bce496c1f2aa3be4001da85a49cb4743042b", + comparia_revision: str | None = _DEFAULT_COMPARIA_REVISION, ) -> pd.DataFrame: """Load battles from one or all arenas. diff --git a/judgearena/dataset_revisions.py b/judgearena/dataset_revisions.py new file mode 100644 index 0000000..eb65178 --- /dev/null +++ b/judgearena/dataset_revisions.py @@ -0,0 +1,64 @@ +"""Pinned upstream revisions for every dataset/space JudgeArena downloads. + +Pinning lets the run metadata answer "exactly which version of the data did +this run see?". When upstream rewrites a dataset (e.g. ComparIA periodically +republishes), an unpinned ``snapshot_download`` will silently start returning +different bytes; pinned revisions force callers to opt into upgrades. + +To bump a revision, paste the new commit SHA from the dataset's HuggingFace +revision page (or the GitHub commit page for the FastChat raw URL). +""" + +from __future__ import annotations + +# HuggingFace dataset / space revisions. Keys are HuggingFace ``repo_id`` +# strings; values are commit SHAs. ``None`` is allowed for repos where we +# do not yet have a stable pin and is recorded as such in the metadata so +# the gap is visible. +HF_DATASET_REVISIONS: dict[str, str | None] = { + # LMArena human-preference battles + "lmarena-ai/arena-human-preference-100k": "72e85b3ddc9c81bf7b659d6b03d4126dfd8fb34a", + "lmarena-ai/arena-human-preference-140k": "6322995ab34d7c2693e3f47dd13fa5caa0789a74", + "lmarena-ai/arena-human-preference-55k": "18c298340948c0e7f7727399fd459cca6ce0ca6f", + # ComparIA (already pinned via the legacy comparia_revision argument). + "ministere-culture/comparia-votes": "7a40bce496c1f2aa3be4001da85a49cb4743042b", + # m-ArenaHard (Cohere release) + "CohereLabs/m-ArenaHard": "ab393a96cd0b134a1acfa96e080af31e5e73a393", + "CohereLabs/m-ArenaHard-v2.0": "24c65eff42cec85e30dd5db99d1a702c7ebaa8ab", + # AlpacaEval instructions / model_outputs (geoalgo redistribution; the + # repo now redirects to ``judge-arena/judge-arena-dataset`` upstream, but + # ``snapshot_download`` follows the redirect transparently). + "geoalgo/llmjudge": "004c4a992956eeefffd36b63ade470f32fd0a582", + # MT-Bench questions (LMSYS Space). + "lmsys/mt-bench": "a4b674ca573c24143824ac7f60d9173e7081e37d", + # Multilingual fluency contexts. + "geoalgo/multilingual-contexts-to-be-completed": "06e73c95ad18d71a04b5a1b6464ed89d38195039", + # Arena-Hard official source (used via datasets.load_dataset). + "lmarena-ai/arena-hard-auto": "15f3746e21432264ce9b453999bde4f3c946d2e6", +} + + +# Raw-URL pins (e.g. FastChat reference answers fetched as a raw GitHub URL). +# Mapping is "logical name" -> commit SHA on the upstream repo. The downloader +# rewrites the URL to point at the pinned SHA. +RAW_URL_REVISIONS: dict[str, str | None] = { + "lm-sys/FastChat": "587d5cfa1609a43d192cedb8441cac3c17db105d", +} + + +def hf_revision(repo_id: str) -> str | None: + """Return the pinned revision for ``repo_id`` (or ``None`` if not pinned).""" + return HF_DATASET_REVISIONS.get(repo_id) + + +def all_dataset_revisions() -> dict[str, str | None]: + """Return a copy of every pin recorded in this module. + + Used by :func:`judgearena.repro.write_run_metadata` to record the + pin table alongside each run so future readers know which version of + the data was visible at the time of the run. + """ + return { + **HF_DATASET_REVISIONS, + **{f"raw:{k}": v for k, v in RAW_URL_REVISIONS.items()}, + } diff --git a/judgearena/instruction_dataset/__init__.py b/judgearena/instruction_dataset/__init__.py index 7d3185c..1a2955e 100644 --- a/judgearena/instruction_dataset/__init__.py +++ b/judgearena/instruction_dataset/__init__.py @@ -20,7 +20,7 @@ def load_instructions(dataset: str, n_instructions: int | None = None) -> pd.Dat df_instructions = load_mt_bench() elif (parsed := split_m_arena_hard_dataset(dataset)) is not None: - from judgearena.utils import data_root + from judgearena import utils as judgearena_utils version_key, lang_or_subset = parsed logger.info( @@ -29,7 +29,9 @@ def load_instructions(dataset: str, n_instructions: int | None = None) -> pd.Dat lang_or_subset, ) df_instructions = load_m_arenahard( - local_path=data_root, version=version_key, language=lang_or_subset + local_path=judgearena_utils.data_root, + version=version_key, + language=lang_or_subset, ) # sort by question_id, then language so that we get multiple languages if we truncate df_instructions.sort_values(["question_id", "lang"], inplace=True) @@ -48,14 +50,16 @@ def load_instructions(dataset: str, n_instructions: int | None = None) -> pd.Dat "arena-hard-v0.1", "arena-hard-v2.0", ] - from judgearena.utils import data_root, download_hf, read_df + from judgearena import utils as judgearena_utils - local_path_tables = data_root / "tables" + local_path_tables = judgearena_utils.data_root / "tables" if is_arena_hard_dataset(dataset): download_arena_hard(dataset=dataset, local_tables_path=local_path_tables) else: - download_hf(name=dataset, local_path=local_path_tables) - df_instructions = read_df(local_path_tables / "instructions" / f"{dataset}.csv") + judgearena_utils.download_hf(name=dataset, local_path=local_path_tables) + df_instructions = judgearena_utils.read_df( + local_path_tables / "instructions" / f"{dataset}.csv" + ) df_instructions = df_instructions.set_index("instruction_index").sort_index() logger.info("Loaded %d instructions for %s.", len(df_instructions), dataset) diff --git a/judgearena/instruction_dataset/arena_hard.py b/judgearena/instruction_dataset/arena_hard.py index a6681fe..f00ed4a 100644 --- a/judgearena/instruction_dataset/arena_hard.py +++ b/judgearena/instruction_dataset/arena_hard.py @@ -5,6 +5,8 @@ import pandas as pd from huggingface_hub import snapshot_download +from judgearena.dataset_revisions import hf_revision + ARENA_HARD_HF_REPO_ID = "lmarena-ai/arena-hard-auto" # Mirrors upstream's `JUDGE_SETTINGS` baseline assignment in @@ -87,6 +89,7 @@ def download_arena_hard(dataset: str, local_tables_path: Path) -> None: f"data/{variant}/model_answer/*.jsonl", ], force_download=False, + revision=hf_revision(ARENA_HARD_HF_REPO_ID), ) raw_df = _read_arena_hard_jsonl_frames( variant_dir=Path(snapshot_root) / "data" / variant diff --git a/judgearena/instruction_dataset/m_arenahard.py b/judgearena/instruction_dataset/m_arenahard.py index 81b4f2e..3d2b1ac 100644 --- a/judgearena/instruction_dataset/m_arenahard.py +++ b/judgearena/instruction_dataset/m_arenahard.py @@ -13,6 +13,8 @@ import pandas as pd from huggingface_hub import snapshot_download +from judgearena.dataset_revisions import hf_revision + EU_LANGUAGES: tuple[str, ...] = ( "cs", "de", @@ -120,6 +122,7 @@ def load_m_arenahard( allow_patterns="*", local_dir=local_path / local_subdir, force_download=False, + revision=hf_revision(repo_id), ) m_arena_root = local_path / local_subdir @@ -146,6 +149,6 @@ def load_m_arenahard( if __name__ == "__main__": - from judgearena.utils import data_root + from judgearena.paths import data_root load_m_arenahard(local_path=data_root, version="m-arena-hard-v0.1", language="EU") diff --git a/judgearena/instruction_dataset/mt_bench.py b/judgearena/instruction_dataset/mt_bench.py index 291e13e..137af0f 100644 --- a/judgearena/instruction_dataset/mt_bench.py +++ b/judgearena/instruction_dataset/mt_bench.py @@ -5,15 +5,25 @@ import pandas as pd from huggingface_hub import snapshot_download -from judgearena.utils import data_root +from judgearena.dataset_revisions import RAW_URL_REVISIONS, hf_revision +from judgearena.paths import data_root MT_BENCH_SPACE_ID = "lmsys/mt-bench" MT_BENCH_QUESTION_PATTERN = "data/mt_bench/question.jsonl" MT_BENCH_MODEL_ANSWER_DIR = Path("data") / "mt_bench" / "model_answer" -FASTCHAT_GPT4_REFERENCE_URL = ( - "https://raw.githubusercontent.com/lm-sys/FastChat/main/" - "fastchat/llm_judge/data/mt_bench/reference_answer/gpt-4.jsonl" -) + + +def _fastchat_reference_url() -> str: + """URL for FastChat MT-Bench GPT-4 references, pinned when available.""" + revision = RAW_URL_REVISIONS.get("lm-sys/FastChat") + rev = revision if revision else "main" + return ( + f"https://raw.githubusercontent.com/lm-sys/FastChat/{rev}/" + "fastchat/llm_judge/data/mt_bench/reference_answer/gpt-4.jsonl" + ) + + +FASTCHAT_GPT4_REFERENCE_URL = _fastchat_reference_url() # Mirrors ``ARENA_HARD_BASELINES`` / ``M_ARENA_HARD_BASELINES``: dataset name -> # dataset-native pairwise baseline. MT-Bench ships only one variant today, and @@ -54,6 +64,7 @@ def _snapshot_mt_bench_files( allow_patterns=allow_patterns, local_dir=local_dir, force_download=False, + revision=hf_revision(MT_BENCH_SPACE_ID), ) except Exception as e: raise RuntimeError( diff --git a/judgearena/paths.py b/judgearena/paths.py new file mode 100644 index 0000000..934d3f3 --- /dev/null +++ b/judgearena/paths.py @@ -0,0 +1,55 @@ +"""Filesystem paths and small file-IO helpers anchored at JudgeArena's data root. + +This is a tiny leaf module so it can be imported by every other module +(including ``judgearena.instruction_dataset``) without pulling in the rest of +``judgearena.utils``, which would create an import cycle with the +``instruction_dataset`` package. + +Symbols here are re-exported from :mod:`judgearena.utils` for backward +compatibility, so existing ``from judgearena.utils import data_root`` / +``from judgearena.utils import download_hf, read_df`` callers keep working. +""" + +from __future__ import annotations + +import os +from pathlib import Path + +import pandas as pd +from huggingface_hub import snapshot_download + +from judgearena.dataset_revisions import hf_revision + + +def _data_root_path() -> Path: + raw = os.environ.get("JUDGEARENA_DATA") or os.environ.get("OPENJURY_DATA") + if raw: + return Path(raw).expanduser() + return Path("~/judgearena-data/").expanduser() + + +data_root: Path = _data_root_path() + + +def download_hf(name: str, local_path: Path) -> None: + """Download AlpacaEval-style instruction/output tables into ``local_path``.""" + local_path.mkdir(exist_ok=True, parents=True) + repo_id = "geoalgo/llmjudge" + snapshot_download( + repo_id=repo_id, + repo_type="dataset", + allow_patterns=f"*{name}*", + local_dir=local_path, + force_download=False, + revision=hf_revision(repo_id), + ) + + +def read_df(filename: Path, **pandas_kwargs) -> pd.DataFrame: + """Read a CSV/CSV-zip/parquet dataframe from disk.""" + assert filename.exists(), f"Dataframe file not found at {filename}" + if filename.name.endswith(".csv.zip") or filename.name.endswith(".csv"): + return pd.read_csv(filename, **pandas_kwargs) + else: + assert filename.name.endswith(".parquet"), f"Unsupported extension {filename}" + return pd.read_parquet(filename, **pandas_kwargs) diff --git a/judgearena/utils.py b/judgearena/utils.py index ccde6b0..0cc1576 100644 --- a/judgearena/utils.py +++ b/judgearena/utils.py @@ -14,50 +14,29 @@ from tqdm.asyncio import tqdm from tqdm.contrib.logging import logging_redirect_tqdm +from judgearena.dataset_revisions import hf_revision from judgearena.instruction_dataset.arena_hard import ( download_arena_hard, is_arena_hard_dataset, ) from judgearena.log import get_logger -logger = get_logger(__name__) - - -def _data_root_path() -> Path: - raw = os.environ.get("JUDGEARENA_DATA") or os.environ.get("OPENJURY_DATA") - if raw: - return Path(raw).expanduser() - return Path("~/judgearena-data/").expanduser() +# ``data_root``, ``download_hf`` and ``read_df`` live in the leaf +# :mod:`judgearena.paths` module so that ``judgearena.instruction_dataset`` can +# import them without going through ``judgearena.utils``. We re-export them +# here so existing callers that do ``from judgearena.utils import data_root`` +# (or ``download_hf`` / ``read_df``) keep working. +from judgearena.paths import data_root, download_hf, read_df +logger = get_logger(__name__) -data_root = _data_root_path() +__all__ = ["data_root", "download_hf", "read_df"] def set_langchain_cache(): set_llm_cache(SQLiteCache(database_path=str(data_root / ".langchain.db"))) -def download_hf(name: str, local_path: Path): - local_path.mkdir(exist_ok=True, parents=True) - # downloads the model from huggingface into `local_path` folder - snapshot_download( - repo_id="geoalgo/llmjudge", - repo_type="dataset", - allow_patterns=f"*{name}*", - local_dir=local_path, - force_download=False, - ) - - -def read_df(filename: Path, **pandas_kwargs) -> pd.DataFrame: - assert filename.exists(), f"Dataframe file not found at {filename}" - if filename.name.endswith(".csv.zip") or filename.name.endswith(".csv"): - return pd.read_csv(filename, **pandas_kwargs) - else: - assert filename.name.endswith(".parquet"), f"Unsupported extension {filename}" - return pd.read_parquet(filename, **pandas_kwargs) - - def compute_pref_summary(prefs: pd.Series) -> dict[str, float | int]: """Compute win/loss/tie stats for preference series (0=A, 0.5=tie, 1=B).""" prefs = pd.Series(prefs, dtype="float64") @@ -483,12 +462,14 @@ def download_all(): else: download_hf(name=dataset, local_path=local_path_tables) + contexts_repo = "geoalgo/multilingual-contexts-to-be-completed" snapshot_download( - repo_id="geoalgo/multilingual-contexts-to-be-completed", + repo_id=contexts_repo, repo_type="dataset", allow_patterns="*", local_dir=data_root / "contexts", force_download=False, + revision=hf_revision(contexts_repo), ) from judgearena.instruction_dataset.mt_bench import download_mt_bench