-
Notifications
You must be signed in to change notification settings - Fork 5
Pin dataset revisions for reproducibility #39
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| """Pinned upstream revisions for every dataset/space JudgeArena downloads. | ||
| Pinning lets the run metadata answer "exactly which version of the data did | ||
| this run see?". When upstream rewrites a dataset (e.g. ComparIA periodically | ||
| republishes), an unpinned ``snapshot_download`` will silently start returning | ||
| different bytes; pinned revisions force callers to opt into upgrades. | ||
| To bump a revision, paste the new commit SHA from the dataset's HuggingFace | ||
| revision page (or the GitHub commit page for the FastChat raw URL). | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| # HuggingFace dataset / space revisions. Keys are HuggingFace ``repo_id`` | ||
| # strings; values are commit SHAs. ``None`` is allowed for repos where we | ||
| # do not yet have a stable pin and is recorded as such in the metadata so | ||
| # the gap is visible. | ||
| HF_DATASET_REVISIONS: dict[str, str | None] = { | ||
| # LMArena human-preference battles | ||
| "lmarena-ai/arena-human-preference-100k": "72e85b3ddc9c81bf7b659d6b03d4126dfd8fb34a", | ||
| "lmarena-ai/arena-human-preference-140k": "6322995ab34d7c2693e3f47dd13fa5caa0789a74", | ||
| "lmarena-ai/arena-human-preference-55k": "18c298340948c0e7f7727399fd459cca6ce0ca6f", | ||
| # ComparIA (already pinned via the legacy comparia_revision argument). | ||
| "ministere-culture/comparia-votes": "7a40bce496c1f2aa3be4001da85a49cb4743042b", | ||
| # m-ArenaHard (Cohere release) | ||
| "CohereLabs/m-ArenaHard": "ab393a96cd0b134a1acfa96e080af31e5e73a393", | ||
| "CohereLabs/m-ArenaHard-v2.0": "24c65eff42cec85e30dd5db99d1a702c7ebaa8ab", | ||
| # AlpacaEval instructions / model_outputs (geoalgo redistribution; the | ||
| # repo now redirects to ``judge-arena/judge-arena-dataset`` upstream, but | ||
| # ``snapshot_download`` follows the redirect transparently). | ||
| "geoalgo/llmjudge": "004c4a992956eeefffd36b63ade470f32fd0a582", | ||
| # MT-Bench questions (LMSYS Space). | ||
| "lmsys/mt-bench": "a4b674ca573c24143824ac7f60d9173e7081e37d", | ||
| # Multilingual fluency contexts. | ||
| "geoalgo/multilingual-contexts-to-be-completed": "06e73c95ad18d71a04b5a1b6464ed89d38195039", | ||
| # Arena-Hard official source (used via datasets.load_dataset). | ||
| "lmarena-ai/arena-hard-auto": "15f3746e21432264ce9b453999bde4f3c946d2e6", | ||
| } | ||
|
|
||
|
|
||
| # Raw-URL pins (e.g. FastChat reference answers fetched as a raw GitHub URL). | ||
| # Mapping is "logical name" -> commit SHA on the upstream repo. The downloader | ||
| # rewrites the URL to point at the pinned SHA. | ||
| RAW_URL_REVISIONS: dict[str, str | None] = { | ||
| "lm-sys/FastChat": "587d5cfa1609a43d192cedb8441cac3c17db105d", | ||
| } | ||
|
|
||
|
|
||
| def hf_revision(repo_id: str) -> str | None: | ||
| """Return the pinned revision for ``repo_id`` (or ``None`` if not pinned).""" | ||
| return HF_DATASET_REVISIONS.get(repo_id) | ||
|
|
||
|
|
||
| def all_dataset_revisions() -> dict[str, str | None]: | ||
| """Return a copy of every pin recorded in this module. | ||
| Used by :func:`judgearena.repro.write_run_metadata` to record the | ||
| pin table alongside each run so future readers know which version of | ||
| the data was visible at the time of the run. | ||
| """ | ||
| return { | ||
| **HF_DATASET_REVISIONS, | ||
| **{f"raw:{k}": v for k, v in RAW_URL_REVISIONS.items()}, | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,55 @@ | ||
| """Filesystem paths and small file-IO helpers anchored at JudgeArena's data root. | ||
|
|
||
| This is a tiny leaf module so it can be imported by every other module | ||
| (including ``judgearena.instruction_dataset``) without pulling in the rest of | ||
| ``judgearena.utils``, which would create an import cycle with the | ||
| ``instruction_dataset`` package. | ||
|
|
||
| Symbols here are re-exported from :mod:`judgearena.utils` for backward | ||
| compatibility, so existing ``from judgearena.utils import data_root`` / | ||
| ``from judgearena.utils import download_hf, read_df`` callers keep working. | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import os | ||
| from pathlib import Path | ||
|
|
||
| import pandas as pd | ||
| from huggingface_hub import snapshot_download | ||
|
|
||
| from judgearena.dataset_revisions import hf_revision | ||
|
|
||
|
|
||
| def _data_root_path() -> Path: | ||
| raw = os.environ.get("JUDGEARENA_DATA") or os.environ.get("OPENJURY_DATA") | ||
| if raw: | ||
| return Path(raw).expanduser() | ||
| return Path("~/judgearena-data/").expanduser() | ||
|
|
||
|
|
||
| data_root: Path = _data_root_path() | ||
|
|
||
|
|
||
| def download_hf(name: str, local_path: Path) -> None: | ||
| """Download AlpacaEval-style instruction/output tables into ``local_path``.""" | ||
| local_path.mkdir(exist_ok=True, parents=True) | ||
| repo_id = "geoalgo/llmjudge" | ||
| snapshot_download( | ||
| repo_id=repo_id, | ||
| repo_type="dataset", | ||
| allow_patterns=f"*{name}*", | ||
| local_dir=local_path, | ||
| force_download=False, | ||
| revision=hf_revision(repo_id), | ||
| ) | ||
|
|
||
|
|
||
| def read_df(filename: Path, **pandas_kwargs) -> pd.DataFrame: | ||
| """Read a CSV/CSV-zip/parquet dataframe from disk.""" | ||
| assert filename.exists(), f"Dataframe file not found at {filename}" | ||
| if filename.name.endswith(".csv.zip") or filename.name.endswith(".csv"): | ||
| return pd.read_csv(filename, **pandas_kwargs) | ||
| else: | ||
| assert filename.name.endswith(".parquet"), f"Unsupported extension {filename}" | ||
| return pd.read_parquet(filename, **pandas_kwargs) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,50 +14,29 @@ | |
| from tqdm.asyncio import tqdm | ||
| from tqdm.contrib.logging import logging_redirect_tqdm | ||
|
|
||
| from judgearena.dataset_revisions import hf_revision | ||
| from judgearena.instruction_dataset.arena_hard import ( | ||
| download_arena_hard, | ||
| is_arena_hard_dataset, | ||
| ) | ||
| from judgearena.log import get_logger | ||
|
|
||
| logger = get_logger(__name__) | ||
|
|
||
|
|
||
| def _data_root_path() -> Path: | ||
| raw = os.environ.get("JUDGEARENA_DATA") or os.environ.get("OPENJURY_DATA") | ||
| if raw: | ||
| return Path(raw).expanduser() | ||
| return Path("~/judgearena-data/").expanduser() | ||
| # ``data_root``, ``download_hf`` and ``read_df`` live in the leaf | ||
| # :mod:`judgearena.paths` module so that ``judgearena.instruction_dataset`` can | ||
| # import them without going through ``judgearena.utils``. We re-export them | ||
| # here so existing callers that do ``from judgearena.utils import data_root`` | ||
| # (or ``download_hf`` / ``read_df``) keep working. | ||
| from judgearena.paths import data_root, download_hf, read_df | ||
|
|
||
| logger = get_logger(__name__) | ||
|
|
||
| data_root = _data_root_path() | ||
| __all__ = ["data_root", "download_hf", "read_df"] | ||
|
|
||
|
|
||
| def set_langchain_cache(): | ||
| set_llm_cache(SQLiteCache(database_path=str(data_root / ".langchain.db"))) | ||
|
|
||
|
|
||
| def download_hf(name: str, local_path: Path): | ||
| local_path.mkdir(exist_ok=True, parents=True) | ||
| # downloads the model from huggingface into `local_path` folder | ||
| snapshot_download( | ||
| repo_id="geoalgo/llmjudge", | ||
| repo_type="dataset", | ||
| allow_patterns=f"*{name}*", | ||
| local_dir=local_path, | ||
| force_download=False, | ||
| ) | ||
|
|
||
|
|
||
| def read_df(filename: Path, **pandas_kwargs) -> pd.DataFrame: | ||
| assert filename.exists(), f"Dataframe file not found at {filename}" | ||
| if filename.name.endswith(".csv.zip") or filename.name.endswith(".csv"): | ||
| return pd.read_csv(filename, **pandas_kwargs) | ||
| else: | ||
| assert filename.name.endswith(".parquet"), f"Unsupported extension {filename}" | ||
| return pd.read_parquet(filename, **pandas_kwargs) | ||
|
|
||
|
|
||
| def compute_pref_summary(prefs: pd.Series) -> dict[str, float | int]: | ||
| """Compute win/loss/tie stats for preference series (0=A, 0.5=tie, 1=B).""" | ||
| prefs = pd.Series(prefs, dtype="float64") | ||
|
|
@@ -483,12 +462,14 @@ def download_all(): | |
| else: | ||
| download_hf(name=dataset, local_path=local_path_tables) | ||
|
|
||
| contexts_repo = "geoalgo/multilingual-contexts-to-be-completed" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Keeping the local variable here intentionally: |
||
| snapshot_download( | ||
| repo_id="geoalgo/multilingual-contexts-to-be-completed", | ||
| repo_id=contexts_repo, | ||
| repo_type="dataset", | ||
| allow_patterns="*", | ||
| local_dir=data_root / "contexts", | ||
| force_download=False, | ||
| revision=hf_revision(contexts_repo), | ||
| ) | ||
|
|
||
| from judgearena.instruction_dataset.mt_bench import download_mt_bench | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We also have m-ArenaHard-v2.1 now! Can be added as a follow-up PR. https://huggingface.co/datasets/CohereLabs/m-ArenaHard-v2.1