Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions judgearena/arenas_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from fast_langdetect import detect_language
from huggingface_hub import snapshot_download

from judgearena.dataset_revisions import hf_revision
from judgearena.log import get_logger

logger = get_logger(__name__)
Expand All @@ -30,11 +31,13 @@ def _load_arena_dataframe(
) -> pd.DataFrame:
assert arena in KNOWN_ARENAS
if arena == "LMArena-55k":
repo_id = "lmarena-ai/arena-human-preference-55k"
path = snapshot_download(
repo_id="lmarena-ai/arena-human-preference-55k",
repo_id=repo_id,
repo_type="dataset",
allow_patterns="*.csv",
force_download=False,
revision=hf_revision(repo_id),
)
df = pd.read_csv(Path(path) / "train.csv")

Expand Down Expand Up @@ -70,11 +73,13 @@ def _winner_55k(row) -> str | None:

elif "LMArena" in arena:
size = arena.split("-")[1] # "100k" or "140k"
repo_id = f"lmarena-ai/arena-human-preference-{size}"
path = snapshot_download(
repo_id=f"lmarena-ai/arena-human-preference-{size}",
repo_id=repo_id,
repo_type="dataset",
allow_patterns="*parquet",
force_download=False,
revision=hf_revision(repo_id),
)
parquet_files = sorted((Path(path) / "data").glob("*.parquet"))
df = pd.concat([pd.read_parquet(f) for f in parquet_files], ignore_index=True)
Expand Down Expand Up @@ -171,9 +176,12 @@ def get_winner(
return df


_DEFAULT_COMPARIA_REVISION = hf_revision("ministere-culture/comparia-votes")


def load_arena_dataframe(
arena: str | None,
comparia_revision: str = "7a40bce496c1f2aa3be4001da85a49cb4743042b",
comparia_revision: str | None = _DEFAULT_COMPARIA_REVISION,
) -> pd.DataFrame:
"""Load battles from one or all arenas.
Expand Down
64 changes: 64 additions & 0 deletions judgearena/dataset_revisions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""Pinned upstream revisions for every dataset/space JudgeArena downloads.
Pinning lets the run metadata answer "exactly which version of the data did
this run see?". When upstream rewrites a dataset (e.g. ComparIA periodically
republishes), an unpinned ``snapshot_download`` will silently start returning
different bytes; pinned revisions force callers to opt into upgrades.
To bump a revision, paste the new commit SHA from the dataset's HuggingFace
revision page (or the GitHub commit page for the FastChat raw URL).
"""

from __future__ import annotations

# HuggingFace dataset / space revisions. Keys are HuggingFace ``repo_id``
# strings; values are commit SHAs. ``None`` is allowed for repos where we
# do not yet have a stable pin and is recorded as such in the metadata so
# the gap is visible.
HF_DATASET_REVISIONS: dict[str, str | None] = {
# LMArena human-preference battles
"lmarena-ai/arena-human-preference-100k": "72e85b3ddc9c81bf7b659d6b03d4126dfd8fb34a",
"lmarena-ai/arena-human-preference-140k": "6322995ab34d7c2693e3f47dd13fa5caa0789a74",
"lmarena-ai/arena-human-preference-55k": "18c298340948c0e7f7727399fd459cca6ce0ca6f",
# ComparIA (already pinned via the legacy comparia_revision argument).
"ministere-culture/comparia-votes": "7a40bce496c1f2aa3be4001da85a49cb4743042b",
# m-ArenaHard (Cohere release)
"CohereLabs/m-ArenaHard": "ab393a96cd0b134a1acfa96e080af31e5e73a393",
"CohereLabs/m-ArenaHard-v2.0": "24c65eff42cec85e30dd5db99d1a702c7ebaa8ab",
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We also have m-ArenaHard-v2.1 now! Can be added as a follow-up PR. https://huggingface.co/datasets/CohereLabs/m-ArenaHard-v2.1

# AlpacaEval instructions / model_outputs (geoalgo redistribution; the
# repo now redirects to ``judge-arena/judge-arena-dataset`` upstream, but
# ``snapshot_download`` follows the redirect transparently).
"geoalgo/llmjudge": "004c4a992956eeefffd36b63ade470f32fd0a582",
# MT-Bench questions (LMSYS Space).
"lmsys/mt-bench": "a4b674ca573c24143824ac7f60d9173e7081e37d",
# Multilingual fluency contexts.
"geoalgo/multilingual-contexts-to-be-completed": "06e73c95ad18d71a04b5a1b6464ed89d38195039",
# Arena-Hard official source (used via datasets.load_dataset).
"lmarena-ai/arena-hard-auto": "15f3746e21432264ce9b453999bde4f3c946d2e6",
}


# Raw-URL pins (e.g. FastChat reference answers fetched as a raw GitHub URL).
# Mapping is "logical name" -> commit SHA on the upstream repo. The downloader
# rewrites the URL to point at the pinned SHA.
RAW_URL_REVISIONS: dict[str, str | None] = {
"lm-sys/FastChat": "587d5cfa1609a43d192cedb8441cac3c17db105d",
}


def hf_revision(repo_id: str) -> str | None:
"""Return the pinned revision for ``repo_id`` (or ``None`` if not pinned)."""
return HF_DATASET_REVISIONS.get(repo_id)


def all_dataset_revisions() -> dict[str, str | None]:
"""Return a copy of every pin recorded in this module.
Used by :func:`judgearena.repro.write_run_metadata` to record the
pin table alongside each run so future readers know which version of
the data was visible at the time of the run.
"""
return {
**HF_DATASET_REVISIONS,
**{f"raw:{k}": v for k, v in RAW_URL_REVISIONS.items()},
}
16 changes: 10 additions & 6 deletions judgearena/instruction_dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def load_instructions(dataset: str, n_instructions: int | None = None) -> pd.Dat
df_instructions = load_mt_bench()

elif (parsed := split_m_arena_hard_dataset(dataset)) is not None:
from judgearena.utils import data_root
from judgearena import utils as judgearena_utils

version_key, lang_or_subset = parsed
logger.info(
Expand All @@ -29,7 +29,9 @@ def load_instructions(dataset: str, n_instructions: int | None = None) -> pd.Dat
lang_or_subset,
)
df_instructions = load_m_arenahard(
local_path=data_root, version=version_key, language=lang_or_subset
local_path=judgearena_utils.data_root,
version=version_key,
language=lang_or_subset,
)
# sort by question_id, then language so that we get multiple languages if we truncate
df_instructions.sort_values(["question_id", "lang"], inplace=True)
Expand All @@ -48,14 +50,16 @@ def load_instructions(dataset: str, n_instructions: int | None = None) -> pd.Dat
"arena-hard-v0.1",
"arena-hard-v2.0",
]
from judgearena.utils import data_root, download_hf, read_df
from judgearena import utils as judgearena_utils

local_path_tables = data_root / "tables"
local_path_tables = judgearena_utils.data_root / "tables"
if is_arena_hard_dataset(dataset):
download_arena_hard(dataset=dataset, local_tables_path=local_path_tables)
else:
download_hf(name=dataset, local_path=local_path_tables)
df_instructions = read_df(local_path_tables / "instructions" / f"{dataset}.csv")
judgearena_utils.download_hf(name=dataset, local_path=local_path_tables)
df_instructions = judgearena_utils.read_df(
local_path_tables / "instructions" / f"{dataset}.csv"
)

df_instructions = df_instructions.set_index("instruction_index").sort_index()
logger.info("Loaded %d instructions for %s.", len(df_instructions), dataset)
Expand Down
3 changes: 3 additions & 0 deletions judgearena/instruction_dataset/arena_hard.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import pandas as pd
from huggingface_hub import snapshot_download

from judgearena.dataset_revisions import hf_revision

ARENA_HARD_HF_REPO_ID = "lmarena-ai/arena-hard-auto"

# Mirrors upstream's `JUDGE_SETTINGS` baseline assignment in
Expand Down Expand Up @@ -87,6 +89,7 @@ def download_arena_hard(dataset: str, local_tables_path: Path) -> None:
f"data/{variant}/model_answer/*.jsonl",
],
force_download=False,
revision=hf_revision(ARENA_HARD_HF_REPO_ID),
)
raw_df = _read_arena_hard_jsonl_frames(
variant_dir=Path(snapshot_root) / "data" / variant
Expand Down
5 changes: 4 additions & 1 deletion judgearena/instruction_dataset/m_arenahard.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import pandas as pd
from huggingface_hub import snapshot_download

from judgearena.dataset_revisions import hf_revision

EU_LANGUAGES: tuple[str, ...] = (
"cs",
"de",
Expand Down Expand Up @@ -120,6 +122,7 @@ def load_m_arenahard(
allow_patterns="*",
local_dir=local_path / local_subdir,
force_download=False,
revision=hf_revision(repo_id),
)
m_arena_root = local_path / local_subdir

Expand All @@ -146,6 +149,6 @@ def load_m_arenahard(


if __name__ == "__main__":
from judgearena.utils import data_root
from judgearena.paths import data_root

load_m_arenahard(local_path=data_root, version="m-arena-hard-v0.1", language="EU")
21 changes: 16 additions & 5 deletions judgearena/instruction_dataset/mt_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,25 @@
import pandas as pd
from huggingface_hub import snapshot_download

from judgearena.utils import data_root
from judgearena.dataset_revisions import RAW_URL_REVISIONS, hf_revision
from judgearena.paths import data_root

MT_BENCH_SPACE_ID = "lmsys/mt-bench"
MT_BENCH_QUESTION_PATTERN = "data/mt_bench/question.jsonl"
MT_BENCH_MODEL_ANSWER_DIR = Path("data") / "mt_bench" / "model_answer"
FASTCHAT_GPT4_REFERENCE_URL = (
"https://raw.githubusercontent.com/lm-sys/FastChat/main/"
"fastchat/llm_judge/data/mt_bench/reference_answer/gpt-4.jsonl"
)


def _fastchat_reference_url() -> str:
"""URL for FastChat MT-Bench GPT-4 references, pinned when available."""
revision = RAW_URL_REVISIONS.get("lm-sys/FastChat")
rev = revision if revision else "main"
return (
f"https://raw.githubusercontent.com/lm-sys/FastChat/{rev}/"
"fastchat/llm_judge/data/mt_bench/reference_answer/gpt-4.jsonl"
)


FASTCHAT_GPT4_REFERENCE_URL = _fastchat_reference_url()

# Mirrors ``ARENA_HARD_BASELINES`` / ``M_ARENA_HARD_BASELINES``: dataset name ->
# dataset-native pairwise baseline. MT-Bench ships only one variant today, and
Expand Down Expand Up @@ -54,6 +64,7 @@ def _snapshot_mt_bench_files(
allow_patterns=allow_patterns,
local_dir=local_dir,
force_download=False,
revision=hf_revision(MT_BENCH_SPACE_ID),
)
except Exception as e:
raise RuntimeError(
Expand Down
55 changes: 55 additions & 0 deletions judgearena/paths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""Filesystem paths and small file-IO helpers anchored at JudgeArena's data root.

This is a tiny leaf module so it can be imported by every other module
(including ``judgearena.instruction_dataset``) without pulling in the rest of
``judgearena.utils``, which would create an import cycle with the
``instruction_dataset`` package.

Symbols here are re-exported from :mod:`judgearena.utils` for backward
compatibility, so existing ``from judgearena.utils import data_root`` /
``from judgearena.utils import download_hf, read_df`` callers keep working.
"""

from __future__ import annotations

import os
from pathlib import Path

import pandas as pd
from huggingface_hub import snapshot_download

from judgearena.dataset_revisions import hf_revision


def _data_root_path() -> Path:
raw = os.environ.get("JUDGEARENA_DATA") or os.environ.get("OPENJURY_DATA")
if raw:
return Path(raw).expanduser()
return Path("~/judgearena-data/").expanduser()


data_root: Path = _data_root_path()


def download_hf(name: str, local_path: Path) -> None:
"""Download AlpacaEval-style instruction/output tables into ``local_path``."""
local_path.mkdir(exist_ok=True, parents=True)
repo_id = "geoalgo/llmjudge"
snapshot_download(
repo_id=repo_id,
repo_type="dataset",
allow_patterns=f"*{name}*",
local_dir=local_path,
force_download=False,
revision=hf_revision(repo_id),
)


def read_df(filename: Path, **pandas_kwargs) -> pd.DataFrame:
"""Read a CSV/CSV-zip/parquet dataframe from disk."""
assert filename.exists(), f"Dataframe file not found at {filename}"
if filename.name.endswith(".csv.zip") or filename.name.endswith(".csv"):
return pd.read_csv(filename, **pandas_kwargs)
else:
assert filename.name.endswith(".parquet"), f"Unsupported extension {filename}"
return pd.read_parquet(filename, **pandas_kwargs)
43 changes: 12 additions & 31 deletions judgearena/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,50 +14,29 @@
from tqdm.asyncio import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm

from judgearena.dataset_revisions import hf_revision
from judgearena.instruction_dataset.arena_hard import (
download_arena_hard,
is_arena_hard_dataset,
)
from judgearena.log import get_logger

logger = get_logger(__name__)


def _data_root_path() -> Path:
raw = os.environ.get("JUDGEARENA_DATA") or os.environ.get("OPENJURY_DATA")
if raw:
return Path(raw).expanduser()
return Path("~/judgearena-data/").expanduser()
# ``data_root``, ``download_hf`` and ``read_df`` live in the leaf
# :mod:`judgearena.paths` module so that ``judgearena.instruction_dataset`` can
# import them without going through ``judgearena.utils``. We re-export them
# here so existing callers that do ``from judgearena.utils import data_root``
# (or ``download_hf`` / ``read_df``) keep working.
from judgearena.paths import data_root, download_hf, read_df

logger = get_logger(__name__)

data_root = _data_root_path()
__all__ = ["data_root", "download_hf", "read_df"]


def set_langchain_cache():
set_llm_cache(SQLiteCache(database_path=str(data_root / ".langchain.db")))


def download_hf(name: str, local_path: Path):
local_path.mkdir(exist_ok=True, parents=True)
# downloads the model from huggingface into `local_path` folder
snapshot_download(
repo_id="geoalgo/llmjudge",
repo_type="dataset",
allow_patterns=f"*{name}*",
local_dir=local_path,
force_download=False,
)


def read_df(filename: Path, **pandas_kwargs) -> pd.DataFrame:
assert filename.exists(), f"Dataframe file not found at {filename}"
if filename.name.endswith(".csv.zip") or filename.name.endswith(".csv"):
return pd.read_csv(filename, **pandas_kwargs)
else:
assert filename.name.endswith(".parquet"), f"Unsupported extension {filename}"
return pd.read_parquet(filename, **pandas_kwargs)


def compute_pref_summary(prefs: pd.Series) -> dict[str, float | int]:
"""Compute win/loss/tie stats for preference series (0=A, 0.5=tie, 1=B)."""
prefs = pd.Series(prefs, dtype="float64")
Expand Down Expand Up @@ -483,12 +462,14 @@ def download_all():
else:
download_hf(name=dataset, local_path=local_path_tables)

contexts_repo = "geoalgo/multilingual-contexts-to-be-completed"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keeping the local variable here intentionally: contexts_repo is used both as the repo_id and as the key into hf_revision(...), so the two stay coupled if this repo id changes later. This mirrors the other pinned snapshot_download sites in this PR.

snapshot_download(
repo_id="geoalgo/multilingual-contexts-to-be-completed",
repo_id=contexts_repo,
repo_type="dataset",
allow_patterns="*",
local_dir=data_root / "contexts",
force_download=False,
revision=hf_revision(contexts_repo),
)

from judgearena.instruction_dataset.mt_bench import download_mt_bench
Expand Down