Skip to content

Commit 9248e8c

Browse files
committed
Generate jsonl ists evaluation data
1 parent 7f9ac9f commit 9248e8c

File tree

1 file changed

+42
-0
lines changed

1 file changed

+42
-0
lines changed
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import argparse
2+
from pathlib import Path
3+
4+
import jsonlines
5+
6+
from rsd.experiments.utils import load_summary_benchmarks
7+
8+
parser = argparse.ArgumentParser()
9+
parser.add_argument("--split", type=str, default="test", help="Dataset split (train or test)", choices=["train", "test"])
10+
parser.add_argument("--limit-samples", type=int, default=None)
11+
args = parser.parse_args()
12+
13+
out_dir = Path(__file__).parent.parent.parent.parent / "ists_evaluation_data" / args.split
14+
out_dir.mkdir(parents=True, exist_ok=True)
15+
out_file = out_dir / "rsd.jsonl"
16+
17+
benchmark_names = [
18+
"ists",
19+
"ists_negatives",
20+
"ists_documents",
21+
"ists_permutations",
22+
"ists_de",
23+
"ists_es",
24+
"ists_fr",
25+
"ists_ja",
26+
"ists_ko",
27+
"ists_zh",
28+
]
29+
30+
with jsonlines.open(out_file, "w") as f:
31+
benchmarks = load_summary_benchmarks(args.split)
32+
results = {}
33+
for j, benchmark in enumerate(benchmarks):
34+
dataset = benchmark.to_dataset()
35+
dataset.shuffle(seed=42)
36+
if args.limit_samples:
37+
dataset = dataset.select(range(args.limit_samples))
38+
for i, sample in enumerate(dataset):
39+
sample = dict(sample)
40+
sample["subset"] = benchmark_names[j]
41+
sample["id"] = f"{benchmark_names[j]}_{i}"
42+
f.write(sample)

0 commit comments

Comments
 (0)