Source code for lightning_ir.data.external_datasets.rank_distillm
1from .ir_datasets_utils import ParquetScoredDocs, register_new_dataset
2
3
[docs]
4def register_rank_distillm():
5 base_url = "https://huggingface.co/datasets/webis/rank-distillm/resolve/main"
6
7 md5_hashes = {
8 "rank-distillm-rankzephyr": (
9 "__rankzephyr-colbert-10000-sampled-100__msmarco-passage-train-judged.parquet",
10 "38f69a3c8a5ed21c639e882a6c2eff7c",
11 ),
12 "rank-distillm-set-encoder": (
13 "__set-encoder-colbert-all-100__msmarco-passage-train-judged.parquet",
14 "a47206da7dc551e3ebd4e5b6866be78a",
15 ),
16 "rank-distillm-monoelectra": (
17 "__monoelectra-colbert-all-100__msmarco-passage-train-judged.parquet",
18 "6561f33476a6c8408737f38ea85c848f",
19 ),
20 "rank-distillm-colbert": (
21 "__colbert__msmarco-passage-train-judged.parquet",
22 "1e927d52af085516bf5a3de2865809d5",
23 ),
24 }
25 for name, (file_name, expected_md5) in md5_hashes.items():
26 dlc_contents = {
27 "url": f"{base_url}/{file_name}",
28 "expected_md5": expected_md5,
29 "cache_path": f"msmarco-passage/train/{name}.parquet",
30 }
31 register_new_dataset(
32 f"msmarco-passage/train/{name}",
33 docs="msmarco-passage",
34 queries="msmarco-passage/train",
35 qrels="msmarco-passage/train",
36 scoreddocs=dlc_contents,
37 ScoreddocsType=ParquetScoredDocs,
38 )