Source code for lightning_ir.data.external_datasets.rank_distillm
1from ir_datasets.util import GzipExtract
2
3from .ir_datasets_utils import ParquetScoredDocs, register_new_dataset
4
5
[docs]
6def register_rank_distillm():
7
8 base_url = "https://zenodo.org/records/15753974/files/"
9
10 dlc_contents = {
11 "url": f"{base_url}__rankzephyr-colbert-10000-" "sampled-100__msmarco-passage-train-judged.run?download=1",
12 "expected_md5": "49f8dbf2c1ee7a2ca1fe517eda528af6",
13 "cache_path": "msmarco-passage/train/rank-distillm-rankzephyr.run",
14 }
15 register_new_dataset(
16 "msmarco-passage/train/rank-distillm-rankzephyr",
17 docs="msmarco-passage",
18 queries="msmarco-passage/train",
19 qrels="msmarco-passage/train",
20 scoreddocs=dlc_contents,
21 )
22
23 dlc_contents = {
24 "url": f"{base_url}__set-encoder-colbert__" "msmarco-passage-train-judged.run.gz?download=1",
25 "expected_md5": "1f069d0daa9842a54a858cc660149e1a",
26 "cache_path": "msmarco-passage/train/rank-distillm-set-encoder.run",
27 "extractors": [GzipExtract],
28 }
29 register_new_dataset(
30 "msmarco-passage/train/rank-distillm-set-encoder",
31 docs="msmarco-passage",
32 queries="msmarco-passage/train",
33 qrels="msmarco-passage/train",
34 scoreddocs=dlc_contents,
35 )
36
37 dlc_contents = {
38 "url": f"{base_url}__monoelectra-colbert__" "msmarco-passage-train-judged.run.gz?download=1",
39 "expected_md5": "5abc9a6c2cdf986c0aedcea853f0b34c",
40 "cache_path": "msmarco-passage/train/rank-distillm-monoelectra.run",
41 "extractors": [GzipExtract],
42 }
43 register_new_dataset(
44 "msmarco-passage/train/rank-distillm-monoelectra",
45 docs="msmarco-passage",
46 queries="msmarco-passage/train",
47 qrels="msmarco-passage/train",
48 scoreddocs=dlc_contents,
49 )
50
51 dlc_contents = {
52 "url": f"{base_url}__colbert__msmarco-passage-train-judged.parquet?download=1",
53 "expected_md5": "1e927d52af085516bf5a3de2865809d5",
54 "cache_path": "msmarco-passage/train/rank-distillm-colbert.parquet",
55 }
56 register_new_dataset(
57 "msmarco-passage/train/rank-distillm-colbert",
58 docs="msmarco-passage",
59 queries="msmarco-passage/train",
60 qrels="msmarco-passage/train",
61 scoreddocs=dlc_contents,
62 ScoreddocsType=ParquetScoredDocs,
63 )