Source code for lightning_ir.data.external_datasets.rank_distillm
1from .ir_datasets_utils import ParquetScoredDocs, register_new_dataset
2
3
[docs]
4def register_rank_distillm():
5
6 base_url = "https://huggingface.co/datasets/webis/rank-distillm/resolve/main/"
7
8 dlc_contents = {
9 "url": f"{base_url}__rankzephyr-colbert-10000-sampled-100__msmarco-passage-train-judged.parquet",
10 "expected_md5": "02a245c712b4ea1804d0cb504005c0e2",
11 "cache_path": "msmarco-passage/train/rank-distillm-rankzephyr.parquet",
12 }
13 register_new_dataset(
14 "msmarco-passage/train/rank-distillm-rankzephyr",
15 docs="msmarco-passage",
16 queries="msmarco-passage/train",
17 qrels="msmarco-passage/train",
18 scoreddocs=dlc_contents,
19 ScoreddocsType=ParquetScoredDocs,
20 )
21
22 dlc_contents = {
23 "url": f"{base_url}__set-encoder-colbert-all-100__msmarco-passage-train-judged.parquet",
24 "expected_md5": "a47206da7dc551e3ebd4e5b6866be78a",
25 "cache_path": "msmarco-passage/train/rank-distillm-set-encoder.parquet",
26 }
27 register_new_dataset(
28 "msmarco-passage/train/rank-distillm-set-encoder",
29 docs="msmarco-passage",
30 queries="msmarco-passage/train",
31 qrels="msmarco-passage/train",
32 scoreddocs=dlc_contents,
33 ScoreddocsType=ParquetScoredDocs,
34 )
35
36 dlc_contents = {
37 "url": f"{base_url}__monoelectra-colbert-all-100__msmarco-passage-train-judged.parquet",
38 "expected_md5": "6561f33476a6c8408737f38ea85c848f",
39 "cache_path": "msmarco-passage/train/rank-distillm-monoelectra.parquet",
40 }
41 register_new_dataset(
42 "msmarco-passage/train/rank-distillm-monoelectra",
43 docs="msmarco-passage",
44 queries="msmarco-passage/train",
45 qrels="msmarco-passage/train",
46 scoreddocs=dlc_contents,
47 ScoreddocsType=ParquetScoredDocs,
48 )
49
50 dlc_contents = {
51 "url": f"{base_url}__colbert__msmarco-passage-train-judged.parquet",
52 "expected_md5": "1e927d52af085516bf5a3de2865809d5",
53 "cache_path": "msmarco-passage/train/rank-distillm-colbert.parquet",
54 }
55 register_new_dataset(
56 "msmarco-passage/train/rank-distillm-colbert",
57 docs="msmarco-passage",
58 queries="msmarco-passage/train",
59 qrels="msmarco-passage/train",
60 scoreddocs=dlc_contents,
61 ScoreddocsType=ParquetScoredDocs,
62 )