Source code for lightning_ir.data.external_datasets.rank_distillm

 1from .ir_datasets_utils import ParquetScoredDocs, register_new_dataset
 2
 3
[docs] 4def register_rank_distillm(): 5 6 base_url = "https://huggingface.co/datasets/webis/rank-distillm/resolve/main/" 7 8 dlc_contents = { 9 "url": f"{base_url}__rankzephyr-colbert-10000-sampled-100__msmarco-passage-train-judged.parquet", 10 "expected_md5": "02a245c712b4ea1804d0cb504005c0e2", 11 "cache_path": "msmarco-passage/train/rank-distillm-rankzephyr.parquet", 12 } 13 register_new_dataset( 14 "msmarco-passage/train/rank-distillm-rankzephyr", 15 docs="msmarco-passage", 16 queries="msmarco-passage/train", 17 qrels="msmarco-passage/train", 18 scoreddocs=dlc_contents, 19 ScoreddocsType=ParquetScoredDocs, 20 ) 21 22 dlc_contents = { 23 "url": f"{base_url}__set-encoder-colbert-all-100__msmarco-passage-train-judged.parquet", 24 "expected_md5": "a47206da7dc551e3ebd4e5b6866be78a", 25 "cache_path": "msmarco-passage/train/rank-distillm-set-encoder.parquet", 26 } 27 register_new_dataset( 28 "msmarco-passage/train/rank-distillm-set-encoder", 29 docs="msmarco-passage", 30 queries="msmarco-passage/train", 31 qrels="msmarco-passage/train", 32 scoreddocs=dlc_contents, 33 ScoreddocsType=ParquetScoredDocs, 34 ) 35 36 dlc_contents = { 37 "url": f"{base_url}__monoelectra-colbert-all-100__msmarco-passage-train-judged.parquet", 38 "expected_md5": "6561f33476a6c8408737f38ea85c848f", 39 "cache_path": "msmarco-passage/train/rank-distillm-monoelectra.parquet", 40 } 41 register_new_dataset( 42 "msmarco-passage/train/rank-distillm-monoelectra", 43 docs="msmarco-passage", 44 queries="msmarco-passage/train", 45 qrels="msmarco-passage/train", 46 scoreddocs=dlc_contents, 47 ScoreddocsType=ParquetScoredDocs, 48 ) 49 50 dlc_contents = { 51 "url": f"{base_url}__colbert__msmarco-passage-train-judged.parquet", 52 "expected_md5": "1e927d52af085516bf5a3de2865809d5", 53 "cache_path": "msmarco-passage/train/rank-distillm-colbert.parquet", 54 } 55 register_new_dataset( 56 "msmarco-passage/train/rank-distillm-colbert", 57 docs="msmarco-passage", 58 queries="msmarco-passage/train", 59 qrels="msmarco-passage/train", 60 scoreddocs=dlc_contents, 61 ScoreddocsType=ParquetScoredDocs, 62 )