Source code for lightning_ir.data.external_datasets.rank_distillm

 1from .ir_datasets_utils import ParquetScoredDocs, register_new_dataset
 2
 3
[docs] 4def register_rank_distillm(): 5 base_url = "https://huggingface.co/datasets/webis/rank-distillm/resolve/main" 6 7 md5_hashes = { 8 "rank-distillm-rankzephyr": ( 9 "__rankzephyr-colbert-10000-sampled-100__msmarco-passage-train-judged.parquet", 10 "38f69a3c8a5ed21c639e882a6c2eff7c", 11 ), 12 "rank-distillm-set-encoder": ( 13 "__set-encoder-colbert-all-100__msmarco-passage-train-judged.parquet", 14 "a47206da7dc551e3ebd4e5b6866be78a", 15 ), 16 "rank-distillm-monoelectra": ( 17 "__monoelectra-colbert-all-100__msmarco-passage-train-judged.parquet", 18 "6561f33476a6c8408737f38ea85c848f", 19 ), 20 "rank-distillm-colbert": ( 21 "__colbert__msmarco-passage-train-judged.parquet", 22 "1e927d52af085516bf5a3de2865809d5", 23 ), 24 } 25 for name, (file_name, expected_md5) in md5_hashes.items(): 26 dlc_contents = { 27 "url": f"{base_url}/{file_name}", 28 "expected_md5": expected_md5, 29 "cache_path": f"msmarco-passage/train/{name}.parquet", 30 } 31 register_new_dataset( 32 f"msmarco-passage/train/{name}", 33 docs="msmarco-passage", 34 queries="msmarco-passage/train", 35 qrels="msmarco-passage/train", 36 scoreddocs=dlc_contents, 37 ScoreddocsType=ParquetScoredDocs, 38 )