Source code for lightning_ir.data.external_datasets.rank_distillm

 1from ir_datasets.util import GzipExtract
 2
 3from .ir_datasets_utils import ParquetScoredDocs, register_new_dataset
 4
 5
[docs] 6def register_rank_distillm(): 7 8 base_url = "https://zenodo.org/records/15753974/files/" 9 10 dlc_contents = { 11 "url": f"{base_url}__rankzephyr-colbert-10000-" "sampled-100__msmarco-passage-train-judged.run?download=1", 12 "expected_md5": "49f8dbf2c1ee7a2ca1fe517eda528af6", 13 "cache_path": "msmarco-passage/train/rank-distillm-rankzephyr.run", 14 } 15 register_new_dataset( 16 "msmarco-passage/train/rank-distillm-rankzephyr", 17 docs="msmarco-passage", 18 queries="msmarco-passage/train", 19 qrels="msmarco-passage/train", 20 scoreddocs=dlc_contents, 21 ) 22 23 dlc_contents = { 24 "url": f"{base_url}__set-encoder-colbert__" "msmarco-passage-train-judged.run.gz?download=1", 25 "expected_md5": "1f069d0daa9842a54a858cc660149e1a", 26 "cache_path": "msmarco-passage/train/rank-distillm-set-encoder.run", 27 "extractors": [GzipExtract], 28 } 29 register_new_dataset( 30 "msmarco-passage/train/rank-distillm-set-encoder", 31 docs="msmarco-passage", 32 queries="msmarco-passage/train", 33 qrels="msmarco-passage/train", 34 scoreddocs=dlc_contents, 35 ) 36 37 dlc_contents = { 38 "url": f"{base_url}__monoelectra-colbert__" "msmarco-passage-train-judged.run.gz?download=1", 39 "expected_md5": "5abc9a6c2cdf986c0aedcea853f0b34c", 40 "cache_path": "msmarco-passage/train/rank-distillm-monoelectra.run", 41 "extractors": [GzipExtract], 42 } 43 register_new_dataset( 44 "msmarco-passage/train/rank-distillm-monoelectra", 45 docs="msmarco-passage", 46 queries="msmarco-passage/train", 47 qrels="msmarco-passage/train", 48 scoreddocs=dlc_contents, 49 ) 50 51 dlc_contents = { 52 "url": f"{base_url}__colbert__msmarco-passage-train-judged.parquet?download=1", 53 "expected_md5": "1e927d52af085516bf5a3de2865809d5", 54 "cache_path": "msmarco-passage/train/rank-distillm-colbert.parquet", 55 } 56 register_new_dataset( 57 "msmarco-passage/train/rank-distillm-colbert", 58 docs="msmarco-passage", 59 queries="msmarco-passage/train", 60 qrels="msmarco-passage/train", 61 scoreddocs=dlc_contents, 62 ScoreddocsType=ParquetScoredDocs, 63 )