Source code for lightning_ir.data.data

  1"""
  2Basic sample classes for Lightning IR.
  3
  4This module defines the basic samples classes for Lightning IR. A sample is single entry in a dataset and can be grouped
  5into batches for processing.
  6"""
  7
  8from dataclasses import dataclass
  9from typing import Any, Dict, List, Sequence
 10
 11import torch
 12from ir_datasets.formats.base import GenericDoc, GenericQuery
 13
 14
[docs] 15@dataclass 16class RankSample: 17 """A sample of ranking data containing a query, a ranked list of documents, and optionally targets and qrels. 18 19 :param query_id: Id of the query 20 :type query_id: str 21 :param query: Query text 22 :type query_id: str 23 :param doc_ids: List of document ids 24 :type doc_ids: Sequence[str] 25 :param docs: List of document texts 26 :type docs: Sequence[str] 27 :param targets: Optional list of target labels denoting the relevane of a document for the query 28 :type targets: torch.Tensor, optional 29 :param qrels: Optional list of dictionaries mapping document ids to relevance labels 30 """ 31 32 query_id: str 33 query: str 34 doc_ids: Sequence[str] 35 docs: Sequence[str] 36 targets: torch.Tensor | None = None 37 qrels: List[Dict[str, Any]] | None = None
38 39
[docs] 40@dataclass 41class QuerySample: 42 """A sample of query data containing a query and its id. 43 44 :param query_id: Id of the query 45 :type query_id: str 46 :param query: Query text 47 :type query_id: str 48 :param qrels: Optional list of dictionaries mapping document ids to relevance labels 49 :type qrels: List[Dict[str, Any]], optional 50 """ 51 52 query_id: str 53 query: str 54 qrels: List[Dict[str, Any]] | None = None 55
[docs] 56 @classmethod 57 def from_ir_dataset_sample(cls, sample: GenericQuery) -> "QuerySample": 58 """Create a QuerySample from a an ir_datasets sample. 59 60 :param sample: ir_datasets sample 61 :type sample: GenericQuery 62 :return: Query sample 63 :rtype: QuerySample 64 """ 65 return cls(sample[0], sample[1])
66 67
[docs] 68@dataclass 69class DocSample: 70 """A sample of document data containing a document and its id. 71 72 :param doc_id: Id of the document 73 :type doc_id: str 74 :param doc: Document text 75 :type doc: str 76 """ 77 78 doc_id: str 79 doc: str 80
[docs] 81 @classmethod 82 def from_ir_dataset_sample(cls, sample: GenericDoc, text_fields: Sequence[str] | None = None) -> "DocSample": 83 """Create a DocSample from an ir_datasets sample. 84 85 :param sample: ir_datasets sample 86 :type sample: GenericDoc 87 :param text_fields: Optional fields to parse the text. If None uses the samples ``default_text()`` 88 defaults to None 89 :type text_fields: Sequence[str] | None, optional 90 :return: Doc sample 91 :rtype: DocSample 92 """ 93 if text_fields is not None: 94 return cls(sample[0], " ".join(getattr(sample, field) for field in text_fields)) 95 return cls(sample[0], sample.default_text())
96 97
[docs] 98@dataclass 99class RankBatch: 100 """A batch of ranking data combining multiple :py:class:`.RankSample` instances 101 102 :param queries: List of query texts 103 :type queries: Sequence[str] 104 :param docs: List of list of document texts 105 :type docs: Sequence[Sequence[str]] 106 :param query_ids: Optional list of query ids 107 :type query_ids: Sequence[str], optional 108 :param doc_ids: Optional list of list of document ids 109 :type doc_ids: Sequence[Sequence[str]], optional 110 :param qrels: Optional list of dictionaries mapping document ids to relevance labels 111 :type qrels: List[Dict[str, Any]], optional 112 """ 113 114 queries: Sequence[str] 115 docs: Sequence[Sequence[str]] 116 query_ids: Sequence[str] | None = None 117 doc_ids: Sequence[Sequence[str]] | None = None 118 qrels: List[Dict[str, int]] | None = None
119 120
[docs] 121@dataclass 122class TrainBatch(RankBatch): 123 """A batch of ranking data that combines multiple :py:class:`.RankSample` instances 124 125 :param queries: List of query texts 126 :type queries: Sequence[str] 127 :param docs: List of list of document texts 128 :type docs: Sequence[Sequence[str]] 129 :param query_ids: Optional list of query ids 130 :type query_ids: Sequence[str], optional 131 :param doc_ids: Optional list of list of document ids 132 :type doc_ids: Sequence[Sequence[str]], optional 133 :param qrels: Optional list of dictionaries mapping document ids to relevance labels 134 :type qrels: List[Dict[str, Any]], optional 135 :param targets: Optional list of target labels denoting the relevane of a document for the query 136 :type targets: torch.Tensor, optional 137 """ 138 139 targets: torch.Tensor | None = None
140 141
[docs] 142@dataclass 143class IndexBatch: 144 """A batch of index that combines multiple :py:class:`.DocSample` instances 145 146 :param doc_ids: List of document ids 147 :type doc_ids: Sequence[str] 148 :param docs: List of document texts 149 :type docs: Sequence[str] 150 """ 151 152 doc_ids: Sequence[str] 153 docs: Sequence[str]
154 155
[docs] 156@dataclass 157class SearchBatch: 158 """A batch of search data that combines multiple :py:class:`.QuerySample` instances. Optionaly includes document ids 159 and qrels. 160 161 :param query_ids: List of query ids 162 :type query_ids: Sequence[str] 163 :param queries: List of query texts 164 :type queries: Sequence[str] 165 :param doc_ids: Optional list of list of document ids 166 :type doc_ids: Sequence[Sequence[str]], optional 167 :param qrels: Optional list of dictionaries mapping document ids to relevance labels 168 :type qrels: List[Dict[str, Any]], optional 169 """ 170 171 query_ids: Sequence[str] 172 queries: Sequence[str] 173 doc_ids: Sequence[Sequence[str]] | None = None 174 qrels: List[Dict[str, int]] | None = None