Source code for lightning_ir.data.data
1"""
2Basic sample classes for Lightning IR.
3
4This module defines the basic samples classes for Lightning IR. A sample is single entry in a dataset and can be grouped
5into batches for processing.
6"""
7
8from dataclasses import dataclass
9from typing import Any, Dict, List, Sequence
10
11import torch
12from ir_datasets.formats.base import GenericDoc, GenericQuery
13
14
[docs]
15@dataclass
16class RankSample:
17 """A sample of ranking data containing a query, a ranked list of documents, and optionally targets and qrels.
18
19 :param query_id: Id of the query
20 :type query_id: str
21 :param query: Query text
22 :type query_id: str
23 :param doc_ids: List of document ids
24 :type doc_ids: Sequence[str]
25 :param docs: List of document texts
26 :type docs: Sequence[str]
27 :param targets: Optional list of target labels denoting the relevane of a document for the query
28 :type targets: torch.Tensor, optional
29 :param qrels: Optional list of dictionaries mapping document ids to relevance labels
30 """
31
32 query_id: str
33 query: str
34 doc_ids: Sequence[str]
35 docs: Sequence[str]
36 targets: torch.Tensor | None = None
37 qrels: List[Dict[str, Any]] | None = None
38
39
[docs]
40@dataclass
41class QuerySample:
42 """A sample of query data containing a query and its id.
43
44 :param query_id: Id of the query
45 :type query_id: str
46 :param query: Query text
47 :type query_id: str
48 :param qrels: Optional list of dictionaries mapping document ids to relevance labels
49 :type qrels: List[Dict[str, Any]], optional
50 """
51
52 query_id: str
53 query: str
54 qrels: List[Dict[str, Any]] | None = None
55
[docs]
56 @classmethod
57 def from_ir_dataset_sample(cls, sample: GenericQuery) -> "QuerySample":
58 """Create a QuerySample from a an ir_datasets sample.
59
60 :param sample: ir_datasets sample
61 :type sample: GenericQuery
62 :return: Query sample
63 :rtype: QuerySample
64 """
65 return cls(sample[0], sample[1])
66
67
[docs]
68@dataclass
69class DocSample:
70 """A sample of document data containing a document and its id.
71
72 :param doc_id: Id of the document
73 :type doc_id: str
74 :param doc: Document text
75 :type doc: str
76 """
77
78 doc_id: str
79 doc: str
80
[docs]
81 @classmethod
82 def from_ir_dataset_sample(cls, sample: GenericDoc, text_fields: Sequence[str] | None = None) -> "DocSample":
83 """Create a DocSample from an ir_datasets sample.
84
85 :param sample: ir_datasets sample
86 :type sample: GenericDoc
87 :param text_fields: Optional fields to parse the text. If None uses the samples ``default_text()``
88 defaults to None
89 :type text_fields: Sequence[str] | None, optional
90 :return: Doc sample
91 :rtype: DocSample
92 """
93 if text_fields is not None:
94 return cls(sample[0], " ".join(getattr(sample, field) for field in text_fields))
95 return cls(sample[0], sample.default_text())
96
97
[docs]
98@dataclass
99class RankBatch:
100 """A batch of ranking data combining multiple :py:class:`.RankSample` instances
101
102 :param queries: List of query texts
103 :type queries: Sequence[str]
104 :param docs: List of list of document texts
105 :type docs: Sequence[Sequence[str]]
106 :param query_ids: Optional list of query ids
107 :type query_ids: Sequence[str], optional
108 :param doc_ids: Optional list of list of document ids
109 :type doc_ids: Sequence[Sequence[str]], optional
110 :param qrels: Optional list of dictionaries mapping document ids to relevance labels
111 :type qrels: List[Dict[str, Any]], optional
112 """
113
114 queries: Sequence[str]
115 docs: Sequence[Sequence[str]]
116 query_ids: Sequence[str] | None = None
117 doc_ids: Sequence[Sequence[str]] | None = None
118 qrels: List[Dict[str, int]] | None = None
119
120
[docs]
121@dataclass
122class TrainBatch(RankBatch):
123 """A batch of ranking data that combines multiple :py:class:`.RankSample` instances
124
125 :param queries: List of query texts
126 :type queries: Sequence[str]
127 :param docs: List of list of document texts
128 :type docs: Sequence[Sequence[str]]
129 :param query_ids: Optional list of query ids
130 :type query_ids: Sequence[str], optional
131 :param doc_ids: Optional list of list of document ids
132 :type doc_ids: Sequence[Sequence[str]], optional
133 :param qrels: Optional list of dictionaries mapping document ids to relevance labels
134 :type qrels: List[Dict[str, Any]], optional
135 :param targets: Optional list of target labels denoting the relevane of a document for the query
136 :type targets: torch.Tensor, optional
137 """
138
139 targets: torch.Tensor | None = None
140
141
[docs]
142@dataclass
143class IndexBatch:
144 """A batch of index that combines multiple :py:class:`.DocSample` instances
145
146 :param doc_ids: List of document ids
147 :type doc_ids: Sequence[str]
148 :param docs: List of document texts
149 :type docs: Sequence[str]
150 """
151
152 doc_ids: Sequence[str]
153 docs: Sequence[str]
154
155
[docs]
156@dataclass
157class SearchBatch:
158 """A batch of search data that combines multiple :py:class:`.QuerySample` instances. Optionaly includes document ids
159 and qrels.
160
161 :param query_ids: List of query ids
162 :type query_ids: Sequence[str]
163 :param queries: List of query texts
164 :type queries: Sequence[str]
165 :param doc_ids: Optional list of list of document ids
166 :type doc_ids: Sequence[Sequence[str]], optional
167 :param qrels: Optional list of dictionaries mapping document ids to relevance labels
168 :type qrels: List[Dict[str, Any]], optional
169 """
170
171 query_ids: Sequence[str]
172 queries: Sequence[str]
173 doc_ids: Sequence[Sequence[str]] | None = None
174 qrels: List[Dict[str, int]] | None = None