Source code for lightning_ir.data.data
1"""
2Basic sample classes for Lightning IR.
3
4This module defines the basic samples classes for Lightning IR. A sample is single entry in a dataset and can be grouped
5into batches for processing.
6"""
7
8from dataclasses import dataclass
9from typing import Any, Dict, List, Sequence
10
11import torch
12from ir_datasets.formats.base import GenericDoc, GenericQuery
13
14
[docs]
15@dataclass
16class RankSample:
17 """A sample of ranking data containing a query, a ranked list of documents, and optionally targets and qrels.
18
19 Attributes:
20 query_id (str): Id of the query.
21 query (str): Query text.
22 doc_ids (Sequence[str]): List of document ids.
23 docs (Sequence[str]): List of document texts.
24 targets (torch.Tensor): Optional list of target labels denoting the relevance of a document for the query.
25 Defaults to None.
26 qrels (List[Dict[str, Any]]): Optional list of dictionaries mapping document ids to relevance labels.
27 Defaults to None.
28 """
29
30 query_id: str
31 query: str
32 doc_ids: Sequence[str]
33 docs: Sequence[str]
34 targets: torch.Tensor | None = None
35 qrels: List[Dict[str, Any]] | None = None
36
37
[docs]
38@dataclass
39class QuerySample:
40 """A sample of query data containing a query and its id.
41
42 Attributes:
43 query_id (str): Id of the query.
44 query (str): Query text.
45 qrels (List[Dict[str, Any]] | None): Optional list of dictionaries mapping document ids to relevance labels.
46 Defaults to None.
47 """
48
49 query_id: str
50 query: str
51 qrels: List[Dict[str, Any]] | None = None
52
[docs]
53 @classmethod
54 def from_ir_dataset_sample(cls, sample: GenericQuery) -> "QuerySample":
55 """Create a QuerySample from a an ir_datasets sample.
56
57 Args:
58 sample (GenericQuery): ir_datasets sample.
59 Returns:
60 QuerySample: Query sample.
61 """
62 return cls(str(sample[0]), sample.default_text())
63
64
[docs]
65@dataclass
66class DocSample:
67 """A sample of document data containing a document and its id.
68
69 Attributes:
70 doc_id (str): Id of the document.
71 doc (str): Document text.
72 """
73
74 doc_id: str
75 doc: str
76
[docs]
77 @classmethod
78 def from_ir_dataset_sample(cls, sample: GenericDoc, text_fields: Sequence[str] | None = None) -> "DocSample":
79 """Create a DocSample from an ir_datasets sample.
80
81 Args:
82 sample (GenericDoc): ir_datasets sample.
83 text_fields (Sequence[str] | None): Optional fields to parse the text. If None uses the sample's
84 `default_text()`. Defaults to None.
85 Returns:
86 DocSample: Document sample.
87 """
88 if text_fields is not None:
89 return cls(sample[0], " ".join(getattr(sample, field) for field in text_fields))
90 return cls(str(sample[0]), sample.default_text())
91
92
[docs]
93@dataclass
94class RankBatch:
95 """A batch of ranking data combining multiple :py:class:`.RankSample` instances
96
97 Attributes:
98 queries (Sequence[str]): List of query texts.
99 docs (Sequence[Sequence[str]]): List of list of document texts.
100 query_ids (Sequence[str] | None): Optional list of query ids. Defaults to None.
101 doc_ids (Sequence[Sequence[str]] | None): Optional list of list of document ids. Defaults to None.
102 qrels (List[Dict[str, int]] | None): Optional list of dictionaries mapping document ids to relevance labels.
103 Defaults to None.
104 """
105
106 queries: Sequence[str]
107 docs: Sequence[Sequence[str]]
108 query_ids: Sequence[str] | None = None
109 doc_ids: Sequence[Sequence[str]] | None = None
110 qrels: List[Dict[str, int]] | None = None
111
112
[docs]
113@dataclass
114class TrainBatch(RankBatch):
115 """A batch of ranking data that combines multiple :py:class:`.RankSample` instances
116
117 Attributes:
118 queries (Sequence[str]): List of query texts.
119 docs (Sequence[Sequence[str]]): List of list of document texts.
120 query_ids (Sequence[str] | None): Optional list of query ids. Defaults to None.
121 doc_ids (Sequence[Sequence[str]] | None): Optional list of list of document ids. Defaults to None.
122 qrels (List[Dict[str, int]] | None): Optional list of dictionaries mapping document ids to relevance labels.
123 Defaults to None.
124 targets (torch.Tensor | None): Optional list of target labels denoting the relevance of a document for the
125 query. Defaults to None.
126 """
127
128 targets: torch.Tensor | None = None
129
130
[docs]
131@dataclass
132class IndexBatch:
133 """A batch of index that combines multiple :py:class:`.DocSample` instances
134
135 Attributes:
136 doc_ids (Sequence[str]): List of document ids.
137 docs (Sequence[str]): List of document texts.
138 """
139
140 doc_ids: Sequence[str]
141 docs: Sequence[str]
142
143
[docs]
144@dataclass
145class SearchBatch:
146 """A batch of search data that combines multiple :py:class:`.QuerySample` instances. Optionaly includes document ids
147 and qrels.
148
149 Attributes:
150 query_ids (Sequence[str]): List of query ids.
151 queries (Sequence[str]): List of query texts.
152 doc_ids (Sequence[Sequence[str]] | None): Optional list of list of document ids. Defaults to None.
153 qrels (List[Dict[str, int]] | None): Optional list of dictionaries mapping document ids to relevance labels.
154 Defaults to None.
155 """
156
157 query_ids: Sequence[str]
158 queries: Sequence[str]
159 doc_ids: Sequence[Sequence[str]] | None = None
160 qrels: List[Dict[str, int]] | None = None