Source code for lightning_ir.data.data
1"""
2Basic sample classes for Lightning IR.
3
4This module defines the basic samples classes for Lightning IR. A sample is single entry in a dataset and can be grouped
5into batches for processing.
6"""
7
8from collections.abc import Sequence
9from dataclasses import dataclass
10from typing import Any
11
12import torch
13from ir_datasets.formats.base import GenericDoc, GenericQuery
14
15
[docs]
16@dataclass
17class RankSample:
18 """A sample of ranking data containing a query, a ranked list of documents, and optionally targets and qrels.
19
20 Attributes:
21 query_id (str): Id of the query.
22 query (str): Query text.
23 doc_ids (Sequence[str]): list of document ids.
24 docs (Sequence[str]): list of document texts.
25 targets (torch.Tensor): Optional list of target labels denoting the relevance of a document for the query.
26 Defaults to None.
27 qrels (list[dict[str, Any]]): Optional list of dictionaries mapping document ids to relevance labels.
28 Defaults to None.
29 """
30
31 query_id: str
32 query: str
33 doc_ids: Sequence[str]
34 docs: Sequence[str]
35 targets: torch.Tensor | None = None
36 qrels: list[dict[str, Any]] | None = None
37
38
[docs]
39@dataclass
40class QuerySample:
41 """A sample of query data containing a query and its id.
42
43 Attributes:
44 query_id (str): Id of the query.
45 query (str): Query text.
46 qrels (list[dict[str, Any]] | None): Optional list of dictionaries mapping document ids to relevance labels.
47 Defaults to None.
48 """
49
50 query_id: str
51 query: str
52 qrels: list[dict[str, Any]] | None = None
53
[docs]
54 @classmethod
55 def from_ir_dataset_sample(cls, sample: GenericQuery) -> "QuerySample":
56 """Create a QuerySample from a an ir_datasets sample.
57
58 Args:
59 sample (GenericQuery): ir_datasets sample.
60 Returns:
61 QuerySample: Query sample.
62 """
63 return cls(str(sample[0]), sample.default_text())
64
65
[docs]
66@dataclass
67class DocSample:
68 """A sample of document data containing a document and its id.
69
70 Attributes:
71 doc_id (str): Id of the document.
72 doc (str): Document text.
73 """
74
75 doc_id: str
76 doc: str
77
[docs]
78 @classmethod
79 def from_ir_dataset_sample(cls, sample: GenericDoc, text_fields: Sequence[str] | None = None) -> "DocSample":
80 """Create a DocSample from an ir_datasets sample.
81
82 Args:
83 sample (GenericDoc): ir_datasets sample.
84 text_fields (Sequence[str] | None): Optional fields to parse the text. If None uses the sample's
85 `default_text()`. Defaults to None.
86 Returns:
87 DocSample: Document sample.
88 """
89 if text_fields is not None:
90 return cls(sample[0], " ".join(getattr(sample, field) for field in text_fields))
91 return cls(str(sample[0]), sample.default_text())
92
93
[docs]
94@dataclass
95class RankBatch:
96 """A batch of ranking data combining multiple :py:class:`.RankSample` instances
97
98 Attributes:
99 queries (Sequence[str]): list of query texts.
100 docs (Sequence[Sequence[str]]): list of list of document texts.
101 query_ids (Sequence[str] | None): Optional list of query ids. Defaults to None.
102 doc_ids (Sequence[Sequence[str]] | None): Optional list of list of document ids. Defaults to None.
103 qrels (list[dict[str, int]] | None): Optional list of dictionaries mapping document ids to relevance labels.
104 Defaults to None.
105 """
106
107 queries: Sequence[str]
108 docs: Sequence[Sequence[str]]
109 query_ids: Sequence[str] | None = None
110 doc_ids: Sequence[Sequence[str]] | None = None
111 qrels: list[dict[str, int]] | None = None
112
113
[docs]
114@dataclass
115class TrainBatch(RankBatch):
116 """A batch of ranking data that combines multiple :py:class:`.RankSample` instances
117
118 Attributes:
119 queries (Sequence[str]): list of query texts.
120 docs (Sequence[Sequence[str]]): list of list of document texts.
121 query_ids (Sequence[str] | None): Optional list of query ids. Defaults to None.
122 doc_ids (Sequence[Sequence[str]] | None): Optional list of list of document ids. Defaults to None.
123 qrels (list[dict[str, int]] | None): Optional list of dictionaries mapping document ids to relevance labels.
124 Defaults to None.
125 targets (torch.Tensor | None): Optional list of target labels denoting the relevance of a document for the
126 query. Defaults to None.
127 """
128
129 targets: torch.Tensor | None = None
130
131
[docs]
132@dataclass
133class IndexBatch:
134 """A batch of index that combines multiple :py:class:`.DocSample` instances
135
136 Attributes:
137 doc_ids (Sequence[str]): list of document ids.
138 docs (Sequence[str]): list of document texts.
139 """
140
141 doc_ids: Sequence[str]
142 docs: Sequence[str]
143
144
[docs]
145@dataclass
146class SearchBatch:
147 """A batch of search data that combines multiple :py:class:`.QuerySample` instances. Optionaly includes document ids
148 and qrels.
149
150 Attributes:
151 query_ids (Sequence[str]): list of query ids.
152 queries (Sequence[str]): list of query texts.
153 doc_ids (Sequence[Sequence[str]] | None): Optional list of list of document ids. Defaults to None.
154 qrels (list[dict[str, int]] | None): Optional list of dictionaries mapping document ids to relevance labels.
155 Defaults to None.
156 """
157
158 query_ids: Sequence[str]
159 queries: Sequence[str]
160 doc_ids: Sequence[Sequence[str]] | None = None
161 qrels: list[dict[str, int]] | None = None