Source code for lightning_ir.data.data

  1"""
  2Basic sample classes for Lightning IR.
  3
  4This module defines the basic samples classes for Lightning IR. A sample is single entry in a dataset and can be grouped
  5into batches for processing.
  6"""
  7
  8from collections.abc import Sequence
  9from dataclasses import dataclass
 10from typing import Any
 11
 12import torch
 13from ir_datasets.formats.base import GenericDoc, GenericQuery
 14
 15
[docs] 16@dataclass 17class RankSample: 18 """A sample of ranking data containing a query, a ranked list of documents, and optionally targets and qrels. 19 20 Attributes: 21 query_id (str): Id of the query. 22 query (str): Query text. 23 doc_ids (Sequence[str]): list of document ids. 24 docs (Sequence[str]): list of document texts. 25 targets (torch.Tensor): Optional list of target labels denoting the relevance of a document for the query. 26 Defaults to None. 27 qrels (list[dict[str, Any]]): Optional list of dictionaries mapping document ids to relevance labels. 28 Defaults to None. 29 """ 30 31 query_id: str 32 query: str 33 doc_ids: Sequence[str] 34 docs: Sequence[str] 35 targets: torch.Tensor | None = None 36 qrels: list[dict[str, Any]] | None = None
37 38
[docs] 39@dataclass 40class QuerySample: 41 """A sample of query data containing a query and its id. 42 43 Attributes: 44 query_id (str): Id of the query. 45 query (str): Query text. 46 qrels (list[dict[str, Any]] | None): Optional list of dictionaries mapping document ids to relevance labels. 47 Defaults to None. 48 """ 49 50 query_id: str 51 query: str 52 qrels: list[dict[str, Any]] | None = None 53
[docs] 54 @classmethod 55 def from_ir_dataset_sample(cls, sample: GenericQuery) -> "QuerySample": 56 """Create a QuerySample from a an ir_datasets sample. 57 58 Args: 59 sample (GenericQuery): ir_datasets sample. 60 Returns: 61 QuerySample: Query sample. 62 """ 63 return cls(str(sample[0]), sample.default_text())
64 65
[docs] 66@dataclass 67class DocSample: 68 """A sample of document data containing a document and its id. 69 70 Attributes: 71 doc_id (str): Id of the document. 72 doc (str): Document text. 73 """ 74 75 doc_id: str 76 doc: str 77
[docs] 78 @classmethod 79 def from_ir_dataset_sample(cls, sample: GenericDoc, text_fields: Sequence[str] | None = None) -> "DocSample": 80 """Create a DocSample from an ir_datasets sample. 81 82 Args: 83 sample (GenericDoc): ir_datasets sample. 84 text_fields (Sequence[str] | None): Optional fields to parse the text. If None uses the sample's 85 `default_text()`. Defaults to None. 86 Returns: 87 DocSample: Document sample. 88 """ 89 if text_fields is not None: 90 return cls(sample[0], " ".join(getattr(sample, field) for field in text_fields)) 91 return cls(str(sample[0]), sample.default_text())
92 93
[docs] 94@dataclass 95class RankBatch: 96 """A batch of ranking data combining multiple :py:class:`.RankSample` instances 97 98 Attributes: 99 queries (Sequence[str]): list of query texts. 100 docs (Sequence[Sequence[str]]): list of list of document texts. 101 query_ids (Sequence[str] | None): Optional list of query ids. Defaults to None. 102 doc_ids (Sequence[Sequence[str]] | None): Optional list of list of document ids. Defaults to None. 103 qrels (list[dict[str, int]] | None): Optional list of dictionaries mapping document ids to relevance labels. 104 Defaults to None. 105 """ 106 107 queries: Sequence[str] 108 docs: Sequence[Sequence[str]] 109 query_ids: Sequence[str] | None = None 110 doc_ids: Sequence[Sequence[str]] | None = None 111 qrels: list[dict[str, int]] | None = None
112 113
[docs] 114@dataclass 115class TrainBatch(RankBatch): 116 """A batch of ranking data that combines multiple :py:class:`.RankSample` instances 117 118 Attributes: 119 queries (Sequence[str]): list of query texts. 120 docs (Sequence[Sequence[str]]): list of list of document texts. 121 query_ids (Sequence[str] | None): Optional list of query ids. Defaults to None. 122 doc_ids (Sequence[Sequence[str]] | None): Optional list of list of document ids. Defaults to None. 123 qrels (list[dict[str, int]] | None): Optional list of dictionaries mapping document ids to relevance labels. 124 Defaults to None. 125 targets (torch.Tensor | None): Optional list of target labels denoting the relevance of a document for the 126 query. Defaults to None. 127 """ 128 129 targets: torch.Tensor | None = None
130 131
[docs] 132@dataclass 133class IndexBatch: 134 """A batch of index that combines multiple :py:class:`.DocSample` instances 135 136 Attributes: 137 doc_ids (Sequence[str]): list of document ids. 138 docs (Sequence[str]): list of document texts. 139 """ 140 141 doc_ids: Sequence[str] 142 docs: Sequence[str]
143 144
[docs] 145@dataclass 146class SearchBatch: 147 """A batch of search data that combines multiple :py:class:`.QuerySample` instances. Optionaly includes document ids 148 and qrels. 149 150 Attributes: 151 query_ids (Sequence[str]): list of query ids. 152 queries (Sequence[str]): list of query texts. 153 doc_ids (Sequence[Sequence[str]] | None): Optional list of list of document ids. Defaults to None. 154 qrels (list[dict[str, int]] | None): Optional list of dictionaries mapping document ids to relevance labels. 155 Defaults to None. 156 """ 157 158 query_ids: Sequence[str] 159 queries: Sequence[str] 160 doc_ids: Sequence[Sequence[str]] | None = None 161 qrels: list[dict[str, int]] | None = None