Source code for lightning_ir.data.data

  1"""
  2Basic sample classes for Lightning IR.
  3
  4This module defines the basic samples classes for Lightning IR. A sample is single entry in a dataset and can be grouped
  5into batches for processing.
  6"""
  7
  8from dataclasses import dataclass
  9from typing import Any, Dict, List, Sequence
 10
 11import torch
 12from ir_datasets.formats.base import GenericDoc, GenericQuery
 13
 14
[docs] 15@dataclass 16class RankSample: 17 """A sample of ranking data containing a query, a ranked list of documents, and optionally targets and qrels. 18 19 Attributes: 20 query_id (str): Id of the query. 21 query (str): Query text. 22 doc_ids (Sequence[str]): List of document ids. 23 docs (Sequence[str]): List of document texts. 24 targets (torch.Tensor): Optional list of target labels denoting the relevance of a document for the query. 25 Defaults to None. 26 qrels (List[Dict[str, Any]]): Optional list of dictionaries mapping document ids to relevance labels. 27 Defaults to None. 28 """ 29 30 query_id: str 31 query: str 32 doc_ids: Sequence[str] 33 docs: Sequence[str] 34 targets: torch.Tensor | None = None 35 qrels: List[Dict[str, Any]] | None = None
36 37
[docs] 38@dataclass 39class QuerySample: 40 """A sample of query data containing a query and its id. 41 42 Attributes: 43 query_id (str): Id of the query. 44 query (str): Query text. 45 qrels (List[Dict[str, Any]] | None): Optional list of dictionaries mapping document ids to relevance labels. 46 Defaults to None. 47 """ 48 49 query_id: str 50 query: str 51 qrels: List[Dict[str, Any]] | None = None 52
[docs] 53 @classmethod 54 def from_ir_dataset_sample(cls, sample: GenericQuery) -> "QuerySample": 55 """Create a QuerySample from a an ir_datasets sample. 56 57 Args: 58 sample (GenericQuery): ir_datasets sample. 59 Returns: 60 QuerySample: Query sample. 61 """ 62 return cls(str(sample[0]), sample.default_text())
63 64
[docs] 65@dataclass 66class DocSample: 67 """A sample of document data containing a document and its id. 68 69 Attributes: 70 doc_id (str): Id of the document. 71 doc (str): Document text. 72 """ 73 74 doc_id: str 75 doc: str 76
[docs] 77 @classmethod 78 def from_ir_dataset_sample(cls, sample: GenericDoc, text_fields: Sequence[str] | None = None) -> "DocSample": 79 """Create a DocSample from an ir_datasets sample. 80 81 Args: 82 sample (GenericDoc): ir_datasets sample. 83 text_fields (Sequence[str] | None): Optional fields to parse the text. If None uses the sample's 84 `default_text()`. Defaults to None. 85 Returns: 86 DocSample: Document sample. 87 """ 88 if text_fields is not None: 89 return cls(sample[0], " ".join(getattr(sample, field) for field in text_fields)) 90 return cls(str(sample[0]), sample.default_text())
91 92
[docs] 93@dataclass 94class RankBatch: 95 """A batch of ranking data combining multiple :py:class:`.RankSample` instances 96 97 Attributes: 98 queries (Sequence[str]): List of query texts. 99 docs (Sequence[Sequence[str]]): List of list of document texts. 100 query_ids (Sequence[str] | None): Optional list of query ids. Defaults to None. 101 doc_ids (Sequence[Sequence[str]] | None): Optional list of list of document ids. Defaults to None. 102 qrels (List[Dict[str, int]] | None): Optional list of dictionaries mapping document ids to relevance labels. 103 Defaults to None. 104 """ 105 106 queries: Sequence[str] 107 docs: Sequence[Sequence[str]] 108 query_ids: Sequence[str] | None = None 109 doc_ids: Sequence[Sequence[str]] | None = None 110 qrels: List[Dict[str, int]] | None = None
111 112
[docs] 113@dataclass 114class TrainBatch(RankBatch): 115 """A batch of ranking data that combines multiple :py:class:`.RankSample` instances 116 117 Attributes: 118 queries (Sequence[str]): List of query texts. 119 docs (Sequence[Sequence[str]]): List of list of document texts. 120 query_ids (Sequence[str] | None): Optional list of query ids. Defaults to None. 121 doc_ids (Sequence[Sequence[str]] | None): Optional list of list of document ids. Defaults to None. 122 qrels (List[Dict[str, int]] | None): Optional list of dictionaries mapping document ids to relevance labels. 123 Defaults to None. 124 targets (torch.Tensor | None): Optional list of target labels denoting the relevance of a document for the 125 query. Defaults to None. 126 """ 127 128 targets: torch.Tensor | None = None
129 130
[docs] 131@dataclass 132class IndexBatch: 133 """A batch of index that combines multiple :py:class:`.DocSample` instances 134 135 Attributes: 136 doc_ids (Sequence[str]): List of document ids. 137 docs (Sequence[str]): List of document texts. 138 """ 139 140 doc_ids: Sequence[str] 141 docs: Sequence[str]
142 143
[docs] 144@dataclass 145class SearchBatch: 146 """A batch of search data that combines multiple :py:class:`.QuerySample` instances. Optionaly includes document ids 147 and qrels. 148 149 Attributes: 150 query_ids (Sequence[str]): List of query ids. 151 queries (Sequence[str]): List of query texts. 152 doc_ids (Sequence[Sequence[str]] | None): Optional list of list of document ids. Defaults to None. 153 qrels (List[Dict[str, int]] | None): Optional list of dictionaries mapping document ids to relevance labels. 154 Defaults to None. 155 """ 156 157 query_ids: Sequence[str] 158 queries: Sequence[str] 159 doc_ids: Sequence[Sequence[str]] | None = None 160 qrels: List[Dict[str, int]] | None = None