Source code for lightning_ir.models.bi_encoders.dpr

  1"""Configuration and model for DPR (Dense Passage Retriever) type models.
  2
  3DPR type models use two separate neural networks, known as dual encoders, to independently map a user's question and
  4the database of documents into a shared mathematical space. During a search, DPR finds relevant documents by
  5measuring the distance or similarity between the single vector of the query and the vectors of the documents.
  6
  7Originally proposed in \
  8`Dense Passage Retrieval for Open-Domain Question Answering \
  9<https://arxiv.org/abs/2004.04906>`_. This model type is also known as a SentenceTransformer model:
 10`Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks \
 11<https://arxiv.org/abs/1908.10084>`_.
 12"""
 13
 14from typing import Literal
 15
 16import torch
 17from transformers import BatchEncoding
 18
 19from ...bi_encoder import BiEncoderEmbedding, SingleVectorBiEncoderConfig, SingleVectorBiEncoderModel
 20from ...modeling_utils.embedding_post_processing import Pooler, Sparsifier
 21
 22
[docs] 23class DprConfig(SingleVectorBiEncoderConfig): 24 """Configuration class for a DPR model.""" 25 26 model_type = "lir-dpr" 27 """Model type for a DPR model.""" 28
[docs] 29 def __init__( 30 self, 31 query_length: int | None = 32, 32 doc_length: int | None = 512, 33 similarity_function: Literal["cosine", "dot"] = "dot", 34 normalization_strategy: Literal["l2"] | None = None, 35 sparsification_strategy: Literal["relu", "relu_log", "relu_2xlog"] | None = None, 36 add_marker_tokens: bool = False, 37 pooling_strategy: Literal["first", "mean", "max", "sum"] = "first", 38 embedding_dim: int | None = None, 39 projection: Literal["linear", "linear_no_bias"] | None = "linear", 40 **kwargs, 41 ) -> None: 42 """A DPR model encodes queries and documents separately. Before computing the similarity score, the 43 contextualized token embeddings are aggregated to obtain a single embedding using a pooling strategy. 44 Optionally, the pooled embeddings can be projected using a linear layer. 45 46 Args: 47 query_length (int | None): Maximum number of tokens per query. If None does not truncate. Defaults to 32. 48 doc_length (int | None): Maximum number of tokens per document. If None does not truncate. Defaults to 512. 49 similarity_function (Literal["cosine", "dot"]): Similarity function to compute scores between query and 50 document embeddings. Defaults to "dot". 51 normalization_strategy (Literal['l2'] | None): Whether to normalization_strategy query and document 52 embeddings. 53 Defaults to None. 54 sparsification_strategy (Literal['relu', 'relu_log', 'relu_2xlog'] | None): Whether and which 55 sparsification_strategy function to apply. Defaults to None. 56 add_marker_tokens (bool): Whether to add marker tokens to the input sequences. Defaults to False. 57 pooling_strategy (Literal["first", "mean", "max", "sum"]): Pooling strategy for query and document 58 embeddings. Defaults to "first". 59 embedding_dim (int | None): Dimension of the final embeddings. If None, it will be set to the hidden size 60 of the backbone model. Defaults to None. 61 projection (Literal["linear", "linear_no_bias"] | None): type of projection layer to apply on the pooled 62 embeddings. If None, no projection is applied. Defaults to "linear". 63 """ 64 super().__init__( 65 query_length=query_length, 66 doc_length=doc_length, 67 similarity_function=similarity_function, 68 normalization_strategy=normalization_strategy, 69 sparsification_strategy=sparsification_strategy, 70 add_marker_tokens=add_marker_tokens, 71 pooling_strategy=pooling_strategy, 72 **kwargs, 73 ) 74 hidden_size = getattr(self, "hidden_size", None) 75 if projection is None: 76 self.embedding_dim = hidden_size 77 else: 78 self.embedding_dim = embedding_dim or hidden_size 79 self.projection = projection
80 81
[docs] 82class DprModel(SingleVectorBiEncoderModel): 83 """A single-vector DPR model. See :class:`DprConfig` for configuration options.""" 84 85 config_class = DprConfig 86 """Configuration class for a DPR model.""" 87
[docs] 88 def __init__(self, config: SingleVectorBiEncoderConfig, *args, **kwargs) -> None: 89 """Initializes a DPR model given a :class:`DprConfig`. 90 91 Args: 92 config (SingleVectorBiEncoderConfig): Configuration for the DPR model. 93 Raises: 94 ValueError: If the embedding dimension is not specified in the configuration. 95 """ 96 super().__init__(config, *args, **kwargs) 97 if self.config.projection is None: 98 self.projection: torch.nn.Module = torch.nn.Identity() 99 else: 100 if self.config.embedding_dim is None: 101 raise ValueError("Unable to determine embedding dimension.") 102 self.projection = torch.nn.Linear( 103 self.config.hidden_size, 104 self.config.embedding_dim, 105 bias="no_bias" not in self.config.projection, 106 ) 107 self.pooler = Pooler(config) 108 self.sparsifier = Sparsifier(config)
109
[docs] 110 def encode(self, encoding: BatchEncoding, input_type: Literal["query", "doc"]) -> BiEncoderEmbedding: 111 """Encodes a batched tokenized text sequences and returns the embeddings and scoring mask. 112 113 Args: 114 encoding (BatchEncoding): Tokenizer encodings for the text sequence. 115 input_type (Literal["query", "doc"]): type of input, either "query" or "doc". 116 Returns: 117 BiEncoderEmbedding: Embeddings and scoring mask. 118 """ 119 embeddings = self._backbone_forward(**encoding).last_hidden_state 120 embeddings = self.pooler(embeddings, encoding["attention_mask"]) 121 embeddings = self.projection(embeddings) 122 embeddings = self.sparsifier(embeddings) 123 if self.config.normalization_strategy == "l2": 124 embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=-1) 125 return BiEncoderEmbedding(embeddings, None, encoding)