1"""Configuration and model for DPR (Dense Passage Retriever) type models.
2
3DPR type models use two separate neural networks, known as dual encoders, to independently map a user's question and
4the database of documents into a shared mathematical space. During a search, DPR finds relevant documents by
5measuring the distance or similarity between the single vector of the query and the vectors of the documents.
6
7Originally proposed in \
8`Dense Passage Retrieval for Open-Domain Question Answering \
9<https://arxiv.org/abs/2004.04906>`_. This model type is also known as a SentenceTransformer model:
10`Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks \
11<https://arxiv.org/abs/1908.10084>`_.
12"""
13
14from typing import Literal
15
16import torch
17from transformers import BatchEncoding
18
19from ...bi_encoder import BiEncoderEmbedding, SingleVectorBiEncoderConfig, SingleVectorBiEncoderModel
20from ...modeling_utils.embedding_post_processing import Pooler, Sparsifier
21
22
[docs]
23class DprConfig(SingleVectorBiEncoderConfig):
24 """Configuration class for a DPR model."""
25
26 model_type = "lir-dpr"
27 """Model type for a DPR model."""
28
[docs]
29 def __init__(
30 self,
31 query_length: int | None = 32,
32 doc_length: int | None = 512,
33 similarity_function: Literal["cosine", "dot"] = "dot",
34 normalization_strategy: Literal["l2"] | None = None,
35 sparsification_strategy: Literal["relu", "relu_log", "relu_2xlog"] | None = None,
36 add_marker_tokens: bool = False,
37 pooling_strategy: Literal["first", "mean", "max", "sum"] = "first",
38 embedding_dim: int | None = None,
39 projection: Literal["linear", "linear_no_bias"] | None = "linear",
40 **kwargs,
41 ) -> None:
42 """A DPR model encodes queries and documents separately. Before computing the similarity score, the
43 contextualized token embeddings are aggregated to obtain a single embedding using a pooling strategy.
44 Optionally, the pooled embeddings can be projected using a linear layer.
45
46 Args:
47 query_length (int | None): Maximum number of tokens per query. If None does not truncate. Defaults to 32.
48 doc_length (int | None): Maximum number of tokens per document. If None does not truncate. Defaults to 512.
49 similarity_function (Literal["cosine", "dot"]): Similarity function to compute scores between query and
50 document embeddings. Defaults to "dot".
51 normalization_strategy (Literal['l2'] | None): Whether to normalization_strategy query and document
52 embeddings.
53 Defaults to None.
54 sparsification_strategy (Literal['relu', 'relu_log', 'relu_2xlog'] | None): Whether and which
55 sparsification_strategy function to apply. Defaults to None.
56 add_marker_tokens (bool): Whether to add marker tokens to the input sequences. Defaults to False.
57 pooling_strategy (Literal["first", "mean", "max", "sum"]): Pooling strategy for query and document
58 embeddings. Defaults to "first".
59 embedding_dim (int | None): Dimension of the final embeddings. If None, it will be set to the hidden size
60 of the backbone model. Defaults to None.
61 projection (Literal["linear", "linear_no_bias"] | None): type of projection layer to apply on the pooled
62 embeddings. If None, no projection is applied. Defaults to "linear".
63 """
64 super().__init__(
65 query_length=query_length,
66 doc_length=doc_length,
67 similarity_function=similarity_function,
68 normalization_strategy=normalization_strategy,
69 sparsification_strategy=sparsification_strategy,
70 add_marker_tokens=add_marker_tokens,
71 pooling_strategy=pooling_strategy,
72 **kwargs,
73 )
74 hidden_size = getattr(self, "hidden_size", None)
75 if projection is None:
76 self.embedding_dim = hidden_size
77 else:
78 self.embedding_dim = embedding_dim or hidden_size
79 self.projection = projection
80
81
[docs]
82class DprModel(SingleVectorBiEncoderModel):
83 """A single-vector DPR model. See :class:`DprConfig` for configuration options."""
84
85 config_class = DprConfig
86 """Configuration class for a DPR model."""
87
[docs]
88 def __init__(self, config: SingleVectorBiEncoderConfig, *args, **kwargs) -> None:
89 """Initializes a DPR model given a :class:`DprConfig`.
90
91 Args:
92 config (SingleVectorBiEncoderConfig): Configuration for the DPR model.
93 Raises:
94 ValueError: If the embedding dimension is not specified in the configuration.
95 """
96 super().__init__(config, *args, **kwargs)
97 if self.config.projection is None:
98 self.projection: torch.nn.Module = torch.nn.Identity()
99 else:
100 if self.config.embedding_dim is None:
101 raise ValueError("Unable to determine embedding dimension.")
102 self.projection = torch.nn.Linear(
103 self.config.hidden_size,
104 self.config.embedding_dim,
105 bias="no_bias" not in self.config.projection,
106 )
107 self.pooler = Pooler(config)
108 self.sparsifier = Sparsifier(config)
109
[docs]
110 def encode(self, encoding: BatchEncoding, input_type: Literal["query", "doc"]) -> BiEncoderEmbedding:
111 """Encodes a batched tokenized text sequences and returns the embeddings and scoring mask.
112
113 Args:
114 encoding (BatchEncoding): Tokenizer encodings for the text sequence.
115 input_type (Literal["query", "doc"]): type of input, either "query" or "doc".
116 Returns:
117 BiEncoderEmbedding: Embeddings and scoring mask.
118 """
119 embeddings = self._backbone_forward(**encoding).last_hidden_state
120 embeddings = self.pooler(embeddings, encoding["attention_mask"])
121 embeddings = self.projection(embeddings)
122 embeddings = self.sparsifier(embeddings)
123 if self.config.normalization_strategy == "l2":
124 embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=-1)
125 return BiEncoderEmbedding(embeddings, None, encoding)