truncate-embedding-dimension

#10

by jupyterjazz - opened May 15, 2024

base: refs/heads/main

←

from: refs/pr/10

Discussion Files changed

+24

-0

Files changed (2) hide show

configuration_xlm_roberta.py +4 -0
modeling_xlm_roberta.py +20 -0

configuration_xlm_roberta.py CHANGED Viewed

@@ -31,6 +31,8 @@ class XLMRobertaFlashConfig(PretrainedConfig):
             use_flash_attn=True,
             torch_dtype=None,
             emb_pooler=None,
             **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -59,6 +61,8 @@ class XLMRobertaFlashConfig(PretrainedConfig):
         self.lora_main_params_trainable = lora_main_params_trainable
         self.use_flash_attn = use_flash_attn
         self.emb_pooler = emb_pooler
         if torch_dtype and hasattr(torch, torch_dtype) and type(getattr(torch, torch_dtype)) is torch.dtype:
             self.torch_dtype = getattr(torch, torch_dtype)
         else:

             use_flash_attn=True,
             torch_dtype=None,
             emb_pooler=None,
+            matryoshka_dimensions=None,
+            truncate_dim=None,
             **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
         self.lora_main_params_trainable = lora_main_params_trainable
         self.use_flash_attn = use_flash_attn
         self.emb_pooler = emb_pooler
+        self.matryoshka_dimensions = matryoshka_dimensions
+        self.truncate_dim = truncate_dim
         if torch_dtype and hasattr(torch, torch_dtype) and type(getattr(torch, torch_dtype)) is torch.dtype:
             self.torch_dtype = getattr(torch, torch_dtype)
         else:

modeling_xlm_roberta.py CHANGED Viewed

@@ -452,6 +452,7 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         convert_to_tensor: bool = False,
         device: Optional[torch.device] = None,
         normalize_embeddings: bool = False,
         **tokenizer_kwargs,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """
@@ -481,6 +482,8 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
                 If set to true, returned vectors will have length 1. In that case, the
                 faster dot-product (util.dot_score) instead of cosine similarity can
                 be used.
             tokenizer_kwargs(`Dict[str, Any]`, *optional*, defaults to {}):
                 Keyword arguments for the tokenizer
         Returns:
@@ -575,6 +578,10 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         all_embeddings = [all_embeddings[idx] for idx in inverse_permutation]
         if convert_to_tensor:
             all_embeddings = torch.stack(all_embeddings)
         elif convert_to_numpy:
@@ -586,6 +593,19 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         self.train(is_training)
         return all_embeddings
     def mean_pooling(
         self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor
     ):

         convert_to_tensor: bool = False,
         device: Optional[torch.device] = None,
         normalize_embeddings: bool = False,
+        truncate_dim: Optional[int] = None,
         **tokenizer_kwargs,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """
                 If set to true, returned vectors will have length 1. In that case, the
                 faster dot-product (util.dot_score) instead of cosine similarity can
                 be used.
+            truncate_dim(`int`, *optional*, defaults to None):
+                The dimension to truncate sentence embeddings to. `None` does no truncation.
             tokenizer_kwargs(`Dict[str, Any]`, *optional*, defaults to {}):
                 Keyword arguments for the tokenizer
         Returns:
         all_embeddings = [all_embeddings[idx] for idx in inverse_permutation]
+        truncate_dim = truncate_dim or self.config.truncate_dim
+        if truncate_dim:
+            all_embeddings = self.truncate_embeddings(all_embeddings, truncate_dim)
         if convert_to_tensor:
             all_embeddings = torch.stack(all_embeddings)
         elif convert_to_numpy:
         self.train(is_training)
         return all_embeddings
+    def truncate_embeddings(self, embeddings, truncate_dim):
+        if not self.config.matryoshka_dimensions:
+            logger.warning(
+                'Matryoshka embeddings are not supported, so dimension truncation will not be performed.'
+            )
+            return embeddings
+        elif truncate_dim in self.config.matryoshka_dimensions:
+            return [tensor[:truncate_dim] for tensor in embeddings]
+        else:
+            raise ValueError(f'The provided `truncate_dim` value of {truncate_dim} is not supported. '
+                             f'Supported dimensions are {self.config.matryoshka_dimensions}.')
     def mean_pooling(
         self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor
     ):