""" Compared to standard Qwen3, we're using bidirectional attention and not causal attention, but it's specified with `is_causal=False` in the config. This file supports two loading paths: 1. Sentence Transformers: `SparseEncoder("naver/splade-code-06B", trust_remote_code=True)` via AutoModelForMaskedLM -> Qwen3ForCausalLM 2. Transformers: `AutoModelForCausalLM.from_pretrained("naver/splade-code-06B", trust_remote_code=True)` -> Splade """ import torch import os from transformers import Qwen3ForCausalLM as TransformersQwen3ForCausalLM from transformers import PretrainedConfig, PreTrainedModel, AutoConfig from transformers.utils import is_flash_attn_2_available from .utils import prepare_tokenizer, splade_max, similarity, encode class Qwen3ForCausalLM(TransformersQwen3ForCausalLM): def tie_weights(self, *args, **kwargs): """Explicitly re-tie lm_head to embed_tokens to hopefully avoid meta tensor errors.""" if ( self.config.tie_word_embeddings and hasattr(self, "lm_head") and hasattr(self, "model") ): self.lm_head.weight = self.model.embed_tokens.weight missing_keys = kwargs.get("missing_keys") if missing_keys is not None: missing_keys.discard("lm_head.weight") else: super().tie_weights(*args, **kwargs) def _init_weights(self, module): """Skip lm_head init when it will be tied to embed_tokens later.""" if module is getattr(self, "lm_head", None) and self.config.tie_word_embeddings: return super()._init_weights(module) class SpladeConfig(PretrainedConfig): model_type = "qwen3" def __init__( self, model_name_or_path: str = "Qwen/Qwen3-0.6B", attn_implementation: str = "flash_attention_2", bidirectional: bool = True, # only for decoder models padding_side: str = "left", **kwargs, ): super().__init__(**kwargs) self.model_name_or_path = model_name_or_path self.attn_implementation = attn_implementation self.bidirectional = bidirectional self.padding_side = padding_side class Splade(PreTrainedModel): config_class = SpladeConfig # methods for MTEB's interface similarity = similarity encode = encode def __init__(self, config, weights_path=None, token=None): super().__init__(config) self.name = "splade" base_cfg = AutoConfig.from_pretrained( weights_path, attn_implementation=config.attn_implementation, torch_dtype="auto", ) self.tokenizer = prepare_tokenizer( weights_path, padding_side=config.padding_side ) if is_flash_attn_2_available(): config.attn_implementation = "flash_attention_2" else: config.attn_implementation = "sdpa" self.model = Qwen3ForCausalLM.from_pretrained( weights_path, config=base_cfg, torch_dtype=torch.bfloat16, attn_implementation=config.attn_implementation, token=token, ) def save_pretrained(self, save_directory, *args, **kwargs): self.model.save_pretrained(os.path.join(save_directory, "lora")) self.config.save_pretrained(save_directory) @classmethod def from_pretrained(cls, model_name_or_path, *args, **kwargs): token = kwargs.get("token", None) config = SpladeConfig.from_pretrained( model_name_or_path, token=token, ) model = cls(config, weights_path=model_name_or_path, token=token) model.reverse_voc = {v: k for k, v in model.tokenizer.vocab.items()} return model def forward(self, **tokens): output = self.model(**tokens) splade_reps, _ = splade_max(output.logits, tokens["attention_mask"]) return (splade_reps,) def get_width(self): return self.model.config.vocab_size def create_batch_dict(self, input_texts, max_length): return self.tokenizer( input_texts, add_special_tokens=True, padding="longest", truncation=True, max_length=max_length, return_attention_mask=True, return_tensors="pt", ) __all__ = ["Qwen3ForCausalLM", "Splade"]