fix: align _init_weights with Qwen2Moe using nn.init API

Use @torch .no_grad() decorator, call super()._init_weights(), and only init MoE gate weights (nn.Linear/nn.Embedding handled by PreTrainedModel base class in transformers v5).

Files changed (1) hide show

modeling_llada2_moe.py +3 -8

modeling_llada2_moe.py CHANGED Viewed

@@ -686,17 +686,12 @@ class LLaDA2MoePreTrainedModel(PreTrainedModel):
     _supports_flex_attn = True
     _supports_cache_class = True
     def _init_weights(self, module):
         super()._init_weights(module)
         std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
 LLADA2MOE_INPUTS_DOCSTRING = r"""

     _supports_flex_attn = True
     _supports_cache_class = True
+    @torch.no_grad()
     def _init_weights(self, module):
         super()._init_weights(module)
         std = self.config.initializer_range
+        if isinstance(module, LLaDA2MoeGate):
+            nn.init.normal_(module.weight, mean=0.0, std=std)
 LLADA2MOE_INPUTS_DOCSTRING = r"""