InternLM · RunningLeon · Nov 27, 2025 · Nov 28, 2025 · Nov 28, 2025
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -350,6 +350,7 @@ class PytorchEngineConfig:
         dllm_denoising_steps (int): Dllm denoising steps.
         dllm_confidence_threshold (float): dllm unmasking threshold for
             dynamic unmasking.
+        enforce_fp32_head (bool): Enforce lm_head to use fp32 in forward.
     """
     dtype: str = 'auto'
     tp: int = 1
@@ -387,6 +388,7 @@ class PytorchEngineConfig:
     hf_overrides: Optional[Dict[str, Any]] = None
     disable_vision_encoder: bool = False
     logprobs_mode: str = None
+    enforce_fp32_head: bool = False
     # router replay
     enable_return_routed_experts: bool = False
     enable_transfer_obj_ref: bool = False

diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py
@@ -435,6 +435,7 @@ class MiscConfig:
     logprobs_mode: str = None
     dllm_config: DLLMConfig = None
     enable_return_routed_experts: bool = False
+    enforce_fp32_head: bool = False
 
     @classmethod
     def from_engine_config(cls, engine_config: PytorchEngineConfig):
@@ -454,6 +455,7 @@ def from_engine_config(cls, engine_config: PytorchEngineConfig):
             logprobs_mode=engine_config.logprobs_mode,
             dllm_config=dllm_config,
             enable_return_routed_experts=engine_config.enable_return_routed_experts,
+            enforce_fp32_head=engine_config.enforce_fp32_head,
         )
         return misc_config
 

diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
@@ -1019,6 +1019,7 @@ def _build_model(self):
             dllm_config=self.misc_config.dllm_config,
             strategy_factory=self.strategy_factory,
             enable_return_routed_experts=enable_return_routed_experts,
+            enforce_fp32_head=self.misc_config.enforce_fp32_head,
         )
         patched_model = build_patched_model(self.model_config,
                                             device=device,

diff --git a/lmdeploy/pytorch/model_inputs.py b/lmdeploy/pytorch/model_inputs.py
@@ -492,6 +492,7 @@ class BuildModelContext:
     dllm_config: DLLMConfig = None
     strategy_factory: 'StrategyFactoryBase' = None
     enable_return_routed_experts: bool = False
+    enforce_fp32_head: bool = False
 
 
 class StepContextManager:

diff --git a/lmdeploy/pytorch/models/internlm2.py b/lmdeploy/pytorch/models/internlm2.py
@@ -7,12 +7,13 @@
 from transformers.configuration_utils import PretrainedConfig
 
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config
-from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, Embedding, RMSNorm, SiluAndMul,
+                                 build_rotary_embedding_from_config)
+from lmdeploy.pytorch.nn.linear import build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
+from .utils.model import DeployModelMixinV1
 
 
 class InternLM2Attention(nn.Module):
@@ -208,11 +209,11 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
-        self.tok_embeddings = nn.Embedding(config.vocab_size,
-                                           config.hidden_size,
-                                           self.padding_idx,
-                                           dtype=dtype,
-                                           device=device)
+        self.tok_embeddings = Embedding(config.vocab_size,
+                                        config.hidden_size,
+                                        self.padding_idx,
+                                        dtype=dtype,
+                                        device=device)
 
         # build all decode layers
         self.layers = nn.ModuleList([
@@ -269,7 +270,7 @@ def get_input_embeddings(self):
         return self.tok_embeddings
 
 
-class InternLM2ForCausalLM(nn.Module, CudaGraphMixin):
+class InternLM2ForCausalLM(nn.Module, DeployModelMixinV1, CudaGraphMixin):
     """Rewrote model of InternLM2ForCausalLM."""
 
     packed_modules_mapping = {
@@ -290,11 +291,7 @@ def __init__(self,
         # build Model
         self.model = InternLM2Model(config, dtype=dtype, device=device)
         # build lm_head
-        self.output = build_rowwise_linear(config.hidden_size,
-                                           config.vocab_size,
-                                           bias=False,
-                                           dtype=dtype,
-                                           device=device)
+        self.output = self.build_lm_head(config.hidden_size, config.vocab_size, bias=False, dtype=dtype, device=device)
 
     def forward(
         self,
@@ -315,9 +312,9 @@ def forward(
         )
         return hidden_states
 
-    def get_logits(self, hidden_states: torch.Tensor):
-        """Compute logits of the model output."""
-        return self.output(hidden_states)
+    def get_lm_head(self):
+        """Get lm_head."""
+        return self.output
 
     def get_input_embeddings(self):
         """Get input embeddings."""

diff --git a/lmdeploy/pytorch/models/internlm3.py b/lmdeploy/pytorch/models/internlm3.py
@@ -7,12 +7,13 @@
 from transformers.configuration_utils import PretrainedConfig
 
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config
-from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, Embedding, RMSNorm, SiluAndMul,
+                                 build_rotary_embedding_from_config)
+from lmdeploy.pytorch.nn.linear import build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
+from .utils.model import DeployModelMixinV1
 
 
 class InternLM3Attention(nn.Module):
@@ -210,11 +211,11 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
-        self.embed_tokens = nn.Embedding(config.vocab_size,
-                                         config.hidden_size,
-                                         self.padding_idx,
-                                         dtype=dtype,
-                                         device=device)
+        self.embed_tokens = Embedding(config.vocab_size,
+                                      config.hidden_size,
+                                      self.padding_idx,
+                                      dtype=dtype,
+                                      device=device)
 
         # build all decode layers
         self.layers = nn.ModuleList([
@@ -271,7 +272,7 @@ def get_input_embeddings(self):
         return self.embed_tokens
 
 
-class InternLM3ForCausalLM(nn.Module, CudaGraphMixin):
+class InternLM3ForCausalLM(nn.Module, DeployModelMixinV1, CudaGraphMixin):
     """Rewrote model of InternLM3ForCausalLM."""
 
     packed_modules_mapping = {
@@ -297,11 +298,7 @@ def __init__(self,
         # build InternLM3Model
         self.model = InternLM3Model(config, dtype=dtype, device=device)
         # build lm_head
-        self.lm_head = build_rowwise_linear(config.hidden_size,
-                                            config.vocab_size,
-                                            bias=False,
-                                            dtype=dtype,
-                                            device=device)
+        self.lm_head = self.build_lm_head(config.hidden_size, config.vocab_size, bias=False, dtype=dtype, device=device)
 
     def forward(
         self,
@@ -322,15 +319,6 @@ def forward(
         )
         return hidden_states
 
-    def update_weights(self):
-        """Update weights."""
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-
-    def get_logits(self, hidden_states: torch.Tensor):
-        """Compute logits of the model output."""
-        return self.lm_head(hidden_states)
-
     def get_input_embeddings(self):
         """Get input embeddings."""
         return self.model.get_input_embeddings()

diff --git a/lmdeploy/pytorch/models/internvl.py b/lmdeploy/pytorch/models/internvl.py
@@ -19,7 +19,7 @@
 
 from .patch import build_model_from_hf_config
 from .utils.cudagraph import CudaGraphMixin
-from .utils.model import DeployModelMixin, vlm_model
+from .utils.model import DeployModelMixinV1, vlm_model
 
 
 class Gating(nn.Module):
@@ -444,7 +444,7 @@ def forward(
         return last_hidden_state
 
 
-class InternVLChatModel(nn.Module, DeployModelMixin, CudaGraphMixin):
+class InternVLChatModel(nn.Module, DeployModelMixinV1, CudaGraphMixin):
 
     def __init__(self,
                  config: PretrainedConfig,
@@ -801,9 +801,9 @@ def forward(
                                                position_ids=position_ids,
                                                attn_metadata=attn_metadata)
 
-    def get_logits(self, hidden_states: torch.Tensor):
-        """Compute logits of the model output."""
-        return self.language_model.get_logits(hidden_states)
+    def get_lm_head(self):
+        """Get lm_head."""
+        return self.language_model.get_lm_head()
 
     def get_input_embeddings(self):
         """Get input embeddings."""

diff --git a/lmdeploy/pytorch/models/internvl3_hf.py b/lmdeploy/pytorch/models/internvl3_hf.py
@@ -20,7 +20,7 @@
 
 from .patch import build_model_from_hf_config
 from .utils.cudagraph import CudaGraphMixin
-from .utils.model import DeployModelMixin, vlm_model
+from .utils.model import DeployModelMixinV1, vlm_model
 
 
 @torch.compile(dynamic=True)
@@ -439,7 +439,7 @@ def forward(self, image_features):
         return hidden_states
 
 
-class InternVLForConditionalGeneration(nn.Module, DeployModelMixin, CudaGraphMixin):
+class InternVLForConditionalGeneration(nn.Module, DeployModelMixinV1, CudaGraphMixin):
 
     def __init__(self,
                  config: PretrainedConfig,
@@ -485,9 +485,9 @@ def _mark_dynamic_once(self, pixel_values, dims):
         torch._dynamo.mark_dynamic(pixel_values, dims)
         self.has_compiled_vit = True
 
-    def get_logits(self, hidden_states: torch.Tensor):
-        """Compute logits of the model output."""
-        return self.language_model.get_logits(hidden_states)
+    def get_lm_head(self):
+        """Get lm_head."""
+        return self.language_model.get_lm_head()
 
     def get_input_embeddings(self):
         """Get input embeddings."""

diff --git a/lmdeploy/pytorch/models/phi3.py b/lmdeploy/pytorch/models/phi3.py
@@ -7,13 +7,13 @@
 from transformers.configuration_utils import PretrainedConfig
 
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul
-from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, Embedding, RMSNorm, SiluAndMul
+from lmdeploy.pytorch.nn.linear import build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj
 from lmdeploy.pytorch.nn.rotary_embedding import build_rotary_embedding_from_config
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
+from .utils.model import DeployModelMixinV1
 
 
 class Phi3Attention(nn.Module):
@@ -211,11 +211,11 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
-        self.embed_tokens = nn.Embedding(config.vocab_size,
-                                         config.hidden_size,
-                                         self.padding_idx,
-                                         dtype=dtype,
-                                         device=device)
+        self.embed_tokens = Embedding(config.vocab_size,
+                                      config.hidden_size,
+                                      self.padding_idx,
+                                      dtype=dtype,
+                                      device=device)
 
         # build all decode layers
         self.layers = nn.ModuleList([
@@ -272,7 +272,7 @@ def get_input_embeddings(self):
         return self.embed_tokens
 
 
-class Phi3ForCausalLM(nn.Module, CudaGraphMixin):
+class Phi3ForCausalLM(nn.Module, DeployModelMixinV1, CudaGraphMixin):
     """ModelForCausalLM."""
 
     packed_modules_mapping = {
@@ -293,11 +293,7 @@ def __init__(self,
         # build model
         self.model = Phi3Model(config, dtype=dtype, device=device)
         # build lm_head
-        self.lm_head = build_rowwise_linear(config.hidden_size,
-                                            config.vocab_size,
-                                            bias=False,
-                                            dtype=dtype,
-                                            device=device)
+        self.lm_head = self.build_lm_head(config.hidden_size, config.vocab_size, bias=False, dtype=dtype, device=device)
 
     def forward(
         self,
@@ -318,15 +314,6 @@ def forward(
         )
         return hidden_states
 
-    def get_logits(self, hidden_states: torch.Tensor):
-        """Compute logits of the model output."""
-        return self.lm_head(hidden_states)
-
-    def update_weights(self):
-        """Update weights."""
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-
     def get_input_embeddings(self):
         """Get input embeddings."""
         return self.model.get_input_embeddings()

diff --git a/lmdeploy/pytorch/models/phi3_v.py b/lmdeploy/pytorch/models/phi3_v.py
@@ -9,11 +9,10 @@
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
-from lmdeploy.pytorch.nn.linear import build_rowwise_linear
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .phi3 import Phi3ForCausalLM, Phi3Model
-from .utils.model import DeployModelMixin, vlm_model
+from .utils.model import vlm_model
 
 CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(attention_dropout=0.0,
                                                      dropout=0.0,
@@ -264,7 +263,7 @@ def forward(
         )
 
 
-class Phi3VForCausalLM(Phi3ForCausalLM, DeployModelMixin):
+class Phi3VForCausalLM(Phi3ForCausalLM):
 
     def __init__(self,
                  config: PretrainedConfig,
@@ -277,11 +276,7 @@ def __init__(self,
         # build model
         self.model = Phi3VModel(config, dtype=dtype, device=device)
         # build lm_head
-        self.lm_head = build_rowwise_linear(config.hidden_size,
-                                            config.vocab_size,
-                                            bias=False,
-                                            dtype=dtype,
-                                            device=device)
+        self.lm_head = self.build_lm_head(config.hidden_size, config.vocab_size, bias=False, dtype=dtype, device=device)
 
         self.input_processor = Phi3VInputProcessor(config, dtype)