Rebase and Minor Fixes

mohiso22 · mohiso22 · commit 77afbbce7335 · 2025-06-09T10:22:08.000Z
Signed-off-by: Mohit Soni &lt;quic_mohisoni@quicinc.com&gt;
diff --git a/QEfficient/base/pytorch_transforms.py b/QEfficient/base/pytorch_transforms.py
@@ -162,7 +162,10 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
             logger.info(f"[layer {layer_idx:02d}] loaded gate_proj & up_proj from fused tensor  (shape {fused.shape})")
             transformed = True
 
-        model.language_model = model_tmp
+        if hasattr(model, "language_model"):
+            model.language_model = model_tmp
+        else:
+            model = model_tmp
         return model, transformed
 
 
diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py
@@ -10,7 +10,7 @@
 
 import torch
 from torch import nn
-from transformers.cache_utils import Cache, DynamicCache
+from transformers.cache_utils import Cache
 from transformers.modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPast,
@@ -32,6 +32,7 @@
     repeat_kv,
 )
 
+from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
 from QEfficient.utils import constants
 from QEfficient.utils._utils import IOInfo, get_padding_shape_from_config
@@ -704,7 +705,7 @@ def forward(
         return_legacy_cache = False
         if use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values = QEffDynamicCache.from_legacy_cache(past_key_values)
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
@@ -8,7 +8,6 @@
 from types import MethodType
 from typing import Optional, Tuple
 
-import transformers
 from torch import nn
 from transformers.models.codegen.modeling_codegen import (
     CodeGenAttention,
@@ -50,6 +49,16 @@
     GraniteModel,
     GraniteRMSNorm,
 )
+from transformers.models.granitemoe.modeling_granitemoe import (
+    GraniteMoeAttention,
+    GraniteMoeForCausalLM,
+    GraniteMoeModel,
+    GraniteMoeMoE,
+    GraniteMoeParallelExperts,
+    GraniteMoeRMSNorm,
+    GraniteMoeRotaryEmbedding,
+    GraniteMoeTopKGating,
+)
 from transformers.models.llama.modeling_llama import (
     LlamaAttention,
     LlamaDecoderLayer,
@@ -72,6 +81,9 @@
 from transformers.models.llava.modeling_llava import (
     LlavaForConditionalGeneration,
 )
+from transformers.models.llava_next.modeling_llava_next import (
+    LlavaNextForConditionalGeneration,
+)
 from transformers.models.mistral.modeling_mistral import (
     MistralAttention,
     MistralDecoderLayer,
@@ -133,7 +145,6 @@
 
 from QEfficient.base.pytorch_transforms import ModuleMappingTransform, ModuleMethodMapperTransform
 from QEfficient.customop import CustomRMSNormAIC, GemmaCustomRMSNormAIC
-from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.models.codegen.modeling_codegen import (
     QEffCodeGenAttention,
     QeffCodeGenBlock,
@@ -181,6 +192,15 @@
     QEffGraniteForCausalLM,
     QEffGraniteModel,
 )
+from QEfficient.transformers.models.granitemoe.modeling_granitemoe import (
+    QEffGraniteMoeAttention,
+    QEffGraniteMoeForCausalLM,
+    QEffGraniteMoeModel,
+    QEffGraniteMoeMoE,
+    QEffGraniteMoeParallelExperts,
+    QEffGraniteMoeRotaryEmbedding,
+    QEffGraniteMoeTopKGating,
+)
 from QEfficient.transformers.models.internvl.modeling_internvl import (
     QEffInternVisionEmbeddings,
     QEffInternVLModel,
@@ -205,6 +225,9 @@
 from QEfficient.transformers.models.llava.modeling_llava import (
     QEffLlavaForConditionalGeneration,
 )
+from QEfficient.transformers.models.llava_next.modeling_llava_next import (
+    QEffLlavaNextForConditionalGeneration,
+)
 from QEfficient.transformers.models.mistral.modeling_mistral import (
     QEffMistralAttention,
     QEffMistralDecoderLayer,
@@ -287,6 +310,7 @@ class CustomOpsTransform(ModuleMappingTransform):
         Qwen2RMSNorm: CustomRMSNormAIC,
         MllamaTextRMSNorm: CustomRMSNormAIC,
         GraniteRMSNorm: CustomRMSNormAIC,
+        GraniteMoeRMSNorm: CustomRMSNormAIC,
     }
 
 
@@ -329,6 +353,8 @@ class KVCacheTransform(ModuleMappingTransform):
         Llama4TextExperts: QEffLlama4TextExperts,
         # Llava
         LlavaForConditionalGeneration: QEffLlavaForConditionalGeneration,
+        # Llava Next
+        LlavaNextForConditionalGeneration: QEffLlavaNextForConditionalGeneration,
         # Gemma
         GemmaAttention: QEffGemmaAttention,
         GemmaDecoderLayer: QEffGemmaDecoderLayer,
@@ -343,6 +369,14 @@ class KVCacheTransform(ModuleMappingTransform):
         GraniteModel: QEffGraniteModel,
         GraniteForCausalLM: QEffGraniteForCausalLM,
         GraniteAttention: QEffGraniteAttention,
+        # GraniteMoe
+        GraniteMoeModel: QEffGraniteMoeModel,
+        GraniteMoeForCausalLM: QEffGraniteMoeForCausalLM,
+        GraniteMoeAttention: QEffGraniteMoeAttention,
+        GraniteMoeRotaryEmbedding: QEffGraniteMoeRotaryEmbedding,
+        GraniteMoeParallelExperts: QEffGraniteMoeParallelExperts,
+        GraniteMoeTopKGating: QEffGraniteMoeTopKGating,
+        GraniteMoeMoE: QEffGraniteMoeMoE,
         # mllama
         MllamaTextRMSNorm: CustomRMSNormAIC,
         MllamaTextSelfAttention: QEffMllamaTextSelfAttention,
@@ -407,8 +441,6 @@ class KVCacheTransform(ModuleMappingTransform):
     @classmethod
     def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
         model, transformed = super().apply(model)
-        # FIXME: see if we can merge into _module_mapping dict
-        transformers.cache_utils.DynamicCache.update = QEffDynamicCache.update
         return model, transformed