Fix lora test by removing LoRA extra vocab (#1156)

vanbasten23 · web-flow · commit f88d7a9f900b · 2025-11-21T17:26:07.000-08:00
Signed-off-by: Xiongfei Wei &lt;isaacwxf23@gmail.com&gt;
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
@@ -91,7 +91,6 @@ def populate_loras(
     index_to_id: list[Optional[int]],
     lora_layer: BaseLayerWithLoRA,
     baselayer_weights: torch.Tensor,
-    generate_embeddings_tensor: int = 0,
     repeats: int = 1,
 ) -> tuple[dict[int, LoRALayerWeights], dict[int, list[LoRALayerWeights]]]:
     """This method populates the lora weights (lora_a and lora_b) in the lora layers (BaseLayerWithLoRA).
@@ -103,8 +102,6 @@ def populate_loras(
         lora_layer: the LoRAlayer to populate.
         baselayer_weights: the PyTorch tensor containing the layer's
             weights.
-        generate_embeddings_tensor: whether to generate an
-            embeddings tensor for each LoRA.
         repeats: must only be set for column parallel packed
             layers. Indicates the number of loras to compose
             together to create a single lora layer.
@@ -131,7 +128,6 @@ def populate_loras(
                     baselayer_weights.device).init_random_lora(
                         module_name=f"fake_{i}",
                         weight=baselayer_weights,
-                        generate_embeddings_tensor=generate_embeddings_tensor,
                     )
                 sublora.lora_b = sublora.lora_b[(sublora_len *
                                                  i):(sublora_len * (i + 1)), :]
@@ -147,7 +143,6 @@ def populate_loras(
                     slot_idx,
                     lora_a=lora.lora_a,
                     lora_b=lora.lora_b,
-                    embeddings_tensor=lora.embeddings_tensor,
                 )
 
             lora_dict[lora_id] = lora
@@ -546,7 +541,6 @@ def _update_punica_wrapper_metadata(punica_wrapper, index_mapping,
             index_to_id,
             lora_config.max_loras,
             vocab_size=512,
-            extra_vocab_size=lora_config.lora_extra_vocab_size,
         )
         assert jax_view(punica_wrapper._lora_indices_per_batch).platform(
         ) == 'tpu', 'punica_wrapper._lora_indices_per_batch should have been moved to TPU.'
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
@@ -24,7 +24,6 @@ def init_random_lora(
         module_name: str,
         weight: torch.Tensor,
         rank: int = 8,
-        generate_embeddings_tensor: int = 0,
     ):
         lora = LoRALayerWeights(
             module_name,
@@ -37,13 +36,6 @@ def init_random_lora(
                               dtype=weight.dtype,
                               device=self._device),
         )
-        if generate_embeddings_tensor:
-            lora.embeddings_tensor = torch.rand(
-                5,
-                generate_embeddings_tensor,
-                dtype=weight.dtype,
-                device=self._device,
-            )
         self.set_module_lora(module_name, lora)
 
         return lora
diff --git a/tpu_inference/lora/torch_punica_tpu.py b/tpu_inference/lora/torch_punica_tpu.py
@@ -239,7 +239,6 @@ def _update_base_metadata(
         lora_index_to_id: list[Optional[int]],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
     ):
         # Pad the prompt mapping to avoid running into recompiles on the TPU
         # TODO: Should this happen inside mapping internally? If so how can we
@@ -258,7 +257,7 @@ def _update_base_metadata(
             lora_index_to_id,
             max_loras,
             vocab_size,
-            extra_vocab_size,
+            0,  # extra_vocab_size
             "cpu",
         )
         with torchax.default_env():
diff --git a/tpu_inference/runner/tpu_runner.py b/tpu_inference/runner/tpu_runner.py
@@ -472,9 +472,6 @@ def _init_inputs(self) -> None:
 
         # tensors for structured decoding
         self.vocab_size = self.model_config.get_vocab_size()
-        if self.lora_config is not None:
-            # lora_config.lora_extra_vocab_size is the "Maximum size of extra vocabulary that can be present in a LoRA adapter" per https://github.com/vanbasten23/vllm/blob/7f4a8b6705622fde952a2e633e86716f902d6e1b/vllm/config.py#L3040
-            self.vocab_size += self.lora_config.lora_extra_vocab_size
         self.grammar_bitmask_cpu = np.zeros(
             (self.max_num_reqs, cdiv(self.vocab_size, 32)),
             dtype=np.int32,