add assertions and merge impl

Lu Fang · Lu Fang · commit f2887f66faf7 · 2025-06-18T05:00:11.000-07:00
Signed-off-by: Lu Fang &lt;fanglu@meta.com&gt;
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
@@ -743,7 +743,7 @@ def use_cascade_attention(
     if common_prefix_len < 256:
         return False
     # Cascade attention is currently not supported with these variants.
-    if use_alibi or use_sliding_window:
+    if use_alibi or use_sliding_window or use_local_attention:
         return False
     # Too few queries. Probably not worth using cascade attention.
     # We use an arbitrary threshold of 8 queries. TODO: Tune this threshold.
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
@@ -114,6 +114,9 @@ def merge(cls, specs: list[Self]) -> Self:
         merged_spec = super().merge(specs)
         sliding_window = set(spec.sliding_window for spec in specs
                              if spec.sliding_window is not None)
+        attention_chunk_size = set(spec.attention_chunk_size for spec in specs
+                                   if spec.attention_chunk_size is not None)
+
         if len(sliding_window) == 0:
             merged_spec.sliding_window = None
         elif len(sliding_window) == 1:
@@ -122,6 +125,17 @@ def merge(cls, specs: list[Self]) -> Self:
             raise ValueError(
                 "All sliding window layers in the same KV cache group "
                 "must have the same window size.")
+        if len(attention_chunk_size) == 0:
+            merged_spec.attention_chunk_size = None
+        elif len(attention_chunk_size) == 1:
+            merged_spec.attention_chunk_size = attention_chunk_size.pop()
+        else:
+            raise ValueError(
+                "All chunked local attention layers in the same KV cache group "
+                "must have the same chunk size.")
+        assert len(sliding_window) + len(attention_chunk_size) <= 1, (
+            "Model with both sliding window layers and chunked local attention "
+            "layers is not supported.")
         return merged_spec
 
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -2330,6 +2330,8 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
 
             # TODO: Support other attention modules, e.g., cross-attention
             if attn_module.attn_type == AttentionType.DECODER:
+                use_local_attention = (self.attention_chunk_size is not None
+                                       and attn_module.use_irope)
                 if attn_module.sliding_window is not None:
                     kv_cache_spec[layer_name] = SlidingWindowSpec(
                         block_size=block_size,
@@ -2338,8 +2340,10 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
                         dtype=self.kv_cache_dtype,
                         sliding_window=attn_module.sliding_window,
                         use_mla=use_mla)
-                elif self.attention_chunk_size is not None \
-                    and attn_module.use_irope:
+                    assert not use_local_attention, (
+                        "attention module can not be with ",
+                        "both local attention and sliding window")
+                elif use_local_attention:
                     kv_cache_spec[layer_name] = \
                     ChunkedLocalAttentionSpec(
                         block_size=block_size,