add attention_chunk_size in full attention spec

Lu Fang · Lu Fang · commit 320ab71a4aa7 · 2025-06-16T07:40:46.000-07:00
Signed-off-by: Lu Fang &lt;fanglu@meta.com&gt;
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
@@ -927,6 +927,7 @@ def is_hybrid(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
                     head_size=spec.head_size,
                     dtype=spec.dtype,
                     use_mla=spec.use_mla,
+                    attention_chunk_size=spec.attention_chunk_size,
                 )
 
     if is_hybrid(kv_cache_spec):
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -386,7 +386,7 @@ def get_num_common_prefix_blocks(self, request_id: str,
         NOTE(Chen): The prefix blocks are null blocks for sliding window layers.
         So it's not correct to count ref_cnt like FullAttentionManager. Return 
         0 here for correctness. Need to support cascade attention + sliding
-        window in the future
+        window in the future.
         """
         return 0
 
@@ -414,22 +414,23 @@ def find_longest_cache_hit(
             "chunked local attention groups")
         max_num_blocks = max_length // kv_cache_spec.block_size
         if max_length > 0:
-            local_attention_start_idx = (
-                (max_length-1) // kv_cache_spec.attention_chunk_size
-                * kv_cache_spec.attention_chunk_size)
+            local_attention_start_idx = ((max_length - 1) //
+                                         kv_cache_spec.attention_chunk_size *
+                                         kv_cache_spec.attention_chunk_size)
         else:
             local_attention_start_idx = 0
         # [ block 0, ..., block x(x_start<=first_attention_token),
         # block x+1, ..,  block N (N_end <=max_len), ...]
-        local_attention_start_block_idx = (
-            local_attention_start_idx // kv_cache_spec.block_size)
+        local_attention_start_block_idx = (local_attention_start_idx //
+                                           kv_cache_spec.block_size)
         computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
             [block_pool.null_block] * local_attention_start_block_idx
             for _ in range(len(kv_cache_group_ids)))
-        # for local chunked attention, we marked blocks out of window as computed
-        # with null blocks, and blocks inside window based on cache lookup result
-        # [null] [null] ... [null] [hit block 1 (1st block contain last window)] 
-        # [hit block 2] ... [hit block x][ 
+        # we marked blocks out of window as computed
+        # with null blocks, and blocks inside window
+        # based on cache lookup result
+        # [null] [null] ... [null] [hit block 1 (1st block contain last window)]
+        # [hit block 2] ... [hit block x]
         for i in range(local_attention_start_block_idx, max_num_blocks):
             block_hash = block_hashes[i]
             if cached_block := block_pool.get_cached_block(
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
@@ -86,6 +86,7 @@ def page_size_bytes(self) -> int:
 @dataclass
 class FullAttentionSpec(AttentionSpec):
     sliding_window: Optional[int] = None
+    attention_chunk_size: Optional[int] = None
     """
     When hybrid allocator is disabled and the model contains both full 
     attention layers and sliding window attention layers, sliding 

Original file line number	Diff line number	Diff line change
`@@ -927,6 +927,7 @@ def is_hybrid(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:`
`927`	`927`	`head_size=spec.head_size,`
`928`	`928`	`dtype=spec.dtype,`
`929`	`929`	`use_mla=spec.use_mla,`
	`930`	`+ attention_chunk_size=spec.attention_chunk_size,`
`930`	`931`	`)`
`931`	`932`
`932`	`933`	`if is_hybrid(kv_cache_spec):`