Clean-up and use all-layer kv cache pool to the flashmla kernel

chang-l · chang-l · commit d5f10fbd66e5 · 2025-10-17T21:25:01.000-07:00
Signed-off-by: Chang Liu (Enterprise Products) &lt;9713593+chang-l@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/flash_mla/CMakeLists.txt b/cpp/tensorrt_llm/flash_mla/CMakeLists.txt
@@ -144,8 +144,8 @@ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
     ${FLASH_MLA_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu)
 endif()
 
-# Disable LTO before creating target (matching DeepEP's approach) Let CMake
-# generate fatbinData for CUDA separable compilation
+# Disable LTO before creating target (similar to DeepEP) Let CMake generate
+# fatbinData for CUDA separable compilation
 set(CMAKE_INTERPROCEDURAL_OPTIMIZATION FALSE)
 
 pybind11_add_module(flash_mla_cpp_tllm ${FLASH_MLA_SOURCES})
@@ -174,7 +174,7 @@ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
 endif()
 set_cuda_architectures(flash_mla_cpp_tllm ${FLASH_MLA_BUILD_ARCHS})
 
-# Compiler options matching FlashMLA setup.py
+# Copy of compiler options from FlashMLA setup.py
 target_compile_options(
   flash_mla_cpp_tllm
   PRIVATE
@@ -215,7 +215,11 @@ target_include_directories(
 # Link libraries (matching FlashMLA setup.py: cuda, cudart + torch)
 target_link_libraries(
   flash_mla_cpp_tllm PRIVATE ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIB}
-                             CUDA::cuda_driver CUDA::cudart CUDA::cudart_static)
+                             CUDA::cuda_driver CUDA::cudart)
+target_link_options(
+  flash_mla_cpp_tllm PRIVATE
+  -Wl,--version-script,${CMAKE_CURRENT_SOURCE_DIR}/flash_mla_cpp_tllm.version
+  -Wl,--no-undefined-version)
 
 # Link directories
 target_link_directories(
diff --git a/examples/llm-api/llm_sparse_attention.py b/examples/llm-api/llm_sparse_attention.py
@@ -64,18 +64,10 @@ def parse_arguments():
                         type=int,
                         default=2048,
                         help="The prompt budget for RocketKV.")
-    parser.add_argument('--index_n_heads',
+    parser.add_argument('--index_max_chunk_size',
                         type=int,
-                        default=64,
-                        help="The number of heads for the indexer.")
-    parser.add_argument('--index_head_dim',
-                        type=int,
-                        default=128,
-                        help="The dimension of the indexer heads.")
-    parser.add_argument('--index_topk',
-                        type=int,
-                        default=2048,
-                        help="The topk for the indexer.")
+                        default=32768,
+                        help="The maximum chunk size for the indexer.")
     parser.add_argument("--max_seq_len",
                         type=int,
                         default=8192,
@@ -198,11 +190,8 @@ def run_RocketKV(args):
 
 def run_DSA(args):
     sparse_attention_config = DSASparseAttentionConfig(
-        index_n_heads=args.index_n_heads,
-        index_head_dim=args.index_head_dim,
-        index_topk=args.index_topk,
-    )
-    run_llm(args, None)
+        indexer_max_chunk_size=args.index_max_chunk_size, )
+    run_llm(args, sparse_attention_config)
 
 
 def main():
diff --git a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py
@@ -69,34 +69,17 @@ def transform_local_topk_and_prepare_pool_view(
     """
     assert topk_indices.dtype == torch.int32
 
-    # Get KV cache pool: [num_blocks, 1, tokens_per_block, 1, head_dim]
-    kv_pool = kv_cache_manager.get_buffers(layer_idx)
-    num_blocks, _, tokens_per_block, _, head_dim = kv_pool.shape
-    assert kv_pool.shape[1] == 1 and kv_pool.shape[3] == 1
-
-    # Squeeze to [num_blocks, tokens_per_block, head_dim]
-    kv_pool = kv_pool.squeeze(1).squeeze(2)
-
-    # Auto-detect stride and prepare view
-    if kv_pool.is_contiguous():
-        stride_factor = tokens_per_block
-        kv_pool = kv_pool.view(-1, 1, head_dim)
-    else:
-        # Here we simply do:
-        # kv_pool = kv_pool.reshape(-1, 1, head_dim) to make it contiguous
-        # however, using strided layout and directly offset topk tokens in the
-        # (layer-interleaved) pool MIGHT be (not benchmarked) more efficient as its zero-copy.
-
-        # Strided layout: compute stride and create efficient view
-        block_stride = kv_pool.stride(0)
-        token_stride = kv_pool.stride(1)
-        assert token_stride == head_dim
-        stride_factor = block_stride // token_stride
-        view_size = (num_blocks - 1) * stride_factor + tokens_per_block
-        kv_pool = torch.as_strided(kv_pool,
-                                   size=(view_size, 1, head_dim),
-                                   stride=(token_stride, 0, 1),
-                                   storage_offset=kv_pool.storage_offset())
+    # Get all layer KV cache pool: [num_blocks, num_layers, kv_factor, blockSize]
+    all_layer_kv_pool = kv_cache_manager.get_unique_primary_pool(
+    )  # [num_blocks, num_layers, kv_factor, blockSize]
+    num_blocks, num_layers, _, _ = all_layer_kv_pool.shape
+    tokens_per_block = kv_cache_manager.tokens_per_block
+    head_dim = kv_cache_manager.head_dim
+    assert num_layers == kv_cache_manager.num_local_layers, "PP is not enable yet for DS32"
+    assert all_layer_kv_pool.is_contiguous(
+    ), "all_layer_kv_pool should be contiguous"
+    all_layer_kv_pool = all_layer_kv_pool.squeeze(2).view(-1, 1, head_dim)
+    stride_factor = num_layers * tokens_per_block
 
     # Get block_table and request indices for this phase
     if is_generation:
@@ -114,12 +97,13 @@ def transform_local_topk_and_prepare_pool_view(
         req_idx,
         block_table,
         topk_indices,
-        BLOCK_SIZE=attn_metadata.tokens_per_block,
+        BLOCK_SIZE=tokens_per_block,
         NUM_TOPK_TOKENS=topk_indices.shape[1],
         stride_factor=stride_factor,
+        layer_id=layer_idx,
     )
 
-    return global_indices, kv_pool
+    return global_indices, all_layer_kv_pool
 
 
 def split_prefill_chunks(
@@ -1262,7 +1246,7 @@ def __init__(
         assert not kv_cache_config.enable_block_reuse, "DSA cache requires block reuse to be disabled in KV cache config"
         self.quant_block_size = 128
         self.index_head_dim = sparse_attn_config.index_head_dim
-        # Use a fixed tokens_per_block for indexer k cache
+        # Use a fixed tokens_per_block for indexer k cache due to DG kernel constraints
         self.indexer_k_cache_tokens_per_block = 64
 
         super().__init__(
diff --git a/tensorrt_llm/_torch/attention_backend/sparse/kernel.py b/tensorrt_llm/_torch/attention_backend/sparse/kernel.py
@@ -318,9 +318,9 @@ def _convert_req_index_to_global_index_kernel_with_stride_factor(
     # shapes (compile-time where possible)
     max_num_blocks_per_req: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
-    BLOCK_N: tl.constexpr,  # tile width along columns
+    BLOCK_N: tl.constexpr,  # tile width along columns # strides (in elements)
     stride_factor: tl.constexpr,  # for strided memory layout adjustment
-    # strides (in elements)
+    layer_id: tl.constexpr,  # for layer interleaving layout
     bt_stride0,
     bt_stride1,
     ti_stride0,
@@ -352,7 +352,7 @@ def _convert_req_index_to_global_index_kernel_with_stride_factor(
 
     # Compute block id and in-block offset
     block_id = tok // BLOCK_SIZE
-    inblock_off = tok % BLOCK_SIZE
+    inblock_off = tok % BLOCK_SIZE + layer_id * BLOCK_SIZE
 
     # Guard block_table access
     valid_block = block_id < max_num_blocks_per_req
@@ -371,14 +371,16 @@ def _convert_req_index_to_global_index_kernel_with_stride_factor(
 
 
 def triton_convert_req_index_to_global_index(
-    req_id: torch.Tensor,  # int32 [num_tokens]
-    block_table: torch.Tensor,  # int32 [num_requests, max_num_blocks_per_req]
-    token_indices: torch.Tensor,  # int32 [num_tokens, NUM_TOPK_TOKENS]
-    BLOCK_SIZE: int = 64,
-    NUM_TOPK_TOKENS: int = 2048,
-    BLOCK_N: int = 128,  # tile width along columns
-    stride_factor:
+        req_id: torch.Tensor,  # int32 [num_tokens]
+        block_table: torch.
+    Tensor,  # int32 [num_requests, max_num_blocks_per_req]
+        token_indices: torch.Tensor,  # int32 [num_tokens, NUM_TOPK_TOKENS]
+        BLOCK_SIZE: int,
+        NUM_TOPK_TOKENS: int = 2048,
+        BLOCK_N: int = 128,  # tile width along columns
+        stride_factor:
     int = None,  # for strided memory layout (with layer interleaving), defaults to BLOCK_SIZE
+        layer_id: int = 0,  # for layer interleaving layout
 ):
     """
     Convert request-local token indices to global KV cache pool indices.
@@ -436,6 +438,7 @@ def triton_convert_req_index_to_global_index(
         BLOCK_N,
         stride_factor,
         # strides
+        layer_id,
         bt_stride0,
         bt_stride1,
         ti_stride0,
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -1530,7 +1530,6 @@ def forward_context(
             elif trtllm_attention.has_cached_kv_for_mla_context(attn_metadata):
                 return self.forward_context_with_cached_kv(
                     q, latent_cache, attn_metadata, output)
-
         return self.forward_context_default(q, compressed_kv, k_pe,
                                             attn_metadata, output, latent_cache)
 
@@ -1601,9 +1600,6 @@ def forward_generation(
             out_scale=self.out_scale,
             latent_cache=latent_cache,  # kvcache and k_pe
             q_pe=q_pe,  # used by `invokeMLARopeGeneration`
-            hidden_states=hidden_states,
-            qr=qr,
-            position_ids=position_ids,
         )
         fused_q = None
 
diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py
@@ -325,7 +325,6 @@ def _load_and_validate_config(
                 if hasattr(config.pretrained_config, sub_config):
                     getattr(config.pretrained_config,
                             sub_config).num_hidden_layers = num_layers_override
-
         return config
 
     def _call_load_weights(self, load_method: Callable, weights, weight_mapper):
diff --git a/tensorrt_llm/quantization/utils/fp8_utils.py b/tensorrt_llm/quantization/utils/fp8_utils.py
@@ -444,7 +444,6 @@ def _per_token_quant_and_transform_kernel(
         )
 
 
-# TODO: Add more comments and tests for this function for future reuse
 def per_token_quant_and_transform(
     input: torch.Tensor,
     quant_group_size: int = 128,

Original file line number	Diff line number	Diff line change
`@@ -444,7 +444,6 @@ def _per_token_quant_and_transform_kernel(`
`444`	`444`	`)`
`445`	`445`
`446`	`446`
`447`		`-# TODO: Add more comments and tests for this function for future reuse`
`448`	`447`	`def per_token_quant_and_transform(`
`449`	`448`	`input: torch.Tensor,`
`450`	`449`	`quant_group_size: int = 128,`