NVIDIA
diff --git a/‎.coderabbit.yaml‎
Lines changed: 3 additions & 1 deletion b/‎.coderabbit.yaml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.gitattributes‎
Lines changed: 3 additions & 0 deletions b/‎.gitattributes‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/pull_request_template.md‎
Lines changed: 14 additions & 0 deletions b/‎.github/pull_request_template.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎.github/scripts/pr_checklist_check.py‎
Lines changed: 120 additions & 0 deletions b/‎.github/scripts/pr_checklist_check.py‎
Lines changed: 120 additions & 0 deletions
diff --git a/‎.github/workflows/pr-check.yml‎
Lines changed: 18 additions & 0 deletions b/‎.github/workflows/pr-check.yml‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheConnector.h‎
Lines changed: 46 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheConnector.h‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 15 additions & 5 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎cpp/include/tensorrt_llm/deep_gemm/scheduler.cuh‎
Lines changed: 1 addition & 1 deletion b/‎cpp/include/tensorrt_llm/deep_gemm/scheduler.cuh‎
Lines changed: 1 addition & 1 deletion
@@ -29,8 +29,10 @@ reviews:
   suggested_labels: true
   suggested_reviewers: true
   poem: false
+  review_status: false
   auto_review:
-    drafts: true
+    auto_incremental_review: false
+    drafts: false
     base_branches: ["main", "release/.+"]
 knowledge_base:
   code_guidelines:
 
@@ -9,3 +9,6 @@ triton_backend/tools/gpt/input_data.json filter=lfs diff=lfs merge=lfs -text
 docs/source/blogs/media/tech_blog3_mla_absorb.png filter=lfs diff=lfs merge=lfs -text
 tests/integration/test_input_files/*.png filter=lfs diff=lfs merge=lfs -text
 tests/integration/test_input_files/*.jpg filter=lfs diff=lfs merge=lfs -text
+docs/source/blogs/media/tech_blog10_baseline_performance_detail.png filter=lfs diff=lfs merge=lfs -text
+docs/source/blogs/media/tech_blog10_full_strategy_performance.png filter=lfs diff=lfs merge=lfs -text
+docs/source/blogs/media/tech_blog10_context_wait_performance.png  filter=lfs diff=lfs merge=lfs -text
@@ -40,6 +40,20 @@ Please explain the issue and the solution in short.
 Please list clearly what are the relevant test(s) that can safeguard the changes in the PR. This helps us to ensure we have sufficient test coverage for the PR.
 -->
 
+## PR Checklist
+
+Please review the following before submitting your PR:
+- PR description clearly explains what and why. If using CodeRabbit's summary, please make sure it makes sense.
+- PR Follows [TRT-LLM CODING GUIDELINES](https://github.com/NVIDIA/TensorRT-LLM/blob/main/CODING_GUIDELINES.md) to the best of your knowledge.
+- Test cases are provided for new code paths (see [test instructions](https://github.com/NVIDIA/TensorRT-LLM/tree/main/tests#1-how-does-the-ci-work))
+- Any new dependencies have been scanned for license and vulnerabilities
+- [CODEOWNERS](https://github.com/NVIDIA/TensorRT-LLM/blob/main/.github/CODEOWNERS) updated if ownership changes
+- Documentation updated as needed
+- The reviewers assigned automatically/manually are appropriate for the PR.
+
+
+- [ ] Please check this after reviewing the above items as appropriate for this PR.
+
 ## GitHub Bot Help
 
 `/bot [-h] ['run', 'kill', 'skip', 'reuse-pipeline'] ...`
 
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import sys
+from typing import List
+
+# Matches a Markdown checklist item in the PR body.
+# Expected format: "- [ ] Task description" or "* [x] Task description"
+# Group 1 captures the checkbox state: ' ' (unchecked), 'x' or 'X' (checked).
+# Group 2 captures the task content (the description of the checklist item).
+TASK_PATTERN = re.compile(r'^\s*[-*]\s+\[( |x|X)\]\s*(.*)')
+
+
+def find_all_tasks(pr_body: str) -> List[str]:
+    """Return list of all task list items (both resolved and unresolved)."""
+    tasks: List[str] = []
+    for line in pr_body.splitlines():
+        match = TASK_PATTERN.match(line)
+        if match:
+            tasks.append(match.group(0).strip())
+    return tasks
+
+
+def find_unresolved_tasks(pr_body: str) -> List[str]:
+    """Return list of unresolved task list items.
+
+    A task is considered resolved if it is checked (``[x]`` or ``[X]``)
+    or if its text is struck through using ``~~`` markers.
+    """
+    unresolved: List[str] = []
+    for line in pr_body.splitlines():
+        match = TASK_PATTERN.match(line)
+        if not match:
+            continue
+        state, content = match.groups()
+        if state.lower() == 'x':
+            continue
+        # Check if the entire content is struck through
+        if content.strip().startswith('~~') and content.strip().endswith('~~'):
+            continue
+        unresolved.append(match.group(0).strip())
+    return unresolved
+
+
+def check_pr_checklist_section(pr_body: str) -> tuple[bool, str]:
+    """Check if the PR Checklist section exists with the required final checkbox.
+
+    Returns:
+        tuple: (is_valid, error_message)
+    """
+    # Check if "## PR Checklist" header exists
+    pr_checklist_pattern = re.compile(r'^##\s+PR\s+Checklist',
+                                      re.IGNORECASE | re.MULTILINE)
+    if not pr_checklist_pattern.search(pr_body):
+        return False, "Missing '## PR Checklist' header. Please ensure you haven't removed the PR template section."
+
+    # Check if the final checkbox exists (the one users must check)
+    final_checkbox_pattern = re.compile(
+        r'^\s*[-*]\s+\[( |x|X)\]\s+Please check this after reviewing the above items',
+        re.MULTILINE)
+    if not final_checkbox_pattern.search(pr_body):
+        return False, "Missing the required final checkbox '- [ ] Please check this after reviewing the above items as appropriate for this PR.' Please ensure you haven't removed this from the PR template."
+
+    return True, ""
+
+
+def main() -> None:
+    pr_body = os.environ.get("PR_BODY", "")
+    enforce_checklist = os.environ.get("ENFORCE_PR_HAS_CHECKLIST",
+                                       "false").lower() == "true"
+
+    # Always check for PR Checklist section when enforcement is enabled
+    if enforce_checklist:
+        is_valid, error_msg = check_pr_checklist_section(pr_body)
+        if not is_valid:
+            print(f"Error: {error_msg}")
+            sys.exit(1)
+
+    all_tasks = find_all_tasks(pr_body)
+    unresolved = find_unresolved_tasks(pr_body)
+
+    # Check if we need to enforce the presence of at least one checklist item
+    if enforce_checklist and not all_tasks:
+        print(
+            "Error: PR body must contain at least one checklist item when ENFORCE_PR_HAS_CHECKLIST is enabled."
+        )
+        print(
+            "Expected format: - [ ] Task description or * [ ] Task description")
+        sys.exit(1)
+
+    # If we have tasks, check if any are unresolved
+    if unresolved:
+        print("Unresolved checklist items found:")
+        for item in unresolved:
+            print(f"{item}")
+        sys.exit(1)
+
+    if all_tasks:
+        print("All checklist items resolved.")
+    else:
+        print("No checklist items found in PR body.")
+
+
+if __name__ == "__main__":
+    main()
@@ -53,3 +53,21 @@ jobs:
           echo "  - [#1234][doc] Update documentation"
           echo "  - [None][chore] Minor clean-up"
           exit 1
+
+  check-pr-body-checklist:
+    name: Check PR Checklist Resolution
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Validate PR Checklist
+        env:
+          PR_BODY: ${{ github.event.pull_request.body }}
+          ENFORCE_PR_HAS_CHECKLIST: false
+        run: python .github/scripts/pr_checklist_check.py
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/batch_manager/common.h"
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/runtime/common.h"
+
+#include <utility>
+#include <vector>
+
+using SizeType32 = tensorrt_llm::runtime::SizeType32;
+using RequestIdType = tensorrt_llm::batch_manager::LlmRequest::RequestIdType;
+
+/// See tensorrt_llm/_torch/pyexecutor/connector.py for details on the Connector API.
+
+namespace tensorrt_llm::batch_manager::kv_connector
+{
+
+/// @brief The KV connector manager. This is passed into the C++ KV Cache Manager when adding sequences.
+class KvCacheConnectorManager
+{
+public:
+    KvCacheConnectorManager() = default;
+    virtual ~KvCacheConnectorManager() = default;
+
+    /// @brief Handle the getNumNewMatchedTokens call inside the C++ KV Cache Manager.
+    /// @return The number of tokens that can be loaded from remote KV cache.
+    virtual SizeType32 getNumNewMatchedTokens(LlmRequest const& request, SizeType32 numComputedTokens) = 0;
+};
+
+} // namespace tensorrt_llm::batch_manager::kv_connector
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "tensorrt_llm/batch_manager/kvCacheConnector.h"
 #include "tensorrt_llm/batch_manager/kvCacheEventManager.h"
 #include "tensorrt_llm/batch_manager/kvCacheType.h"
 #include "tensorrt_llm/batch_manager/llmRequest.h" // TODO forward declare
@@ -538,7 +539,8 @@ class WindowBlockManager
         SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool,
         SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream,
         bool onboardBlocks, CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
-        std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse);
+        std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse,
+        std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager);
 
     ~WindowBlockManager();
 
@@ -835,6 +837,8 @@ class WindowBlockManager
     bool mEnablePartialReuse;
     // Whether partially matched blocks that are already in use should be copied and reused.
     bool mCopyOnPartialReuse;
+    // The kv cache connector manager
+    std::shared_ptr<kv_connector::KvCacheConnectorManager> mKvCacheConnectorManager;
 };
 
 class BlockManager
@@ -852,7 +856,8 @@ class BlockManager
         SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType = CacheType::kSELF,
         std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
         std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
-        bool copyOnPartialReuse = true);
+        bool copyOnPartialReuse = true,
+        std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager = nullptr);
 
     BlockManager(BlockManager const&) = delete;
     BlockManager& operator=(BlockManager const&) = delete;
@@ -1287,6 +1292,7 @@ class BaseKVCacheManager
         LlmRequest::RequestIdType requestId, SizeType32 windowSize) const
         = 0;
 
+    [[nodiscard]] virtual runtime::ITensor::SharedPtr getUniquePrimaryPool() const = 0;
     [[nodiscard]] virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
     [[nodiscard]] virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
 
@@ -1373,7 +1379,8 @@ class KVCacheManager : public BaseKVCacheManager
         bool enableBlockReuse = false, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
         std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
         std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
-        bool copyOnpartialReuse = true);
+        bool copyOnpartialReuse = true,
+        std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager = nullptr);
 
     KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
         BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
@@ -1383,7 +1390,8 @@ class KVCacheManager : public BaseKVCacheManager
         bool enableBlockReuse = false, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
         std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
         std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
-        bool copyOnpartialReuse = true);
+        bool copyOnpartialReuse = true,
+        std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager = nullptr);
 
     KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
         BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
@@ -1393,7 +1401,8 @@ class KVCacheManager : public BaseKVCacheManager
         bool enableBlockReuse = true, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
         std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
         std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
-        bool copyOnpartialReuse = true);
+        bool copyOnpartialReuse = true,
+        std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager = nullptr);
 
     KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
         BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
@@ -1624,6 +1633,7 @@ class KVCacheManager : public BaseKVCacheManager
     std::vector<SizeType32> getNewlyAllocatedBlockIds(
         LlmRequest::RequestIdType requestId, SizeType32 windowSize) const override;
 
+    runtime::ITensor::SharedPtr getUniquePrimaryPool() const override;
     runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const override;
 
     SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const override
 
@@ -379,7 +379,7 @@ struct GroupedMaskedScheduler
     }
 };
 
-// Need to keep the same as the one in tests/unittest/_torch/thop/deep_gemm_tests.py
+// Need to keep the same as the one in tests/unittest/_torch/thop/parallel/deep_gemm_tests.py
 template <typename T_offset, typename T_index>
 __host__ __device__ __forceinline__ T_offset compute_padded_offset(T_offset offset, T_index problem_idx)
 {
Original file line number	Diff line number	Diff line change
`@@ -379,7 +379,7 @@ struct GroupedMaskedScheduler`
`379`	`379`	`}`
`380`	`380`	`};`
`381`	`381`
`382`		`-// Need to keep the same as the one in tests/unittest/_torch/thop/deep_gemm_tests.py`
	`382`	`+// Need to keep the same as the one in tests/unittest/_torch/thop/parallel/deep_gemm_tests.py`
`383`	`383`	`template <typename T_offset, typename T_index>`
`384`	`384`	`__host__ __device__ __forceinline__ T_offset compute_padded_offset(T_offset offset, T_index problem_idx)`
`385`	`385`	`{`