Skip to content

Commit c134cf1

Browse files
authored
Merge branch 'main' into should_use_spec
2 parents a01bc8d + 091b67a commit c134cf1

File tree

132 files changed

+6690
-1345
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

132 files changed

+6690
-1345
lines changed

.coderabbit.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,10 @@ reviews:
2929
suggested_labels: true
3030
suggested_reviewers: true
3131
poem: false
32+
review_status: false
3233
auto_review:
33-
drafts: true
34+
auto_incremental_review: false
35+
drafts: false
3436
base_branches: ["main", "release/.+"]
3537
knowledge_base:
3638
code_guidelines:

.gitattributes

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,6 @@ triton_backend/tools/gpt/input_data.json filter=lfs diff=lfs merge=lfs -text
99
docs/source/blogs/media/tech_blog3_mla_absorb.png filter=lfs diff=lfs merge=lfs -text
1010
tests/integration/test_input_files/*.png filter=lfs diff=lfs merge=lfs -text
1111
tests/integration/test_input_files/*.jpg filter=lfs diff=lfs merge=lfs -text
12+
docs/source/blogs/media/tech_blog10_baseline_performance_detail.png filter=lfs diff=lfs merge=lfs -text
13+
docs/source/blogs/media/tech_blog10_full_strategy_performance.png filter=lfs diff=lfs merge=lfs -text
14+
docs/source/blogs/media/tech_blog10_context_wait_performance.png filter=lfs diff=lfs merge=lfs -text

.github/pull_request_template.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,20 @@ Please explain the issue and the solution in short.
4040
Please list clearly what are the relevant test(s) that can safeguard the changes in the PR. This helps us to ensure we have sufficient test coverage for the PR.
4141
-->
4242

43+
## PR Checklist
44+
45+
Please review the following before submitting your PR:
46+
- PR description clearly explains what and why. If using CodeRabbit's summary, please make sure it makes sense.
47+
- PR Follows [TRT-LLM CODING GUIDELINES](https://github.com/NVIDIA/TensorRT-LLM/blob/main/CODING_GUIDELINES.md) to the best of your knowledge.
48+
- Test cases are provided for new code paths (see [test instructions](https://github.com/NVIDIA/TensorRT-LLM/tree/main/tests#1-how-does-the-ci-work))
49+
- Any new dependencies have been scanned for license and vulnerabilities
50+
- [CODEOWNERS](https://github.com/NVIDIA/TensorRT-LLM/blob/main/.github/CODEOWNERS) updated if ownership changes
51+
- Documentation updated as needed
52+
- The reviewers assigned automatically/manually are appropriate for the PR.
53+
54+
55+
- [ ] Please check this after reviewing the above items as appropriate for this PR.
56+
4357
## GitHub Bot Help
4458

4559
`/bot [-h] ['run', 'kill', 'skip', 'reuse-pipeline'] ...`
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
#!/usr/bin/env python3
2+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
import os
18+
import re
19+
import sys
20+
from typing import List
21+
22+
# Matches a Markdown checklist item in the PR body.
23+
# Expected format: "- [ ] Task description" or "* [x] Task description"
24+
# Group 1 captures the checkbox state: ' ' (unchecked), 'x' or 'X' (checked).
25+
# Group 2 captures the task content (the description of the checklist item).
26+
TASK_PATTERN = re.compile(r'^\s*[-*]\s+\[( |x|X)\]\s*(.*)')
27+
28+
29+
def find_all_tasks(pr_body: str) -> List[str]:
30+
"""Return list of all task list items (both resolved and unresolved)."""
31+
tasks: List[str] = []
32+
for line in pr_body.splitlines():
33+
match = TASK_PATTERN.match(line)
34+
if match:
35+
tasks.append(match.group(0).strip())
36+
return tasks
37+
38+
39+
def find_unresolved_tasks(pr_body: str) -> List[str]:
40+
"""Return list of unresolved task list items.
41+
42+
A task is considered resolved if it is checked (``[x]`` or ``[X]``)
43+
or if its text is struck through using ``~~`` markers.
44+
"""
45+
unresolved: List[str] = []
46+
for line in pr_body.splitlines():
47+
match = TASK_PATTERN.match(line)
48+
if not match:
49+
continue
50+
state, content = match.groups()
51+
if state.lower() == 'x':
52+
continue
53+
# Check if the entire content is struck through
54+
if content.strip().startswith('~~') and content.strip().endswith('~~'):
55+
continue
56+
unresolved.append(match.group(0).strip())
57+
return unresolved
58+
59+
60+
def check_pr_checklist_section(pr_body: str) -> tuple[bool, str]:
61+
"""Check if the PR Checklist section exists with the required final checkbox.
62+
63+
Returns:
64+
tuple: (is_valid, error_message)
65+
"""
66+
# Check if "## PR Checklist" header exists
67+
pr_checklist_pattern = re.compile(r'^##\s+PR\s+Checklist',
68+
re.IGNORECASE | re.MULTILINE)
69+
if not pr_checklist_pattern.search(pr_body):
70+
return False, "Missing '## PR Checklist' header. Please ensure you haven't removed the PR template section."
71+
72+
# Check if the final checkbox exists (the one users must check)
73+
final_checkbox_pattern = re.compile(
74+
r'^\s*[-*]\s+\[( |x|X)\]\s+Please check this after reviewing the above items',
75+
re.MULTILINE)
76+
if not final_checkbox_pattern.search(pr_body):
77+
return False, "Missing the required final checkbox '- [ ] Please check this after reviewing the above items as appropriate for this PR.' Please ensure you haven't removed this from the PR template."
78+
79+
return True, ""
80+
81+
82+
def main() -> None:
83+
pr_body = os.environ.get("PR_BODY", "")
84+
enforce_checklist = os.environ.get("ENFORCE_PR_HAS_CHECKLIST",
85+
"false").lower() == "true"
86+
87+
# Always check for PR Checklist section when enforcement is enabled
88+
if enforce_checklist:
89+
is_valid, error_msg = check_pr_checklist_section(pr_body)
90+
if not is_valid:
91+
print(f"Error: {error_msg}")
92+
sys.exit(1)
93+
94+
all_tasks = find_all_tasks(pr_body)
95+
unresolved = find_unresolved_tasks(pr_body)
96+
97+
# Check if we need to enforce the presence of at least one checklist item
98+
if enforce_checklist and not all_tasks:
99+
print(
100+
"Error: PR body must contain at least one checklist item when ENFORCE_PR_HAS_CHECKLIST is enabled."
101+
)
102+
print(
103+
"Expected format: - [ ] Task description or * [ ] Task description")
104+
sys.exit(1)
105+
106+
# If we have tasks, check if any are unresolved
107+
if unresolved:
108+
print("Unresolved checklist items found:")
109+
for item in unresolved:
110+
print(f"{item}")
111+
sys.exit(1)
112+
113+
if all_tasks:
114+
print("All checklist items resolved.")
115+
else:
116+
print("No checklist items found in PR body.")
117+
118+
119+
if __name__ == "__main__":
120+
main()

.github/workflows/pr-check.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,21 @@ jobs:
5353
echo " - [#1234][doc] Update documentation"
5454
echo " - [None][chore] Minor clean-up"
5555
exit 1
56+
57+
check-pr-body-checklist:
58+
name: Check PR Checklist Resolution
59+
runs-on: ubuntu-latest
60+
steps:
61+
- name: Checkout repository
62+
uses: actions/checkout@v4
63+
64+
- name: Set up Python
65+
uses: actions/setup-python@v5
66+
with:
67+
python-version: '3.10'
68+
69+
- name: Validate PR Checklist
70+
env:
71+
PR_BODY: ${{ github.event.pull_request.body }}
72+
ENFORCE_PR_HAS_CHECKLIST: false
73+
run: python .github/scripts/pr_checklist_check.py
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include "tensorrt_llm/batch_manager/common.h"
20+
#include "tensorrt_llm/batch_manager/llmRequest.h"
21+
#include "tensorrt_llm/runtime/common.h"
22+
23+
#include <utility>
24+
#include <vector>
25+
26+
using SizeType32 = tensorrt_llm::runtime::SizeType32;
27+
using RequestIdType = tensorrt_llm::batch_manager::LlmRequest::RequestIdType;
28+
29+
/// See tensorrt_llm/_torch/pyexecutor/connector.py for details on the Connector API.
30+
31+
namespace tensorrt_llm::batch_manager::kv_connector
32+
{
33+
34+
/// @brief The KV connector manager. This is passed into the C++ KV Cache Manager when adding sequences.
35+
class KvCacheConnectorManager
36+
{
37+
public:
38+
KvCacheConnectorManager() = default;
39+
virtual ~KvCacheConnectorManager() = default;
40+
41+
/// @brief Handle the getNumNewMatchedTokens call inside the C++ KV Cache Manager.
42+
/// @return The number of tokens that can be loaded from remote KV cache.
43+
virtual SizeType32 getNumNewMatchedTokens(LlmRequest const& request, SizeType32 numComputedTokens) = 0;
44+
};
45+
46+
} // namespace tensorrt_llm::batch_manager::kv_connector

cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#pragma once
1818

19+
#include "tensorrt_llm/batch_manager/kvCacheConnector.h"
1920
#include "tensorrt_llm/batch_manager/kvCacheEventManager.h"
2021
#include "tensorrt_llm/batch_manager/kvCacheType.h"
2122
#include "tensorrt_llm/batch_manager/llmRequest.h" // TODO forward declare
@@ -538,7 +539,8 @@ class WindowBlockManager
538539
SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool,
539540
SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream,
540541
bool onboardBlocks, CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
541-
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse);
542+
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse,
543+
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager);
542544

543545
~WindowBlockManager();
544546

@@ -835,6 +837,8 @@ class WindowBlockManager
835837
bool mEnablePartialReuse;
836838
// Whether partially matched blocks that are already in use should be copied and reused.
837839
bool mCopyOnPartialReuse;
840+
// The kv cache connector manager
841+
std::shared_ptr<kv_connector::KvCacheConnectorManager> mKvCacheConnectorManager;
838842
};
839843

840844
class BlockManager
@@ -852,7 +856,8 @@ class BlockManager
852856
SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType = CacheType::kSELF,
853857
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
854858
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
855-
bool copyOnPartialReuse = true);
859+
bool copyOnPartialReuse = true,
860+
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager = nullptr);
856861

857862
BlockManager(BlockManager const&) = delete;
858863
BlockManager& operator=(BlockManager const&) = delete;
@@ -1287,6 +1292,7 @@ class BaseKVCacheManager
12871292
LlmRequest::RequestIdType requestId, SizeType32 windowSize) const
12881293
= 0;
12891294

1295+
[[nodiscard]] virtual runtime::ITensor::SharedPtr getUniquePrimaryPool() const = 0;
12901296
[[nodiscard]] virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
12911297
[[nodiscard]] virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
12921298

@@ -1373,7 +1379,8 @@ class KVCacheManager : public BaseKVCacheManager
13731379
bool enableBlockReuse = false, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
13741380
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
13751381
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
1376-
bool copyOnpartialReuse = true);
1382+
bool copyOnpartialReuse = true,
1383+
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager = nullptr);
13771384

13781385
KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
13791386
BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
@@ -1383,7 +1390,8 @@ class KVCacheManager : public BaseKVCacheManager
13831390
bool enableBlockReuse = false, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
13841391
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
13851392
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
1386-
bool copyOnpartialReuse = true);
1393+
bool copyOnpartialReuse = true,
1394+
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager = nullptr);
13871395

13881396
KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
13891397
BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
@@ -1393,7 +1401,8 @@ class KVCacheManager : public BaseKVCacheManager
13931401
bool enableBlockReuse = true, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
13941402
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
13951403
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
1396-
bool copyOnpartialReuse = true);
1404+
bool copyOnpartialReuse = true,
1405+
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager = nullptr);
13971406

13981407
KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
13991408
BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
@@ -1624,6 +1633,7 @@ class KVCacheManager : public BaseKVCacheManager
16241633
std::vector<SizeType32> getNewlyAllocatedBlockIds(
16251634
LlmRequest::RequestIdType requestId, SizeType32 windowSize) const override;
16261635

1636+
runtime::ITensor::SharedPtr getUniquePrimaryPool() const override;
16271637
runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const override;
16281638

16291639
SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const override

cpp/include/tensorrt_llm/deep_gemm/scheduler.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -379,7 +379,7 @@ struct GroupedMaskedScheduler
379379
}
380380
};
381381

382-
// Need to keep the same as the one in tests/unittest/_torch/thop/deep_gemm_tests.py
382+
// Need to keep the same as the one in tests/unittest/_torch/thop/parallel/deep_gemm_tests.py
383383
template <typename T_offset, typename T_index>
384384
__host__ __device__ __forceinline__ T_offset compute_padded_offset(T_offset offset, T_index problem_idx)
385385
{

0 commit comments

Comments
 (0)