Skip to content

Commit 556438b

Browse files
committed
Merge from origin.
Signed-off-by: Zheyu Fu <zheyuf@NVIDIA.com>
2 parents 6e75d48 + 08b2040 commit 556438b

File tree

274 files changed

+11568
-6298
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

274 files changed

+11568
-6298
lines changed

.gitattributes

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,5 @@
77
triton_backend/tools/gpt/input_data.json filter=lfs diff=lfs merge=lfs -text
88
*cubin.cpp filter=lfs diff=lfs merge=lfs -text
99
docs/source/blogs/media/tech_blog3_mla_absorb.png filter=lfs diff=lfs merge=lfs -text
10+
tests/integration/test_input_files/*.png filter=lfs diff=lfs merge=lfs -text
11+
tests/integration/test_input_files/*.jpg filter=lfs diff=lfs merge=lfs -text

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ TensorRT-LLM
99
[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
1010
[![cuda](https://img.shields.io/badge/cuda-12.9.1-green)](https://developer.nvidia.com/cuda-downloads)
1111
[![trt](https://img.shields.io/badge/TRT-10.11.0-green)](https://developer.nvidia.com/tensorrt)
12-
[![version](https://img.shields.io/badge/release-1.1.0rc1-green)](./tensorrt_llm/version.py)
12+
[![version](https://img.shields.io/badge/release-1.1.0rc2-green)](./tensorrt_llm/version.py)
1313
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
1414

1515
[Architecture](./docs/source/torch/arch_overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
@@ -18,10 +18,9 @@ TensorRT-LLM
1818
<div align="left">
1919

2020
## Tech Blogs
21-
* [08/06] Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
21+
* [08/05] Running a High-Performance GPT-OSS-120B Inference Server with TensorRT-LLM
2222
[➡️ link](./docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md)
2323

24-
2524
* [08/01] Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
2625
[➡️ link](./docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md)
2726

@@ -44,6 +43,7 @@ TensorRT-LLM
4443
[➡️ link](./docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md)
4544

4645
## Latest News
46+
* [08/05] 🌟 TensorRT-LLM delivers Day-0 support for OpenAI's latest open-weights models: GPT-OSS-120B [➡️ link](https://huggingface.co/openai/gpt-oss-120b) and GPT-OSS-20B [➡️ link](https://huggingface.co/openai/gpt-oss-20b)
4747
* [07/15] 🌟 TensorRT-LLM delivers Day-0 support for LG AI Research's latest model, EXAONE 4.0 [➡️ link](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B)
4848
* [06/17] Join NVIDIA and DeepInfra for a developer meetup on June 26 ✨ [➡️ link](https://events.nvidia.com/scaletheunscalablenextgenai)
4949
* [05/22] Blackwell Breaks the 1,000 TPS/User Barrier With Meta’s Llama 4 Maverick
@@ -253,5 +253,5 @@ Deprecation is used to inform developers that some APIs and tools are no longer
253253
## Useful Links
254254
- [Quantized models on Hugging Face](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4): A growing collection of quantized (e.g., FP8, FP4) and optimized LLMs, including [DeepSeek FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), ready for fast inference with TensorRT-LLM.
255255
- [NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo): A datacenter scale distributed inference serving framework that works seamlessly with TensorRT-LLM.
256-
- [AutoDeploy](./examples/auto_deploy/README.md): A prototype backend for TensorRT-LLM to simplify and accelerate the deployment of PyTorch models.
256+
- [AutoDeploy](https://nvidia.github.io/TensorRT-LLM/torch/auto_deploy/auto-deploy.html): A prototype backend for TensorRT-LLM to simplify and accelerate the deployment of PyTorch models.
257257
- [WeChat Discussion Group](https://github.com/NVIDIA/TensorRT-LLM/issues/5359): A real-time channel for TensorRT-LLM Q&A and news.

cpp/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ add_compile_definitions("TLLM_GEN_EXPORT_INTERFACE")
6969
add_compile_definitions("TLLM_ENABLE_CUDA")
7070

7171
set(BINDING_TYPE
72-
"pybind"
72+
"nanobind"
7373
CACHE STRING
7474
"Binding type of Python bindings for C++ runtime and batch manager")
7575

cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
#include "tensorrt_llm/runtime/common.h"
2525
#include "tensorrt_llm/runtime/iTensor.h"
2626
#include "tensorrt_llm/runtime/modelConfig.h"
27-
#include "tensorrt_llm/runtime/request.h"
2827
#include "tensorrt_llm/runtime/worldConfig.h"
2928

3029
namespace tensorrt_llm::runtime
@@ -88,37 +87,6 @@ class CreateNewDecoderRequests : Algorithm
8887
SizeType32 maxSequenceLength, OptionalRef<MedusaBuffers const> medusaBuffers) const;
8988

9089
private:
91-
//! @brief Setups decoder internal tensors for new speculative decoding request
92-
static void newRequestSpeculativeDecoding(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
93-
SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig,
94-
DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream,
95-
CudaStream const& decoderStream, SpeculativeDecodingMode const& speculativeDecodingMode,
96-
SizeType32 maxDecodingEngineTokens);
97-
98-
//! @brief Setups decoder internal tensors for new request in Draft model Sps mode
99-
static void newRequestDraftTokensExternal(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
100-
SamplingConfig const& samplingConfig, DecodingInput& jointDecodingInput, CudaStream const& decoderStream);
101-
102-
//! @brief Setups decoder internal tensors for new Medusa request
103-
static void newRequestMedusa(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
104-
DecodingInput& jointDecodingInput, CudaStream const& decoderStream, SizeType32 maxDecodingEngineTokens);
105-
106-
//! @brief Setups decoder internal tensors for new Lookahead request
107-
static void newRequestLookahead(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
108-
DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream);
109-
110-
//! @brief Setups decoder internal tensors for new Explicit draft tokens request
111-
static void newRequestExplicitDraftTokens(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
112-
DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream);
113-
114-
//! @brief Setups decoder internal tensors for new Eagle request
115-
static void newRequestEagle(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
116-
runtime::ModelConfig const& modelConfig, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream);
117-
118-
[[nodiscard]] std::shared_ptr<runtime::ITensor> retrieveDraftLogits(runtime::ModelConfig const& modelConfig,
119-
runtime::WorldConfig const& worldConfig, std::shared_ptr<runtime::ITensor> const& tensor,
120-
runtime::BufferManager const& bufferManager) const;
121-
12290
bool mSpeculativeDecodingFastLogits;
12391
bool mIsLeaderInOrchMode;
12492
bool mIsNormalizeLogProbs;

cpp/include/tensorrt_llm/batch_manager/llmRequest.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1110,7 +1110,7 @@ class GenericLlmRequest
11101110

11111111
[[nodiscard]] SizeType32 getNumDraftTokens() const
11121112
{
1113-
return mDraftTokens->size();
1113+
return hasDraftTokens() ? mDraftTokens->size() : 0;
11141114
}
11151115

11161116
void discardDraftTokens(SizeType32 numTokensToDiscard)

cpp/include/tensorrt_llm/executor/dataTransceiverState.h

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,29 +52,30 @@ class CacheState final
5252
AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2)
5353
: mModelConfig(std::move(modelConfig))
5454
, mParallelConfig{worldConfig.getTensorParallelism(), worldConfig.getPipelineParallelism(),
55-
worldConfig.enableAttentionDP(), worldConfig.getTensorParallelRank(), worldConfig.getTensorParallelism()}
55+
worldConfig.getContextParallelism(), worldConfig.enableAttentionDP(), worldConfig.getTensorParallelRank(),
56+
worldConfig.getTensorParallelism()}
5657
, mDataType{dataType}
5758
, mAttentionConfig(attentionType, kvFactor)
5859
{
5960
}
6061

6162
CacheState(std::vector<SizeType32> nbKvHeadPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
62-
SizeType32 tensorParallelism, SizeType32 pipelineParallelism, nvinfer1::DataType dataType,
63-
AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableAttentionDP = false,
64-
int DPrank = 0, int DPsize = 0)
63+
SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism,
64+
nvinfer1::DataType dataType, AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2,
65+
bool enableAttentionDP = false, int DPrank = 0, int DPsize = 0)
6566
: mModelConfig{std::move(nbKvHeadPerLayer), sizePerHead, tokensPerBlock}
66-
, mParallelConfig{tensorParallelism, pipelineParallelism, enableAttentionDP, DPrank, DPsize}
67+
, mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize}
6768
, mDataType{dataType}
6869
, mAttentionConfig(attentionType, kvFactor)
6970
{
7071
}
7172

7273
CacheState(SizeType32 nbAttentionLayers, SizeType32 nbKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
73-
SizeType32 tensorParallelism, SizeType32 pipelineParallelism, nvinfer1::DataType dataType,
74-
AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableAttentionDP = false,
75-
int DPrank = 0, int DPsize = 0)
74+
SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism,
75+
nvinfer1::DataType dataType, AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2,
76+
bool enableAttentionDP = false, int DPrank = 0, int DPsize = 0)
7677
: mModelConfig{std::vector(nbAttentionLayers, nbKvHeads), sizePerHead, tokensPerBlock}
77-
, mParallelConfig{tensorParallelism, pipelineParallelism, enableAttentionDP, DPrank, DPsize}
78+
, mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize}
7879
, mDataType{dataType}
7980
, mAttentionConfig(attentionType, kvFactor)
8081
{
@@ -83,7 +84,7 @@ class CacheState final
8384
[[nodiscard]] bool operator==(kv_cache::CacheState const& other) const noexcept
8485
{
8586
return mModelConfig == other.mModelConfig && mParallelConfig == other.mParallelConfig
86-
&& mDataType == other.mDataType;
87+
&& mAttentionConfig == other.mAttentionConfig && mDataType == other.mDataType;
8788
}
8889

8990
struct ModelConfig
@@ -103,15 +104,16 @@ class CacheState final
103104
{
104105
SizeType32 mTensorParallelism;
105106
SizeType32 mPipelineParallelism;
107+
SizeType32 mContextParallelism;
106108
bool mEnableAttentionDP;
107109
SizeType32 mDPrank;
108110
SizeType32 mDPsize;
109111

110112
[[nodiscard]] bool operator==(ParallelConfig const& other) const noexcept
111113
{
112114
return mTensorParallelism == other.mTensorParallelism && mPipelineParallelism == other.mPipelineParallelism
113-
&& mEnableAttentionDP == other.mEnableAttentionDP && mDPrank == other.mDPrank
114-
&& mDPsize == other.mDPsize;
115+
&& mContextParallelism == other.mContextParallelism && mEnableAttentionDP == other.mEnableAttentionDP
116+
&& mDPrank == other.mDPrank && mDPsize == other.mDPsize;
115117
}
116118
};
117119

@@ -125,6 +127,11 @@ class CacheState final
125127
{
126128
}
127129

130+
[[nodiscard]] bool operator==(AttentionConfig const& other) const noexcept
131+
{
132+
return mAttentionType == other.mAttentionType && mKvFactor == other.mKvFactor;
133+
}
134+
128135
// attentionType ;
129136
AttentionType mAttentionType;
130137
int mKvFactor;
@@ -162,6 +169,7 @@ class CacheState final
162169
sstring << "mTokensPerBlock:" << mModelConfig.mTokensPerBlock << "\n";
163170
sstring << "tp:" << mParallelConfig.mTensorParallelism << "\n";
164171
sstring << "pp:" << mParallelConfig.mPipelineParallelism << "\n";
172+
sstring << "cp:" << mParallelConfig.mContextParallelism << "\n";
165173
sstring << "enableAttentionDP:" << mParallelConfig.mEnableAttentionDP << "\n";
166174
sstring << "datatype:" << static_cast<int32_t>(mDataType) << "\n";
167175
sstring << "attentionType:" << static_cast<int32_t>(mAttentionConfig.mAttentionType) << "\n";

cpp/include/tensorrt_llm/runtime/decodingInput.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,13 @@ class DecodingInput
102102
{
103103
public:
104104
TensorPtr draftLogits;
105+
TensorPtr draftLogitsHost;
105106
TensorPtr draftProbs;
106107
TensorPtr targetProbs;
107108
TensorPtr numDraftTokens;
108109
TensorPtr numDraftTokensHost;
109110
TensorPtr draftTokenIds;
111+
TensorPtr draftTokenIdsHost;
110112
TensorPtr useDraftLogits;
111113
TensorPtr useDraftLogitsHost;
112114

cpp/include/tensorrt_llm/runtime/request.h

Lines changed: 0 additions & 54 deletions
This file was deleted.

cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -833,7 +833,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
833833
// Runs for 3 iterations or 1 second and picks the best option
834834
int pickBestTactic(MOEParallelismConfig parallelism_config, GemmToProfile gemm_to_profile)
835835
{
836-
auto tactics = mMoERunner.getTactics();
836+
auto tactics = mMoERunner.getTactics(static_cast<MoeGemmId>(gemm_to_profile));
837837
::nvtx3::scoped_range nvtx(tensorrt_llm::common::nvtx::nextColor(),
838838
"Tactic Profiling GEMM " + std::to_string(static_cast<int>(gemm_to_profile)));
839839
// We save space by reusing the same workspace buffer for all tactics when doing full layer profiling. So we
@@ -925,12 +925,14 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
925925
std::pair<int, int> setTactic(
926926
int tactic_idx1, int tactic_idx2, MOEParallelismConfig parallelism_config, GemmToProfile gemm_to_profile)
927927
{
928-
auto tactics = mMoERunner.getTactics();
928+
auto tactics1 = mMoERunner.getTactics(MoeGemmId::GEMM_1);
929+
auto tactics2 = mMoERunner.getTactics(MoeGemmId::GEMM_2);
929930
std::vector<std::pair<std::reference_wrapper<int>, GemmToProfile>> tactics_to_profile{
930931
{tactic_idx1, GemmToProfile::GEMM_1}, {tactic_idx2, GemmToProfile::GEMM_2}};
931932
for (auto& combo : tactics_to_profile)
932933
{
933934
auto& t = combo.first.get();
935+
auto& tactics = combo.second == GemmToProfile::GEMM_1 ? tactics1 : tactics2;
934936
if (combo.second != gemm_to_profile && gemm_to_profile != GemmToProfile::LAYER)
935937
{
936938
t = 0; // Unneeded tactic, set to 0
@@ -947,7 +949,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
947949
}
948950
}
949951

950-
mMoERunner.setTactic(tactics[tactic_idx1], tactics[tactic_idx2]);
952+
mMoERunner.setTactic(tactics1[tactic_idx1], tactics2[tactic_idx2]);
951953
mBestTacticGemm1 = tactic_idx1;
952954
mBestTacticGemm2 = tactic_idx2;
953955
return {tactic_idx1, tactic_idx2};
@@ -965,7 +967,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
965967
auto expert_weights_size
966968
= gemm_to_profile == GemmToProfile::GEMM_1 ? mExpertWeight1Size : mExpertWeight2Size;
967969

968-
auto tactics = mMoERunner.getTactics()[tactic_idx];
970+
auto tactics = mMoERunner.getTactics(static_cast<MoeGemmId>(gemm_to_profile))[tactic_idx];
969971
if (static_cast<int>(gemm_to_profile) != static_cast<int>(mGemmProfilerBackend.mGemmToProfile))
970972
{
971973
throw std::runtime_error("Configuration mismatch between mGemmProfilerBackend and runMoEPermute");
@@ -1074,11 +1076,12 @@ void MixtureOfExpertsBenchmark<TypeTuple_>::runBenchmark(benchmark::State& state
10741076
}
10751077
if (LOG_LEVEL >= INFO)
10761078
{
1077-
auto tactics = mMoERunner.getTactics();
1078-
std::cout << "Selected tactic #1: " << tactic_idx1 << "/" << tactics.size() << "\n"
1079-
<< tactics[tactic_idx1].toString() << std::endl;
1080-
std::cout << "Selected tactic #2: " << tactic_idx2 << "/" << tactics.size() << "\n"
1081-
<< tactics[tactic_idx2].toString() << std::endl;
1079+
auto tactics1 = mMoERunner.getTactics(MoeGemmId::GEMM_1);
1080+
auto tactics2 = mMoERunner.getTactics(MoeGemmId::GEMM_2);
1081+
std::cout << "Selected tactic #1: " << tactic_idx1 << "/" << tactics1.size() << "\n"
1082+
<< tactics1[tactic_idx1].toString() << std::endl;
1083+
std::cout << "Selected tactic #2: " << tactic_idx2 << "/" << tactics2.size() << "\n"
1084+
<< tactics2[tactic_idx2].toString() << std::endl;
10821085
}
10831086
state.counters["tactic_idx1"] = tactic_idx1;
10841087
state.counters["tactic_idx2"] = tactic_idx2;

0 commit comments

Comments
 (0)