Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -2199,8 +2199,9 @@ def launchTestJobs(pipeline, testFilter)
// "GB200-8_GPUs-2_Nodes-PyTorch-5": ["gb200-multi-node", "l0_gb200_multi_nodes", 5, 5, 8, 2],
// ]
multiNodesSBSAConfigs = [:]
multiNodesSBSAConfigs += (1..7).collectEntries { i ->
["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-multi-node", "l0_gb200_multi_nodes", i, 7, 8, 2]]
def numMultiNodeTests = 9
multiNodesSBSAConfigs += (1..numMultiNodeTests).collectEntries { i ->
["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-multi-node", "l0_gb200_multi_nodes", i, numMultiNodeTests, 8, 2]]
}
fullSet += multiNodesSBSAConfigs.keySet()

Expand Down
16 changes: 8 additions & 8 deletions tensorrt_llm/_torch/models/modeling_deepseekv3.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,12 +404,8 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig]):
self.norm = RMSNorm(hidden_size=config.hidden_size,
eps=config.rms_norm_eps,
dtype=config.torch_dtype)
if self.model_config.mapping.enable_attention_dp and \
getattr(self.model_config.mapping, 'enable_lm_head_tp_in_adp', False):
self.mapping_lm_head_tp = create_lm_head_tp_mapping(
self.model_config.mapping)
else:
self.mapping_lm_head_tp = self.model_config.mapping

self.mapping_lm_head_tp = None

@torch.compile(options={"max-autotune": True})
def get_last_token_states(self, hidden_states, attn_metadata):
Expand All @@ -433,19 +429,23 @@ def forward(self,
hidden_states = hidden_states[-1].unsqueeze(0)

enable_attention_dp = self.model_config.mapping.enable_attention_dp
enable_lm_head_tp_in_adp = self.model_config.mapping.enable_lm_head_tp_in_adp
enable_lm_head_tp_in_adp = enable_attention_dp and self.model_config.mapping.enable_lm_head_tp_in_adp

# Add pre-lm gather logic
if enable_lm_head_tp_in_adp:
# ADP + LM TP mode: perform All-Gather before LM_head
self.mapping_lm_head_tp = create_lm_head_tp_mapping(
self.model_config.mapping, hidden_states.shape[0])
hidden_states = allgather(hidden_states,
self.mapping_lm_head_tp,
dim=0)

# Temporarily disable gather_output when not in ADP mode or (in ADP mode and LM TP is enabled)
if not enable_attention_dp or enable_lm_head_tp_in_adp:
lm_head.gather_output = False
logits = lm_head(hidden_states, is_spec_decoding_head=True)
logits = lm_head(hidden_states,
mapping_lm_head_tp=self.mapping_lm_head_tp,
is_spec_decoding_head=True)
if not enable_attention_dp or enable_lm_head_tp_in_adp:
lm_head.gather_output = True
return logits
Expand Down
26 changes: 10 additions & 16 deletions tensorrt_llm/_torch/modules/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@

from tensorrt_llm.functional import AllReduceParams
from tensorrt_llm.mapping import Mapping
from tensorrt_llm.math_utils import ceil_div

from ..distributed import allgather
from ..utils import create_lm_head_tp_mapping
from .linear import Linear, TensorParallelMode


Expand Down Expand Up @@ -38,8 +38,6 @@ def __init__(
mapping = mapping or Mapping()
self.enable_lm_head_tp_in_adp = mapping.enable_attention_dp and \
getattr(mapping, 'enable_lm_head_tp_in_adp', False)
if self.enable_lm_head_tp_in_adp:
mapping = create_lm_head_tp_mapping(mapping)

tp_size = mapping.tp_size

Expand Down Expand Up @@ -78,18 +76,6 @@ def __init__(
self.weight = Parameter(torch.empty(weight_shape, dtype=dtype))
self.register_parameter("bias", None)

# For LM head TP in ADP, we need to slice the weight for the LM head
self.lm_head_slice_obj = None
if self.enable_lm_head_tp_in_adp:
tp_rank = self.mapping.tp_rank
tp_size = self.mapping.tp_size
slice_width = math.ceil(self.out_features / tp_size)
slice_start = tp_rank * slice_width
slice_end = min((tp_rank + 1) * slice_width, self.out_features)
slice_obj = [slice(None)] * len(self.weight.shape)
slice_obj[0] = slice(slice_start, slice_end)
self.lm_head_slice_obj = tuple(slice_obj)

@property
def vocab_size_padded(self) -> int:
if self.tp_mode == TensorParallelMode.COLUMN and self.gather_output:
Expand All @@ -102,10 +88,18 @@ def forward(
input: torch.Tensor,
*,
all_reduce_params: Optional[AllReduceParams] = None,
mapping_lm_head_tp: Optional[Mapping] = None,
is_spec_decoding_head: bool = False,
) -> torch.Tensor:
if is_spec_decoding_head and self.enable_lm_head_tp_in_adp:
output = F.linear(input, self.weight[self.lm_head_slice_obj], None)
# For LM head TP in ADP, we need to slice the weight for the LM head
tp_rank = mapping_lm_head_tp.tp_rank
tp_size = mapping_lm_head_tp.tp_size
slice_width = ceil_div(self.out_features, tp_size)
slice_start = tp_rank * slice_width
slice_end = min((tp_rank + 1) * slice_width, self.out_features)
output = F.linear(input, self.weight[slice_start:slice_end, :],
None)
else:
output = super().forward(input, all_reduce_params=all_reduce_params)
if (self.tp_mode == TensorParallelMode.COLUMN and self.gather_output
Expand Down
10 changes: 7 additions & 3 deletions tensorrt_llm/_torch/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import contextlib
import os
import threading
from dataclasses import dataclass
from enum import Enum
Expand Down Expand Up @@ -288,8 +287,13 @@ def get_per_request_piecewise_cuda_graph_flag() -> bool:
return getattr(_global_attrs, 'per_request_piecewise_cuda_graph_flag', True)


def create_lm_head_tp_mapping(mapping: Mapping) -> Mapping:
lm_head_tp_size = int(os.getenv('LM_HEAD_TP_SIZE', 2))
def create_lm_head_tp_mapping(mapping: Mapping, token_count: int) -> Mapping:
# We use heuristic to determine the lm_head_tp_size
# Since token_count=256 will hit the boundary of math-bound problem
# We use 256 // token_count to determine the lm_head_tp_size
lm_head_tp_size_raw = 256 // token_count
lm_head_tp_size = nearest_in_buckets(lm_head_tp_size_raw,
[1, mapping.gpus_per_node])
assert mapping.tp_size % lm_head_tp_size == 0
lm_head_pp_size = mapping.pp_size * mapping.tp_size // lm_head_tp_size

Expand Down
42 changes: 38 additions & 4 deletions tests/integration/defs/accuracy/test_llm_api_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -1903,7 +1903,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):

@skip_pre_blackwell
@pytest.mark.parametrize(
"tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend",
"tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,enable_lm_head_tp_in_adp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend",
[
# Use a larger batch_size to speed up the tests
pytest.param(8,
Expand All @@ -1912,6 +1912,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
3,
False,
False,
False,
True,
True,
32,
Expand All @@ -1923,6 +1924,31 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
3,
False,
False,
False,
True,
True,
32,
"TRTLLM",
marks=pytest.mark.skip_less_mpi_world_size(8)),
pytest.param(8,
1,
4,
3,
False,
True,
True,
True,
True,
32,
"CUTLASS",
marks=pytest.mark.skip_less_mpi_world_size(8)),
pytest.param(8,
1,
4,
3,
False,
True,
True,
True,
True,
32,
Expand All @@ -1934,6 +1960,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
0,
True,
True,
False,
True,
True,
32,
Expand All @@ -1945,6 +1972,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
0,
True,
True,
False,
True,
True,
32,
Expand All @@ -1956,6 +1984,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
0,
True,
True,
False,
True,
True,
16,
Expand All @@ -1967,6 +1996,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
1,
True,
True,
False,
True,
True,
32,
Expand All @@ -1978,19 +2008,22 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
1,
True,
True,
False,
True,
True,
8,
"CUTLASS",
marks=pytest.mark.skip_less_mpi_world_size(8)),
],
ids=[
"latency", "latency_trtllmgen", "throughput", "throughput_tp8",
"latency", "latency_trtllmgen", "latency_adp_lmtp",
"latency_trtllmgen_adp_lmtp", "throughput", "throughput_tp8",
"throughput_tp4", "throughput_mtp", "throughput_bs8_mtp"
])
def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
attention_dp, cuda_graph, overlap_scheduler,
max_batch_size, moe_backend):
attention_dp, enable_lm_head_tp_in_adp,
cuda_graph, overlap_scheduler, max_batch_size,
moe_backend):
if moe_backend == "TRTLLM" and (get_sm_version() == 120
or get_sm_version() == 121):
pytest.skip(
Expand All @@ -2016,6 +2049,7 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
kv_cache_config=kv_cache_config,
**pytorch_config,
enable_attention_dp=attention_dp,
enable_lm_head_tp_in_adp=enable_lm_head_tp_in_adp,
speculative_config=mtp_config) as llm:

assert llm.args.moe_config.backend == moe_backend
Expand Down
2 changes: 2 additions & 0 deletions tests/integration/test_lists/qa/llm_function_core.txt
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,8 @@ accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4]
Expand Down
2 changes: 2 additions & 0 deletions tests/integration/test_lists/qa/llm_function_core_sanity.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput]
Expand Down
2 changes: 2 additions & 0 deletions tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ l0_gb200_multi_nodes:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90)
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90)
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] TIMEOUT (90)
Expand Down