Skip to content

Commit 9286223

Browse files
SimengLiu-nvmikeiovine
authored andcommitted
[https://nvbugs/5515753][ci] Add NCCL_DEBUG=INFO flag to collect more info with CI failure. (#8440)
Signed-off-by: Simeng Liu <simengl@nvidia.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine <miovine@nvidia.com>
1 parent ee6944b commit 9286223

File tree

4 files changed

+18
-4
lines changed

4 files changed

+18
-4
lines changed

jenkins/L0_Test.groovy

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2209,6 +2209,19 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
22092209
def noIsolateTests = false
22102210
def rerunFailed = false
22112211

2212+
echoNodeAndGpuInfo(pipeline, stageName)
2213+
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
2214+
2215+
def extraInternalEnv = ""
2216+
def pytestTestTimeout = "3600"
2217+
2218+
// TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines.
2219+
extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\""
2220+
// CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
2221+
extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}"
2222+
// Enable NCCL debug information for multi-GPU tests
2223+
extraInternalEnv += " NCCL_DEBUG=INFO"
2224+
22122225
def testDBList = renderTestDB(testList, llmSrc, stageName)
22132226

22142227
// Process shard test list and create separate files for regular and isolate tests

tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,6 @@ def func(input, residual, norm_weight, eps, enable_fusion):
194194
)
195195
def test_row_linear_residual_norm_fusion(seq_len, hidden_size, dtype, strategy,
196196
fusion):
197-
198197
if strategy == AllReduceStrategy.NCCL_SYMMETRIC and 2048 in seq_len:
199198
pytest.skip("https://nvbugspro.nvidia.com/bug/5573856")
200199

tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,6 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size):
3333
is_fp8 = quant == "fp8"
3434
is_fp4 = quant == "fp4"
3535

36-
if tp_size == 4:
37-
pytest.skip(f"https://nvbugs/5515753")
38-
3936
if torch.cuda.device_count() < tp_size:
4037
pytest.skip(f"Not enough GPUs available, need {tp_size} "
4138
f"but only have {torch.cuda.device_count()}")

tests/unittest/_torch/thop/parallel/test_moe.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1056,6 +1056,7 @@ class TestMoeFp4:
10561056
)
10571057
def test_autotune(self, num_tokens, hidden_size, intermediate_size,
10581058
routing_info):
1059+
pytest.skip("https://nvbugs/5575841")
10591060

10601061
self.run_moe_fp4_test(num_tokens,
10611062
hidden_size,
@@ -1138,6 +1139,7 @@ def test_autotune_fp8_fp4(self, num_tokens, hidden_size, intermediate_size,
11381139
ids=["use_score_as_input", "use_topk_as_input"])
11391140
def test_no_autotune(self, num_tokens, hidden_size, intermediate_size,
11401141
routing_info, use_topk_as_input):
1142+
pytest.skip("https://nvbugs/5575841")
11411143

11421144
self.run_moe_fp4_test(num_tokens,
11431145
hidden_size,
@@ -1234,6 +1236,9 @@ def run_moe_fp4_test(self, num_tokens: int, hidden_size: int,
12341236
if padding >= 256:
12351237
pytest.skip("Routing kernel requires that padding be less than 256")
12361238

1239+
if intermediate_size == 384:
1240+
pytest.skip("https://nvbugs/5434352")
1241+
12371242
assert top_k <= num_experts
12381243
assert top_k <= 10
12391244
assert num_experts % 4 == 0

0 commit comments

Comments
 (0)