Skip to content

Commit a375f0a

Browse files
SimengLiu-nvmikeiovine
authored andcommitted
[https://nvbugs/5515753][ci] Add NCCL_DEBUG=INFO flag to collect more info with CI failure. (NVIDIA#8440)
Signed-off-by: Simeng Liu <simengl@nvidia.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
1 parent 5f58228 commit a375f0a

File tree

4 files changed

+18
-4
lines changed

4 files changed

+18
-4
lines changed

jenkins/L0_Test.groovy

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2166,6 +2166,19 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
21662166
def noIsolateTests = false
21672167
def rerunFailed = false
21682168

2169+
echoNodeAndGpuInfo(pipeline, stageName)
2170+
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
2171+
2172+
def extraInternalEnv = ""
2173+
def pytestTestTimeout = "3600"
2174+
2175+
// TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines.
2176+
extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\""
2177+
// CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
2178+
extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}"
2179+
// Enable NCCL debug information for multi-GPU tests
2180+
extraInternalEnv += " NCCL_DEBUG=INFO"
2181+
21692182
def testDBList = renderTestDB(testList, llmSrc, stageName)
21702183

21712184
// Process shard test list and create separate files for regular and isolate tests

tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,6 @@ def func(input, residual, norm_weight, eps, enable_fusion):
194194
)
195195
def test_row_linear_residual_norm_fusion(seq_len, hidden_size, dtype, strategy,
196196
fusion):
197-
198197
if strategy == AllReduceStrategy.NCCL_SYMMETRIC and 2048 in seq_len:
199198
pytest.skip("https://nvbugspro.nvidia.com/bug/5573856")
200199

tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,6 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size):
3333
is_fp8 = quant == "fp8"
3434
is_fp4 = quant == "fp4"
3535

36-
if tp_size == 4:
37-
pytest.skip(f"https://nvbugs/5515753")
38-
3936
if torch.cuda.device_count() < tp_size:
4037
pytest.skip(f"Not enough GPUs available, need {tp_size} "
4138
f"but only have {torch.cuda.device_count()}")

tests/unittest/_torch/thop/parallel/test_moe.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1062,6 +1062,7 @@ class TestMoeFp4:
10621062
)
10631063
def test_autotune(self, num_tokens, hidden_size, intermediate_size,
10641064
routing_info):
1065+
pytest.skip("https://nvbugs/5575841")
10651066

10661067
self.run_moe_fp4_test(num_tokens,
10671068
hidden_size,
@@ -1148,6 +1149,7 @@ def test_autotune_fp8_fp4(self, num_tokens, hidden_size, intermediate_size,
11481149
ids=["use_score_as_input", "use_topk_as_input"])
11491150
def test_no_autotune(self, num_tokens, hidden_size, intermediate_size,
11501151
routing_info, use_topk_as_input):
1152+
pytest.skip("https://nvbugs/5575841")
11511153

11521154
self.run_moe_fp4_test(num_tokens,
11531155
hidden_size,
@@ -1215,6 +1217,9 @@ def run_moe_fp4_test(self, num_tokens: int, hidden_size: int,
12151217
if padding >= 256:
12161218
pytest.skip("Routing kernel requires that padding be less than 256")
12171219

1220+
if intermediate_size == 384:
1221+
pytest.skip("https://nvbugs/5434352")
1222+
12181223
assert top_k <= num_experts
12191224
assert top_k <= 10
12201225
assert num_experts % 4 == 0

0 commit comments

Comments
 (0)