[None][chore] Update disagg benchmark configs (#8289)

qiaoxj07 · web-flow · commit d145e87f6f74 · 2025-10-13T18:15:46.000+08:00
Signed-off-by: Xianjie &lt;5410381+qiaoxj07@users.noreply.github.com&gt;
Signed-off-by: Xianjie Qiao &lt;5410381+qiaoxj07@users.noreply.github.com&gt;
diff --git a/examples/disaggregated/slurm/benchmark/gen_worker_config.py b/examples/disaggregated/slurm/benchmark/gen_worker_config.py
@@ -92,7 +92,8 @@ def gen_config_file(work_dir: str,
         },
         'tensor_parallel_size': gen_tp_size,
         'moe_expert_parallel_size': gen_tp_size,
-        'enable_attention_dp': True if gen_enable_attention_dp else False,
+        'enable_attention_dp': gen_enable_attention_dp,
+        'enable_lm_head_tp_in_adp': gen_enable_attention_dp and mtp_size > 0,
         'pipeline_parallel_size': gen_pp_size,
         'max_batch_size': gen_batch_size,
         'max_num_tokens': gen_max_num_tokens,
@@ -109,6 +110,7 @@ def gen_config_file(work_dir: str,
         },
         'moe_config': {
             'backend': gen_moe_backend,
+            'use_low_precision_moe_combine': True,
         },
         'cache_transceiver_config': {
             'max_tokens_in_buffer': cache_transceiver_max_num_tokens,
diff --git a/examples/disaggregated/slurm/benchmark/start_worker.sh b/examples/disaggregated/slurm/benchmark/start_worker.sh
@@ -19,6 +19,8 @@ echo "enable_pdl: ${enable_pdl}, work_dir: ${work_dir}"
 echo "SLURM_PROCID: ${SLURM_PROCID}, hostname: $(hostname), instance_id: ${instance_id}"
 
 export TLLM_LOG_LEVEL=INFO
+export TRTLLM_SERVER_DISABLE_GC=1
+export TRTLLM_WORKER_DISABLE_GC=1
 
 if [ "${enable_pdl}" = "true" ]; then
     export TRTLLM_ENABLE_PDL=1
@@ -62,15 +64,16 @@ if [ "${enable_nsys}" != "true" ]; then
     trtllm-llmapi-launch ${numa_bind_cmd} trtllm-serve ${model_path} --host $(hostname) --port ${port} --extra_llm_api_options ${config_file}
 else
     nsys_prefix=""
-    nsys_file=${work_dir}/nsys_worker_proc_${instance_id}_${SLURM_PROCID}
+    nsys_file=${work_dir}/nsys_worker_proc_${role}_${instance_id}_${SLURM_PROCID}
     export TLLM_PROFILE_RECORD_GC=1
     export TLLM_NVTX_DEBUG=1
-    if [ "${role}" = "GEN" ] && [ "$SLURM_PROCID" = "0" ]; then
+    nsys_prefix="nsys profile -e \"NSYS_MPI_STORE_TEAMS_PER_RANK=1\" -o ${nsys_file} -f true -t cuda,nvtx,python-gil -c cudaProfilerApi --cuda-graph-trace node --capture-range-end=stop --gpu-metrics-devices=none"
+    if [ "${role}" = "GEN" ]; then
         export TLLM_PROFILE_START_STOP=200-250
-        nsys_prefix="nsys profile -e \"NSYS_MPI_STORE_TEAMS_PER_RANK=1\" -o ${nsys_file} -f true -t cuda,nvtx,python-gil -c cudaProfilerApi --cuda-graph-trace node --capture-range-end=stop --gpu-metrics-devices=none"
-        echo "nsys_prefix: ${nsys_prefix}"
+        echo "nsys is enabled on gen_gpus"
     elif [ "${role}" = "CTX" ]; then
-        echo "nsys is not enabled on ctx_gpus"
+        export TLLM_PROFILE_START_STOP=10-30
+        echo "nsys is enabled on ctx_gpus"
     fi
     ${nsys_prefix} trtllm-llmapi-launch ${numa_bind_cmd} \
         trtllm-serve ${model_path} \