[nvbug/5337601][fix] Fix disagg + speculative decoding (NVIDIA#5558)

Tabrizian · mikeiovine · dc3671 · commit 81a4b33d6169 · 2025-07-13T17:54:46.000-07:00
Signed-off-by: Mike Iovine &lt;6158008+mikeiovine@users.noreply.github.com&gt;
Signed-off-by: Iman Tabrizian &lt;10105175+tabrizian@users.noreply.github.com&gt;
Co-authored-by: Mike Iovine &lt;6158008+mikeiovine@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -879,8 +879,9 @@ def _executor_loop(self):
 
                 self._pad_attention_dp_dummy_request()
 
-                if self.draft_model_engine is not None or self.drafter is not None:
-                    self._prepare_draft_requests()
+                if self.draft_model_engine is not None or hasattr(
+                        self, 'drafter') and self.drafter is not None:
+                    self._prepare_draft_requests(self.active_requests)
 
                 scheduled_batch, fitting_disagg_gen_init_requests, num_fitting_reqs = self._schedule(
                 )
@@ -969,12 +970,11 @@ def _executor_loop(self):
                                    iter_stats=iter_stats,
                                    iter_start_time=iter_start_time))
 
-    def _prepare_draft_requests(self):
+    def _prepare_draft_requests(self, requests):
         try:
             # Set draft tokens here to make the KV cache manager
             # and scheduler aware of them.
-            for req in self.active_requests:
-                # TODO: enable draft tokens in context phase
+            for req in requests:
                 if req.state not in (LlmRequestState.GENERATION_IN_PROGRESS,
                                      LlmRequestState.DISAGG_GENERATION_INIT):
                     continue
@@ -1786,7 +1786,6 @@ def create_new_request(input_tokens):
                     # This is the first time the draft model is seeing this request.
                     # Prepare a context request. We discard the first token and take
                     # the newly decoded one - this is the convention for EAGLE 2 and 3.
-                    assert num_draft_tokens == 0
                     new_request = create_new_request(input_tokens)
                     draft_batch.context_requests.append(new_request)
                 elif num_accepted_tokens == 0:
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -91,6 +91,7 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any],
         trtllm_serve_path, model_name, "--host", "localhost", "--backend",
         "pytorch"
     ]
+
     if tensor_parallel_size > 1:
         common_args.append(f"--tp_size={tensor_parallel_size}")
 
@@ -103,18 +104,22 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any],
     env_gen["TRTLLM_USE_UCX_KVCACHE"] = "1"
     env_gen["CUDA_VISIBLE_DEVICES"] = ",".join(
         map(str, range(tensor_parallel_size, 2 * tensor_parallel_size)))
-
-    with (MyThreadPoolExecutor(max_workers=16) as thread_pool, temp_dir,
-          popen(common_args + [
-              "--port", "8001", "--extra_llm_api_options",
-              ctx_server_config_path
-          ],
-                env=env_ctx) as ctx_server,
-          popen(common_args + [
-              "--port", "8002", "--extra_llm_api_options",
-              gen_server_config_path
-          ],
-                env=env_gen) as gen_server,
+    ctx_server_args = common_args + [
+        "--port", "8001", "--extra_llm_api_options", ctx_server_config_path
+    ]
+    gen_server_args = common_args + [
+        "--port", "8002", "--extra_llm_api_options", gen_server_config_path
+    ]
+    if "max_num_tokens" in ctx_server_config:
+        ctx_server_args.append(
+            f"--max_num_tokens={ctx_server_config['max_num_tokens']}")
+    if "max_num_tokens" in gen_server_config:
+        gen_server_args.append(
+            f"--max_num_tokens={gen_server_config['max_num_tokens']}")
+
+    with (MyThreadPoolExecutor(max_workers=16) as
+          thread_pool, temp_dir, popen(ctx_server_args, env=env_ctx) as
+          ctx_server, popen(gen_server_args, env=env_gen) as gen_server,
           popen([
               trtllm_serve_path, "disaggregated", "-c",
               disaggregated_serving_config_path, "--server_start_timeout",
@@ -252,9 +257,53 @@ def test_ngram(self):
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @pytest.mark.parametrize("overlap_scheduler", [False])
+    def test_eagle3(self, overlap_scheduler):
+        speculative_decoding_config = {
+            "decoding_type": "Eagle",
+            "max_draft_len": 4,
+            "pytorch_weights_path":
+            f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B",
+            "eagle3_one_model": False
+        }
+        kv_cache_config = {
+            "free_gpu_memory_fraction": 0.5,
+            "enable_block_reuse": False
+        }
+        ctx_server_config = {
+            "disable_overlap_scheduler": True,
+            "speculative_config": speculative_decoding_config,
+            "kv_cache_config": kv_cache_config,
+            "max_num_tokens": 13393 * 2
+        }
+        gen_server_config = {
+            "disable_overlap_scheduler": not overlap_scheduler,
+            "speculative_config": speculative_decoding_config,
+            "kv_cache_config": kv_cache_config,
+            "max_num_tokens": 13393 * 2
+        }
+        disaggregated_server_config = {
+            "hostname": "localhost",
+            "port": 8000,
+            "backend": "pytorch",
+            "context_servers": {
+                "num_instances": 1,
+                "urls": ["localhost:8001"]
+            },
+            "generation_servers": {
+                "num_instances": 1,
+                "urls": ["localhost:8002"]
+            }
+        }
+        with launch_disaggregated_llm(disaggregated_server_config,
+                                      ctx_server_config, gen_server_config,
+                                      self.MODEL_PATH) as llm:
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
 
-@pytest.mark.timeout(3600)
 @pytest.mark.skip_less_device_memory(140000)
+@pytest.mark.timeout(3600)
 class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
     MODEL_PATH = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct"
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -35,6 +35,7 @@ l0_dgx_h100:
   - accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
   - accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
+  - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[False]
   - test_e2e.py::test_ptp_quickstart_advanced_bs1
 - condition:
     ranges: