NVIDIA
diff --git a/‎examples/llm-api/quickstart_advanced.py‎
Lines changed: 19 additions & 9 deletions b/‎examples/llm-api/quickstart_advanced.py‎
Lines changed: 19 additions & 9 deletions
diff --git a/‎tensorrt_llm/_torch/models/modeling_auto.py‎
Lines changed: 2 additions & 0 deletions b/‎tensorrt_llm/_torch/models/modeling_auto.py‎
Lines changed: 2 additions & 0 deletions
@@ -169,17 +169,27 @@ def setup_llm(args, **kwargs):
     ) if args.spec_decode_algo is not None else None
 
     if spec_decode_algo == 'MTP':
+
         if not args.use_one_model:
-            print(
-                "MTP only supports one model style spec decode; ignoring default use_one_model=False"
+            print("Running MTP eagle with two model style.")
+            spec_config = EagleDecodingConfig(
+                max_draft_len=args.spec_decode_max_draft_len,
+                speculative_model_dir=args.model_dir,
+                eagle3_one_model=args.use_one_model,
+                is_mtp_eagle=True,
+                use_relaxed_acceptance_for_thinking=args.
+                use_relaxed_acceptance_for_thinking,
+                relaxed_topk=args.relaxed_topk,
+                relaxed_delta=args.relaxed_delta,
             )
-
-        spec_config = MTPDecodingConfig(
-            num_nextn_predict_layers=args.spec_decode_max_draft_len,
-            use_relaxed_acceptance_for_thinking=args.
-            use_relaxed_acceptance_for_thinking,
-            relaxed_topk=args.relaxed_topk,
-            relaxed_delta=args.relaxed_delta)
+        else:
+            spec_config = MTPDecodingConfig(
+                num_nextn_predict_layers=args.spec_decode_max_draft_len,
+                use_relaxed_acceptance_for_thinking=args.
+                use_relaxed_acceptance_for_thinking,
+                relaxed_topk=args.relaxed_topk,
+                relaxed_delta=args.relaxed_delta,
+                mtp_eagle_one_model=args.use_one_model)
     elif spec_decode_algo == "EAGLE3":
         spec_config = EagleDecodingConfig(
             max_draft_len=args.spec_decode_max_draft_len,
 
@@ -20,6 +20,8 @@ def from_config(
                                         "")  # Strip the appended EAGLE3
         if hasattr(config.pretrained_config, "draft_vocab_size"):
             model_arch = "EAGLE3" + model_arch
+        if model_arch == "DeepseekV3ForCausalLM" and config.spec_config.max_draft_len == 0:
+            model_arch = "MTPDraftModelForCausalLM"
 
         cls = MODEL_CLASS_MAPPING.get(model_arch)
         if cls is None: