[https://nvbugs/5453667] [fix] reverting a breaking change: make trtllm-bench enable_chunked_context defaults backend-dependent (NVIDIA#6956)

venkywonka · dominicshanshan · commit 238a285241be · 2025-08-23T21:25:44.000-07:00
Signed-off-by: Venky Ganesh &lt;23023424+venkywonka@users.noreply.github.com&gt;
Signed-off-by: Wangshanshan &lt;30051912+dominicshanshan@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py
@@ -281,10 +281,11 @@
     help="Path where per request information is written to.",
 )
 @optgroup.option(
-    "--enable_chunked_context/--disable_chunked_context",
-    default=True,
-    help=
-    "Enable/disable chunking in prefill stage for enhanced throughput benchmark. "
+    "--enable_chunked_context",
+    is_flag=True,
+    default=None,
+    help="Enable chunking in prefill stage for enhanced throughput benchmark. "
+    "Default is False for PyTorch/AutoDeploy backend, True for TensorRT backend.",
 )
 @optgroup.option(
     "--scheduler_policy",
@@ -409,8 +410,11 @@ def throughput_command(
     kv_cache_percent = params.get("kv_cache_free_gpu_mem_fraction")
     beam_width = params.get("beam_width")
     streaming: bool = params.get("streaming")
-    enable_chunked_context: bool = params.get("enable_chunked_context")
     scheduler_policy: str = params.get("scheduler_policy")
+    enable_chunked_context: bool = params.get("enable_chunked_context")
+    if enable_chunked_context is None:
+        # Set default based on backend: True for TensorRT, False for others
+        enable_chunked_context = backend.lower() == "tensorrt"
 
     # Update configuration with runtime options
     exec_settings["settings_config"]["kv_cache_percent"] = kv_cache_percent