We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent f167b1f commit b427966Copy full SHA for b427966
tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -787,6 +787,11 @@ def release_batch(result: ScheduledRequests | None):
787
f"Run generation only CUDA graph warmup for batch size={bs}, draft_len={draft_len}"
788
)
789
self.enable_spec_decode = draft_len > 0 or self.is_draft_model
790
+ with self.no_cuda_graph(), autotune():
791
+ self.forward(batch,
792
+ new_tensors_device=None,
793
+ resource_manager=resource_manager)
794
+ torch.cuda.synchronize()
795
self.forward(batch,
796
new_tensors_device=None,
797
resource_manager=resource_manager)
0 commit comments