[TRTLLM-8551][feat] add cache_salt in LLM.generate and refactor test_return_logits.py (NVIDIA#8317)

ixlmar · govind-ramnarayan · commit 6d752d30e667 · 2025-10-21T10:30:55.000-07:00
Signed-off-by: ixlmar &lt;206748156+ixlmar@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -262,6 +262,7 @@ def generate(
             DisaggregatedParams, Sequence[DisaggregatedParams]]] = None,
         scheduling_params: Optional[Union[SchedulingParams,
                                           List[SchedulingParams]]] = None,
+        cache_salt: Optional[Union[str, Sequence[str]]] = None,
     ) -> Union[RequestOutput, List[RequestOutput]]:
         """Generate output for the given prompts in the synchronous mode.
         Synchronous generation accepts either single prompt or batched prompts.
@@ -282,6 +283,7 @@ def generate(
                 Disaggregated parameters. Defaults to None.
             scheduling_params (tensorrt_llm.scheduling_params.SchedulingParams, List[tensorrt_llm.scheduling_params.SchedulingParams], optional):
                 Scheduling parameters. Defaults to None.
+            cache_salt (str, Sequence[str], optional): If specified, KV cache will be salted with the provided string to limit the kv cache reuse to the requests with the same string. Defaults to None.
         Returns:
             Union[tensorrt_llm.llmapi.RequestOutput, List[tensorrt_llm.llmapi.RequestOutput]]: The output data of the completion request to the LLM.
         """
@@ -312,7 +314,9 @@ def _item_at(maybe_batched: Union[Any, Sequence[Any]], pos: int) -> Any:
                                                    i),
                 disaggregated_params=_item_at(disaggregated_params, i),
                 scheduling_params=_item_at(scheduling_params, i),
-                streaming=False)
+                cache_salt=_item_at(cache_salt, i),
+                streaming=False,
+            )
             futures.append(future)
 
         for future in tqdm(futures,
diff --git a/tests/integration/test_lists/test-db/l0_a30.yml b/tests/integration/test_lists/test-db/l0_a30.yml
@@ -21,6 +21,7 @@ l0_a30:
   - unittest/_torch/modeling -k "modeling_out_of_tree"
   - unittest/_torch/auto_deploy/unit/singlegpu
   - unittest/_torch/sampler/test_beam_search.py
+  - unittest/_torch/sampler/test_return_logits.py
   - test_e2e.py::test_openai_completions_with_logit_bias[torch_sampler]
   - test_e2e.py::test_openai_chat_with_logit_bias[torch_sampler]
   - test_e2e.py::test_openai_completions_with_logit_bias[trtllm_sampler]
diff --git a/tests/unittest/_torch/sampler/test_return_logits.py b/tests/unittest/_torch/sampler/test_return_logits.py
@@ -9,23 +9,65 @@
 from tensorrt_llm.llmapi.llm_utils import BuildConfig, KvCacheConfig
 
 prompts = ["A B C"]
-global_kvcache_config = KvCacheConfig(max_tokens=10000)
+global_kvcache_config = KvCacheConfig(
+    max_tokens=10000,
+    enable_block_reuse=True,
+)
 
 
-@force_ampere  # Save H100 resource
-@pytest.mark.parametrize("return_log_probs", [False, True])
-@pytest.mark.parametrize("gather_generation_logits", [False, True])
-@pytest.mark.parametrize("gather_context_logits", [False, True])
-@pytest.mark.parametrize("sampler_type", ["TRTLLMSampler", "TorchSampler"])
-@pytest.mark.parametrize("disable_overlap_scheduler", [False, True])
-def test_generate_with_return_logits(disable_overlap_scheduler: bool,
-                                     sampler_type: str,
-                                     gather_context_logits: bool,
-                                     gather_generation_logits: bool,
-                                     return_log_probs: bool):
-    if not (gather_context_logits or gather_generation_logits
-            or return_log_probs):  # prune space
-        pytest.skip("Nothing to test")
+@pytest.fixture(scope="module", params=[False, True])
+def gather_generation_logits_fixture(request) -> bool:
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[False, True])
+def gather_context_logits_fixture(request) -> bool:
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[False, True])
+def disable_overlap_scheduler_fixture(request) -> bool:
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["TRTLLMSampler", "TorchSampler"])
+def sampler_type_fixture(request) -> str:
+    return request.param
+
+
+class CacheSalter:
+
+    _salt = 0
+
+    @classmethod
+    def get_salt_unique(cls) -> str:
+        cls._salt += 1
+        return str(cls._salt)
+
+    @classmethod
+    def get_salt_shared(cls) -> str:
+        return str(0)
+
+    @classmethod
+    def get_salt(cls, reuse_cache: bool) -> str:
+        if reuse_cache:
+            salt = cls.get_salt_shared()
+        else:
+            salt = cls.get_salt_unique()
+        return salt
+
+
+@pytest.fixture(scope="module")
+def llm(
+    gather_context_logits_fixture: bool,
+    gather_generation_logits_fixture: bool,
+    sampler_type_fixture: str,
+    disable_overlap_scheduler_fixture: bool,
+):
+    gather_context_logits = gather_context_logits_fixture
+    gather_generation_logits = gather_generation_logits_fixture
+    sampler_type = sampler_type_fixture
+    disable_overlap_scheduler = disable_overlap_scheduler_fixture
 
     build_config = BuildConfig()
     build_config.gather_context_logits = gather_context_logits
@@ -42,100 +84,156 @@ def test_generate_with_return_logits(disable_overlap_scheduler: bool,
         disable_overlap_scheduler=disable_overlap_scheduler,
     )
 
+    # FIXME: Sometimes LLM shutdown hangs, might be related to https://nvbugs/5577178.
+    #        Remove patch below once fixed.
+    old_exit = LLM.__exit__
+
+    def _exit_with_xfail_on_timeout(self, exc_type, exc_value,
+                                    traceback) -> bool:
+        import _pytest.outcomes
+        try:
+            return old_exit(self, exc_type, exc_value, traceback)
+        except _pytest.outcomes.Failed as e:
+            if e.msg and "pytest-timeout" in e.msg.lower():
+                pytest.xfail(
+                    "Known LLM shutdown issue (https://nvbugs/5577178).")
+            else:
+                raise
+
+    with pytest.MonkeyPatch.context() as patch:
+        patch.setattr(LLM, "__exit__", _exit_with_xfail_on_timeout)
+
+        with llm:
+            yield llm
+
+
+@force_ampere  # Save H100 resource
+@pytest.mark.parametrize("reuse_cache", [False, True])
+@pytest.mark.parametrize("return_log_probs", [False, True])
+# FIXME: sometimes LLM shutdown hangs, might be related to https://nvbugs/5577178
+# NB: Timeout covers fixtures https://github.com/pytest-dev/pytest-timeout/issues/134
+@pytest.mark.timeout(120, method="signal")
+@pytest.mark.threadleak(enabled=False)
+def test_generate_with_return_logits(
+    llm,
+    gather_context_logits_fixture: bool,
+    gather_generation_logits_fixture: bool,
+    reuse_cache: bool,
+    return_log_probs: bool,
+):
+    gather_context_logits = gather_context_logits_fixture
+    gather_generation_logits = gather_generation_logits_fixture
+
+    if not (gather_context_logits or gather_generation_logits
+            or return_log_probs):  # prune space
+        pytest.skip("Nothing to test")
+
     sampling_params = SamplingParams(
         max_tokens=8,
         return_context_logits=gather_context_logits,
         return_generation_logits=gather_generation_logits,
         logprobs=return_log_probs,
     )
 
-    with llm:
-        for output in llm.generate(prompts, sampling_params=sampling_params):
-            if gather_context_logits:
-                assert output.context_logits is not None
-                # NOTE: prompt_token_ids of "A B C" becomes [1, 319, 350, 315]
-                expected_len = len(prompts[0].split()) + 1
+    for output in llm.generate(
+            prompts,
+            sampling_params=sampling_params,
+            cache_salt=[CacheSalter.get_salt(reuse_cache) for _ in prompts],
+    ):
+        if gather_context_logits:
+            assert output.context_logits is not None
+            # NOTE: prompt_token_ids of "A B C" becomes [1, 319, 350, 315]
+            expected_len = len(prompts[0].split()) + 1
+            try:
                 assert expected_len == output.context_logits.shape[0]
-            else:
-                assert output.context_logits is None
-
-            if gather_generation_logits:
-                gen_logits = output.outputs[0].generation_logits
-                assert gen_logits is not None
-                assert gen_logits.ndim == 2
-                assert gen_logits.shape[0] == sampling_params.max_tokens
-                assert torch.argmax(
-                    gen_logits, dim=1).tolist() == output.outputs[0].token_ids
-            else:
-                assert output.outputs[0].generation_logits is None
-
-            if return_log_probs:
-                assert len(
-                    output.outputs[0].logprobs) == sampling_params.max_tokens
-            else:
-                assert len(output.outputs[0].logprobs) == 0
+            except AssertionError:
+                # FIXME: Remove this once the bug has been fixed
+                if gather_context_logits and reuse_cache:
+                    pytest.xfail("Known bug: https://nvbugs/5577178")
+                raise
+        else:
+            assert output.context_logits is None
+
+        if gather_generation_logits:
+            gen_logits = output.outputs[0].generation_logits
+            assert gen_logits is not None
+            assert gen_logits.ndim == 2
+            assert gen_logits.shape[0] == sampling_params.max_tokens
+            assert torch.argmax(gen_logits,
+                                dim=1).tolist() == output.outputs[0].token_ids
+        else:
+            assert output.outputs[0].generation_logits is None
+
+        if return_log_probs:
+            assert len(output.outputs[0].logprobs) == sampling_params.max_tokens
+        else:
+            assert len(output.outputs[0].logprobs) == 0
 
 
 @force_ampere  # Save H100 resource
+@pytest.mark.parametrize("reuse_cache", [False, True])
 @pytest.mark.parametrize("return_log_probs", [False, True])
-@pytest.mark.parametrize("gather_generation_logits", [False, True])
-@pytest.mark.parametrize("gather_context_logits", [False, True])
-@pytest.mark.parametrize("sampler_type", ["TRTLLMSampler", "TorchSampler"])
-@pytest.mark.parametrize("disable_overlap_scheduler", [False, True])
-def test_generate_async_with_return_logits(disable_overlap_scheduler: bool,
-                                           sampler_type: str,
-                                           gather_context_logits: bool,
-                                           gather_generation_logits: bool,
-                                           return_log_probs: bool):
+# FIXME: sometimes LLM shutdown hangs, might be related to https://nvbugs/5577178
+# NB: Timeout covers fixtures https://github.com/pytest-dev/pytest-timeout/issues/134
+@pytest.mark.timeout(120, method="signal")
+@pytest.mark.threadleak(enabled=False)
+def test_generate_async_with_return_logits(
+    llm,
+    gather_context_logits_fixture: bool,
+    gather_generation_logits_fixture: bool,
+    reuse_cache: bool,
+    return_log_probs: bool,
+):
+    gather_context_logits = gather_context_logits_fixture
+    gather_generation_logits = gather_generation_logits_fixture
+
     if not (gather_context_logits or gather_generation_logits
             or return_log_probs):  # prune space
         pytest.skip("Nothing to test")
 
-    build_config = BuildConfig()
-    build_config.gather_context_logits = gather_context_logits
-
-    llm = LLM(
-        model=os.path.join(llm_models_root(), "llama-models-v2",
-                           "TinyLlama-1.1B-Chat-v1.0"),
-        kv_cache_config=global_kvcache_config,
-        build_config=build_config,
-        gather_generation_logits=gather_generation_logits,
-        max_batch_size=
-        128,  # reduce buffer sizes, specially for generation logits
-        sampler_type=sampler_type,
-        disable_overlap_scheduler=disable_overlap_scheduler,
-    )
     sampling_params = SamplingParams(
         max_tokens=8,
         return_context_logits=gather_context_logits,
         return_generation_logits=gather_generation_logits,
         logprobs=return_log_probs)
 
-    with llm:
-        for idx, output in enumerate(
-                llm.generate_async(prompts[0],
-                                   sampling_params=sampling_params,
-                                   streaming=True)):
-            if gather_context_logits:
-                assert output.context_logits is not None
-                # NOTE: prompt_token_ids of "A B C" becomes [1, 319, 350, 315]
-                expected_len = len(prompts[0].split()) + 1
+    for idx, output in enumerate(
+            llm.generate_async(
+                prompts[0],
+                sampling_params=sampling_params,
+                streaming=True,
+                cache_salt=CacheSalter.get_salt(reuse_cache),
+            )):
+        if gather_context_logits:
+            assert output.context_logits is not None
+            # NOTE: prompt_token_ids of "A B C" becomes [1, 319, 350, 315]
+            expected_len = len(prompts[0].split()) + 1
+            try:
                 assert expected_len == output.context_logits.shape[0]
-            else:
-                assert output.context_logits is None
-
-            if gather_generation_logits:
-                gen_logits = output.outputs[0].generation_logits
-                assert gen_logits is not None
-                assert gen_logits.ndim == 2
-                assert gen_logits.shape[0] == 1
+            except AssertionError:
+                # FIXME: Remove this once the bug has been fixed
+                if gather_context_logits and reuse_cache:
+                    pytest.xfail("Known bug: https://nvbugs/5577178")
+                raise
+        else:
+            assert output.context_logits is None
+
+        if gather_generation_logits:
+            gen_logits = output.outputs[0].generation_logits
+            assert gen_logits is not None
+            assert gen_logits.ndim == 2
+            assert gen_logits.shape[0] == 1
+            try:
                 assert torch.argmax(
                     gen_logits,
                     dim=1).tolist()[0] == output.outputs[0].token_ids[-1]
-            else:
-                assert output.outputs[0].generation_logits is None
-
-            if return_log_probs:
-                assert len(output.outputs[0].logprobs) == idx + 1
-            else:
-                assert len(output.outputs[0].logprobs) == 0
+            except AssertionError:
+                # FIXME: Remove xfail once the bug is fixed
+                pytest.xfail("Known bug: https://nvbugs/5573238")
+        else:
+            assert output.outputs[0].generation_logits is None
+
+        if return_log_probs:
+            assert len(output.outputs[0].logprobs) == idx + 1
+        else:
+            assert len(output.outputs[0].logprobs) == 0
diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml
@@ -199,6 +199,9 @@ methods:
       scheduling_params:
         annotation: Union[tensorrt_llm.scheduling_params.SchedulingParams, List[tensorrt_llm.scheduling_params.SchedulingParams], NoneType]
         default: null
+      cache_salt:
+        annotation: Union[str, Sequence[str], NoneType]
+        default: null
     return_annotation: Union[tensorrt_llm.llmapi.llm.RequestOutput, List[tensorrt_llm.llmapi.llm.RequestOutput]]
   generate_async:
     parameters: