[Core] Add a random suffix to frontend-provided request IDs (#27987)

Signed-off-by: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Nick Hill <nhill@redhat.com> Co-authored-by: Nick Hill <nhill@redhat.com>
2025-12-26 14:49:41 +08:00 · 2025-12-23 21:05:39 +00:00 · 2025-12-23 21:05:39 +00:00 · f790068600
commit f790068600
parent 34916ae37f
16 changed files with 328 additions and 154 deletions
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@ -260,7 +260,7 @@ async def test_multi_abort(output_kind: RequestOutputKind):

        # Use multi-abort to abort multiple requests at once
        abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT]
-        await engine.abort(abort_request_ids)
+        await engine.abort(abort_request_ids, internal=False)

        # Wait for all tasks to complete
        results = await asyncio.gather(*tasks, return_exceptions=True)
@ -609,7 +609,7 @@ async def test_abort_final_output(output_kind: RequestOutputKind):
        await asyncio.sleep(0.5)

        # Abort the request
-        await engine.abort(request_id)
+        await engine.abort(request_id, internal=False)

        # Wait for generation to complete and return final output
        final_output = await generated
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@ -40,10 +40,16 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
 PROMPT = "I am Gyoubu Masataka Oniwa"
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids

+_REQUEST_COUNTER = 0
+

 def make_request() -> EngineCoreRequest:
+    global _REQUEST_COUNTER
+    _REQUEST_COUNTER += 1
+    request_id = f"request-{_REQUEST_COUNTER}"
    return EngineCoreRequest(
-        request_id=str(uuid.uuid4()),
+        request_id=request_id,
+        external_req_id=f"{request_id}-{uuid.uuid4()}",
        prompt_token_ids=PROMPT_TOKENS,
        mm_features=None,
        sampling_params=SamplingParams(),
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@ -45,6 +45,8 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
 PROMPT = "Hello my name is Robert and I love quantization kernels"
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids

+_REQUEST_COUNTER = 0
+

 def make_request(
    params: SamplingParams, prompt_tokens_ids: list[int] | None = None
@ -52,8 +54,12 @@ def make_request(
    if not prompt_tokens_ids:
        prompt_tokens_ids = PROMPT_TOKENS

+    global _REQUEST_COUNTER
+    _REQUEST_COUNTER += 1
+    request_id = f"request-{_REQUEST_COUNTER}"
    return EngineCoreRequest(
-        request_id=str(uuid.uuid4()),
+        request_id=request_id,
+        external_req_id=f"{request_id}-{uuid.uuid4()}",
        prompt_token_ids=prompt_tokens_ids,
        mm_features=None,
        sampling_params=params,
--- a/tests/v1/engine/test_fast_incdec_prefix_err.py
+++ b/tests/v1/engine/test_fast_incdec_prefix_err.py
@ -27,6 +27,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
    params = SamplingParams(skip_special_tokens=True)
    request = EngineCoreRequest(
        request_id="test",
+        external_req_id="test-ext",
        prompt_token_ids=prompt_token_ids,
        mm_features=None,
        sampling_params=params,
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@ -58,12 +58,12 @@ def test_incremental_detokenization(
    output_processor = OutputProcessor(
        dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval
    )
-    engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens)

    # Make N requests.
    requests = [
        EngineCoreRequest(
-            request_id=f"request-{idx}",
+            request_id=f"request-{idx}-int",
+            external_req_id=f"request-{idx}",
            prompt_token_ids=prompt_tokens,
            mm_features=None,
            eos_token_id=None,
@ -83,6 +83,11 @@ def test_incremental_detokenization(
        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
    ]

+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens,
+        request_ids=[req.request_id for req in requests],
+    )
+
    # Add requests to the detokenizer.
    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
        output_processor.add_request(request, prompt)
@ -438,15 +443,6 @@ def test_logprobs_processor(
    dummy_test_vectors,
 ):
    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
-    engine_core = MockEngineCore(
-        tokens_list=dummy_test_vectors.generation_tokens,
-        generated_logprobs_raw=None
-        if num_sample_logprobs is None
-        else dummy_test_vectors.generation_logprobs,
-        prompt_logprobs_raw=None
-        if num_prompt_logprobs is None
-        else dummy_test_vectors.prompt_logprobs,
-    )

    # Make N requests.
    request_id_list = [
@ -454,7 +450,8 @@ def test_logprobs_processor(
    ]
    requests = [
        EngineCoreRequest(
-            request_id=request_id_list[idx],
+            request_id=request_id_list[idx] + "-int",
+            external_req_id=request_id_list[idx],
            prompt_token_ids=prompt_tokens,
            mm_features=None,
            eos_token_id=None,
@ -476,6 +473,17 @@ def test_logprobs_processor(
        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
    ]

+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens,
+        generated_logprobs_raw=None
+        if num_sample_logprobs is None
+        else dummy_test_vectors.generation_logprobs,
+        prompt_logprobs_raw=None
+        if num_prompt_logprobs is None
+        else dummy_test_vectors.prompt_logprobs,
+        request_ids=[req.request_id for req in requests],
+    )
+
    # Add requests to the detokenizer.
    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
        output_processor.add_request(request, prompt)
@ -621,19 +629,12 @@ def test_stop_token(
        ]
    prompt_string = dummy_test_vectors.prompt_strings[0]
    prompt_tokens = dummy_test_vectors.prompt_tokens[0]
-    engine_core = MockEngineCore(
-        tokens_list=[generation_tokens],
-        generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
-        prompt_logprobs_raw=None,
-        eos_token_id=eos_token_id,
-        stop_token_ids=stop_token_ids,
-        ignore_eos=ignore_eos,
-    )

    # Make request.
    request_id = "request-0"
    request = EngineCoreRequest(
        request_id=request_id,
+        external_req_id=request_id + "-ext",
        prompt_token_ids=prompt_tokens,
        mm_features=None,
        eos_token_id=eos_token_id,
@ -655,6 +656,16 @@ def test_stop_token(
        pooling_params=None,
    )

+    engine_core = MockEngineCore(
+        tokens_list=[generation_tokens],
+        generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
+        prompt_logprobs_raw=None,
+        eos_token_id=eos_token_id,
+        stop_token_ids=stop_token_ids,
+        ignore_eos=ignore_eos,
+        request_ids=[request.request_id],
+    )
+
    # Add request to the detokenizer.
    output_processor.add_request(request, prompt_string)

@ -720,13 +731,6 @@ def test_stop_string(
    dummy_test_vectors,
 ):
    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
-    engine_core = MockEngineCore(
-        tokens_list=dummy_test_vectors.generation_tokens,
-        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
-        if num_sample_logprobs
-        else None,
-        prompt_logprobs_raw=None,
-    )

    # Make N requests.
    request_id_list = [
@ -734,7 +738,8 @@ def test_stop_string(
    ]
    requests = [
        EngineCoreRequest(
-            request_id=request_id_list[idx],
+            request_id=request_id_list[idx] + "-int",
+            external_req_id=request_id_list[idx],
            prompt_token_ids=prompt_tokens,
            mm_features=None,
            eos_token_id=None,
@ -756,6 +761,15 @@ def test_stop_string(
        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
    ]

+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens,
+        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
+        if num_sample_logprobs
+        else None,
+        prompt_logprobs_raw=None,
+        request_ids=[req.request_id for req in requests],
+    )
+
    # Add requests to the detokenizer.
    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
        output_processor.add_request(request, prompt)
@ -813,9 +827,12 @@ def test_stop_string(
    for idx, (ref_gen_str, stop_str) in enumerate(
        zip(dummy_test_vectors.generation_strings, STOP_STRINGS)
    ):
-        # Request should be aborted.
+        # Request should be aborted (check internal ID in abort list).
+        internal_request_id = f"request-{idx}-int"
+        assert internal_request_id in aborted
+
+        # Use external ID for collecting outputs
        request_id = f"request-{idx}"
-        assert request_id in aborted

        # Collected values that were generated.
        gen_str = gen_strings[request_id]
@ -848,13 +865,13 @@ def test_stop_string(

 def test_iteration_stats(dummy_test_vectors):
    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
-    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
    engine_core_timestamp = time.monotonic()

    # Make N requests.
    requests = [
        EngineCoreRequest(
            request_id=f"request-{idx}",
+            external_req_id=f"request-{idx}-ext",
            prompt_token_ids=prompt_tokens,
            mm_features=None,
            eos_token_id=None,
@ -868,6 +885,11 @@ def test_iteration_stats(dummy_test_vectors):
        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
    ]

+    engine_core = MockEngineCore(
+        dummy_test_vectors.generation_tokens,
+        request_ids=[req.request_id for req in requests],
+    )
+
    # Add all requests except one to the OutputProcessor.
    num_active = len(dummy_test_vectors.generation_tokens) - 1
    for request in requests[:num_active]:
@ -922,7 +944,6 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
    output_processor = OutputProcessor(
        dummy_test_vectors.tokenizer, log_stats=log_stats
    )
-    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
    engine_core_timestamp = time.monotonic()

    # Create LoRA requests
@ -936,7 +957,8 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
    lora_assignments = [lora1, lora2, None]
    requests = [
        EngineCoreRequest(
-            request_id=f"request-{idx}",
+            request_id=f"request-{idx}-int",
+            external_req_id=f"request-{idx}",
            prompt_token_ids=prompt_tokens,
            mm_features=None,
            eos_token_id=None,
@ -950,6 +972,11 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
    ]

+    engine_core = MockEngineCore(
+        dummy_test_vectors.generation_tokens,
+        request_ids=[req.request_id for req in requests],
+    )
+
    # Add all requests to the OutputProcessor
    for request in requests:
        output_processor.add_request(request, None)
@ -1015,9 +1042,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
    outputs = EngineCoreOutputs(
        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
    )
-    # Find and mark request-0 as finished (it uses lora-1)
+    # Find and mark request-0-int as finished (it uses lora-1)
    for output in outputs.outputs:
-        if output.request_id == "request-0":
+        if output.request_id == "request-0-int":
            output.finish_reason = FinishReason.LENGTH
            break

@ -1040,9 +1067,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
    outputs = EngineCoreOutputs(
        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
    )
-    # Find and mark request-1 as finished (it uses lora-2)
+    # Find and mark request-1-int as finished (it uses lora-2)
    for output in outputs.outputs:
-        if output.request_id == "request-1":
+        if output.request_id == "request-1-int":
            output.finish_reason = FinishReason.LENGTH
            break

@ -1064,9 +1091,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
    outputs = EngineCoreOutputs(
        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
    )
-    # Find and mark request-2 as finished (it has no LoRA)
+    # Find and mark request-2-int as finished (it has no LoRA)
    for output in outputs.outputs:
-        if output.request_id == "request-2":
+        if output.request_id == "request-2-int":
            output.finish_reason = FinishReason.LENGTH
            break

@ -1107,7 +1134,9 @@ async def test_request_output_collector():
            for idx in range(NUM_REQS)
        ]

-    collector = RequestOutputCollector(RequestOutputKind.DELTA)
+    collector = RequestOutputCollector(
+        RequestOutputKind.DELTA, request_id="my-request-id-int"
+    )

    # CASE 1: Put then get.
    outputs = make_outputs()
@ -1163,7 +1192,9 @@ async def test_request_output_collector():
@pytest.mark.asyncio
 async def test_cumulative_output_collector_n():
    """Test collector correctly handles multiple outputs by index."""
-    collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE)
+    collector = RequestOutputCollector(
+        RequestOutputKind.CUMULATIVE, request_id="my-request-id-int"
+    )
    outputs = [
        RequestOutput(
            request_id="my-request-id",
@ -1242,11 +1273,13 @@ async def test_cumulative_output_collector_n():


@pytest.mark.parametrize("runner", ["generate", "pooling"])
-def test_abort_requests(runner: str, dummy_test_vectors):
+@pytest.mark.parametrize("abort_by", ["internal", "external"])
+def test_abort_requests(runner: str, abort_by: str, dummy_test_vectors):
    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
    requests = [
        EngineCoreRequest(
            request_id=f"request-{idx}",
+            external_req_id=f"external-{idx}",
            prompt_token_ids=prompt_tokens,
            mm_features=None,
            eos_token_id=None,
@ -1265,8 +1298,13 @@ def test_abort_requests(runner: str, dummy_test_vectors):
            output_kind = request.sampling_params.output_kind
        else:
            output_kind = request.pooling_params.output_kind
-        queue = RequestOutputCollector(output_kind=output_kind)
+        queue = RequestOutputCollector(
+            output_kind=output_kind, request_id=request.request_id
+        )
        output_processor.add_request(request, None, queue=queue)

    for request in requests:
-        output_processor.abort_requests([request.request_id])
+        if abort_by == "internal":
+            output_processor.abort_requests([request.request_id], internal=True)
+        else:
+            output_processor.abort_requests([request.external_req_id], internal=False)
--- a/tests/v1/engine/test_parallel_sampling.py
+++ b/tests/v1/engine/test_parallel_sampling.py
@ -4,11 +4,12 @@
 from vllm import SamplingParams
 from vllm.outputs import CompletionOutput
 from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.parallel_sampling import ParentRequest


 def test_parent_request_to_output_stream() -> None:
-    parent_request = ParentRequest("parent_id", SamplingParams(n=2))
+    parent_request = ParentRequest(make_request(SamplingParams(n=2)))
    parent_request.child_requests = {"child_id_0", "child_id_1"}
    output_0 = CompletionOutput(
        index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
@ -17,51 +18,31 @@ def test_parent_request_to_output_stream() -> None:
        index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
    )
    # Request not finished
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
-        "child_id_1", output_1
-    )
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
-        "child_id_1", output_1
-    )
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
+    assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
+    assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)

    # output_1 finished
    output_1.finish_reason = "ended"
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
-        "child_id_1", output_1
-    )
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
+    assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
    # Finished output_1 had already returned, DO NOT returned again
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert parent_request.get_outputs("child_id_1", output_1) == (
-        "parent_id",
-        [],
-        False,
-    )
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], False)

    # output_0 finished
    output_0.finish_reason = "ended"
-    assert ("parent_id", [output_0], True) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
+    assert ([output_0], True) == parent_request.get_outputs("child_id_0", output_0)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
    # Finished output_0 had already returned, DO NOT returned again
-    assert parent_request.get_outputs("child_id_0", output_0) == ("parent_id", [], True)
-    assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
+    assert parent_request.get_outputs("child_id_0", output_0) == ([], True)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], True)


 def test_parent_request_to_output_final_only() -> None:
    parent_request = ParentRequest(
-        "parent_id", SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY)
+        make_request(SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY))
    )
    parent_request.child_requests = {"child_id_0", "child_id_1"}
    output_0 = CompletionOutput(
@ -71,33 +52,33 @@ def test_parent_request_to_output_final_only() -> None:
        index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
    )
    # Request not finished, return nothing
-    assert parent_request.get_outputs("child_id_0", output_0) == (
-        "parent_id",
-        [],
-        False,
-    )
-    assert parent_request.get_outputs("child_id_1", output_1) == (
-        "parent_id",
-        [],
-        False,
-    )
+    assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
    # output_1 finished, but outputs won't be returned until all child requests finished
    output_1.finish_reason = "ended"
-    assert parent_request.get_outputs("child_id_0", output_0) == (
-        "parent_id",
-        [],
-        False,
-    )
-    assert parent_request.get_outputs("child_id_1", output_1) == (
-        "parent_id",
-        [],
-        False,
-    )
+    assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
    # output_0 finished, as all child requests finished, the output would be returned
    output_0.finish_reason = "ended"
-    assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
+    assert ([output_0, output_1], True) == parent_request.get_outputs(
        "child_id_0", output_0
    )
-    assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
+    assert ([output_0, output_1], True) == parent_request.get_outputs(
        "child_id_1", output_1
    )
+
+
+def make_request(sampling_params: SamplingParams) -> EngineCoreRequest:
+    return EngineCoreRequest(
+        request_id="parent_id",
+        external_req_id="ext_parent_id",
+        prompt_token_ids=None,
+        mm_features=None,
+        sampling_params=sampling_params,
+        pooling_params=None,
+        eos_token_id=None,
+        arrival_time=0.0,
+        lora_request=None,
+        cache_salt=None,
+        data_parallel_rank=None,
+    )
--- a/tests/v1/engine/test_process_multi_modal_uuids.py
+++ b/tests/v1/engine/test_process_multi_modal_uuids.py
@ -6,6 +6,7 @@ import pytest
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
+from vllm.multimodal import MultiModalUUIDDict
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine import input_processor as input_processor_mod
 from vllm.v1.engine.input_processor import InputProcessor
@ -166,7 +167,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
        monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
    )

-    captured: dict[str, object] = {}
+    captured: dict[str, MultiModalUUIDDict] = {}

    def fake_preprocess(
        prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None
@ -196,7 +197,16 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
    )

    # Expect request-id-based overrides are passed through
-    assert captured["mm_uuids"] == {
-        "image": [f"{request_id}-image-0", f"{request_id}-image-1"],
-        "video": [f"{request_id}-video-0"],
-    }
+    mm_uuids = captured["mm_uuids"]
+    assert set(mm_uuids.keys()) == {"image", "video"}
+    assert len(mm_uuids["image"]) == 2
+    assert len(mm_uuids["video"]) == 1
+    assert mm_uuids["image"][0].startswith(f"{request_id}-image-") and mm_uuids[
+        "image"
+    ][0].endswith("-0")
+    assert mm_uuids["image"][1].startswith(f"{request_id}-image-") and mm_uuids[
+        "image"
+    ][1].endswith("-1")
+    assert mm_uuids["video"][0].startswith(f"{request_id}-video-") and mm_uuids[
+        "video"
+    ][0].endswith("-0")
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@ -343,6 +343,7 @@ class MockEngineCore:
        eos_token_id: int | None = None,
        stop_token_ids: list[int] | None = None,
        ignore_eos: bool = False,
+        request_ids: list[str] | None = None,
    ) -> None:
        self.num_requests = len(tokens_list)
        self.tokens_list = tokens_list
@ -355,6 +356,11 @@ class MockEngineCore:
        self.eos_token_id = eos_token_id
        self.stop_token_ids = stop_token_ids
        self.ignore_eos = ignore_eos
+        self.request_ids = (
+            request_ids
+            if request_ids is not None
+            else [f"request-{i}" for i in range(self.num_requests)]
+        )

    def get_outputs(self) -> list[EngineCoreOutput]:
        do_logprobs = self.do_logprobs
@ -386,7 +392,7 @@ class MockEngineCore:
                    prompt_logprobs = None
                new_token_id = token_ids[token_idx]
                output = EngineCoreOutput(
-                    request_id=f"request-{req_idx}",
+                    request_id=self.request_ids[req_idx],
                    new_token_ids=[new_token_id],
                    new_logprobs=logprobs,
                    new_prompt_logprobs_tensors=prompt_logprobs,
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@ -41,10 +41,13 @@ from vllm.distributed.kv_transfer.kv_transfer_state import (
    has_kv_transfer_group,
 )
 from vllm.forward_context import ForwardContext
+from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
 from vllm.platforms.interface import Platform
 from vllm.sampling_params import SamplingParams
 from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.request import RequestStatus

@ -1265,6 +1268,22 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
        run_test_and_cleanup()


+class RequestIdMapper:
+    """Helper class to map external request IDs to internal request IDs."""
+
+    def __init__(self, output_processor: OutputProcessor):
+        self.req_id_mapping: dict[str, str] = {}
+        self.original_add_request = output_processor.add_request
+        output_processor.add_request = self._add_request
+
+    def _add_request(self, request: EngineCoreRequest, *args, **kwargs):
+        self.req_id_mapping[request.external_req_id] = request.request_id
+        return self.original_add_request(request, *args, **kwargs)
+
+    def __call__(self, external_req_id: str) -> str:
+        return self.req_id_mapping[external_req_id]
+
+
 def _run_abort_timeout_test(llm: LLM, timeout: int):
    """Helper function to run the abort timeout test logic."""
    remote_prefill_opts = {
@ -1286,24 +1305,34 @@ def _run_abort_timeout_test(llm: LLM, timeout: int):
        0
    ].req_to_blocks

+    id_mapper = RequestIdMapper(llm.llm_engine.output_processor)
+
+    def req_id(outputs: list[RequestOutput]) -> str:
+        assert len(outputs) == 1
+        return id_mapper(outputs[0].request_id)
+
    padding = "Just making this request a little longer so that we're sure "
    "we're not hitting the small-request lower bound beneath which we don't "
    "actually trigger the whole kv transfer, but rather just recompute the "
    "blocks on D."
-    _ = llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
+    req0_id = req_id(
+        llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
+    )

    # Request finished but not freed
-    assert "0" in scheduler.finished_req_ids and "0" in req_to_blocks
+    assert req0_id in scheduler.finished_req_ids and req0_id in req_to_blocks
    # Some other request, 0 still not freed
-    _ = llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
-    assert "0" in req_to_blocks
-    assert "1" in scheduler.finished_req_ids and "1" in req_to_blocks
+    req1_id = req_id(
+        llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
+    )
+    assert req0_id in req_to_blocks
+    assert req1_id in scheduler.finished_req_ids and req1_id in req_to_blocks

    # Wait for timeout and trigger another scheduler loop
    time.sleep(timeout)
    _ = llm.generate([f"What is the capital of France? {padding}"], sampling_params)
    # Request-0 times out and is cleared!
-    assert "0" not in req_to_blocks
+    assert req0_id not in req_to_blocks
    # Need to shutdown the background thread to release NIXL side channel port
    llm.llm_engine.engine_core.shutdown()

--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -1621,7 +1621,7 @@ class LLM:
                added_request_ids.append(request_id)
        except Exception as e:
            if added_request_ids:
-                self.llm_engine.abort_request(added_request_ids)
+                self.llm_engine.abort_request(added_request_ids, internal=True)
            raise e

    def _validate_mm_data_and_uuids(
@ -1731,7 +1731,7 @@ class LLM:
            priority=priority,
            prompt_text=prompt_text,
        )
-        return request_id
+        return engine_request.request_id

    def _run_engine(
        self, *, use_tqdm: bool | Callable[..., tqdm] = True
--- a/vllm/v1/engine/init.py
+++ b/vllm/v1/engine/init.py
@ -75,6 +75,12 @@ class EngineCoreRequest(

    trace_headers: Mapping[str, str] | None = None

+    # The user-provided request ID. This field is set internally,
+    # copied from the provided request_id that's originally assigned
+    # to the request_id field, see InputProcessor.assign_request_id().
+    # Used in outputs and to support abort(req_id, internal=False).
+    external_req_id: str | None = None
+
    @property
    def params(self) -> SamplingParams | PoolingParams:
        """Return the processed params (sampling or pooling)."""
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@ -290,12 +290,15 @@ class AsyncLLM(EngineClient):

        is_pooling = isinstance(params, PoolingParams)

-        # Create a new output collector for the request.
-        queue = RequestOutputCollector(output_kind=params.output_kind)
-
        # Convert Input --> Request.
        if isinstance(prompt, EngineCoreRequest):
            request = prompt
+            if request_id != request.request_id:
+                logger.warning_once(
+                    "AsyncLLM.add_request() was passed a request_id parameter that "
+                    "does not match the EngineCoreRequest.request_id attribute. The "
+                    "latter will be used, and the former will be ignored."
+                )
        else:
            assert prompt_text is None
            request = self.input_processor.process_inputs(
@ -314,6 +317,11 @@ class AsyncLLM(EngineClient):
            elif isinstance(prompt, Mapping):
                prompt_text = cast(str | None, prompt.get("prompt"))

+        self.input_processor.assign_request_id(request)
+
+        # Create a new output collector for the request.
+        queue = RequestOutputCollector(params.output_kind, request.request_id)
+
        # Use cloned params that may have been updated in process_inputs()
        params = request.params

@ -325,7 +333,7 @@ class AsyncLLM(EngineClient):
        assert isinstance(parent_params, SamplingParams)

        # Fan out child requests (for n>1).
-        parent_request = ParentRequest(request_id, parent_params)
+        parent_request = ParentRequest(request)
        for idx in range(parent_params.n):
            request_id, child_params = parent_request.get_child_info(idx)
            child_request = request if idx == parent_params.n - 1 else copy(request)
@ -396,6 +404,7 @@ class AsyncLLM(EngineClient):
                "prompt logprobs"
            )

+        q: RequestOutputCollector | None = None
        try:
            # We start the output_handler on the first call to generate() so
            # we can call __init__ before the event loop, which enables us
@ -446,7 +455,8 @@ class AsyncLLM(EngineClient):
        # is cancelled or the generator is garbage collected. So,
        # we abort the request if we end up here.
        except (asyncio.CancelledError, GeneratorExit):
-            await self.abort(request_id)
+            if q is not None:
+                await self.abort(q.request_id, internal=True)
            if self.log_requests:
                logger.info("Request %s aborted.", request_id)
            raise
@ -465,7 +475,8 @@ class AsyncLLM(EngineClient):

        # Unexpected error in the generate() task (possibly recoverable).
        except Exception as e:
-            await self.abort(request_id)
+            if q is not None:
+                await self.abort(q.request_id, internal=True)
            if self.log_requests:
                logger.info("Request %s failed.", request_id)
            raise EngineGenerateError() from e
@ -541,13 +552,15 @@ class AsyncLLM(EngineClient):

        self.output_handler = asyncio.create_task(output_handler())

-    async def abort(self, request_id: str | Iterable[str]) -> None:
+    async def abort(
+        self, request_id: str | Iterable[str], internal: bool = False
+    ) -> None:
        """Abort RequestId in OutputProcessor and EngineCore."""

        request_ids = (
            (request_id,) if isinstance(request_id, str) else as_list(request_id)
        )
-        all_request_ids = self.output_processor.abort_requests(request_ids)
+        all_request_ids = self.output_processor.abort_requests(request_ids, internal)
        await self.engine_core.abort_requests_async(all_request_ids)

        if self.log_requests:
@ -581,7 +594,7 @@ class AsyncLLM(EngineClient):
        if not wait_for_inflight_requests:
            request_ids = list(self.output_processor.request_states.keys())
            if request_ids:
-                await self.abort(request_ids)
+                await self.abort(request_ids, internal=True)

        # Wait for running requests to drain before clearing cache.
        if self.output_processor.has_unfinished_requests():
@ -633,6 +646,7 @@ class AsyncLLM(EngineClient):
        TODO: Remove truncate_prompt_tokens in v0.15.
        """

+        q: RequestOutputCollector | None = None
        try:
            # We start the output_handler on the first call to generate() so
            # we can call __init__ before the event loop, which enables us
@ -687,7 +701,8 @@ class AsyncLLM(EngineClient):
        # If the request is disconnected by the client, generate()
        # is cancelled. So, we abort the request if we end up here.
        except asyncio.CancelledError:
-            await self.abort(request_id)
+            if q is not None:
+                await self.abort(q.request_id, internal=True)
            if self.log_requests:
                logger.info("Request %s aborted.", request_id)
            raise
@ -706,7 +721,8 @@ class AsyncLLM(EngineClient):

        # Unexpected error in the generate() task (possibly recoverable).
        except Exception as e:
-            await self.abort(request_id)
+            if q is not None:
+                await self.abort(q.request_id, internal=True)
            if self.log_requests:
                logger.info("Request %s failed.", request_id)
            raise EngineGenerateError() from e
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@ -21,7 +21,7 @@ from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tokenizers import TokenizerLike
 from vllm.tokenizers.mistral import MistralTokenizer
-from vllm.utils import length_from_prompt_token_ids_or_embeds
+from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.metrics.stats import MultiModalCacheStats
 from vllm.v1.structured_output.backend_guidance import (
@ -406,6 +406,19 @@ class InputProcessor:
            mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)]
        return mm_uuids

+    @staticmethod
+    def assign_request_id(request: EngineCoreRequest):
+        """Replace the externally supplied request ID with an internal request ID
+        that adds 8 random characters in order to ensure uniquness.
+        """
+        if request.external_req_id is not None:
+            raise ValueError(
+                "The external_req_id field should not be set on EngineCoreRequests"
+                " passed to vLLM; use the request_id field."
+            )
+        request.external_req_id = request.request_id
+        request.request_id = f"{request.external_req_id}-{random_uuid():.8}"
+
    def process_inputs(
        self,
        request_id: str,
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@ -213,10 +213,10 @@ class LLMEngine:
    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
        return self.engine_core.get_supported_tasks()

-    def abort_request(self, request_ids: list[str]) -> None:
+    def abort_request(self, request_ids: list[str], internal: bool = False) -> None:
        """Remove request_ids from EngineCore and Detokenizer."""

-        request_ids = self.output_processor.abort_requests(request_ids)
+        request_ids = self.output_processor.abort_requests(request_ids, internal)
        self.engine_core.abort_requests(request_ids)

    def add_request(
@ -238,6 +238,12 @@ class LLMEngine:
        # Process raw inputs into the request.
        if isinstance(prompt, EngineCoreRequest):
            request = prompt
+            if request_id != request.request_id:
+                logger.warning_once(
+                    "AsyncLLM.add_request() was passed a request_id parameter that "
+                    "does not match the EngineCoreRequest.request_id attribute. The "
+                    "latter will be used, and the former will be ignored."
+                )
        else:
            assert prompt_text is None
            request = self.input_processor.process_inputs(
@ -255,6 +261,8 @@ class LLMEngine:
            elif isinstance(prompt, Mapping):
                prompt_text = cast(str | None, prompt.get("prompt"))

+        self.input_processor.assign_request_id(request)
+
        # Use cloned params that may have been updated in process_inputs()
        params = request.params

@ -268,7 +276,7 @@ class LLMEngine:
            return

        # Fan out child requests (for n>1).
-        parent_req = ParentRequest(request_id, params)
+        parent_req = ParentRequest(request)
        for idx in range(n):
            request_id, child_params = parent_req.get_child_info(idx)
            child_request = request if idx == n - 1 else copy(request)
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import asyncio
+from collections import defaultdict
 from collections.abc import Iterable
 from dataclasses import dataclass
 from typing import Any, cast
@ -40,8 +41,9 @@ class RequestOutputCollector:
    producer gets ahead of the consumer.
    """

-    def __init__(self, output_kind: RequestOutputKind):
+    def __init__(self, output_kind: RequestOutputKind, request_id: str):
        self.aggregate = output_kind == RequestOutputKind.DELTA
+        self.request_id = request_id
        self.output: RequestOutput | PoolingRequestOutput | Exception | None = None
        self.ready = asyncio.Event()

@ -92,6 +94,7 @@ class RequestState:
    def __init__(
        self,
        request_id: str,
+        external_req_id: str,
        parent_req: ParentRequest | None,
        request_index: int,
        lora_request: LoRARequest | None,
@ -111,6 +114,7 @@ class RequestState:
        temperature: float | None = None,
    ):
        self.request_id = request_id
+        self.external_req_id = external_req_id
        self.parent_req = parent_req
        self.request_index = request_index
        self.lora_request = lora_request
@ -176,8 +180,10 @@ class RequestState:
            assert request.pooling_params is not None
            output_kind = request.pooling_params.output_kind

+        assert request.external_req_id is not None
        return cls(
            request_id=request.request_id,
+            external_req_id=request.external_req_id,
            parent_req=parent_req,
            request_index=request_index,
            lora_request=request.lora_request,
@ -235,10 +241,13 @@ class RequestState:
                ]
                self.sent_tokens_offset = len(self.detokenizer.output_token_ids)

-        request_id = self.request_id
+        external_req_id = self.external_req_id
+
        if pooling_output is not None:
            return self._new_request_output(
-                request_id, [self._new_pooling_output(pooling_output)], finished
+                external_req_id,
+                [self._new_pooling_output(pooling_output)],
+                finished,
            )

        output = self._new_completion_output(new_token_ids, finish_reason, stop_reason)
@ -246,19 +255,18 @@ class RequestState:
        if self.parent_req is None:
            outputs = [output]
        else:
-            request_id, outputs, finished = self.parent_req.get_outputs(
-                request_id, output
-            )
+            outputs, finished = self.parent_req.get_outputs(self.request_id, output)
            if not outputs:
                return None
+            external_req_id = self.parent_req.external_req_id

        return self._new_request_output(
-            request_id, outputs, finished, kv_transfer_params
+            external_req_id, outputs, finished, kv_transfer_params
        )

    def _new_request_output(
        self,
-        request_id: str,
+        external_req_id: str,
        outputs: list[CompletionOutput] | list[PoolingOutput],
        finished: bool,
        kv_transfer_params: dict[str, Any] | None = None,
@ -269,7 +277,7 @@ class RequestState:
            # Prompt embeddings are currently not supported by pooling requests.
            assert self.prompt_token_ids is not None
            return PoolingRequestOutput(
-                request_id=request_id,
+                request_id=external_req_id,
                outputs=first_output,
                num_cached_tokens=self.num_cached_tokens,
                prompt_token_ids=self.prompt_token_ids,
@ -288,7 +296,7 @@ class RequestState:
            prompt_token_ids = [0] * len(self.prompt_embeds)

        return RequestOutput(
-            request_id=request_id,
+            request_id=external_req_id,  # request_id is what was provided externally
            lora_request=self.lora_request,
            prompt=self.prompt,
            prompt_token_ids=prompt_token_ids,
@ -352,6 +360,7 @@ class OutputProcessor:
        self.stream_interval = stream_interval
        self.request_states: dict[str, RequestState] = {}
        self.parent_requests: dict[str, ParentRequest] = {}
+        self.external_req_ids: defaultdict[str, list[str]] = defaultdict(list)
        self.lora_states = LoRARequestStates(log_stats)
        self.tracer: Tracer | None = None
        self._requests_drained = asyncio.Event()
@ -375,12 +384,41 @@ class OutputProcessor:
            assert state.queue is not None
            state.queue.put(e)

-    def abort_requests(
-        self,
-        request_ids: Iterable[str],
-    ) -> list[str]:
-        request_ids_to_abort = []
+    def abort_requests(self, request_ids: Iterable[str], internal: bool) -> list[str]:
+        """Abort a list of requests.
+
+        The request_ids may be either external request IDs (those passed to
+        InputProcessor.process_inputs()) or internal request IDs (those randomly
+        generated when creating the EngineCoreRequest).
+
+        If an external request ID is provided, and that external request ID
+        was used for multiple requests, all requests associated with that external
+        request ID are aborted.
+
+        In the case of parallel sampling, a request ID may be used to identify
+        a parent request, in which case the associated child requests are aborted
+        also.
+        """
+
+        internal_req_ids = []
        for request_id in request_ids:
+            if internal:
+                # Internal ID - this may be a parent request
+                internal_req_ids.append(request_id)
+
+                # Remove internal ID from the external->internal mapping
+                if req_state := self.request_states.get(request_id):
+                    external_req_id = req_state.external_req_id
+                    internal_ids = self.external_req_ids[external_req_id]
+                    internal_ids.remove(request_id)
+                    if not internal_ids:
+                        del self.external_req_ids[external_req_id]
+            elif internal_ids := self.external_req_ids.pop(request_id, []):
+                # External ID - abort all requests in the external->internal mapping
+                internal_req_ids.extend(internal_ids)
+
+        request_ids_to_abort = []
+        for request_id in internal_req_ids:
            req_state = self.request_states.pop(request_id, None)
            if req_state is not None:
                self.lora_states.request_finished(request_id, req_state.lora_name)
@ -404,7 +442,7 @@ class OutputProcessor:
                # Abort children prior to removing the parent.
                if parent.child_requests:
                    child_reqs = list(parent.child_requests)
-                    child_reqs = self.abort_requests(child_reqs)
+                    child_reqs = self.abort_requests(child_reqs, internal=True)
                    request_ids_to_abort.extend(child_reqs)
                self.parent_requests.pop(request_id, None)
        if not self.request_states:
@ -439,6 +477,9 @@ class OutputProcessor:
        if parent_req:
            self.parent_requests[parent_req.request_id] = parent_req

+        # Track the external_req_id -> [internal_req_id, ...] mapping
+        self.external_req_ids[req_state.external_req_id].append(request_id)
+
    def process_outputs(
        self,
        engine_core_outputs: list[EngineCoreOutput],
@ -522,6 +563,12 @@ class OutputProcessor:
            # Free completed requests.
            if finish_reason is not None:
                self.request_states.pop(req_id)
+
+                internal_ids = self.external_req_ids[req_state.external_req_id]
+                internal_ids.remove(req_id)
+                if not internal_ids:
+                    del self.external_req_ids[req_state.external_req_id]
+
                # Remove parent request if applicable.
                parent_req = req_state.parent_req
                if parent_req and not parent_req.child_requests:
@ -597,7 +644,9 @@ class OutputProcessor:
            )

            # meta
-            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID, req_state.request_id)
+            span.set_attribute(
+                SpanAttributes.GEN_AI_REQUEST_ID, req_state.external_req_id
+            )
            if req_state.top_p:
                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, req_state.top_p)
            if req_state.max_tokens_param:
--- a/vllm/v1/engine/parallel_sampling.py
+++ b/vllm/v1/engine/parallel_sampling.py
@ -6,6 +6,7 @@ from typing import Optional, cast

 from vllm.outputs import CompletionOutput
 from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.metrics.stats import IterationStats


@ -17,6 +18,7 @@ class ParentRequest:
    """

    request_id: str
+    external_req_id: str
    sampling_params: SamplingParams

    # To track the completion of child requests
@ -31,8 +33,11 @@ class ParentRequest:
    # To efficiently obtain child sampling params
    cached_child_sampling_params: SamplingParams | None

-    def __init__(self, request_id: str, sampling_params: SamplingParams) -> None:
-        self.request_id = request_id
+    def __init__(self, request: EngineCoreRequest) -> None:
+        assert request.external_req_id is not None
+        sampling_params = request.params
+        self.request_id = request.request_id
+        self.external_req_id = request.external_req_id
        self.sampling_params = sampling_params

        self.child_requests = set()
@ -96,7 +101,7 @@ class ParentRequest:
        self,
        child_request_id: str,
        completion_output: CompletionOutput,
-    ) -> tuple[str, list[CompletionOutput], bool]:
+    ) -> tuple[list[CompletionOutput], bool]:
        already_finished_and_returned: bool = False
        if completion_output.finished():
            if child_request_id in self.child_requests:
@ -118,7 +123,7 @@ class ParentRequest:
            outputs = [] if self.child_requests else self.output_aggregator

        finished = not self.child_requests
-        return self.request_id, outputs, finished
+        return outputs, finished

    def observe_num_generation_tokens(self, num_generation_tokens: int):
        self.max_num_generation_tokens = max(