[V1][Minor] Simplify rejection sampler's parse_output (#15741)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-12-17 00:58:38 +08:00 · 2025-03-29 09:25:17 -07:00 · 2025-03-29 09:25:17 -07:00 · 2bc4be4e32
commit 2bc4be4e32
parent c67abd614f
2 changed files with 3 additions and 11 deletions
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@ -107,7 +107,6 @@ class RejectionSampler(nn.Module):
    @staticmethod
    def parse_output(
        output_token_ids: torch.Tensor,
        ignored_req_idxs: list[int],
        vocab_size: int,
    ) -> list[list[int]]:
        """Parse the output of the rejection sampler.
@ -117,9 +116,6 @@ class RejectionSampler(nn.Module):
                [batch_size, max_spec_len + 1]. The rejected tokens are
                replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
                and will be filtered out in this function.
            ignored_req_idxs: The indices of the requests that should not be
                sampled. This is usually because the request is still in the
                prefill phase.
            vocab_size: The size of the vocabulary.
        Returns:
@ -129,11 +125,8 @@ class RejectionSampler(nn.Module):
        # Create mask for valid tokens.
        valid_mask = ((output_token_ids_np != PLACEHOLDER_TOKEN_ID) &
                      (output_token_ids_np < vocab_size))
        ignored_req_idx_set = set(ignored_req_idxs)
        outputs = [
            row[valid_mask[i]].tolist()
            if i not in ignored_req_idx_set else []
            for i, row in enumerate(output_token_ids_np)
        ]
        return outputs
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -1121,16 +1121,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        if max_gen_len == 1:
            # No spec decode tokens.
            valid_sampled_token_ids = sampled_token_ids.tolist()
            # Mask out the sampled tokens that should not be sampled.
            for i in discard_sampled_tokens_req_indices:
                valid_sampled_token_ids[i].clear()
        else:
            # Includes spec decode tokens.
            valid_sampled_token_ids = self.rejection_sampler.parse_output(
                sampled_token_ids,
                discard_sampled_tokens_req_indices,
                self.input_batch.vocab_size,
            )
        # Mask out the sampled tokens that should not be sampled.
        for i in discard_sampled_tokens_req_indices:
            valid_sampled_token_ids[i].clear()
        if not self.use_spec_decode:
            spec_token_ids = None