mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 02:55:40 +08:00
[Chore][V0 Deprecation] Move LogProb to a separate file (#24055)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
a0e0efd6bd
commit
5685370271
@ -4,8 +4,8 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||||
|
|
||||||
|
from vllm.logprobs import Logprob
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sequence import Logprob
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.multimodal import MultiModalDataDict
|
from vllm.multimodal import MultiModalDataDict
|
||||||
|
|||||||
@ -43,10 +43,10 @@ from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
|
|||||||
from vllm.entrypoints.score_utils import (ScoreContentPartParam,
|
from vllm.entrypoints.score_utils import (ScoreContentPartParam,
|
||||||
ScoreMultiModalParam)
|
ScoreMultiModalParam)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.logprobs import Logprob
|
||||||
from vllm.pooling_params import PoolingParams
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
|
from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
|
||||||
RequestOutputKind, SamplingParams)
|
RequestOutputKind, SamplingParams)
|
||||||
from vllm.sequence import Logprob
|
|
||||||
from vllm.utils import random_uuid, resolve_obj_by_qualname
|
from vllm.utils import random_uuid, resolve_obj_by_qualname
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|||||||
@ -43,10 +43,10 @@ from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
|
|||||||
from vllm.entrypoints.utils import get_max_tokens
|
from vllm.entrypoints.utils import get_max_tokens
|
||||||
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.logprobs import Logprob
|
||||||
from vllm.outputs import CompletionOutput, RequestOutput
|
from vllm.outputs import CompletionOutput, RequestOutput
|
||||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||||
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
||||||
from vllm.sequence import Logprob
|
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
||||||
from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls,
|
from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls,
|
||||||
truncate_tool_call_ids,
|
truncate_tool_call_ids,
|
||||||
|
|||||||
@ -38,9 +38,9 @@ from vllm.entrypoints.utils import get_max_tokens
|
|||||||
from vllm.inputs.data import (EmbedsPrompt, TokensPrompt, is_embeds_prompt,
|
from vllm.inputs.data import (EmbedsPrompt, TokensPrompt, is_embeds_prompt,
|
||||||
is_tokens_prompt)
|
is_tokens_prompt)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.logprobs import Logprob
|
||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
||||||
from vllm.sequence import Logprob
|
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
from vllm.utils import as_list, merge_async_iterators
|
from vllm.utils import as_list, merge_async_iterators
|
||||||
|
|
||||||
|
|||||||
@ -67,13 +67,13 @@ from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt
|
|||||||
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
||||||
from vllm.inputs.parse import parse_and_batch_prompt
|
from vllm.inputs.parse import parse_and_batch_prompt
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.logprobs import Logprob, PromptLogprobs
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.multimodal import ( # noqa: F401 - Required to resolve Pydantic error in RequestProcessingMixin
|
from vllm.multimodal import ( # noqa: F401 - Required to resolve Pydantic error in RequestProcessingMixin
|
||||||
MultiModalDataDict, MultiModalUUIDDict)
|
MultiModalDataDict, MultiModalUUIDDict)
|
||||||
from vllm.outputs import PoolingRequestOutput, RequestOutput
|
from vllm.outputs import PoolingRequestOutput, RequestOutput
|
||||||
from vllm.pooling_params import PoolingParams
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
||||||
from vllm.sequence import Logprob, PromptLogprobs
|
|
||||||
from vllm.tracing import (contains_trace_headers, extract_trace_headers,
|
from vllm.tracing import (contains_trace_headers, extract_trace_headers,
|
||||||
log_tracing_disabled_warning)
|
log_tracing_disabled_warning)
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
||||||
|
|||||||
@ -58,11 +58,11 @@ from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
|||||||
from vllm.entrypoints.tool_server import MCPToolServer, ToolServer
|
from vllm.entrypoints.tool_server import MCPToolServer, ToolServer
|
||||||
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.logprobs import Logprob as SampleLogprob
|
||||||
|
from vllm.logprobs import SampleLogprobs
|
||||||
from vllm.outputs import CompletionOutput
|
from vllm.outputs import CompletionOutput
|
||||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.sequence import Logprob as SampleLogprob
|
|
||||||
from vllm.sequence import SampleLogprobs
|
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
from vllm.utils import random_uuid
|
from vllm.utils import random_uuid
|
||||||
|
|
||||||
|
|||||||
28
vllm/logprobs.py
Normal file
28
vllm/logprobs.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
# We use dataclass for now because it is used for
|
||||||
|
# openai server output, and msgspec is not serializable.
|
||||||
|
# TODO(sang): Fix it.
|
||||||
|
@dataclass
|
||||||
|
class Logprob:
|
||||||
|
"""Infos for supporting OpenAI compatible logprobs and token ranks.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
logprob: The logprob of chosen token
|
||||||
|
rank: The vocab rank of chosen token (>=1)
|
||||||
|
decoded_token: The decoded chosen token index
|
||||||
|
"""
|
||||||
|
logprob: float
|
||||||
|
rank: Optional[int] = None
|
||||||
|
decoded_token: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
# {token_id -> logprob} per each sequence group. None if the corresponding
|
||||||
|
# sequence group doesn't require prompt logprob.
|
||||||
|
PromptLogprobs = list[Optional[dict[int, Logprob]]]
|
||||||
|
# {token_id -> logprob} for each sequence group.
|
||||||
|
SampleLogprobs = list[dict[int, Logprob]]
|
||||||
@ -13,14 +13,14 @@ import torch
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
|
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
|
||||||
from vllm.model_executor.layers.utils import apply_penalties
|
from vllm.model_executor.layers.utils import apply_penalties
|
||||||
from vllm.model_executor.sampling_metadata import (SamplingMetadata,
|
from vllm.model_executor.sampling_metadata import (SamplingMetadata,
|
||||||
SamplingTensors,
|
SamplingTensors,
|
||||||
SequenceGroupToSample)
|
SequenceGroupToSample)
|
||||||
from vllm.sampling_params import SamplingType
|
from vllm.sampling_params import SamplingType
|
||||||
from vllm.sequence import (VLLM_INVALID_TOKEN_ID,
|
from vllm.sequence import (VLLM_INVALID_TOKEN_ID,
|
||||||
CompletionSequenceGroupOutput, Logprob,
|
CompletionSequenceGroupOutput, SequenceOutput)
|
||||||
PromptLogprobs, SampleLogprobs, SequenceOutput)
|
|
||||||
|
|
||||||
if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"):
|
if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"):
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
|
|||||||
@ -14,12 +14,12 @@ from transformers import PretrainedConfig
|
|||||||
|
|
||||||
from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
|
from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
|
||||||
SpeculativeConfig)
|
SpeculativeConfig)
|
||||||
|
from vllm.logprobs import Logprob
|
||||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||||
from vllm.model_executor.layers.quantization import get_quantization_config
|
from vllm.model_executor.layers.quantization import get_quantization_config
|
||||||
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
|
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
|
from vllm.sequence import CompletionSequenceGroupOutput, SequenceOutput
|
||||||
SequenceOutput)
|
|
||||||
|
|
||||||
TORCH_DTYPE_TO_NEURON_AMP = {
|
TORCH_DTYPE_TO_NEURON_AMP = {
|
||||||
"auto": "f32",
|
"auto": "f32",
|
||||||
|
|||||||
@ -27,11 +27,11 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig
|
|||||||
from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
|
from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
|
||||||
SpeculativeConfig)
|
SpeculativeConfig)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.logprobs import Logprob
|
||||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||||
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
|
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
|
from vllm.sequence import CompletionSequenceGroupOutput, SequenceOutput
|
||||||
SequenceOutput)
|
|
||||||
|
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|||||||
@ -11,11 +11,12 @@ import torch
|
|||||||
from typing_extensions import TypeVar
|
from typing_extensions import TypeVar
|
||||||
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.logprobs import PromptLogprobs, SampleLogprobs
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.multimodal.inputs import MultiModalPlaceholderDict
|
from vllm.multimodal.inputs import MultiModalPlaceholderDict
|
||||||
from vllm.sampling_params import RequestOutputKind
|
from vllm.sampling_params import RequestOutputKind
|
||||||
from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs,
|
from vllm.sequence import (RequestMetrics, SequenceGroup, SequenceGroupBase,
|
||||||
SequenceGroup, SequenceGroupBase, SequenceStatus)
|
SequenceStatus)
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|||||||
@ -16,6 +16,7 @@ import msgspec
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.inputs import SingletonInputs
|
from vllm.inputs import SingletonInputs
|
||||||
|
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
|
||||||
from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
|
from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
|
||||||
from vllm.pooling_params import PoolingParams
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||||
@ -38,30 +39,6 @@ def array_full(token_id: int, count: int):
|
|||||||
return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
|
return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
|
||||||
|
|
||||||
|
|
||||||
# We use dataclass for now because it is used for
|
|
||||||
# openai server output, and msgspec is not serializable.
|
|
||||||
# TODO(sang): Fix it.
|
|
||||||
@dataclass
|
|
||||||
class Logprob:
|
|
||||||
"""Infos for supporting OpenAI compatible logprobs and token ranks.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
logprob: The logprob of chosen token
|
|
||||||
rank: The vocab rank of chosen token (>=1)
|
|
||||||
decoded_token: The decoded chosen token index
|
|
||||||
"""
|
|
||||||
logprob: float
|
|
||||||
rank: Optional[int] = None
|
|
||||||
decoded_token: Optional[str] = None
|
|
||||||
|
|
||||||
|
|
||||||
# {token_id -> logprob} per each sequence group. None if the corresponding
|
|
||||||
# sequence group doesn't require prompt logprob.
|
|
||||||
PromptLogprobs = list[Optional[dict[int, Logprob]]]
|
|
||||||
# {token_id -> logprob} for each sequence group.
|
|
||||||
SampleLogprobs = list[dict[int, Logprob]]
|
|
||||||
|
|
||||||
|
|
||||||
class SequenceStatus(enum.IntEnum):
|
class SequenceStatus(enum.IntEnum):
|
||||||
"""Status of a sequence."""
|
"""Status of a sequence."""
|
||||||
WAITING = 0
|
WAITING = 0
|
||||||
|
|||||||
@ -3,8 +3,9 @@
|
|||||||
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams,
|
from vllm.logprobs import Logprob
|
||||||
Sequence, SequenceGroup)
|
from vllm.sequence import (VLLM_INVALID_TOKEN_ID, SamplingParams, Sequence,
|
||||||
|
SequenceGroup)
|
||||||
|
|
||||||
from .detokenizer_utils import (convert_prompt_ids_to_tokens,
|
from .detokenizer_utils import (convert_prompt_ids_to_tokens,
|
||||||
detokenize_incrementally)
|
detokenize_incrementally)
|
||||||
|
|||||||
@ -7,7 +7,7 @@ from dataclasses import dataclass
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
|
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
|
||||||
from vllm.transformers_utils.detokenizer_utils import (
|
from vllm.transformers_utils.detokenizer_utils import (
|
||||||
AnyTokenizer, convert_ids_list_to_tokens)
|
AnyTokenizer, convert_ids_list_to_tokens)
|
||||||
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
|
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user