mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 15:47:22 +08:00
Merge branch 'main' into v1-blocktable-opt
This commit is contained in:
commit
a6e5d7b5b7
@ -73,7 +73,7 @@ steps:
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: H100
|
||||
depends_on: block-h100
|
||||
depends_on: ~
|
||||
plugins:
|
||||
- docker#v5.12.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||
|
||||
@ -106,14 +106,12 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
commands:
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
|
||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
|
||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
@ -333,8 +331,6 @@ steps:
|
||||
- vllm/
|
||||
- tests/models
|
||||
commands:
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/test_registry.py
|
||||
- pytest -v -s models/test_initialization.py
|
||||
|
||||
@ -360,7 +356,7 @@ steps:
|
||||
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/embedding/language -m 'not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Standard) # 28min
|
||||
- label: Multi-Modal Models Test (Standard) # 40min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -376,7 +372,7 @@ steps:
|
||||
- pytest -v -s models/encoder_decoder/language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1 # 1h16m
|
||||
- label: Multi-Modal Models Test (Extended) 1 # 48m
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -469,11 +465,28 @@ steps:
|
||||
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
|
||||
|
||||
- label: Plugin Tests (2 GPUs) # 40min
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
fast_check: true
|
||||
source_file_dependencies:
|
||||
- vllm/plugins/
|
||||
- tests/plugins/
|
||||
commands:
|
||||
# begin platform plugin tests, all the code in-between runs on dummy platform
|
||||
- pip install -e ./plugins/vllm_add_dummy_platform
|
||||
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||
- pip uninstall vllm_add_dummy_platform -y
|
||||
# end platform plugin tests
|
||||
# other tests continue here:
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
|
||||
- label: Multi-step Tests (4 GPUs) # 36min
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
|
||||
@ -225,13 +225,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
FetchContent_Declare(
|
||||
cutlass
|
||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||
GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227
|
||||
GIT_TAG v3.6.0
|
||||
GIT_PROGRESS TRUE
|
||||
|
||||
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
|
||||
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
|
||||
GIT_SHALLOW FALSE
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
endif()
|
||||
FetchContent_MakeAvailable(cutlass)
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# default base image
|
||||
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
|
||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04"
|
||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
|
||||
|
||||
FROM $BASE_IMAGE
|
||||
|
||||
@ -22,9 +22,9 @@ WORKDIR ${APP_MOUNT}/vllm
|
||||
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
||||
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
|
||||
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
|
||||
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
|
||||
184
benchmarks/benchmark_long_document_qa_throughput.py
Normal file
184
benchmarks/benchmark_long_document_qa_throughput.py
Normal file
@ -0,0 +1,184 @@
|
||||
"""
|
||||
Offline benchmark to test the long document QA throughput.
|
||||
|
||||
Example usage:
|
||||
# This command run the vllm with 50GB CPU memory for offloading
|
||||
# The workload samples 8 different prompts with a default input
|
||||
# length of 20000 tokens, then replicates each prompt 2 times
|
||||
# in random order.
|
||||
python benchmark_long_document_qa_throughput.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--enable-prefix-caching \
|
||||
--num-documents 8 \
|
||||
--repeat-count 2
|
||||
|
||||
Commandline arguments:
|
||||
--num-documents: The number of documents to sample prompts from.
|
||||
|
||||
--document-length: The length of each document in tokens.
|
||||
(Optional, default: 20000)
|
||||
|
||||
--output-len: The number of tokens to generate for each prompt.
|
||||
(Optional, default: 10)
|
||||
|
||||
--repeat-count: The number of times to repeat each prompt.
|
||||
(Optional, default: 2)
|
||||
|
||||
--repeat-mode: The mode to repeat prompts. The supported modes are:
|
||||
- 'random': shuffle the prompts randomly. (Default)
|
||||
- 'tile': the entire prompt list is repeated in sequence. (Potentially
|
||||
lowest cache hit)
|
||||
- 'interleave': each prompt is repeated consecutively before
|
||||
moving to the next element. (Highest cache hit)
|
||||
|
||||
--shuffle-seed: Random seed when the repeat mode is "random".
|
||||
(Optional, default: 0)
|
||||
|
||||
In the meantime, it also supports all the vLLM engine args to initialize the
|
||||
LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
|
||||
details.
|
||||
"""
|
||||
|
||||
import dataclasses
|
||||
import random
|
||||
import time
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
|
||||
"""
|
||||
Test long document QA with the given prompts and sampling parameters.
|
||||
Print the time spent in processing all the prompts.
|
||||
|
||||
Args:
|
||||
llm: The language model used for generating responses.
|
||||
sampling_params: Sampling parameter used to generate the response.
|
||||
prompts: A list of prompt strings to be processed by the LLM.
|
||||
"""
|
||||
start_time = time.time()
|
||||
llm.generate(prompts, sampling_params=sampling_params)
|
||||
end_time = time.time()
|
||||
print(f"Time to execute all requests: {end_time - start_time:.4f} secs")
|
||||
|
||||
|
||||
def repeat_prompts(prompts, repeat_count, mode: str):
|
||||
"""
|
||||
Repeat each prompt in the list for a specified number of times.
|
||||
The order of prompts in the output list depends on the mode.
|
||||
|
||||
Args:
|
||||
prompts: A list of prompts to be repeated.
|
||||
repeat_count: The number of times each prompt is repeated.
|
||||
mode: The mode of repetition. Supported modes are:
|
||||
- 'random': Shuffle the prompts randomly after repetition.
|
||||
- 'tile': Repeat the entire prompt list in sequence.
|
||||
Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
|
||||
- 'interleave': Repeat each prompt consecutively before moving to
|
||||
the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
|
||||
|
||||
Returns:
|
||||
A list of repeated prompts in the specified order.
|
||||
|
||||
Raises:
|
||||
ValueError: If an invalid mode is provided.
|
||||
"""
|
||||
print("Repeat mode: ", mode)
|
||||
if mode == 'random':
|
||||
repeated_prompts = prompts * repeat_count
|
||||
random.shuffle(repeated_prompts)
|
||||
return repeated_prompts
|
||||
elif mode == 'tile':
|
||||
return prompts * repeat_count
|
||||
elif mode == 'interleave':
|
||||
repeated_prompts = []
|
||||
for prompt in prompts:
|
||||
repeated_prompts.extend([prompt] * repeat_count)
|
||||
return repeated_prompts
|
||||
else:
|
||||
raise ValueError(f"Invalid mode: {mode}, only support "
|
||||
"'random', 'tile', 'interleave'")
|
||||
|
||||
|
||||
def main(args):
|
||||
random.seed(args.shuffle_seed)
|
||||
|
||||
# Prepare the prompts:
|
||||
# we append the document id at the beginning to avoid any of the document
|
||||
# being the prefix of other documents
|
||||
prompts = [
|
||||
str(i) + ' '.join(['hi'] * args.document_length)
|
||||
for i in range(args.num_documents)
|
||||
]
|
||||
|
||||
prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
|
||||
|
||||
warmup_prompts = [
|
||||
"This is warm up request " + str(i) + \
|
||||
' '.join(['hi'] * args.document_length)
|
||||
for i in range(args.num_documents)]
|
||||
|
||||
# Create the LLM engine
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
|
||||
|
||||
print("------warm up------")
|
||||
test_long_document_qa(
|
||||
llm=llm,
|
||||
prompts=warmup_prompts,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
print("------start generating------")
|
||||
test_long_document_qa(
|
||||
llm=llm,
|
||||
prompts=prompts,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description=
|
||||
'Benchmark the performance with or without automatic prefix caching.')
|
||||
|
||||
parser.add_argument(
|
||||
'--document-length',
|
||||
type=int,
|
||||
# Roughly the number of tokens for a system paper,
|
||||
# excluding images
|
||||
default=20000,
|
||||
help='Range of input lengths for sampling prompts,'
|
||||
'specified as "min:max" (e.g., "128:256").')
|
||||
|
||||
parser.add_argument('--num-documents',
|
||||
type=int,
|
||||
default=8,
|
||||
help='Range of input lengths for sampling prompts,'
|
||||
'specified as "min:max" (e.g., "128:256").')
|
||||
|
||||
parser.add_argument('--output-len', type=int, default=10)
|
||||
|
||||
parser.add_argument('--repeat-count',
|
||||
type=int,
|
||||
default=2,
|
||||
help='Number of times to repeat each prompt')
|
||||
|
||||
parser.add_argument("--repeat-mode",
|
||||
type=str,
|
||||
default='random',
|
||||
help='The mode to repeat prompts. The supported '
|
||||
'modes are "random", "tile", and "interleave". '
|
||||
'See repeat_prompts() in the source code for details.')
|
||||
|
||||
parser.add_argument("--shuffle-seed",
|
||||
type=int,
|
||||
default=0,
|
||||
help='Random seed when the repeat mode is "random"')
|
||||
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
@ -14,9 +14,9 @@ class VLLMDataType(enum.Enum):
|
||||
|
||||
|
||||
class MixedInputKernelScheduleType(enum.Enum):
|
||||
TmaWarpSpecializedMixedInput = enum_auto()
|
||||
TmaWarpSpecializedPingpongMixedInput = enum_auto()
|
||||
TmaWarpSpecializedCooperativeMixedInput = enum_auto()
|
||||
TmaWarpSpecialized = enum_auto()
|
||||
TmaWarpSpecializedPingpong = enum_auto()
|
||||
TmaWarpSpecializedCooperative = enum_auto()
|
||||
|
||||
|
||||
VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = {
|
||||
@ -68,11 +68,11 @@ VLLMKernelScheduleTag: Dict[Union[
|
||||
MixedInputKernelScheduleType, KernelScheduleType], str] = {
|
||||
**KernelScheduleTag, # type: ignore
|
||||
**{
|
||||
MixedInputKernelScheduleType.TmaWarpSpecializedMixedInput:
|
||||
"cutlass::gemm::KernelTmaWarpSpecializedMixedInput",
|
||||
MixedInputKernelScheduleType.TmaWarpSpecializedPingpongMixedInput:
|
||||
"cutlass::gemm::KernelTmaWarpSpecializedPingpongMixedInput",
|
||||
MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput:
|
||||
"cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput",
|
||||
MixedInputKernelScheduleType.TmaWarpSpecialized:
|
||||
"cutlass::gemm::KernelTmaWarpSpecialized",
|
||||
MixedInputKernelScheduleType.TmaWarpSpecializedPingpong:
|
||||
"cutlass::gemm::KernelTmaWarpSpecializedPingpong",
|
||||
MixedInputKernelScheduleType.TmaWarpSpecializedCooperative:
|
||||
"cutlass::gemm::KernelTmaWarpSpecializedCooperative",
|
||||
}
|
||||
}
|
||||
|
||||
@ -189,7 +189,7 @@ using Kernel_{{type_sig}} = MacheteKernelTemplate<
|
||||
{{DataTypeTag[t.b_group_zeropoint]}}, // GroupZeroT
|
||||
{{DataTypeTag[t.b_channel_scale]}}, // ChannelScaleT
|
||||
{{DataTypeTag[t.a_token_scale]}}, // TokenScaleT
|
||||
cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput,
|
||||
cutlass::gemm::KernelTmaWarpSpecializedCooperative,
|
||||
Sch>;
|
||||
|
||||
{% for sch in schs %}
|
||||
@ -223,7 +223,7 @@ torch::Tensor prepack_B_dispatch(PrepackBArgs args) {
|
||||
{{DataTypeTag[t.convert]}}, // ElementConvert
|
||||
{{DataTypeTag[t.accumulator]}}, // Accumulator
|
||||
cutlass::layout::ColumnMajor,
|
||||
cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput>
|
||||
cutlass::gemm::KernelTmaWarpSpecializedCooperative>
|
||||
>(args.B);
|
||||
}
|
||||
{%- endfor %}
|
||||
@ -239,7 +239,7 @@ torch::Tensor prepack_B_dispatch(PrepackBArgs args) {
|
||||
}; // namespace machete
|
||||
"""
|
||||
|
||||
TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput
|
||||
TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperative
|
||||
TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative
|
||||
|
||||
|
||||
@ -300,7 +300,7 @@ def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
|
||||
# mostly unique shorter sch_sig
|
||||
def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str:
|
||||
kernel_terse_names_replace = {
|
||||
"KernelTmaWarpSpecializedCooperativeMixedInput_": "TmaMI_",
|
||||
"KernelTmaWarpSpecializedCooperative": "TmaMI_",
|
||||
"TmaWarpSpecializedCooperative_": "TmaCoop_",
|
||||
"StreamKScheduler": "streamK",
|
||||
}
|
||||
|
||||
@ -18,16 +18,14 @@ struct VLLMCollectiveBuilder<
|
||||
ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType,
|
||||
KernelScheduleType,
|
||||
cute::enable_if_t<(
|
||||
cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
|
||||
cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
|
||||
cute::is_same_v<KernelScheduleType,
|
||||
KernelTmaWarpSpecializedMixedInput> ||
|
||||
cute::is_same_v<KernelScheduleType,
|
||||
KernelTmaWarpSpecializedPingpongMixedInput> ||
|
||||
cute::is_same_v<KernelScheduleType,
|
||||
KernelTmaWarpSpecializedCooperativeMixedInput>)>> {
|
||||
KernelTmaWarpSpecializedCooperative>)>> {
|
||||
using CollectiveOp = machete::MacheteCollectiveMma<
|
||||
ElementPairA_, GmemLayoutA_, AlignmentA, ElementPairB_, GmemLayoutB_,
|
||||
AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK,
|
||||
StageCountType, KernelScheduleType>;
|
||||
};
|
||||
|
||||
}; // namespace cutlass::gemm::collective
|
||||
}; // namespace cutlass::gemm::collective
|
||||
|
||||
@ -66,13 +66,11 @@ struct MacheteCollectiveMma {
|
||||
using Schedule = KernelScheduleType;
|
||||
static_assert(
|
||||
cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
|
||||
cute::is_same_v<Schedule, KernelTmaWarpSpecializedMixedInput> ||
|
||||
cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
|
||||
cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
|
||||
cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
|
||||
cute::is_same_v<Schedule,
|
||||
KernelTmaWarpSpecializedPingpongMixedInput> ||
|
||||
cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative> ||
|
||||
cute::is_same_v<Schedule,
|
||||
KernelTmaWarpSpecializedCooperativeMixedInput>,
|
||||
cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative>,
|
||||
"KernelSchedule must be one of the warp specialized policies");
|
||||
|
||||
public:
|
||||
@ -113,8 +111,7 @@ struct MacheteCollectiveMma {
|
||||
// For coop schedules we have two warp groups cooperatively issuing wgmma
|
||||
// instructions so we use 2 atoms along the M dim (one for each warpgroup)
|
||||
using AtomLayoutMNK = cute::conditional_t<
|
||||
cute::is_same_v<KernelScheduleType,
|
||||
KernelTmaWarpSpecializedCooperativeMixedInput>,
|
||||
cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>,
|
||||
Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
|
||||
|
||||
using TiledMma = decltype(cute::make_tiled_mma(
|
||||
|
||||
@ -98,8 +98,7 @@ struct PrepackedLayoutBTemplate {
|
||||
// For coop schedules we have two warp groups cooperatively issuing wgmma
|
||||
// instructions so we use 2 atoms along the M dim (one for each warpgroup)
|
||||
using AtomLayoutMNK = cute::conditional_t<
|
||||
cute::is_same_v<KernelSchedule,
|
||||
KernelTmaWarpSpecializedCooperativeMixedInput>,
|
||||
cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperative>,
|
||||
Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
|
||||
|
||||
using TiledMma = decltype(cute::make_tiled_mma(
|
||||
@ -247,4 +246,4 @@ struct PrepackedLayoutBTemplate {
|
||||
}
|
||||
};
|
||||
|
||||
}; // namespace machete
|
||||
}; // namespace machete
|
||||
|
||||
@ -19,3 +19,4 @@ openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entr
|
||||
fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
||||
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
||||
requests
|
||||
zmq
|
||||
|
||||
@ -191,6 +191,7 @@ def linkcode_resolve(domain, info):
|
||||
|
||||
# Mock out external dependencies here, otherwise the autodoc pages may be blank.
|
||||
autodoc_mock_imports = [
|
||||
"blake3",
|
||||
"compressed_tensors",
|
||||
"cpuinfo",
|
||||
"cv2",
|
||||
@ -207,7 +208,7 @@ autodoc_mock_imports = [
|
||||
"tensorizer",
|
||||
"pynvml",
|
||||
"outlines",
|
||||
"xgrammar,"
|
||||
"xgrammar",
|
||||
"librosa",
|
||||
"soundfile",
|
||||
"gguf",
|
||||
|
||||
@ -11,11 +11,11 @@ Below is a visual representation of the multi-stage Dockerfile. The build graph
|
||||
|
||||
The edges of the build graph represent:
|
||||
|
||||
- FROM ... dependencies (with a solid line and a full arrow head)
|
||||
- `FROM ...` dependencies (with a solid line and a full arrow head)
|
||||
|
||||
- COPY --from=... dependencies (with a dashed line and an empty arrow head)
|
||||
- `COPY --from=...` dependencies (with a dashed line and an empty arrow head)
|
||||
|
||||
- RUN --mount=(.\*)from=... dependencies (with a dotted line and an empty diamond arrow head)
|
||||
- `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)
|
||||
|
||||
> ```{figure} ../../assets/dev/dockerfile-stages-dependency.png
|
||||
> :align: center
|
||||
|
||||
@ -34,7 +34,7 @@ pytest tests/
|
||||
```
|
||||
|
||||
```{note}
|
||||
Currently, the repository does not pass the `mypy` tests.
|
||||
Currently, the repository is not fully checked by `mypy`.
|
||||
```
|
||||
|
||||
# Contribution Guidelines
|
||||
|
||||
@ -45,31 +45,23 @@ adding_multimodal_plugin
|
||||
### Base Classes
|
||||
|
||||
```{eval-rst}
|
||||
.. autodata:: vllm.multimodal.NestedTensors
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autodata:: vllm.multimodal.BatchedTensorInputs
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.multimodal.MultiModalDataBuiltins
|
||||
.. automodule:: vllm.multimodal.base
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autodata:: vllm.multimodal.MultiModalDataDict
|
||||
```
|
||||
### Input Classes
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.multimodal.MultiModalKwargs
|
||||
.. automodule:: vllm.multimodal.inputs
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
### Audio Classes
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: vllm.multimodal.MultiModalPlugin
|
||||
.. automodule:: vllm.multimodal.audio
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
@ -81,3 +73,11 @@ adding_multimodal_plugin
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
### Video Classes
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: vllm.multimodal.video
|
||||
:members:
|
||||
:show-inheritance:
|
||||
```
|
||||
|
||||
@ -41,9 +41,11 @@ Every plugin has three parts:
|
||||
2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name.
|
||||
3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.
|
||||
|
||||
## What Can Plugins Do?
|
||||
## Types of supported plugins
|
||||
|
||||
Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM.
|
||||
- **General plugins** (with group name `vllm.general_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model inside the plugin function.
|
||||
|
||||
- **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
|
||||
|
||||
## Guidelines for Writing Plugins
|
||||
|
||||
|
||||
@ -20,7 +20,7 @@ Contents:
|
||||
## Requirements
|
||||
|
||||
- **Operating System**: Linux or macOS
|
||||
- **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended)
|
||||
- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended)
|
||||
- **Instruction Set Architecture (ISA)**: NEON support is required
|
||||
|
||||
(arm-backend-quick-start-dockerfile)=
|
||||
|
||||
@ -24,7 +24,7 @@ Table of contents:
|
||||
## Requirements
|
||||
|
||||
- OS: Linux
|
||||
- Compiler: gcc/g++>=12.3.0 (optional, recommended)
|
||||
- Compiler: `gcc/g++>=12.3.0` (optional, recommended)
|
||||
- Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
|
||||
|
||||
(cpu-backend-quick-start-dockerfile)=
|
||||
@ -69,7 +69,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
|
||||
|
||||
```{note}
|
||||
- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
|
||||
- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.
|
||||
- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
|
||||
```
|
||||
|
||||
(env-intro)=
|
||||
|
||||
@ -197,4 +197,4 @@ if __name__ == '__main__':
|
||||
## Known Issues
|
||||
|
||||
- In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).
|
||||
- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) .
|
||||
- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable `NCCL_CUMEM_ENABLE=0` to disable NCCL's `cuMem` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) .
|
||||
|
||||
@ -141,32 +141,33 @@ Gaudi2 devices. Configurations that are not listed may or may not work.
|
||||
|
||||
Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
|
||||
|
||||
```{eval-rst}
|
||||
.. list-table:: vLLM execution modes
|
||||
:widths: 25 25 50
|
||||
:header-rows: 1
|
||||
```{list-table} vLLM execution modes
|
||||
:widths: 25 25 50
|
||||
:header-rows: 1
|
||||
|
||||
* - ``PT_HPU_LAZY_MODE``
|
||||
- ``enforce_eager``
|
||||
- execution mode
|
||||
* - 0
|
||||
- 0
|
||||
- torch.compile
|
||||
* - 0
|
||||
- 1
|
||||
- PyTorch eager mode
|
||||
* - 1
|
||||
- 0
|
||||
- HPU Graphs
|
||||
* - 1
|
||||
- 1
|
||||
- PyTorch lazy mode
|
||||
* - `PT_HPU_LAZY_MODE`
|
||||
- `enforce_eager`
|
||||
- execution mode
|
||||
* - 0
|
||||
- 0
|
||||
- torch.compile
|
||||
* - 0
|
||||
- 1
|
||||
- PyTorch eager mode
|
||||
* - 1
|
||||
- 0
|
||||
- HPU Graphs
|
||||
* - 1
|
||||
- 1
|
||||
- PyTorch lazy mode
|
||||
```
|
||||
|
||||
```{warning}
|
||||
In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
|
||||
```
|
||||
|
||||
(gaudi-bucketing-mechanism)=
|
||||
|
||||
### Bucketing mechanism
|
||||
|
||||
Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
|
||||
@ -185,7 +186,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, ma
|
||||
INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
|
||||
```
|
||||
|
||||
`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
|
||||
`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
|
||||
|
||||
Example (with ramp-up)
|
||||
|
||||
@ -214,7 +215,7 @@ If a request exceeds maximum bucket size in any dimension, it will be processed
|
||||
As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
|
||||
|
||||
```{note}
|
||||
Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
|
||||
Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
|
||||
```
|
||||
|
||||
### Warmup
|
||||
@ -235,7 +236,7 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size
|
||||
INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
|
||||
```
|
||||
|
||||
This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
|
||||
This example uses the same buckets as in the [Bucketing Mechanism](#gaudi-bucketing-mechanism) section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
|
||||
|
||||
```{tip}
|
||||
Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
|
||||
|
||||
@ -26,7 +26,7 @@ Installation steps:
|
||||
(build-from-source-neuron)=
|
||||
|
||||
```{note}
|
||||
The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
|
||||
The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
|
||||
```
|
||||
|
||||
## Build from source
|
||||
|
||||
@ -114,7 +114,7 @@ $ "temperature": 0
|
||||
$ }'
|
||||
```
|
||||
|
||||
Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` python package:
|
||||
Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package:
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
@ -151,7 +151,7 @@ $ ]
|
||||
$ }'
|
||||
```
|
||||
|
||||
Alternatively, you can use the `openai` python package:
|
||||
Alternatively, you can use the `openai` Python package:
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
@ -68,33 +68,32 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
|
||||
--service-account SERVICE_ACCOUNT
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. list-table:: Parameter descriptions
|
||||
:header-rows: 1
|
||||
```{list-table} Parameter descriptions
|
||||
:header-rows: 1
|
||||
|
||||
* - Parameter name
|
||||
- Description
|
||||
* - QUEUED_RESOURCE_ID
|
||||
- The user-assigned ID of the queued resource request.
|
||||
* - TPU_NAME
|
||||
- The user-assigned name of the TPU which is created when the queued
|
||||
resource request is allocated.
|
||||
* - PROJECT_ID
|
||||
- Your Google Cloud project
|
||||
* - ZONE
|
||||
- The GCP zone where you want to create your Cloud TPU. The value you use
|
||||
depends on the version of TPUs you are using. For more information, see
|
||||
`TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
|
||||
* - ACCELERATOR_TYPE
|
||||
- The TPU version you want to use. Specify the TPU version, for example
|
||||
`v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
|
||||
see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
|
||||
* - RUNTIME_VERSION
|
||||
- The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
|
||||
* - SERVICE_ACCOUNT
|
||||
- The email address for your service account. You can find it in the IAM
|
||||
Cloud Console under *Service Accounts*. For example:
|
||||
`tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
|
||||
* - Parameter name
|
||||
- Description
|
||||
* - QUEUED_RESOURCE_ID
|
||||
- The user-assigned ID of the queued resource request.
|
||||
* - TPU_NAME
|
||||
- The user-assigned name of the TPU which is created when the queued
|
||||
resource request is allocated.
|
||||
* - PROJECT_ID
|
||||
- Your Google Cloud project
|
||||
* - ZONE
|
||||
- The GCP zone where you want to create your Cloud TPU. The value you use
|
||||
depends on the version of TPUs you are using. For more information, see
|
||||
`TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
|
||||
* - ACCELERATOR_TYPE
|
||||
- The TPU version you want to use. Specify the TPU version, for example
|
||||
`v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
|
||||
see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
|
||||
* - RUNTIME_VERSION
|
||||
- The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
|
||||
* - SERVICE_ACCOUNT
|
||||
- The email address for your service account. You can find it in the IAM
|
||||
Cloud Console under *Service Accounts*. For example:
|
||||
`tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
|
||||
```
|
||||
|
||||
Connect to your TPU using SSH:
|
||||
@ -103,7 +102,7 @@ Connect to your TPU using SSH:
|
||||
gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE
|
||||
```
|
||||
|
||||
Install Miniconda
|
||||
Install Miniconda:
|
||||
|
||||
```bash
|
||||
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -4,121 +4,120 @@
|
||||
|
||||
The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
|
||||
|
||||
```{eval-rst}
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:widths: 20 8 8 8 8 8 8 8 8 8 8
|
||||
```{list-table}
|
||||
:header-rows: 1
|
||||
:widths: 20 8 8 8 8 8 8 8 8 8 8
|
||||
|
||||
* - Implementation
|
||||
- Volta
|
||||
- Turing
|
||||
- Ampere
|
||||
- Ada
|
||||
- Hopper
|
||||
- AMD GPU
|
||||
- Intel GPU
|
||||
- x86 CPU
|
||||
- AWS Inferentia
|
||||
- Google TPU
|
||||
* - AWQ
|
||||
- ✗
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
* - GPTQ
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
* - Marlin (GPTQ/AWQ/FP8)
|
||||
- ✗
|
||||
- ✗
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
* - INT8 (W8A8)
|
||||
- ✗
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
* - FP8 (W8A8)
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
* - AQLM
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
* - bitsandbytes
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
* - DeepSpeedFP
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
* - GGUF
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
* - Implementation
|
||||
- Volta
|
||||
- Turing
|
||||
- Ampere
|
||||
- Ada
|
||||
- Hopper
|
||||
- AMD GPU
|
||||
- Intel GPU
|
||||
- x86 CPU
|
||||
- AWS Inferentia
|
||||
- Google TPU
|
||||
* - AWQ
|
||||
- ✗
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
* - GPTQ
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
* - Marlin (GPTQ/AWQ/FP8)
|
||||
- ✗
|
||||
- ✗
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
* - INT8 (W8A8)
|
||||
- ✗
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
* - FP8 (W8A8)
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
* - AQLM
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
* - bitsandbytes
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
* - DeepSpeedFP
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
* - GGUF
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✅︎
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
- ✗
|
||||
```
|
||||
|
||||
## Notes:
|
||||
|
||||
@ -33,7 +33,7 @@ docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
|
||||
vllm = "latest"
|
||||
```
|
||||
|
||||
Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py\`:
|
||||
Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
@ -55,13 +55,13 @@ def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
|
||||
return {"results": results}
|
||||
```
|
||||
|
||||
Then, run the following code to deploy it to the cloud
|
||||
Then, run the following code to deploy it to the cloud:
|
||||
|
||||
```console
|
||||
$ cerebrium deploy
|
||||
```
|
||||
|
||||
If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run)
|
||||
If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case` /run`)
|
||||
|
||||
```python
|
||||
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
|
||||
|
||||
@ -25,7 +25,7 @@ $ cd vllm-dstack
|
||||
$ dstack init
|
||||
```
|
||||
|
||||
Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
|
||||
Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
|
||||
|
||||
```yaml
|
||||
type: service
|
||||
|
||||
@ -43,209 +43,208 @@ chart **including persistent volumes** and deletes the release.
|
||||
|
||||
## Values
|
||||
|
||||
```{eval-rst}
|
||||
.. list-table:: Values
|
||||
:widths: 25 25 25 25
|
||||
:header-rows: 1
|
||||
```{list-table}
|
||||
:widths: 25 25 25 25
|
||||
:header-rows: 1
|
||||
|
||||
* - Key
|
||||
- Type
|
||||
- Default
|
||||
- Description
|
||||
* - autoscaling
|
||||
- object
|
||||
- {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
|
||||
- Autoscaling configuration
|
||||
* - autoscaling.enabled
|
||||
- bool
|
||||
- false
|
||||
- Enable autoscaling
|
||||
* - autoscaling.maxReplicas
|
||||
- int
|
||||
- 100
|
||||
- Maximum replicas
|
||||
* - autoscaling.minReplicas
|
||||
- int
|
||||
- 1
|
||||
- Minimum replicas
|
||||
* - autoscaling.targetCPUUtilizationPercentage
|
||||
- int
|
||||
- 80
|
||||
- Target CPU utilization for autoscaling
|
||||
* - configs
|
||||
- object
|
||||
- {}
|
||||
- Configmap
|
||||
* - containerPort
|
||||
- int
|
||||
- 8000
|
||||
- Container port
|
||||
* - customObjects
|
||||
- list
|
||||
- []
|
||||
- Custom Objects configuration
|
||||
* - deploymentStrategy
|
||||
- object
|
||||
- {}
|
||||
- Deployment strategy configuration
|
||||
* - externalConfigs
|
||||
- list
|
||||
- []
|
||||
- External configuration
|
||||
* - extraContainers
|
||||
- list
|
||||
- []
|
||||
- Additional containers configuration
|
||||
* - extraInit
|
||||
- object
|
||||
- {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
|
||||
- Additional configuration for the init container
|
||||
* - extraInit.pvcStorage
|
||||
- string
|
||||
- "50Gi"
|
||||
- Storage size of the s3
|
||||
* - extraInit.s3modelpath
|
||||
- string
|
||||
- "relative_s3_model_path/opt-125m"
|
||||
- Path of the model on the s3 which hosts model weights and config files
|
||||
* - extraInit.awsEc2MetadataDisabled
|
||||
- boolean
|
||||
- true
|
||||
- Disables the use of the Amazon EC2 instance metadata service
|
||||
* - extraPorts
|
||||
- list
|
||||
- []
|
||||
- Additional ports configuration
|
||||
* - gpuModels
|
||||
- list
|
||||
- ["TYPE_GPU_USED"]
|
||||
- Type of gpu used
|
||||
* - image
|
||||
- object
|
||||
- {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
|
||||
- Image configuration
|
||||
* - image.command
|
||||
- list
|
||||
- ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
|
||||
- Container launch command
|
||||
* - image.repository
|
||||
- string
|
||||
- "vllm/vllm-openai"
|
||||
- Image repository
|
||||
* - image.tag
|
||||
- string
|
||||
- "latest"
|
||||
- Image tag
|
||||
* - livenessProbe
|
||||
- object
|
||||
- {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
|
||||
- Liveness probe configuration
|
||||
* - livenessProbe.failureThreshold
|
||||
- int
|
||||
- 3
|
||||
- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
|
||||
* - livenessProbe.httpGet
|
||||
- object
|
||||
- {"path":"/health","port":8000}
|
||||
- Configuration of the Kubelet http request on the server
|
||||
* - livenessProbe.httpGet.path
|
||||
- string
|
||||
- "/health"
|
||||
- Path to access on the HTTP server
|
||||
* - livenessProbe.httpGet.port
|
||||
- int
|
||||
- 8000
|
||||
- Name or number of the port to access on the container, on which the server is listening
|
||||
* - livenessProbe.initialDelaySeconds
|
||||
- int
|
||||
- 15
|
||||
- Number of seconds after the container has started before liveness probe is initiated
|
||||
* - livenessProbe.periodSeconds
|
||||
- int
|
||||
- 10
|
||||
- How often (in seconds) to perform the liveness probe
|
||||
* - maxUnavailablePodDisruptionBudget
|
||||
- string
|
||||
- ""
|
||||
- Disruption Budget Configuration
|
||||
* - readinessProbe
|
||||
- object
|
||||
- {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
|
||||
- Readiness probe configuration
|
||||
* - readinessProbe.failureThreshold
|
||||
- int
|
||||
- 3
|
||||
- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
|
||||
* - readinessProbe.httpGet
|
||||
- object
|
||||
- {"path":"/health","port":8000}
|
||||
- Configuration of the Kubelet http request on the server
|
||||
* - readinessProbe.httpGet.path
|
||||
- string
|
||||
- "/health"
|
||||
- Path to access on the HTTP server
|
||||
* - readinessProbe.httpGet.port
|
||||
- int
|
||||
- 8000
|
||||
- Name or number of the port to access on the container, on which the server is listening
|
||||
* - readinessProbe.initialDelaySeconds
|
||||
- int
|
||||
- 5
|
||||
- Number of seconds after the container has started before readiness probe is initiated
|
||||
* - readinessProbe.periodSeconds
|
||||
- int
|
||||
- 5
|
||||
- How often (in seconds) to perform the readiness probe
|
||||
* - replicaCount
|
||||
- int
|
||||
- 1
|
||||
- Number of replicas
|
||||
* - resources
|
||||
- object
|
||||
- {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
|
||||
- Resource configuration
|
||||
* - resources.limits."nvidia.com/gpu"
|
||||
- int
|
||||
- 1
|
||||
- Number of gpus used
|
||||
* - resources.limits.cpu
|
||||
- int
|
||||
- 4
|
||||
- Number of CPUs
|
||||
* - resources.limits.memory
|
||||
- string
|
||||
- "16Gi"
|
||||
- CPU memory configuration
|
||||
* - resources.requests."nvidia.com/gpu"
|
||||
- int
|
||||
- 1
|
||||
- Number of gpus used
|
||||
* - resources.requests.cpu
|
||||
- int
|
||||
- 4
|
||||
- Number of CPUs
|
||||
* - resources.requests.memory
|
||||
- string
|
||||
- "16Gi"
|
||||
- CPU memory configuration
|
||||
* - secrets
|
||||
- object
|
||||
- {}
|
||||
- Secrets configuration
|
||||
* - serviceName
|
||||
- string
|
||||
-
|
||||
- Service name
|
||||
* - servicePort
|
||||
- int
|
||||
- 80
|
||||
- Service port
|
||||
* - labels.environment
|
||||
- string
|
||||
- test
|
||||
- Environment name
|
||||
* - labels.release
|
||||
- string
|
||||
- test
|
||||
- Release name
|
||||
* - Key
|
||||
- Type
|
||||
- Default
|
||||
- Description
|
||||
* - autoscaling
|
||||
- object
|
||||
- {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
|
||||
- Autoscaling configuration
|
||||
* - autoscaling.enabled
|
||||
- bool
|
||||
- false
|
||||
- Enable autoscaling
|
||||
* - autoscaling.maxReplicas
|
||||
- int
|
||||
- 100
|
||||
- Maximum replicas
|
||||
* - autoscaling.minReplicas
|
||||
- int
|
||||
- 1
|
||||
- Minimum replicas
|
||||
* - autoscaling.targetCPUUtilizationPercentage
|
||||
- int
|
||||
- 80
|
||||
- Target CPU utilization for autoscaling
|
||||
* - configs
|
||||
- object
|
||||
- {}
|
||||
- Configmap
|
||||
* - containerPort
|
||||
- int
|
||||
- 8000
|
||||
- Container port
|
||||
* - customObjects
|
||||
- list
|
||||
- []
|
||||
- Custom Objects configuration
|
||||
* - deploymentStrategy
|
||||
- object
|
||||
- {}
|
||||
- Deployment strategy configuration
|
||||
* - externalConfigs
|
||||
- list
|
||||
- []
|
||||
- External configuration
|
||||
* - extraContainers
|
||||
- list
|
||||
- []
|
||||
- Additional containers configuration
|
||||
* - extraInit
|
||||
- object
|
||||
- {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
|
||||
- Additional configuration for the init container
|
||||
* - extraInit.pvcStorage
|
||||
- string
|
||||
- "50Gi"
|
||||
- Storage size of the s3
|
||||
* - extraInit.s3modelpath
|
||||
- string
|
||||
- "relative_s3_model_path/opt-125m"
|
||||
- Path of the model on the s3 which hosts model weights and config files
|
||||
* - extraInit.awsEc2MetadataDisabled
|
||||
- boolean
|
||||
- true
|
||||
- Disables the use of the Amazon EC2 instance metadata service
|
||||
* - extraPorts
|
||||
- list
|
||||
- []
|
||||
- Additional ports configuration
|
||||
* - gpuModels
|
||||
- list
|
||||
- ["TYPE_GPU_USED"]
|
||||
- Type of gpu used
|
||||
* - image
|
||||
- object
|
||||
- {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
|
||||
- Image configuration
|
||||
* - image.command
|
||||
- list
|
||||
- ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
|
||||
- Container launch command
|
||||
* - image.repository
|
||||
- string
|
||||
- "vllm/vllm-openai"
|
||||
- Image repository
|
||||
* - image.tag
|
||||
- string
|
||||
- "latest"
|
||||
- Image tag
|
||||
* - livenessProbe
|
||||
- object
|
||||
- {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
|
||||
- Liveness probe configuration
|
||||
* - livenessProbe.failureThreshold
|
||||
- int
|
||||
- 3
|
||||
- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
|
||||
* - livenessProbe.httpGet
|
||||
- object
|
||||
- {"path":"/health","port":8000}
|
||||
- Configuration of the Kubelet http request on the server
|
||||
* - livenessProbe.httpGet.path
|
||||
- string
|
||||
- "/health"
|
||||
- Path to access on the HTTP server
|
||||
* - livenessProbe.httpGet.port
|
||||
- int
|
||||
- 8000
|
||||
- Name or number of the port to access on the container, on which the server is listening
|
||||
* - livenessProbe.initialDelaySeconds
|
||||
- int
|
||||
- 15
|
||||
- Number of seconds after the container has started before liveness probe is initiated
|
||||
* - livenessProbe.periodSeconds
|
||||
- int
|
||||
- 10
|
||||
- How often (in seconds) to perform the liveness probe
|
||||
* - maxUnavailablePodDisruptionBudget
|
||||
- string
|
||||
- ""
|
||||
- Disruption Budget Configuration
|
||||
* - readinessProbe
|
||||
- object
|
||||
- {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
|
||||
- Readiness probe configuration
|
||||
* - readinessProbe.failureThreshold
|
||||
- int
|
||||
- 3
|
||||
- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
|
||||
* - readinessProbe.httpGet
|
||||
- object
|
||||
- {"path":"/health","port":8000}
|
||||
- Configuration of the Kubelet http request on the server
|
||||
* - readinessProbe.httpGet.path
|
||||
- string
|
||||
- "/health"
|
||||
- Path to access on the HTTP server
|
||||
* - readinessProbe.httpGet.port
|
||||
- int
|
||||
- 8000
|
||||
- Name or number of the port to access on the container, on which the server is listening
|
||||
* - readinessProbe.initialDelaySeconds
|
||||
- int
|
||||
- 5
|
||||
- Number of seconds after the container has started before readiness probe is initiated
|
||||
* - readinessProbe.periodSeconds
|
||||
- int
|
||||
- 5
|
||||
- How often (in seconds) to perform the readiness probe
|
||||
* - replicaCount
|
||||
- int
|
||||
- 1
|
||||
- Number of replicas
|
||||
* - resources
|
||||
- object
|
||||
- {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
|
||||
- Resource configuration
|
||||
* - resources.limits."nvidia.com/gpu"
|
||||
- int
|
||||
- 1
|
||||
- Number of gpus used
|
||||
* - resources.limits.cpu
|
||||
- int
|
||||
- 4
|
||||
- Number of CPUs
|
||||
* - resources.limits.memory
|
||||
- string
|
||||
- "16Gi"
|
||||
- CPU memory configuration
|
||||
* - resources.requests."nvidia.com/gpu"
|
||||
- int
|
||||
- 1
|
||||
- Number of gpus used
|
||||
* - resources.requests.cpu
|
||||
- int
|
||||
- 4
|
||||
- Number of CPUs
|
||||
* - resources.requests.memory
|
||||
- string
|
||||
- "16Gi"
|
||||
- CPU memory configuration
|
||||
* - secrets
|
||||
- object
|
||||
- {}
|
||||
- Secrets configuration
|
||||
* - serviceName
|
||||
- string
|
||||
-
|
||||
- Service name
|
||||
* - servicePort
|
||||
- int
|
||||
- 80
|
||||
- Service port
|
||||
* - labels.environment
|
||||
- string
|
||||
- test
|
||||
- Environment name
|
||||
* - labels.release
|
||||
- string
|
||||
- test
|
||||
- Release name
|
||||
```
|
||||
|
||||
@ -47,7 +47,11 @@ data:
|
||||
token: "REPLACE_WITH_TOKEN"
|
||||
```
|
||||
|
||||
Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model:
|
||||
Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.
|
||||
|
||||
Here are two examples for using NVIDIA GPU and AMD GPU.
|
||||
|
||||
- NVIDIA GPU
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
@ -119,6 +123,79 @@ spec:
|
||||
periodSeconds: 5
|
||||
```
|
||||
|
||||
- AMD GPU
|
||||
|
||||
You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: mistral-7b
|
||||
namespace: default
|
||||
labels:
|
||||
app: mistral-7b
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: mistral-7b
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: mistral-7b
|
||||
spec:
|
||||
volumes:
|
||||
# PVC
|
||||
- name: cache-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: mistral-7b
|
||||
# vLLM needs to access the host's shared memory for tensor parallel inference.
|
||||
- name: shm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: "8Gi"
|
||||
hostNetwork: true
|
||||
hostIPC: true
|
||||
containers:
|
||||
- name: mistral-7b
|
||||
image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
|
||||
securityContext:
|
||||
seccompProfile:
|
||||
type: Unconfined
|
||||
runAsGroup: 44
|
||||
capabilities:
|
||||
add:
|
||||
- SYS_PTRACE
|
||||
command: ["/bin/sh", "-c"]
|
||||
args: [
|
||||
"vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
|
||||
]
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
resources:
|
||||
limits:
|
||||
cpu: "10"
|
||||
memory: 20G
|
||||
amd.com/gpu: "1"
|
||||
requests:
|
||||
cpu: "6"
|
||||
memory: 6G
|
||||
amd.com/gpu: "1"
|
||||
volumeMounts:
|
||||
- name: cache-volume
|
||||
mountPath: /root/.cache/huggingface
|
||||
- name: shm
|
||||
mountPath: /dev/shm
|
||||
```
|
||||
You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
|
||||
|
||||
2. **Create a Kubernetes Service for vLLM**
|
||||
|
||||
Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
|
||||
|
||||
@ -8,7 +8,7 @@ Before going into the details of distributed inference and serving, let's first
|
||||
|
||||
- **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference.
|
||||
- **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4.
|
||||
- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2.
|
||||
- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2.
|
||||
|
||||
In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes.
|
||||
|
||||
@ -77,7 +77,7 @@ Then you get a ray cluster of containers. Note that you need to keep the shells
|
||||
|
||||
Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
|
||||
|
||||
After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
|
||||
After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
|
||||
|
||||
```console
|
||||
$ vllm serve /path/to/the/model/in/the/container \
|
||||
@ -85,7 +85,7 @@ $ --tensor-parallel-size 8 \
|
||||
$ --pipeline-parallel-size 2
|
||||
```
|
||||
|
||||
You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16:
|
||||
You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16:
|
||||
|
||||
```console
|
||||
$ vllm serve /path/to/the/model/in/the/container \
|
||||
|
||||
@ -41,7 +41,7 @@ For reading from S3, it will be the number of client instances the host is openi
|
||||
$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}'
|
||||
```
|
||||
|
||||
You can controls the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
|
||||
You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
|
||||
You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit).
|
||||
|
||||
```console
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
# Structured Outputs
|
||||
|
||||
vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines) or [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer) as backends for the guided decoding.
|
||||
vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding.
|
||||
This document shows you some examples of the different options that are available to generate structured outputs.
|
||||
|
||||
## Online Inference (OpenAI API)
|
||||
|
||||
@ -24,10 +24,13 @@ def run_aria(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
model_name = "rhymes-ai/Aria"
|
||||
|
||||
# NOTE: Need L40 (or equivalent) to avoid OOM
|
||||
llm = LLM(model=model_name,
|
||||
tokenizer_mode="slow",
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
trust_remote_code=True,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
|
||||
prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
|
||||
@ -57,6 +60,7 @@ def run_chameleon(question: str, modality: str):
|
||||
prompt = f"{question}<image>"
|
||||
llm = LLM(model="facebook/chameleon-7b",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
@ -257,7 +261,7 @@ def run_minicpmv(question: str, modality: str):
|
||||
# 2.5
|
||||
# model_name = "openbmb/MiniCPM-Llama3-V-2_5"
|
||||
|
||||
#2.6
|
||||
# 2.6
|
||||
model_name = "openbmb/MiniCPM-V-2_6"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
trust_remote_code=True)
|
||||
@ -308,7 +312,20 @@ def run_mllama(question: str, modality: str):
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
prompt = f"<|image|><|begin_of_text|>{question}"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "image"
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": f"{question}"
|
||||
}]
|
||||
}]
|
||||
prompt = tokenizer.apply_chat_template(messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=False)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
@ -417,9 +434,11 @@ def run_pixtral_hf(question: str, modality: str):
|
||||
|
||||
model_name = "mistral-community/pixtral-12b"
|
||||
|
||||
# NOTE: Need L40 (or equivalent) to avoid OOM
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
|
||||
@ -2,6 +2,6 @@
|
||||
-r requirements-common.txt
|
||||
|
||||
# Dependencies for Neuron devices
|
||||
transformers-neuronx >= 0.12.0
|
||||
torch-neuronx >= 2.1.2
|
||||
transformers-neuronx >= 0.13.0
|
||||
torch-neuronx >= 2.5.0
|
||||
neuronx-cc
|
||||
|
||||
52
setup.py
52
setup.py
@ -1,3 +1,4 @@
|
||||
import ctypes
|
||||
import importlib.util
|
||||
import logging
|
||||
import os
|
||||
@ -13,7 +14,7 @@ from packaging.version import Version, parse
|
||||
from setuptools import Extension, find_packages, setup
|
||||
from setuptools.command.build_ext import build_ext
|
||||
from setuptools_scm import get_version
|
||||
from torch.utils.cpp_extension import CUDA_HOME
|
||||
from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
|
||||
|
||||
|
||||
def load_module_from_path(module_name, path):
|
||||
@ -379,25 +380,31 @@ def _build_custom_ops() -> bool:
|
||||
return _is_cuda() or _is_hip() or _is_cpu()
|
||||
|
||||
|
||||
def get_hipcc_rocm_version():
|
||||
# Run the hipcc --version command
|
||||
result = subprocess.run(['hipcc', '--version'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True)
|
||||
def get_rocm_version():
|
||||
# Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
|
||||
# see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
|
||||
try:
|
||||
librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
|
||||
if not librocm_core_file.is_file():
|
||||
return None
|
||||
librocm_core = ctypes.CDLL(librocm_core_file)
|
||||
VerErrors = ctypes.c_uint32
|
||||
get_rocm_core_version = librocm_core.getROCmVersion
|
||||
get_rocm_core_version.restype = VerErrors
|
||||
get_rocm_core_version.argtypes = [
|
||||
ctypes.POINTER(ctypes.c_uint32),
|
||||
ctypes.POINTER(ctypes.c_uint32),
|
||||
ctypes.POINTER(ctypes.c_uint32),
|
||||
]
|
||||
major = ctypes.c_uint32()
|
||||
minor = ctypes.c_uint32()
|
||||
patch = ctypes.c_uint32()
|
||||
|
||||
# Check if the command was executed successfully
|
||||
if result.returncode != 0:
|
||||
print("Error running 'hipcc --version'")
|
||||
if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
|
||||
ctypes.byref(patch)) == 0):
|
||||
return "%d.%d.%d" % (major.value, minor.value, patch.value)
|
||||
return None
|
||||
|
||||
# Extract the version using a regular expression
|
||||
match = re.search(r'HIP version: (\S+)', result.stdout)
|
||||
if match:
|
||||
# Return the version string
|
||||
return match.group(1)
|
||||
else:
|
||||
print("Could not find HIP version in the output")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
@ -479,11 +486,10 @@ def get_vllm_version() -> str:
|
||||
if "sdist" not in sys.argv:
|
||||
version += f"{sep}cu{cuda_version_str}"
|
||||
elif _is_hip():
|
||||
# Get the HIP version
|
||||
hipcc_version = get_hipcc_rocm_version()
|
||||
if hipcc_version != MAIN_CUDA_VERSION:
|
||||
rocm_version_str = hipcc_version.replace(".", "")[:3]
|
||||
version += f"{sep}rocm{rocm_version_str}"
|
||||
# Get the Rocm Version
|
||||
rocm_version = get_rocm_version() or torch.version.hip
|
||||
if rocm_version and rocm_version != MAIN_CUDA_VERSION:
|
||||
version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
|
||||
elif _is_neuron():
|
||||
# Get the Neuron version
|
||||
neuron_version = str(get_neuronxcc_version())
|
||||
|
||||
@ -7,7 +7,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are
|
||||
initialized randomly with a fixed seed.
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple
|
||||
from typing import Any, List, Optional, Tuple
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -54,6 +54,16 @@ class LlamaConfig:
|
||||
tractable_init: bool = False
|
||||
random_seed: int = 0
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
factors: List[Any] = []
|
||||
for k, v in self.__dict__.items():
|
||||
if k == "random_seed":
|
||||
continue
|
||||
factors.append((k, v))
|
||||
factors.sort()
|
||||
import hashlib
|
||||
return hashlib.md5(str(factors).encode()).hexdigest()
|
||||
|
||||
def __post_init__(self):
|
||||
assert self.mlp_size >= self.hidden_size
|
||||
|
||||
@ -263,7 +273,8 @@ def run_model(llama_config,
|
||||
compilation_config = CompilationConfig(
|
||||
level=CompilationLevel.NO_COMPILATION, )
|
||||
|
||||
vllm_config = VllmConfig(compilation_config=compilation_config)
|
||||
vllm_config = VllmConfig(compilation_config=compilation_config,
|
||||
additional_config=llama_config)
|
||||
with set_current_vllm_config(vllm_config):
|
||||
model = LlamaModel(config=llama_config,
|
||||
vllm_config=vllm_config,
|
||||
|
||||
@ -31,7 +31,6 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
|
||||
to_enc_dec_tuple_list, zip_enc_dec_prompts)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sampling_params import BeamSearchParams
|
||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
|
||||
identity)
|
||||
@ -242,6 +241,7 @@ _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
|
||||
class HfRunner:
|
||||
|
||||
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
|
||||
from vllm.platforms import current_platform
|
||||
if x is None or isinstance(x, (bool, )):
|
||||
return x
|
||||
|
||||
|
||||
@ -4,7 +4,7 @@ import pytest
|
||||
|
||||
from vllm.entrypoints.openai.cli_args import (make_arg_parser,
|
||||
validate_parsed_serve_args)
|
||||
from vllm.entrypoints.openai.serving_engine import LoRAModulePath
|
||||
from vllm.entrypoints.openai.serving_models import LoRAModulePath
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
from ...utils import VLLM_PATH
|
||||
|
||||
@ -28,6 +28,8 @@ PA_NAME = "swapnilbp/llama_tweet_ptune"
|
||||
# need to change to match the prompt adapter
|
||||
PA_NUM_VIRTUAL_TOKENS = 8
|
||||
|
||||
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_lora_files():
|
||||
@ -635,8 +637,7 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
["outlines", "lm-format-enforcer"])
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_json_completion(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_json_schema):
|
||||
@ -658,8 +659,7 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
["outlines", "lm-format-enforcer"])
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_regex_completion(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_regex):
|
||||
@ -680,8 +680,7 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI,
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
["outlines", "lm-format-enforcer"])
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_choice_completion(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_guided_choice):
|
||||
@ -761,8 +760,7 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
["outlines", "lm-format-enforcer"])
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_json_schema, sample_regex):
|
||||
|
||||
@ -55,7 +55,10 @@ def server_with_lora_modules_json(zephyr_lora_files):
|
||||
"64",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
# Enable the /v1/load_lora_adapter endpoint
|
||||
envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -67,8 +70,8 @@ async def client_for_lora_lineage(server_with_lora_modules_json):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
|
||||
zephyr_lora_files):
|
||||
async def test_static_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
|
||||
zephyr_lora_files):
|
||||
models = await client_for_lora_lineage.models.list()
|
||||
models = models.data
|
||||
served_model = models[0]
|
||||
@ -81,3 +84,26 @@ async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
|
||||
assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
|
||||
assert lora_models[0].id == "zephyr-lora"
|
||||
assert lora_models[1].id == "zephyr-lora2"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dynamic_lora_lineage(
|
||||
client_for_lora_lineage: openai.AsyncOpenAI, zephyr_lora_files):
|
||||
|
||||
response = await client_for_lora_lineage.post("load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={
|
||||
"lora_name":
|
||||
"zephyr-lora-3",
|
||||
"lora_path":
|
||||
zephyr_lora_files
|
||||
})
|
||||
# Ensure adapter loads before querying /models
|
||||
assert "success" in response
|
||||
|
||||
models = await client_for_lora_lineage.models.list()
|
||||
models = models.data
|
||||
dynamic_lora_model = models[-1]
|
||||
assert dynamic_lora_model.root == zephyr_lora_files
|
||||
assert dynamic_lora_model.parent == MODEL_NAME
|
||||
assert dynamic_lora_model.id == "zephyr-lora-3"
|
||||
|
||||
@ -8,7 +8,8 @@ from vllm.config import MultiModalConfig
|
||||
from vllm.engine.multiprocessing.client import MQLLMEngineClient
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.serving_engine import BaseModelPath
|
||||
from vllm.entrypoints.openai.serving_models import (BaseModelPath,
|
||||
OpenAIServingModels)
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
MODEL_NAME = "openai-community/gpt2"
|
||||
@ -33,6 +34,7 @@ class MockModelConfig:
|
||||
hf_config = MockHFConfig()
|
||||
logits_processor_pattern = None
|
||||
diff_sampling_param: Optional[dict] = None
|
||||
allowed_local_media_path: str = ""
|
||||
|
||||
def get_diff_sampling_param(self):
|
||||
return self.diff_sampling_param or {}
|
||||
@ -49,14 +51,13 @@ async def _async_serving_chat_init():
|
||||
engine = MockEngine()
|
||||
model_config = await engine.get_model_config()
|
||||
|
||||
models = OpenAIServingModels(model_config, BASE_MODEL_PATHS)
|
||||
serving_completion = OpenAIServingChat(engine,
|
||||
model_config,
|
||||
BASE_MODEL_PATHS,
|
||||
models,
|
||||
response_role="assistant",
|
||||
chat_template=CHAT_TEMPLATE,
|
||||
chat_template_content_format="auto",
|
||||
lora_modules=None,
|
||||
prompt_adapters=None,
|
||||
request_logger=None)
|
||||
return serving_completion
|
||||
|
||||
@ -71,14 +72,14 @@ def test_serving_chat_should_set_correct_max_tokens():
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
|
||||
models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
|
||||
model_config=MockModelConfig())
|
||||
serving_chat = OpenAIServingChat(mock_engine,
|
||||
MockModelConfig(),
|
||||
BASE_MODEL_PATHS,
|
||||
models,
|
||||
response_role="assistant",
|
||||
chat_template=CHAT_TEMPLATE,
|
||||
chat_template_content_format="auto",
|
||||
lora_modules=None,
|
||||
prompt_adapters=None,
|
||||
request_logger=None)
|
||||
req = ChatCompletionRequest(
|
||||
model=MODEL_NAME,
|
||||
@ -114,14 +115,14 @@ def test_serving_chat_could_load_correct_generation_config():
|
||||
mock_engine.errored = False
|
||||
|
||||
# Initialize the serving chat
|
||||
models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
|
||||
model_config=mock_model_config)
|
||||
serving_chat = OpenAIServingChat(mock_engine,
|
||||
mock_model_config,
|
||||
BASE_MODEL_PATHS,
|
||||
models,
|
||||
response_role="assistant",
|
||||
chat_template=CHAT_TEMPLATE,
|
||||
chat_template_content_format="auto",
|
||||
lora_modules=None,
|
||||
prompt_adapters=None,
|
||||
request_logger=None)
|
||||
req = ChatCompletionRequest(
|
||||
model=MODEL_NAME,
|
||||
|
||||
@ -4,11 +4,11 @@ from unittest.mock import MagicMock
|
||||
import pytest
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.openai.protocol import (ErrorResponse,
|
||||
LoadLoraAdapterRequest,
|
||||
UnloadLoraAdapterRequest)
|
||||
from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import (BaseModelPath,
|
||||
OpenAIServingModels)
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
MODEL_NAME = "meta-llama/Llama-2-7b"
|
||||
@ -19,47 +19,45 @@ LORA_UNLOADING_SUCCESS_MESSAGE = (
|
||||
"Success: LoRA adapter '{lora_name}' removed successfully.")
|
||||
|
||||
|
||||
async def _async_serving_engine_init():
|
||||
mock_engine_client = MagicMock(spec=EngineClient)
|
||||
async def _async_serving_models_init() -> OpenAIServingModels:
|
||||
mock_model_config = MagicMock(spec=ModelConfig)
|
||||
# Set the max_model_len attribute to avoid missing attribute
|
||||
mock_model_config.max_model_len = 2048
|
||||
|
||||
serving_engine = OpenAIServing(mock_engine_client,
|
||||
mock_model_config,
|
||||
BASE_MODEL_PATHS,
|
||||
lora_modules=None,
|
||||
prompt_adapters=None,
|
||||
request_logger=None)
|
||||
return serving_engine
|
||||
serving_models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
|
||||
model_config=mock_model_config,
|
||||
lora_modules=None,
|
||||
prompt_adapters=None)
|
||||
|
||||
return serving_models
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_serving_model_name():
|
||||
serving_engine = await _async_serving_engine_init()
|
||||
assert serving_engine._get_model_name(None) == MODEL_NAME
|
||||
serving_models = await _async_serving_models_init()
|
||||
assert serving_models.model_name(None) == MODEL_NAME
|
||||
request = LoRARequest(lora_name="adapter",
|
||||
lora_path="/path/to/adapter2",
|
||||
lora_int_id=1)
|
||||
assert serving_engine._get_model_name(request) == request.lora_name
|
||||
assert serving_models.model_name(request) == request.lora_name
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_load_lora_adapter_success():
|
||||
serving_engine = await _async_serving_engine_init()
|
||||
serving_models = await _async_serving_models_init()
|
||||
request = LoadLoraAdapterRequest(lora_name="adapter",
|
||||
lora_path="/path/to/adapter2")
|
||||
response = await serving_engine.load_lora_adapter(request)
|
||||
response = await serving_models.load_lora_adapter(request)
|
||||
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
|
||||
assert len(serving_engine.lora_requests) == 1
|
||||
assert serving_engine.lora_requests[0].lora_name == "adapter"
|
||||
assert len(serving_models.lora_requests) == 1
|
||||
assert serving_models.lora_requests[0].lora_name == "adapter"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_load_lora_adapter_missing_fields():
|
||||
serving_engine = await _async_serving_engine_init()
|
||||
serving_models = await _async_serving_models_init()
|
||||
request = LoadLoraAdapterRequest(lora_name="", lora_path="")
|
||||
response = await serving_engine.load_lora_adapter(request)
|
||||
response = await serving_models.load_lora_adapter(request)
|
||||
assert isinstance(response, ErrorResponse)
|
||||
assert response.type == "InvalidUserInput"
|
||||
assert response.code == HTTPStatus.BAD_REQUEST
|
||||
@ -67,43 +65,43 @@ async def test_load_lora_adapter_missing_fields():
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_load_lora_adapter_duplicate():
|
||||
serving_engine = await _async_serving_engine_init()
|
||||
serving_models = await _async_serving_models_init()
|
||||
request = LoadLoraAdapterRequest(lora_name="adapter1",
|
||||
lora_path="/path/to/adapter1")
|
||||
response = await serving_engine.load_lora_adapter(request)
|
||||
response = await serving_models.load_lora_adapter(request)
|
||||
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
|
||||
lora_name='adapter1')
|
||||
assert len(serving_engine.lora_requests) == 1
|
||||
assert len(serving_models.lora_requests) == 1
|
||||
|
||||
request = LoadLoraAdapterRequest(lora_name="adapter1",
|
||||
lora_path="/path/to/adapter1")
|
||||
response = await serving_engine.load_lora_adapter(request)
|
||||
response = await serving_models.load_lora_adapter(request)
|
||||
assert isinstance(response, ErrorResponse)
|
||||
assert response.type == "InvalidUserInput"
|
||||
assert response.code == HTTPStatus.BAD_REQUEST
|
||||
assert len(serving_engine.lora_requests) == 1
|
||||
assert len(serving_models.lora_requests) == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_unload_lora_adapter_success():
|
||||
serving_engine = await _async_serving_engine_init()
|
||||
serving_models = await _async_serving_models_init()
|
||||
request = LoadLoraAdapterRequest(lora_name="adapter1",
|
||||
lora_path="/path/to/adapter1")
|
||||
response = await serving_engine.load_lora_adapter(request)
|
||||
assert len(serving_engine.lora_requests) == 1
|
||||
response = await serving_models.load_lora_adapter(request)
|
||||
assert len(serving_models.lora_requests) == 1
|
||||
|
||||
request = UnloadLoraAdapterRequest(lora_name="adapter1")
|
||||
response = await serving_engine.unload_lora_adapter(request)
|
||||
response = await serving_models.unload_lora_adapter(request)
|
||||
assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
|
||||
lora_name='adapter1')
|
||||
assert len(serving_engine.lora_requests) == 0
|
||||
assert len(serving_models.lora_requests) == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_unload_lora_adapter_missing_fields():
|
||||
serving_engine = await _async_serving_engine_init()
|
||||
serving_models = await _async_serving_models_init()
|
||||
request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None)
|
||||
response = await serving_engine.unload_lora_adapter(request)
|
||||
response = await serving_models.unload_lora_adapter(request)
|
||||
assert isinstance(response, ErrorResponse)
|
||||
assert response.type == "InvalidUserInput"
|
||||
assert response.code == HTTPStatus.BAD_REQUEST
|
||||
@ -111,9 +109,9 @@ async def test_unload_lora_adapter_missing_fields():
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_unload_lora_adapter_not_found():
|
||||
serving_engine = await _async_serving_engine_init()
|
||||
serving_models = await _async_serving_models_init()
|
||||
request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
|
||||
response = await serving_engine.unload_lora_adapter(request)
|
||||
response = await serving_models.unload_lora_adapter(request)
|
||||
assert isinstance(response, ErrorResponse)
|
||||
assert response.type == "InvalidUserInput"
|
||||
assert response.code == HTTPStatus.BAD_REQUEST
|
||||
@ -91,5 +91,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
|
||||
assert len(embeddings.data) == 1
|
||||
assert len(embeddings.data[0].embedding) == 3072
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
assert embeddings.usage.prompt_tokens == 765
|
||||
assert embeddings.usage.total_tokens == 765
|
||||
assert embeddings.usage.prompt_tokens == 764
|
||||
assert embeddings.usage.total_tokens == 764
|
||||
|
||||
@ -2,7 +2,6 @@ import warnings
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.config import ModelConfig
|
||||
@ -91,10 +90,7 @@ def _assert_mm_data_is_image_input(
|
||||
image_data = mm_data.get("image")
|
||||
assert image_data is not None
|
||||
|
||||
if image_count == 1:
|
||||
assert isinstance(image_data, Image.Image)
|
||||
else:
|
||||
assert isinstance(image_data, list) and len(image_data) == image_count
|
||||
assert isinstance(image_data, list) and len(image_data) == image_count
|
||||
|
||||
|
||||
def test_parse_chat_messages_single_image(
|
||||
|
||||
@ -5,7 +5,10 @@ import torch
|
||||
|
||||
from tests.kernels.utils import override_backend_env_variable
|
||||
from vllm.attention.selector import which_attn_to_use
|
||||
from vllm.platforms import cpu, cuda, openvino, rocm
|
||||
from vllm.platforms.cpu import CpuPlatform
|
||||
from vllm.platforms.cuda import CudaPlatform
|
||||
from vllm.platforms.openvino import OpenVinoPlatform
|
||||
from vllm.platforms.rocm import RocmPlatform
|
||||
from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
|
||||
|
||||
|
||||
@ -20,26 +23,23 @@ def test_env(name: str, device: str, monkeypatch):
|
||||
override_backend_env_variable(monkeypatch, name)
|
||||
|
||||
if device == "cpu":
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
cpu.CpuPlatform()):
|
||||
with patch("vllm.attention.selector.current_platform", CpuPlatform()):
|
||||
backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
|
||||
False)
|
||||
assert backend.name == "TORCH_SDPA"
|
||||
elif device == "hip":
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
rocm.RocmPlatform()):
|
||||
with patch("vllm.attention.selector.current_platform", RocmPlatform()):
|
||||
backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
|
||||
False)
|
||||
assert backend.name == "ROCM_FLASH"
|
||||
elif device == "openvino":
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
openvino.OpenVinoPlatform()):
|
||||
OpenVinoPlatform()):
|
||||
backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
|
||||
False)
|
||||
assert backend.name == "OPENVINO"
|
||||
else:
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
cuda.CudaPlatform()):
|
||||
with patch("vllm.attention.selector.current_platform", CudaPlatform()):
|
||||
backend = which_attn_to_use(16, torch.float16, torch.float16, 16,
|
||||
False)
|
||||
assert backend.name == name
|
||||
|
||||
@ -4,6 +4,7 @@ from typing import Dict, List, TypedDict
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
import safetensors
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from huggingface_hub import snapshot_download
|
||||
@ -169,6 +170,29 @@ def mixtral_lora_files_all_target_modules():
|
||||
return snapshot_download(repo_id="dyang415/mixtral-lora-v0")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def jamba_lora_files():
|
||||
# some of the adapters have unnecessary weights for serving,
|
||||
# hence we remove them
|
||||
def remove_unnecessary_weights(path):
|
||||
lora_path = f"{adapter_path}/adapter_model.safetensors"
|
||||
tensors = safetensors.torch.load_file(lora_path)
|
||||
nonlora_keys = []
|
||||
for k in list(tensors.keys()):
|
||||
if "lora" not in k:
|
||||
nonlora_keys.append(k)
|
||||
for k in nonlora_keys:
|
||||
del tensors[k]
|
||||
safetensors.torch.save_file(tensors, lora_path)
|
||||
|
||||
adapter_path = snapshot_download(
|
||||
repo_id=
|
||||
"hf-100/Jamba-1.5-mini-Spellbound-StoryWriter-0.1-6583896-ckpt53-lora")
|
||||
|
||||
remove_unnecessary_weights(adapter_path)
|
||||
return adapter_path
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def gemma_lora_files():
|
||||
return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
|
||||
|
||||
54
tests/lora/test_jamba.py
Normal file
54
tests/lora/test_jamba.py
Normal file
@ -0,0 +1,54 @@
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
MODEL_PATH = "ai21labs/AI21-Jamba-1.5-Mini"
|
||||
|
||||
MAX_TOKENS = 40
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
|
||||
prompts: List[str]) -> List[str]:
|
||||
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=MAX_TOKENS)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts: List[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp_size", [4])
|
||||
def test_jamba_lora(jamba_lora_files, tp_size):
|
||||
"""Original test, the LoRA model has the common target modules, not all"""
|
||||
if torch.cuda.device_count() < tp_size:
|
||||
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
||||
|
||||
prompts = ["Write a story about a sheep and a goat."]
|
||||
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
distributed_executor_backend="ray",
|
||||
tensor_parallel_size=tp_size,
|
||||
)
|
||||
|
||||
expected_jamba_output = [
|
||||
"""Once upon a time, in a lush green meadow, there lived a sheep named Clara and a goat named Billy. Clara was a gentle creature, always nibbling on the soft grass and humming""" # noqa: E501
|
||||
]
|
||||
assert do_sample(llm, jamba_lora_files, lora_id=1,
|
||||
prompts=prompts) == expected_jamba_output
|
||||
@ -1,4 +1,5 @@
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
from typing import Dict, List
|
||||
|
||||
@ -50,6 +51,18 @@ def test_peft_helper(sql_lora_files):
|
||||
"embed_tokens",
|
||||
"lm_head",
|
||||
]
|
||||
scaling = peft_helper.lora_alpha / peft_helper.r
|
||||
assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
|
||||
|
||||
# test RSLoRA
|
||||
config = dict(r=8,
|
||||
lora_alpha=16,
|
||||
target_modules=["gate_proj"],
|
||||
use_rslora=True)
|
||||
peft_helper = PEFTHelper.from_dict(config)
|
||||
|
||||
scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
|
||||
assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
|
||||
|
||||
expected_error = "vLLM only supports modules_to_save being None."
|
||||
with pytest.raises(ValueError, match=expected_error):
|
||||
@ -60,13 +73,6 @@ def test_peft_helper(sql_lora_files):
|
||||
modules_to_save=["lm_head"],
|
||||
)
|
||||
PEFTHelper.from_dict(config)
|
||||
expected_error = "vLLM does not yet support RSLoRA."
|
||||
with pytest.raises(ValueError, match=expected_error):
|
||||
config = dict(r=8,
|
||||
lora_alpha=16,
|
||||
target_modules=["gate_proj"],
|
||||
use_rslora=True)
|
||||
PEFTHelper.from_dict(config)
|
||||
|
||||
expected_error = "vLLM does not yet support DoRA."
|
||||
with pytest.raises(ValueError, match=expected_error):
|
||||
|
||||
@ -7,7 +7,7 @@ from vllm.assets.image import ImageAsset
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct"
|
||||
MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
|
||||
|
||||
PROMPT_TEMPLATE = (
|
||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
|
||||
@ -49,10 +49,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
|
||||
# Print the outputs.
|
||||
generated_texts: List[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
print(f"Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
|
||||
@ -30,7 +30,7 @@ def get_max_qwen2_vl_image_tokens():
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [
|
||||
({}, 1225),
|
||||
({}, 16384),
|
||||
({
|
||||
MIN_PIXELS: 64**2,
|
||||
MAX_PIXELS: 512**2
|
||||
|
||||
@ -140,10 +140,7 @@ VLM_TEST_SETTINGS = {
|
||||
"aria": VLMTestInfo(
|
||||
models=["rhymes-ai/Aria"],
|
||||
tokenizer_mode="slow",
|
||||
test_type=(
|
||||
VLMTestType.IMAGE,
|
||||
VLMTestType.MULTI_IMAGE,
|
||||
),
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
dtype="bfloat16",
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
|
||||
@ -179,6 +176,7 @@ VLM_TEST_SETTINGS = {
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
||||
"pixel_values"
|
||||
@ -212,7 +210,7 @@ VLM_TEST_SETTINGS = {
|
||||
dtype="bfloat16",
|
||||
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
||||
patch_hf_runner=model_utils.glm_patch_hf_runner,
|
||||
marks=[large_gpu_mark(min_gb=48)],
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"h2ovl": VLMTestInfo(
|
||||
models = [
|
||||
@ -261,6 +259,7 @@ VLM_TEST_SETTINGS = {
|
||||
dtype="bfloat16",
|
||||
use_tokenizer_eos=True,
|
||||
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"llava_next": VLMTestInfo(
|
||||
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
||||
|
||||
@ -140,6 +140,8 @@ _EMBEDDING_EXAMPLE_MODELS = {
|
||||
"BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
|
||||
"Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
|
||||
"GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
|
||||
"InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward",
|
||||
trust_remote_code=True),
|
||||
"JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"), # noqa: E501
|
||||
"LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
|
||||
"MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import transformers
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm import LLM
|
||||
@ -12,9 +11,6 @@ from .registry import HF_EXAMPLE_MODELS
|
||||
@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
|
||||
def test_can_initialize(model_arch):
|
||||
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
|
||||
if (model_arch == "Cohere2ForCausalLM"
|
||||
and transformers.__version__ < "4.48.0"):
|
||||
pytest.skip(reason="Model introduced in HF >= 4.48.0")
|
||||
if not model_info.is_available_online:
|
||||
pytest.skip("Model is not available online")
|
||||
|
||||
|
||||
@ -1,12 +1,20 @@
|
||||
from functools import partial
|
||||
from typing import cast
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.multimodal.processing import (PromptReplacement, _PlaceholderInfo,
|
||||
find_text_matches, find_token_matches,
|
||||
iter_placeholders, iter_token_matches,
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.inputs import InputProcessingContext
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.processing import (ProcessingCache, PromptReplacement,
|
||||
_PlaceholderInfo, find_text_matches,
|
||||
find_token_matches, iter_placeholders,
|
||||
iter_token_matches,
|
||||
replace_text_matches,
|
||||
replace_token_matches)
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.utils import full_groupby
|
||||
|
||||
@ -457,6 +465,7 @@ def test_find_replace_tokens(
|
||||
),
|
||||
]
|
||||
)
|
||||
# yapf: enable
|
||||
def test_iter_placeholders(
|
||||
repl_by_key,
|
||||
prompt,
|
||||
@ -475,11 +484,203 @@ def test_iter_placeholders(
|
||||
prompt_repls,
|
||||
prompt,
|
||||
# Effectively match all occurrences in the prompt
|
||||
{key: 3 for key in repl_by_key},
|
||||
))
|
||||
{key: 3
|
||||
for key in repl_by_key},
|
||||
))
|
||||
|
||||
# Only displayed on error
|
||||
print("result:", result)
|
||||
|
||||
# Manually constructed results
|
||||
assert result == expected
|
||||
|
||||
|
||||
def _rand_img(rng: np.random.RandomState, min_wh: int, max_wh: int):
|
||||
w, h = rng.randint(min_wh, max_wh, size=(2, ))
|
||||
arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8)
|
||||
return Image.fromarray(arr)
|
||||
|
||||
|
||||
def _rand_video(
|
||||
rng: np.random.RandomState,
|
||||
min_frames: int,
|
||||
max_frames: int,
|
||||
min_wh: int,
|
||||
max_wh: int,
|
||||
):
|
||||
# Temporary workaround for https://github.com/huggingface/transformers/issues/35412
|
||||
num_frames = rng.randint(min_frames, max_frames)
|
||||
num_frames = (num_frames // 2) * 2
|
||||
|
||||
w, h = rng.randint(min_wh, max_wh, size=(2, ))
|
||||
return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8)
|
||||
|
||||
|
||||
def _rand_audio(
|
||||
rng: np.random.RandomState,
|
||||
min_len: int,
|
||||
max_len: int,
|
||||
sr: int,
|
||||
):
|
||||
audio_len = rng.randint(min_len, max_len)
|
||||
return rng.rand(audio_len), sr
|
||||
|
||||
|
||||
def _test_processing_cache_correctness(
|
||||
model_id: str,
|
||||
modalities: dict[str, bool],
|
||||
hit_rate: float,
|
||||
num_batches: int,
|
||||
simplify_rate: float,
|
||||
):
|
||||
if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3":
|
||||
hf_overrides = {"architectures": ["MantisForConditionalGeneration"]}
|
||||
else:
|
||||
hf_overrides = {}
|
||||
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
hf_overrides=hf_overrides,
|
||||
)
|
||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||
|
||||
processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
|
||||
ctx = InputProcessingContext(
|
||||
model_config,
|
||||
tokenizer=cached_get_tokenizer(model_config.tokenizer),
|
||||
)
|
||||
# Ensure that it can fit all of the data
|
||||
cache = ProcessingCache(capacity=1 << 30)
|
||||
|
||||
baseline_processor = processor_factory(ctx, cache=None)
|
||||
cached_processor = processor_factory(ctx, cache=cache)
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
input_to_hit = {
|
||||
"image": Image.new("RGB", size=(128, 128)),
|
||||
"video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
|
||||
"audio": (np.zeros((512, )), 16000),
|
||||
}
|
||||
input_factory = {
|
||||
"image":
|
||||
partial(_rand_img, rng, min_wh=128, max_wh=256),
|
||||
"video":
|
||||
partial(_rand_video,
|
||||
rng,
|
||||
min_frames=2,
|
||||
max_frames=8,
|
||||
min_wh=128,
|
||||
max_wh=256),
|
||||
"audio":
|
||||
partial(_rand_audio, rng, min_len=256, max_len=512, sr=16000),
|
||||
}
|
||||
input_max_count = {
|
||||
modality: 3 if supports_multi else 1
|
||||
for modality, supports_multi in modalities.items()
|
||||
}
|
||||
|
||||
for batch_idx in range(num_batches):
|
||||
mm_data = {
|
||||
k:
|
||||
[(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
|
||||
for _ in range(rng.randint(input_max_count[k]))]
|
||||
for k in modalities
|
||||
}
|
||||
|
||||
mm_counts = {k: len(vs) for k, vs in mm_data.items()}
|
||||
prompt = baseline_processor._get_dummy_mm_inputs(mm_counts).prompt_text
|
||||
|
||||
# Drop unnecessary keys and test single -> multi conversion
|
||||
if rng.rand() < simplify_rate:
|
||||
for k in list(mm_data.keys()):
|
||||
if not mm_data[k]:
|
||||
del mm_data[k]
|
||||
elif len(mm_data[k]) == 1:
|
||||
mm_data[k] = mm_data[k][0]
|
||||
|
||||
baseline_result = baseline_processor.apply(
|
||||
prompt,
|
||||
mm_data=mm_data,
|
||||
hf_processor_mm_kwargs={},
|
||||
)
|
||||
cached_result = cached_processor.apply(
|
||||
prompt,
|
||||
mm_data=mm_data,
|
||||
hf_processor_mm_kwargs={},
|
||||
)
|
||||
|
||||
assert baseline_result == cached_result, (
|
||||
f"Failed ({batch_idx=}, {mm_data=})")
|
||||
|
||||
|
||||
# yapf: disable
|
||||
# True if the model supports multiple data items of the modality per request
|
||||
@pytest.mark.parametrize(("model_id", "modalities"), [
|
||||
("rhymes-ai/Aria", {"image": True}),
|
||||
("Salesforce/blip2-opt-2.7b", {"image": False}),
|
||||
("facebook/chameleon-7b", {"image": False}),
|
||||
("adept/fuyu-8b", {"image": False}),
|
||||
("llava-hf/llava-1.5-7b-hf", {"image": True}),
|
||||
("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
|
||||
("mistral-community/pixtral-12b", {"image": True}),
|
||||
("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
|
||||
("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
|
||||
("fixie-ai/ultravox-v0_3", {"audio": True}),
|
||||
])
|
||||
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
|
||||
@pytest.mark.parametrize("num_batches", [32])
|
||||
@pytest.mark.parametrize("simplify_rate", [1.0])
|
||||
# yapf: enable
|
||||
def test_processing_cache_correctness(
|
||||
model_id: str,
|
||||
modalities: dict[str, bool],
|
||||
hit_rate: float,
|
||||
num_batches: int,
|
||||
simplify_rate: float,
|
||||
):
|
||||
_test_processing_cache_correctness(
|
||||
model_id,
|
||||
modalities,
|
||||
hit_rate=hit_rate,
|
||||
num_batches=num_batches,
|
||||
simplify_rate=simplify_rate,
|
||||
)
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(("model_id", "modalities"), [
|
||||
("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
|
||||
])
|
||||
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
|
||||
@pytest.mark.parametrize("num_batches", [32])
|
||||
@pytest.mark.parametrize("simplify_rate", [1.0])
|
||||
# yapf: enable
|
||||
def test_processing_cache_correctness_phi3v(
|
||||
model_id: str,
|
||||
modalities: dict[str, bool],
|
||||
hit_rate: float,
|
||||
num_batches: int,
|
||||
simplify_rate: float,
|
||||
):
|
||||
# HACK - this is an attempted workaround for the following bug
|
||||
# https://github.com/huggingface/transformers/issues/34307
|
||||
from transformers import AutoImageProcessor # noqa: F401
|
||||
from transformers import AutoProcessor # noqa: F401
|
||||
|
||||
AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
|
||||
|
||||
_test_processing_cache_correctness(
|
||||
model_id,
|
||||
modalities,
|
||||
hit_rate=hit_rate,
|
||||
num_batches=num_batches,
|
||||
simplify_rate=simplify_rate,
|
||||
)
|
||||
|
||||
@ -9,7 +9,7 @@ import pytest
|
||||
from PIL import Image, ImageChops
|
||||
from transformers import AutoConfig, AutoTokenizer
|
||||
|
||||
from vllm.multimodal.utils import (async_fetch_image, fetch_image,
|
||||
from vllm.multimodal.utils import (MediaConnector,
|
||||
repeat_and_pad_placeholder_tokens)
|
||||
|
||||
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
|
||||
@ -23,7 +23,12 @@ TEST_IMAGE_URLS = [
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def url_images() -> Dict[str, Image.Image]:
|
||||
return {image_url: fetch_image(image_url) for image_url in TEST_IMAGE_URLS}
|
||||
connector = MediaConnector()
|
||||
|
||||
return {
|
||||
image_url: connector.fetch_image(image_url)
|
||||
for image_url in TEST_IMAGE_URLS
|
||||
}
|
||||
|
||||
|
||||
def get_supported_suffixes() -> Tuple[str, ...]:
|
||||
@ -43,8 +48,10 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool:
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
|
||||
async def test_fetch_image_http(image_url: str):
|
||||
image_sync = fetch_image(image_url)
|
||||
image_async = await async_fetch_image(image_url)
|
||||
connector = MediaConnector()
|
||||
|
||||
image_sync = connector.fetch_image(image_url)
|
||||
image_async = await connector.fetch_image_async(image_url)
|
||||
assert _image_equals(image_sync, image_async)
|
||||
|
||||
|
||||
@ -53,6 +60,7 @@ async def test_fetch_image_http(image_url: str):
|
||||
@pytest.mark.parametrize("suffix", get_supported_suffixes())
|
||||
async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
|
||||
image_url: str, suffix: str):
|
||||
connector = MediaConnector()
|
||||
url_image = url_images[image_url]
|
||||
|
||||
try:
|
||||
@ -75,48 +83,49 @@ async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
|
||||
base64_image = base64.b64encode(f.read()).decode("utf-8")
|
||||
data_url = f"data:{mime_type};base64,{base64_image}"
|
||||
|
||||
data_image_sync = fetch_image(data_url)
|
||||
data_image_sync = connector.fetch_image(data_url)
|
||||
if _image_equals(url_image, Image.open(f)):
|
||||
assert _image_equals(url_image, data_image_sync)
|
||||
else:
|
||||
pass # Lossy format; only check that image can be opened
|
||||
|
||||
data_image_async = await async_fetch_image(data_url)
|
||||
data_image_async = await connector.fetch_image_async(data_url)
|
||||
assert _image_equals(data_image_sync, data_image_async)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
|
||||
async def test_fetch_image_local_files(image_url: str):
|
||||
connector = MediaConnector()
|
||||
|
||||
with TemporaryDirectory() as temp_dir:
|
||||
origin_image = fetch_image(image_url)
|
||||
local_connector = MediaConnector(allowed_local_media_path=temp_dir)
|
||||
|
||||
origin_image = connector.fetch_image(image_url)
|
||||
origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)),
|
||||
quality=100,
|
||||
icc_profile=origin_image.info.get('icc_profile'))
|
||||
|
||||
image_async = await async_fetch_image(
|
||||
f"file://{temp_dir}/{os.path.basename(image_url)}",
|
||||
allowed_local_media_path=temp_dir)
|
||||
|
||||
image_sync = fetch_image(
|
||||
f"file://{temp_dir}/{os.path.basename(image_url)}",
|
||||
allowed_local_media_path=temp_dir)
|
||||
image_async = await local_connector.fetch_image_async(
|
||||
f"file://{temp_dir}/{os.path.basename(image_url)}")
|
||||
image_sync = local_connector.fetch_image(
|
||||
f"file://{temp_dir}/{os.path.basename(image_url)}")
|
||||
# Check that the images are equal
|
||||
assert not ImageChops.difference(image_sync, image_async).getbbox()
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
await async_fetch_image(
|
||||
f"file://{temp_dir}/../{os.path.basename(image_url)}",
|
||||
allowed_local_media_path=temp_dir)
|
||||
with pytest.raises(ValueError):
|
||||
await async_fetch_image(
|
||||
with pytest.raises(ValueError, match="must be a subpath"):
|
||||
await local_connector.fetch_image_async(
|
||||
f"file://{temp_dir}/../{os.path.basename(image_url)}")
|
||||
with pytest.raises(RuntimeError, match="Cannot load local files"):
|
||||
await connector.fetch_image_async(
|
||||
f"file://{temp_dir}/../{os.path.basename(image_url)}")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}",
|
||||
allowed_local_media_path=temp_dir)
|
||||
with pytest.raises(ValueError):
|
||||
fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}")
|
||||
with pytest.raises(ValueError, match="must be a subpath"):
|
||||
local_connector.fetch_image(
|
||||
f"file://{temp_dir}/../{os.path.basename(image_url)}")
|
||||
with pytest.raises(RuntimeError, match="Cannot load local files"):
|
||||
connector.fetch_image(
|
||||
f"file://{temp_dir}/../{os.path.basename(image_url)}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||
|
||||
11
tests/plugins/vllm_add_dummy_platform/setup.py
Normal file
11
tests/plugins/vllm_add_dummy_platform/setup.py
Normal file
@ -0,0 +1,11 @@
|
||||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
name='vllm_add_dummy_platform',
|
||||
version='0.1',
|
||||
packages=['vllm_add_dummy_platform'],
|
||||
entry_points={
|
||||
'vllm.platform_plugins': [
|
||||
"dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin" # noqa
|
||||
]
|
||||
})
|
||||
@ -0,0 +1,5 @@
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def dummy_platform_plugin() -> Optional[str]:
|
||||
return "vllm_add_dummy_platform.dummy_platform.DummyPlatform"
|
||||
@ -0,0 +1,5 @@
|
||||
from vllm.platforms.cuda import CudaPlatform
|
||||
|
||||
|
||||
class DummyPlatform(CudaPlatform):
|
||||
device_name = "DummyDevice"
|
||||
16
tests/plugins_tests/test_platform_plugins.py
Normal file
16
tests/plugins_tests/test_platform_plugins.py
Normal file
@ -0,0 +1,16 @@
|
||||
def test_platform_plugins():
|
||||
# simulate workload by running an example
|
||||
import runpy
|
||||
current_file = __file__
|
||||
import os
|
||||
example_file = os.path.join(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
|
||||
"examples", "offline_inference.py")
|
||||
runpy.run_path(example_file)
|
||||
|
||||
# check if the plugin is loaded correctly
|
||||
from vllm.platforms import _init_trace, current_platform
|
||||
assert current_platform.device_name == "DummyDevice", (
|
||||
f"Expected DummyDevice, got {current_platform.device_name}, "
|
||||
"possibly because current_platform is imported before the plugin"
|
||||
f" is loaded. The first import:\n{_init_trace}")
|
||||
@ -98,9 +98,9 @@ def test_prefill():
|
||||
# Incomplete 1 block (6 tokens)
|
||||
unique_token_ids = [3] * 6
|
||||
req2 = make_request("2", common_token_ids + unique_token_ids)
|
||||
computed_block = manager.get_computed_blocks(req2)
|
||||
computed_blocks = manager.get_computed_blocks(req2)
|
||||
assert len(req2.kv_block_hashes) == 3
|
||||
assert [b.block_id for b in computed_block] == [0, 1, 2]
|
||||
assert [b.block_id for b in computed_blocks] == [0, 1, 2]
|
||||
num_new_tokens = 53 - 3 * 16
|
||||
blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
|
||||
assert [b.block_id for b in blocks] == [7, 8]
|
||||
@ -469,9 +469,9 @@ def test_mm_prefix_caching():
|
||||
# Completed block should have hashes with extra keys.
|
||||
assert not computed_blocks
|
||||
assert len(req0.kv_block_hashes) == 3
|
||||
assert req0.kv_block_hashes[0].extra_keys == (("aaa", 0), )
|
||||
assert req0.kv_block_hashes[1].extra_keys == (("aaa", 5), ("bbb", 0))
|
||||
assert req0.kv_block_hashes[2].extra_keys == (("bbb", 2), )
|
||||
assert req0.kv_block_hashes[0].extra_keys == ("aaa", )
|
||||
assert req0.kv_block_hashes[1].extra_keys == ("aaa", "bbb")
|
||||
assert req0.kv_block_hashes[2].extra_keys == ("bbb", )
|
||||
|
||||
blocks = manager.allocate_slots(req0, 59, computed_blocks)
|
||||
assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
|
||||
@ -485,7 +485,7 @@ def test_mm_prefix_caching():
|
||||
|
||||
# The just completed block should have hashes with extra keys.
|
||||
assert len(req0.kv_block_hashes) == 4
|
||||
assert req0.kv_block_hashes[3].extra_keys == (("ccc", 0), )
|
||||
assert req0.kv_block_hashes[3].extra_keys == ("ccc", )
|
||||
|
||||
# Cache hit.
|
||||
unique_token_ids = [-1] * 7 + [200] * 5
|
||||
@ -500,3 +500,62 @@ def test_mm_prefix_caching():
|
||||
mm_hashes=mm_hashes)
|
||||
computed_blocks = manager.get_computed_blocks(req1)
|
||||
assert len(computed_blocks) == 3
|
||||
|
||||
|
||||
def test_prefill_not_enough_free_blocks_with_computed_blocks():
|
||||
"""
|
||||
This is a unit test that tests the correctness of the allocate_slots
|
||||
when there is not enough free blocks. Specifically, when a request
|
||||
has computed blocks but cannot be allocated due to not enough free blocks,
|
||||
the computed blocks should not be touched.
|
||||
"""
|
||||
block_size = 16
|
||||
manager = KVCacheManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=10,
|
||||
max_model_len=8192,
|
||||
sliding_window=None,
|
||||
enable_caching=True,
|
||||
num_preallocate_tokens=0,
|
||||
)
|
||||
# Complete 3 blocks (48 tokens)
|
||||
# | Common-0 | Common-1 | Common-2 | ... |
|
||||
common_token_ids = [i for i in range(3) for _ in range(16)]
|
||||
req0 = make_request("0", common_token_ids)
|
||||
computed_blocks = manager.get_computed_blocks(req0)
|
||||
assert not computed_blocks
|
||||
manager.allocate_slots(req0, 48, computed_blocks)
|
||||
block_part0 = manager.req_to_blocks[req0.request_id]
|
||||
|
||||
# | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
|
||||
req1 = make_request("1", common_token_ids * 2)
|
||||
computed_blocks = manager.get_computed_blocks(req1)
|
||||
assert computed_blocks == block_part0
|
||||
manager.allocate_slots(req1, 48, computed_blocks)
|
||||
block_part1 = manager.req_to_blocks[req1.request_id]
|
||||
# | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
|
||||
# | Req1-5(F)| ... |
|
||||
manager.free(req1)
|
||||
assert {block.ref_cnt for block in block_part1[:3]} == {1}
|
||||
assert {block.ref_cnt for block in block_part1[3:]} == {0}
|
||||
|
||||
# | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
|
||||
# | Req1-5(F)| Req2-0 | Req2-1 | ... |
|
||||
req2 = make_request("2", [7] * block_size * 2)
|
||||
computed_blocks = manager.get_computed_blocks(req2)
|
||||
assert not computed_blocks
|
||||
manager.allocate_slots(req2, block_size * 2, computed_blocks)
|
||||
|
||||
# Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
|
||||
# but it cannot be allocated due to insufficient free blocks (2).
|
||||
# In this case, the ref_cnt of the computed blocks should not be changed.
|
||||
assert manager.free_block_queue.num_free_blocks == 5
|
||||
req3 = make_request("3", common_token_ids * 3)
|
||||
computed_blocks = manager.get_computed_blocks(req3)
|
||||
assert computed_blocks == block_part1
|
||||
# Req3 cannot be allocated.
|
||||
assert manager.allocate_slots(req3, 48, computed_blocks) is None
|
||||
# Block 0-2 are used by Req 1.
|
||||
assert {block.ref_cnt for block in block_part1[:3]} == {1}
|
||||
# Block 3-5 are free.
|
||||
assert {block.ref_cnt for block in block_part1[3:]} == {0}
|
||||
|
||||
@ -3,9 +3,9 @@ from typing import List
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.sampling_params import RequestOutputKind
|
||||
from vllm.v1.engine import EngineCoreOutput
|
||||
from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerRequest
|
||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
|
||||
from vllm.v1.engine.detokenizer import Detokenizer
|
||||
|
||||
TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
|
||||
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
|
||||
@ -71,16 +71,22 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
|
||||
|
||||
# Make N requests.
|
||||
requests = [
|
||||
DetokenizerRequest(
|
||||
request_id=f"request-{idx}",
|
||||
prompt=prompt,
|
||||
prompt_token_ids=prompt_tokens,
|
||||
skip_special_tokens=False,
|
||||
spaces_between_special_tokens=False,
|
||||
output_kind=request_output_kind,
|
||||
stop=[],
|
||||
include_stop_str_in_output=False,
|
||||
) for idx, (
|
||||
EngineCoreRequest(request_id=f"request-{idx}",
|
||||
prompt=prompt,
|
||||
prompt_token_ids=prompt_tokens,
|
||||
arrival_time=0,
|
||||
mm_inputs=None,
|
||||
mm_hashes=None,
|
||||
mm_placeholders=None,
|
||||
eos_token_id=None,
|
||||
lora_request=None,
|
||||
sampling_params=SamplingParams(
|
||||
skip_special_tokens=False,
|
||||
spaces_between_special_tokens=False,
|
||||
output_kind=request_output_kind,
|
||||
stop=[],
|
||||
include_stop_str_in_output=False))
|
||||
for idx, (
|
||||
prompt,
|
||||
prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
|
||||
]
|
||||
@ -133,18 +139,25 @@ def test_stop_string(include_stop_str_in_output: bool):
|
||||
|
||||
# Make N requests.
|
||||
requests = [
|
||||
DetokenizerRequest(
|
||||
EngineCoreRequest(
|
||||
request_id=f"request-{idx}",
|
||||
prompt=prompt,
|
||||
prompt_token_ids=prompt_tokens,
|
||||
skip_special_tokens=False,
|
||||
spaces_between_special_tokens=False,
|
||||
output_kind=RequestOutputKind.DELTA,
|
||||
stop=STOP_STRINGS,
|
||||
include_stop_str_in_output=include_stop_str_in_output,
|
||||
) for idx, (
|
||||
prompt,
|
||||
prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
|
||||
arrival_time=0,
|
||||
mm_inputs=None,
|
||||
mm_hashes=None,
|
||||
mm_placeholders=None,
|
||||
eos_token_id=None,
|
||||
lora_request=None,
|
||||
sampling_params=SamplingParams(
|
||||
skip_special_tokens=False,
|
||||
spaces_between_special_tokens=False,
|
||||
output_kind=RequestOutputKind.DELTA,
|
||||
stop=STOP_STRINGS,
|
||||
include_stop_str_in_output=include_stop_str_in_output,
|
||||
)) for idx, (
|
||||
prompt,
|
||||
prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
|
||||
]
|
||||
|
||||
# Add requests to the detokenizer.
|
||||
|
||||
@ -7,7 +7,6 @@ from transformers import AutoTokenizer
|
||||
from vllm import SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
from vllm.v1.engine.core import EngineCore
|
||||
@ -43,13 +42,11 @@ def test_engine_core(monkeypatch):
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
"""Setup the EngineCore."""
|
||||
engine_args = EngineArgs(model=MODEL_NAME)
|
||||
vllm_config = engine_args.create_engine_config(
|
||||
usage_context=UsageContext.UNKNOWN_CONTEXT)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
executor_class = AsyncLLM._get_executor_cls(vllm_config)
|
||||
|
||||
engine_core = EngineCore(vllm_config=vllm_config,
|
||||
executor_class=executor_class,
|
||||
usage_context=UsageContext.UNKNOWN_CONTEXT)
|
||||
executor_class=executor_class)
|
||||
"""Test basic request lifecycle."""
|
||||
|
||||
# First request.
|
||||
@ -151,13 +148,11 @@ def test_engine_core_advanced_sampling(monkeypatch):
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
"""Setup the EngineCore."""
|
||||
engine_args = EngineArgs(model=MODEL_NAME)
|
||||
vllm_config = engine_args.create_engine_config(
|
||||
usage_context=UsageContext.UNKNOWN_CONTEXT)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
executor_class = AsyncLLM._get_executor_cls(vllm_config)
|
||||
|
||||
engine_core = EngineCore(vllm_config=vllm_config,
|
||||
executor_class=executor_class,
|
||||
usage_context=UsageContext.UNKNOWN_CONTEXT)
|
||||
executor_class=executor_class)
|
||||
"""Test basic request lifecycle."""
|
||||
# First request.
|
||||
request: EngineCoreRequest = make_request()
|
||||
|
||||
@ -86,11 +86,10 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
|
||||
UsageContext.UNKNOWN_CONTEXT)
|
||||
executor_class = AsyncLLM._get_executor_cls(vllm_config)
|
||||
client = EngineCoreClient.make_client(
|
||||
vllm_config,
|
||||
executor_class,
|
||||
UsageContext.UNKNOWN_CONTEXT,
|
||||
multiprocess_mode=multiprocessing_mode,
|
||||
asyncio_mode=False,
|
||||
vllm_config=vllm_config,
|
||||
executor_class=executor_class,
|
||||
)
|
||||
|
||||
MAX_TOKENS = 20
|
||||
@ -158,11 +157,10 @@ async def test_engine_core_client_asyncio(monkeypatch):
|
||||
usage_context=UsageContext.UNKNOWN_CONTEXT)
|
||||
executor_class = AsyncLLM._get_executor_cls(vllm_config)
|
||||
client = EngineCoreClient.make_client(
|
||||
vllm_config,
|
||||
executor_class,
|
||||
UsageContext.UNKNOWN_CONTEXT,
|
||||
multiprocess_mode=True,
|
||||
asyncio_mode=True,
|
||||
vllm_config=vllm_config,
|
||||
executor_class=executor_class,
|
||||
)
|
||||
|
||||
MAX_TOKENS = 20
|
||||
|
||||
@ -23,8 +23,7 @@ with contextlib.suppress(ImportError):
|
||||
import vllm._moe_C # noqa: F401
|
||||
supports_moe_ops = True
|
||||
|
||||
# neuron has torch version that doesn't even have impl_abstract
|
||||
if TYPE_CHECKING or current_platform.is_neuron():
|
||||
if TYPE_CHECKING:
|
||||
|
||||
def register_fake(fn):
|
||||
return lambda name: fn
|
||||
|
||||
@ -21,12 +21,10 @@ class AudioAsset:
|
||||
name: Literal["winning_call", "mary_had_lamb"]
|
||||
|
||||
@property
|
||||
def audio_and_sample_rate(self) -> tuple[npt.NDArray, int]:
|
||||
def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
|
||||
audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
|
||||
s3_prefix=ASSET_DIR)
|
||||
y, sr = librosa.load(audio_path, sr=None)
|
||||
assert isinstance(sr, int)
|
||||
return y, sr
|
||||
return librosa.load(audio_path, sr=None)
|
||||
|
||||
@property
|
||||
def url(self) -> str:
|
||||
|
||||
@ -208,8 +208,8 @@ def wrap_inductor(graph: fx.GraphModule,
|
||||
from torch._inductor.compile_fx import graph_returns_tuple
|
||||
returns_tuple = graph_returns_tuple(graph)
|
||||
|
||||
# this is the graph we return to Dynamo to run
|
||||
def compiled_graph(*args) -> Optional[fx.CompiledFxGraph]:
|
||||
# this is the callable we return to Dynamo to run
|
||||
def compiled_graph(*args):
|
||||
# convert args to list
|
||||
list_args = list(args)
|
||||
graph_output = inductor_compiled_graph(list_args)
|
||||
@ -537,7 +537,8 @@ class VllmBackend:
|
||||
example_inputs[x].clone() for x in self.sym_tensor_indices
|
||||
]
|
||||
|
||||
def copy_and_call(*args) -> fx.GraphModule:
|
||||
# this is the callable we return to Dynamo to run
|
||||
def copy_and_call(*args):
|
||||
list_args = list(args)
|
||||
for i, index in enumerate(self.sym_tensor_indices):
|
||||
runtime_tensor = list_args[index]
|
||||
@ -618,8 +619,10 @@ class PiecewiseBackend:
|
||||
# the entries for different shapes that we need to either
|
||||
# compile or capture cudagraph
|
||||
self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
|
||||
self.to_be_compiled_sizes: Set[int] = self.compile_sizes.union(
|
||||
self.capture_sizes)
|
||||
|
||||
# to_be_compiled_sizes tracks the remaining sizes to compile,
|
||||
# and updates during the compilation process, so we need to copy it
|
||||
self.to_be_compiled_sizes: Set[int] = self.compile_sizes.copy()
|
||||
for shape in self.compile_sizes.union(self.capture_sizes):
|
||||
self.concrete_size_entries[shape] = ConcreteSizeEntry(
|
||||
runtime_shape=shape,
|
||||
@ -627,12 +630,17 @@ class PiecewiseBackend:
|
||||
use_cudagraph=shape in self.capture_sizes,
|
||||
)
|
||||
|
||||
def check_for_ending_compilation(self):
|
||||
if self.is_last_graph and not self.to_be_compiled_sizes:
|
||||
# no specific sizes to compile
|
||||
# save the hash of the inductor graph for the next run
|
||||
self.compilation_config.inductor_hash_cache.save_to_file()
|
||||
end_monitoring_torch_compile(self.vllm_config)
|
||||
|
||||
def __call__(self, *args) -> Any:
|
||||
if not self.first_run_finished:
|
||||
self.first_run_finished = True
|
||||
# no specific sizes to compile
|
||||
if self.is_last_graph and not self.to_be_compiled_sizes:
|
||||
end_monitoring_torch_compile(self.vllm_config)
|
||||
self.check_for_ending_compilation()
|
||||
return self.compiled_graph_for_general_shape(*args)
|
||||
|
||||
runtime_shape = args[self.sym_shape_indices[0]]
|
||||
@ -661,10 +669,7 @@ class PiecewiseBackend:
|
||||
|
||||
# finished compilations for all required shapes
|
||||
if self.is_last_graph and not self.to_be_compiled_sizes:
|
||||
|
||||
# save the hash of the inductor graph for the next run
|
||||
self.compilation_config.inductor_hash_cache.save_to_file()
|
||||
end_monitoring_torch_compile(self.vllm_config)
|
||||
self.check_for_ending_compilation()
|
||||
|
||||
if not entry.use_cudagraph:
|
||||
return entry.runnable(*args)
|
||||
|
||||
@ -28,11 +28,12 @@ class TorchCompileWrapperWithCustomDispatcher:
|
||||
compiled_callable: Optional[Callable] = None,
|
||||
compilation_level: int = 0):
|
||||
|
||||
vllm_config = get_current_vllm_config()
|
||||
self.vllm_config = vllm_config
|
||||
if compiled_callable is None:
|
||||
# default compilation settings
|
||||
# compiling the forward method
|
||||
|
||||
vllm_config = get_current_vllm_config()
|
||||
backend = vllm_config.compilation_config.init_backend(vllm_config)
|
||||
|
||||
compiled_callable = torch.compile(
|
||||
@ -82,6 +83,13 @@ class TorchCompileWrapperWithCustomDispatcher:
|
||||
|
||||
self.compiled_codes.append(new_code)
|
||||
|
||||
if self.vllm_config.compilation_config.use_cudagraph and \
|
||||
"update" in new_code.co_names:
|
||||
import depyf
|
||||
src = depyf.decompile(new_code)
|
||||
msg = "Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n" + src # noqa
|
||||
raise RuntimeError(msg)
|
||||
|
||||
@contextmanager
|
||||
def dispatch_to_code(self, index: int):
|
||||
"""Context manager to dispatch to the compiled code.
|
||||
|
||||
@ -9,8 +9,8 @@ from contextlib import contextmanager
|
||||
from dataclasses import dataclass, field, replace
|
||||
from pathlib import Path
|
||||
from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Counter, Dict,
|
||||
Final, List, Literal, Mapping, Optional, Set, Tuple, Type,
|
||||
Union)
|
||||
Final, List, Literal, Mapping, Optional, Protocol, Set,
|
||||
Tuple, Type, Union)
|
||||
|
||||
import torch
|
||||
from pydantic import BaseModel, Field, PrivateAttr
|
||||
@ -22,7 +22,7 @@ from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
|
||||
get_quantization_config)
|
||||
from vllm.model_executor.models import ModelRegistry
|
||||
from vllm.platforms import current_platform, interface
|
||||
from vllm.platforms import CpuArchEnum
|
||||
from vllm.tracing import is_otel_available, otel_import_error_traceback
|
||||
from vllm.transformers_utils.config import (
|
||||
ConfigFormat, get_config, get_hf_image_processor_config,
|
||||
@ -75,6 +75,12 @@ HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig],
|
||||
PretrainedConfig]]
|
||||
|
||||
|
||||
class SupportsHash(Protocol):
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
...
|
||||
|
||||
|
||||
class ModelConfig:
|
||||
"""Configuration for the model.
|
||||
|
||||
@ -301,7 +307,7 @@ class ModelConfig:
|
||||
sliding_window = getattr(self.hf_text_config, "sliding_window", None)
|
||||
has_interleaved_attention = (sliding_window is not None) and (
|
||||
isinstance(sliding_window, list) or
|
||||
(self.hf_text_config.model_type in ["gemma2"]))
|
||||
(self.hf_text_config.model_type in ["gemma2", "cohere2"]))
|
||||
|
||||
if (not self.disable_sliding_window and has_interleaved_attention):
|
||||
if envs.VLLM_ATTENTION_BACKEND == "XFORMERS":
|
||||
@ -343,6 +349,7 @@ class ModelConfig:
|
||||
self.is_hybrid = self._init_is_hybrid()
|
||||
self.has_inner_state = self._init_has_inner_state()
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
if current_platform.is_neuron():
|
||||
self.override_neuron_config = override_neuron_config
|
||||
else:
|
||||
@ -583,6 +590,7 @@ class ModelConfig:
|
||||
raise ValueError(
|
||||
f"Unknown quantization method: {self.quantization}. Must "
|
||||
f"be one of {supported_quantization}.")
|
||||
from vllm.platforms import current_platform
|
||||
current_platform.verify_quantization(self.quantization)
|
||||
if self.quantization not in optimized_quantization_methods:
|
||||
logger.warning(
|
||||
@ -638,6 +646,7 @@ class ModelConfig:
|
||||
|
||||
# Reminder: Please update docs/source/usage/compatibility_matrix.md
|
||||
# If the feature combo become valid
|
||||
from vllm.platforms import current_platform
|
||||
if not current_platform.is_async_output_supported(self.enforce_eager):
|
||||
logger.warning(
|
||||
"Async output processing is not supported on the "
|
||||
@ -1006,6 +1015,7 @@ class CacheConfig:
|
||||
raise ValueError(
|
||||
"GPU memory utilization must be less than 1.0. Got "
|
||||
f"{self.gpu_memory_utilization}.")
|
||||
from vllm.platforms import current_platform
|
||||
if (current_platform.is_cuda() and self.block_size is not None
|
||||
and self.block_size > 32):
|
||||
raise ValueError("CUDA Paged Attention kernel only supports "
|
||||
@ -1273,6 +1283,7 @@ class ParallelConfig:
|
||||
f"distributed executor backend "
|
||||
f"'{self.distributed_executor_backend}'.")
|
||||
ray_only_devices = ["tpu", "hpu"]
|
||||
from vllm.platforms import current_platform
|
||||
if (current_platform.device_type in ray_only_devices
|
||||
and self.world_size > 1):
|
||||
if self.distributed_executor_backend is None:
|
||||
@ -1321,7 +1332,7 @@ class ParallelConfig:
|
||||
def _verify_args(self) -> None:
|
||||
# Lazy import to avoid circular import
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
if self.distributed_executor_backend not in (
|
||||
"ray", "mp", None) and not (isinstance(
|
||||
self.distributed_executor_backend, type) and issubclass(
|
||||
@ -1522,6 +1533,7 @@ class DeviceConfig:
|
||||
def __init__(self, device: str = "auto") -> None:
|
||||
if device == "auto":
|
||||
# Automated device type detection
|
||||
from vllm.platforms import current_platform
|
||||
self.device_type = current_platform.device_type
|
||||
if not self.device_type:
|
||||
raise RuntimeError("Failed to infer device type")
|
||||
@ -2235,9 +2247,10 @@ def _get_and_verify_dtype(
|
||||
else:
|
||||
torch_dtype = config_dtype
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
if (current_platform.is_cpu()
|
||||
and current_platform.get_cpu_architecture()
|
||||
== interface.CpuArchEnum.POWERPC
|
||||
== CpuArchEnum.POWERPC
|
||||
and (config_dtype == torch.float16
|
||||
or config_dtype == torch.float32)):
|
||||
logger.info(
|
||||
@ -2559,14 +2572,6 @@ class KVTransferConfig(BaseModel):
|
||||
return KVTransferConfig.model_validate_json(cli_value)
|
||||
|
||||
def model_post_init(self, __context: Any) -> None:
|
||||
supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"]
|
||||
if all([
|
||||
self.kv_connector is not None, self.kv_connector
|
||||
not in supported_kv_connector
|
||||
]):
|
||||
raise ValueError(f"Unsupported kv_connector: {self.kv_connector}. "
|
||||
f"Supported connectors are "
|
||||
f"{supported_kv_connector}.")
|
||||
|
||||
if self.kv_role is not None and self.kv_role not in [
|
||||
"kv_producer", "kv_consumer", "kv_both"
|
||||
@ -2977,6 +2982,10 @@ class VllmConfig:
|
||||
init=True) # type: ignore
|
||||
kv_transfer_config: KVTransferConfig = field(default=None,
|
||||
init=True) # type: ignore
|
||||
# some opaque config, only used to provide additional information
|
||||
# for the hash computation, mainly used for testing and debugging.
|
||||
additional_config: SupportsHash = field(default=None,
|
||||
init=True) # type: ignore
|
||||
instance_id: str = ""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
@ -3008,33 +3017,62 @@ class VllmConfig:
|
||||
vllm_factors.append(__version__)
|
||||
if self.model_config:
|
||||
vllm_factors.append(self.model_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.cache_config:
|
||||
vllm_factors.append(self.cache_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.parallel_config:
|
||||
vllm_factors.append(self.parallel_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.scheduler_config:
|
||||
vllm_factors.append(self.scheduler_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.device_config:
|
||||
vllm_factors.append(self.device_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.load_config:
|
||||
vllm_factors.append(self.load_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.lora_config:
|
||||
vllm_factors.append(self.lora_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.speculative_config:
|
||||
vllm_factors.append(self.speculative_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.decoding_config:
|
||||
vllm_factors.append(self.decoding_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.observability_config:
|
||||
vllm_factors.append(self.observability_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.prompt_adapter_config:
|
||||
vllm_factors.append(self.prompt_adapter_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.quant_config:
|
||||
pass # should be captured by model_config.quantization
|
||||
if self.compilation_config:
|
||||
vllm_factors.append(self.compilation_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.kv_transfer_config:
|
||||
vllm_factors.append(self.kv_transfer_config.compute_hash())
|
||||
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.additional_config:
|
||||
vllm_factors.append(self.additional_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
factors.append(vllm_factors)
|
||||
|
||||
hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10]
|
||||
@ -3052,6 +3090,7 @@ class VllmConfig:
|
||||
model_config: ModelConfig,
|
||||
load_config: LoadConfig) -> Optional[QuantizationConfig]:
|
||||
"""Get the quantization config."""
|
||||
from vllm.platforms import current_platform
|
||||
if model_config.quantization is not None:
|
||||
from vllm.model_executor.model_loader.weight_utils import (
|
||||
get_quant_config)
|
||||
@ -3114,6 +3153,7 @@ class VllmConfig:
|
||||
self.quant_config = VllmConfig._get_quantization_config(
|
||||
self.model_config, self.load_config)
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
if self.scheduler_config is not None and \
|
||||
self.model_config is not None and \
|
||||
self.scheduler_config.chunked_prefill_enabled and \
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
from typing import TYPE_CHECKING
|
||||
import importlib
|
||||
from typing import TYPE_CHECKING, Callable, Dict, Type
|
||||
|
||||
from .base import KVConnectorBase
|
||||
|
||||
@ -7,14 +8,41 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
class KVConnectorFactory:
|
||||
_registry: Dict[str, Callable[[], Type[KVConnectorBase]]] = {}
|
||||
|
||||
@staticmethod
|
||||
def create_connector(rank: int, local_rank: int,
|
||||
@classmethod
|
||||
def register_connector(cls, name: str, module_path: str,
|
||||
class_name: str) -> None:
|
||||
"""Register a connector with a lazy-loading module and class name."""
|
||||
if name in cls._registry:
|
||||
raise ValueError(f"Connector '{name}' is already registered.")
|
||||
|
||||
def loader() -> Type[KVConnectorBase]:
|
||||
module = importlib.import_module(module_path)
|
||||
return getattr(module, class_name)
|
||||
|
||||
cls._registry[name] = loader
|
||||
|
||||
@classmethod
|
||||
def create_connector(cls, rank: int, local_rank: int,
|
||||
config: "VllmConfig") -> KVConnectorBase:
|
||||
supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"]
|
||||
if config.kv_transfer_config.kv_connector in supported_kv_connector:
|
||||
from .simple_connector import SimpleConnector
|
||||
return SimpleConnector(rank, local_rank, config)
|
||||
else:
|
||||
raise ValueError(f"Unsupported connector type: "
|
||||
f"{config.kv_connector}")
|
||||
connector_name = config.kv_transfer_config.kv_connector
|
||||
if connector_name not in cls._registry:
|
||||
raise ValueError(f"Unsupported connector type: {connector_name}")
|
||||
|
||||
connector_cls = cls._registry[connector_name]()
|
||||
return connector_cls(rank, local_rank, config)
|
||||
|
||||
|
||||
# Register various connectors here.
|
||||
# The registration should not be done in each individual file, as we want to
|
||||
# only load the files corresponding to the current connector.
|
||||
KVConnectorFactory.register_connector(
|
||||
"PyNcclConnector",
|
||||
"vllm.distributed.kv_transfer.kv_connector.simple_connector",
|
||||
"SimpleConnector")
|
||||
|
||||
KVConnectorFactory.register_connector(
|
||||
"MooncakeConnector",
|
||||
"vllm.distributed.kv_transfer.kv_connector.simple_connector",
|
||||
"SimpleConnector")
|
||||
|
||||
@ -39,7 +39,6 @@ import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer
|
||||
import vllm.envs as envs
|
||||
from vllm.distributed.utils import StatelessProcessGroup
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import direct_register_custom_op, supports_custom_op
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -194,6 +193,7 @@ class GroupCoordinator:
|
||||
assert self.cpu_group is not None
|
||||
assert self.device_group is not None
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
if current_platform.is_cuda_alike():
|
||||
self.device = torch.device(f"cuda:{local_rank}")
|
||||
else:
|
||||
@ -1188,6 +1188,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
|
||||
import ray # Lazy import Ray
|
||||
ray.shutdown()
|
||||
gc.collect()
|
||||
from vllm.platforms import current_platform
|
||||
if not current_platform.is_cpu():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
@ -18,7 +18,6 @@ from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import FlexibleArgumentParser, StoreBoolean
|
||||
@ -1094,6 +1093,7 @@ class EngineArgs:
|
||||
use_sliding_window = (model_config.get_sliding_window()
|
||||
is not None)
|
||||
use_spec_decode = self.speculative_model is not None
|
||||
from vllm.platforms import current_platform
|
||||
if (is_gpu and not use_sliding_window and not use_spec_decode
|
||||
and not self.enable_lora
|
||||
and not self.enable_prompt_adapter
|
||||
|
||||
@ -1124,6 +1124,8 @@ class LLMEngine:
|
||||
|
||||
seq_group = scheduled_seq_group.seq_group
|
||||
seq_group.maybe_set_first_token_time(now)
|
||||
if not seq_group.is_prefill():
|
||||
seq_group.set_last_token_time(now)
|
||||
request_output = RequestOutputFactory.create(
|
||||
seq_group,
|
||||
self.seq_id_to_seq_group,
|
||||
@ -1166,6 +1168,8 @@ class LLMEngine:
|
||||
|
||||
seq_group = scheduled_seq_group.seq_group
|
||||
seq_group.maybe_set_first_token_time(now)
|
||||
if not seq_group.is_prefill():
|
||||
seq_group.set_last_token_time(now)
|
||||
request_output = RequestOutputFactory.create(
|
||||
seq_group,
|
||||
self.seq_id_to_seq_group,
|
||||
@ -1686,7 +1690,7 @@ class LLMEngine:
|
||||
# If the seq_group just finished the prefill state
|
||||
# get TTFT.
|
||||
if not seq_group.is_prefill():
|
||||
latency = seq_group.get_last_latency(now)
|
||||
latency = seq_group.get_last_token_latency()
|
||||
time_to_first_tokens_iter.append(latency)
|
||||
|
||||
# One generation token per finished prefill.
|
||||
@ -1694,7 +1698,7 @@ class LLMEngine:
|
||||
seq_group.num_seqs())
|
||||
else:
|
||||
# TPOTs.
|
||||
latency = seq_group.get_last_latency(now)
|
||||
latency = seq_group.get_last_token_latency()
|
||||
time_per_output_tokens_iter.append(latency)
|
||||
if seq_group.state.current_step == 0:
|
||||
# For async_output_proc, the do_log_stats()
|
||||
|
||||
@ -6,7 +6,7 @@ from collections import defaultdict, deque
|
||||
from functools import lru_cache, partial
|
||||
from pathlib import Path
|
||||
from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List,
|
||||
Literal, Mapping, Optional, Tuple, TypeVar, Union, cast)
|
||||
Literal, Optional, Tuple, TypeVar, Union, cast)
|
||||
|
||||
import jinja2.nodes
|
||||
import transformers.utils.chat_template_utils as hf_chat_utils
|
||||
@ -23,6 +23,8 @@ from openai.types.chat import (
|
||||
ChatCompletionMessageParam as OpenAIChatCompletionMessageParam)
|
||||
from openai.types.chat import (ChatCompletionMessageToolCallParam,
|
||||
ChatCompletionToolMessageParam)
|
||||
from openai.types.chat.chat_completion_content_part_input_audio_param import (
|
||||
InputAudio)
|
||||
# yapf: enable
|
||||
# pydantic needs the TypedDict from typing_extensions
|
||||
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
||||
@ -31,11 +33,7 @@ from typing_extensions import Required, TypeAlias, TypedDict
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.multimodal import MultiModalDataDict
|
||||
from vllm.multimodal.utils import (async_get_and_parse_audio,
|
||||
async_get_and_parse_image,
|
||||
async_get_and_parse_video,
|
||||
get_and_parse_audio, get_and_parse_image,
|
||||
get_and_parse_video)
|
||||
from vllm.multimodal.utils import MediaConnector
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
||||
from vllm.utils import print_warning_once
|
||||
|
||||
@ -368,14 +366,17 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
self._tokenizer = tokenizer
|
||||
self._allowed_items = (model_config.multimodal_config.limit_per_prompt
|
||||
if model_config.multimodal_config else {})
|
||||
self._consumed_items = {k: 0 for k in self._allowed_items}
|
||||
|
||||
self._items: List[_T] = []
|
||||
self._items_by_modality = defaultdict[str, list[_T]](list)
|
||||
|
||||
@property
|
||||
def model_config(self) -> ModelConfig:
|
||||
return self._model_config
|
||||
|
||||
@property
|
||||
def allowed_local_media_path(self):
|
||||
return self._model_config.allowed_local_media_path
|
||||
|
||||
@staticmethod
|
||||
@lru_cache(maxsize=None)
|
||||
def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
|
||||
@ -435,38 +436,19 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
else:
|
||||
raise TypeError(f"Unknown modality: {modality}")
|
||||
|
||||
@staticmethod
|
||||
def _combine(items: List[MultiModalDataDict]) -> MultiModalDataDict:
|
||||
mm_lists: Mapping[str, List[object]] = defaultdict(list)
|
||||
|
||||
# Merge all the multi-modal items
|
||||
for single_mm_data in items:
|
||||
for mm_key, mm_item in single_mm_data.items():
|
||||
if isinstance(mm_item, list):
|
||||
mm_lists[mm_key].extend(mm_item)
|
||||
else:
|
||||
mm_lists[mm_key].append(mm_item)
|
||||
|
||||
# Unpack any single item lists for models that don't expect multiple.
|
||||
return {
|
||||
mm_key: mm_list[0] if len(mm_list) == 1 else mm_list
|
||||
for mm_key, mm_list in mm_lists.items()
|
||||
}
|
||||
|
||||
def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
|
||||
"""
|
||||
Add a multi-modal item to the current prompt and returns the
|
||||
placeholder string to use, if any.
|
||||
"""
|
||||
allowed_count = self._allowed_items.get(modality, 1)
|
||||
current_count = self._consumed_items.get(modality, 0) + 1
|
||||
current_count = len(self._items_by_modality[modality]) + 1
|
||||
if current_count > allowed_count:
|
||||
raise ValueError(
|
||||
f"At most {allowed_count} {modality}(s) may be provided in "
|
||||
"one request.")
|
||||
|
||||
self._consumed_items[modality] = current_count
|
||||
self._items.append(item)
|
||||
self._items_by_modality[modality].append(item)
|
||||
|
||||
return self._placeholder_str(modality, current_count)
|
||||
|
||||
@ -475,22 +457,26 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class MultiModalItemTracker(BaseMultiModalItemTracker[MultiModalDataDict]):
|
||||
class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
|
||||
|
||||
def all_mm_data(self) -> Optional[MultiModalDataDict]:
|
||||
return self._combine(self._items) if self._items else None
|
||||
if self._items_by_modality:
|
||||
return dict(self._items_by_modality)
|
||||
|
||||
return None
|
||||
|
||||
def create_parser(self) -> "BaseMultiModalContentParser":
|
||||
return MultiModalContentParser(self)
|
||||
|
||||
|
||||
class AsyncMultiModalItemTracker(
|
||||
BaseMultiModalItemTracker[Awaitable[MultiModalDataDict]]):
|
||||
class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
|
||||
|
||||
async def all_mm_data(self) -> Optional[MultiModalDataDict]:
|
||||
if self._items:
|
||||
items = await asyncio.gather(*self._items)
|
||||
return self._combine(items)
|
||||
if self._items_by_modality:
|
||||
return {
|
||||
modality: await asyncio.gather(*items)
|
||||
for modality, items in self._items_by_modality.items()
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
@ -522,7 +508,7 @@ class BaseMultiModalContentParser(ABC):
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
|
||||
def parse_input_audio(self, input_audio: InputAudio) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
@ -537,31 +523,31 @@ class MultiModalContentParser(BaseMultiModalContentParser):
|
||||
|
||||
self._tracker = tracker
|
||||
|
||||
self._connector = MediaConnector(
|
||||
allowed_local_media_path=tracker.allowed_local_media_path,
|
||||
)
|
||||
|
||||
def parse_image(self, image_url: str) -> None:
|
||||
image = get_and_parse_image(image_url,
|
||||
allowed_local_media_path=self._tracker.
|
||||
_model_config.allowed_local_media_path)
|
||||
image = self._connector.fetch_image(image_url)
|
||||
|
||||
placeholder = self._tracker.add("image", image)
|
||||
self._add_placeholder(placeholder)
|
||||
|
||||
def parse_audio(self, audio_url: str) -> None:
|
||||
audio = get_and_parse_audio(audio_url)
|
||||
audio = self._connector.fetch_audio(audio_url)
|
||||
|
||||
placeholder = self._tracker.add("audio", audio)
|
||||
self._add_placeholder(placeholder)
|
||||
|
||||
def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
|
||||
input_audio_data = input_audio.get("data","")
|
||||
input_audio_format = input_audio.get("format","")
|
||||
audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
|
||||
audio = get_and_parse_audio(audio_url)
|
||||
def parse_input_audio(self, input_audio: InputAudio) -> None:
|
||||
audio_data = input_audio.get("data", "")
|
||||
audio_format = input_audio.get("format", "")
|
||||
audio_url = f"data:audio/{audio_format};base64,{audio_data}"
|
||||
|
||||
placeholder = self._tracker.add("audio", audio)
|
||||
self._add_placeholder(placeholder)
|
||||
return self.parse_audio(audio_url)
|
||||
|
||||
def parse_video(self, video_url: str) -> None:
|
||||
video = get_and_parse_video(video_url)
|
||||
video = self._connector.fetch_video(video_url)
|
||||
|
||||
placeholder = self._tracker.add("video", video)
|
||||
self._add_placeholder(placeholder)
|
||||
@ -573,33 +559,31 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
|
||||
super().__init__()
|
||||
|
||||
self._tracker = tracker
|
||||
self._connector = MediaConnector(
|
||||
allowed_local_media_path=tracker.allowed_local_media_path,
|
||||
)
|
||||
|
||||
def parse_image(self, image_url: str) -> None:
|
||||
image_coro = async_get_and_parse_image(
|
||||
image_url,
|
||||
allowed_local_media_path=self._tracker._model_config.
|
||||
allowed_local_media_path)
|
||||
image_coro = self._connector.fetch_image_async(image_url)
|
||||
|
||||
placeholder = self._tracker.add("image", image_coro)
|
||||
self._add_placeholder(placeholder)
|
||||
|
||||
def parse_audio(self, audio_url: str) -> None:
|
||||
audio_coro = async_get_and_parse_audio(audio_url)
|
||||
audio_coro = self._connector.fetch_audio_async(audio_url)
|
||||
|
||||
placeholder = self._tracker.add("audio", audio_coro)
|
||||
self._add_placeholder(placeholder)
|
||||
|
||||
def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
|
||||
input_audio_data = input_audio.get("data","")
|
||||
input_audio_format = input_audio.get("format","")
|
||||
audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
|
||||
audio_coro = async_get_and_parse_audio(audio_url)
|
||||
def parse_input_audio(self, input_audio: InputAudio) -> None:
|
||||
audio_data = input_audio.get("data", "")
|
||||
audio_format = input_audio.get("format", "")
|
||||
audio_url = f"data:audio/{audio_format};base64,{audio_data}"
|
||||
|
||||
placeholder = self._tracker.add("audio", audio_coro)
|
||||
self._add_placeholder(placeholder)
|
||||
return self.parse_audio(audio_url)
|
||||
|
||||
def parse_video(self, video_url: str) -> None:
|
||||
video = async_get_and_parse_video(video_url)
|
||||
video = self._connector.fetch_video_async(video_url)
|
||||
|
||||
placeholder = self._tracker.add("video", video)
|
||||
self._add_placeholder(placeholder)
|
||||
@ -695,10 +679,13 @@ _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
|
||||
_RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
|
||||
_VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
|
||||
|
||||
_ContentPart: TypeAlias = Union[str, Dict[str, str], InputAudio]
|
||||
|
||||
# Define a mapping from part types to their corresponding parsing functions.
|
||||
MM_PARSER_MAP: Dict[str,
|
||||
Callable[[ChatCompletionContentPartParam],
|
||||
Union[str, Dict[str,str]]]] = {
|
||||
MM_PARSER_MAP: Dict[
|
||||
str,
|
||||
Callable[[ChatCompletionContentPartParam], _ContentPart],
|
||||
] = {
|
||||
"text":
|
||||
lambda part: _TextParser(part).get("text", ""),
|
||||
"image_url":
|
||||
@ -715,8 +702,7 @@ MM_PARSER_MAP: Dict[str,
|
||||
|
||||
|
||||
def _parse_chat_message_content_mm_part(
|
||||
part: ChatCompletionContentPartParam) -> Tuple[str,
|
||||
Union[str, Dict[str, str]]]:
|
||||
part: ChatCompletionContentPartParam) -> tuple[str, _ContentPart]:
|
||||
"""
|
||||
Parses a given multi-modal content part based on its type.
|
||||
|
||||
@ -783,7 +769,7 @@ def _parse_chat_message_content_parts(
|
||||
*,
|
||||
wrap_dicts: bool,
|
||||
) -> List[ConversationMessage]:
|
||||
content: List[Union[str, Dict[str, str]]] = []
|
||||
content = list[_ContentPart]()
|
||||
|
||||
mm_parser = mm_tracker.create_parser()
|
||||
|
||||
@ -814,7 +800,7 @@ def _parse_chat_message_content_part(
|
||||
mm_parser: BaseMultiModalContentParser,
|
||||
*,
|
||||
wrap_dicts: bool,
|
||||
) -> Optional[Union[str, Dict[str, str]]]:
|
||||
) -> Optional[_ContentPart]:
|
||||
"""Parses a single part of a conversation. If wrap_dicts is True,
|
||||
structured dictionary pieces for texts and images will be
|
||||
wrapped in dictionaries, i.e., {"type": "text", "text", ...} and
|
||||
@ -823,8 +809,7 @@ def _parse_chat_message_content_part(
|
||||
with multimodal placeholders.
|
||||
"""
|
||||
if isinstance(part, str): # Handle plain text parts
|
||||
text = _TextParser(part)
|
||||
return text
|
||||
return part
|
||||
|
||||
# Handle structured dictionary parts
|
||||
part_type, content = _parse_chat_message_content_mm_part(part)
|
||||
@ -855,7 +840,7 @@ def _parse_chat_message_content_part(
|
||||
return {'type': 'audio'} if wrap_dicts else None
|
||||
|
||||
if part_type == "input_audio":
|
||||
dict_content = cast(Dict[str, str], content)
|
||||
dict_content = cast(InputAudio, content)
|
||||
mm_parser.parse_input_audio(dict_content)
|
||||
return {'type': 'audio'} if wrap_dicts else None
|
||||
|
||||
|
||||
@ -58,7 +58,9 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
||||
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
|
||||
from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import (BaseModelPath,
|
||||
OpenAIServingModels)
|
||||
from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
|
||||
from vllm.entrypoints.openai.serving_score import OpenAIServingScores
|
||||
from vllm.entrypoints.openai.serving_tokenization import (
|
||||
@ -133,32 +135,21 @@ async def build_async_engine_client_from_engine_args(
|
||||
Returns the Client or None if the creation failed.
|
||||
"""
|
||||
|
||||
# Fall back
|
||||
# TODO: fill out feature matrix.
|
||||
# AsyncLLMEngine.
|
||||
if (MQLLMEngineClient.is_unsupported_config(engine_args)
|
||||
or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):
|
||||
engine_config = engine_args.create_engine_config(
|
||||
UsageContext.OPENAI_API_SERVER)
|
||||
uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config),
|
||||
"uses_ray", False)
|
||||
|
||||
build_engine = partial(AsyncLLMEngine.from_engine_args,
|
||||
engine_args=engine_args,
|
||||
engine_config=engine_config,
|
||||
usage_context=UsageContext.OPENAI_API_SERVER)
|
||||
if uses_ray:
|
||||
# Must run in main thread with ray for its signal handlers to work
|
||||
engine_client = build_engine()
|
||||
else:
|
||||
engine_client = await asyncio.get_running_loop().run_in_executor(
|
||||
None, build_engine)
|
||||
engine_client: Optional[EngineClient] = None
|
||||
try:
|
||||
engine_client = AsyncLLMEngine.from_engine_args(
|
||||
engine_args=engine_args,
|
||||
usage_context=UsageContext.OPENAI_API_SERVER)
|
||||
yield engine_client
|
||||
finally:
|
||||
if engine_client and hasattr(engine_client, "shutdown"):
|
||||
engine_client.shutdown()
|
||||
|
||||
yield engine_client
|
||||
if hasattr(engine_client, "shutdown"):
|
||||
engine_client.shutdown()
|
||||
return
|
||||
|
||||
# Otherwise, use the multiprocessing AsyncLLMEngine.
|
||||
# MQLLMEngine.
|
||||
else:
|
||||
if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
|
||||
# Make TemporaryDirectory for prometheus multiprocessing
|
||||
@ -280,6 +271,10 @@ def base(request: Request) -> OpenAIServing:
|
||||
return tokenization(request)
|
||||
|
||||
|
||||
def models(request: Request) -> OpenAIServingModels:
|
||||
return request.app.state.openai_serving_models
|
||||
|
||||
|
||||
def chat(request: Request) -> Optional[OpenAIServingChat]:
|
||||
return request.app.state.openai_serving_chat
|
||||
|
||||
@ -347,10 +342,10 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
|
||||
|
||||
@router.get("/v1/models")
|
||||
async def show_available_models(raw_request: Request):
|
||||
handler = base(raw_request)
|
||||
handler = models(raw_request)
|
||||
|
||||
models = await handler.show_available_models()
|
||||
return JSONResponse(content=models.model_dump())
|
||||
models_ = await handler.show_available_models()
|
||||
return JSONResponse(content=models_.model_dump())
|
||||
|
||||
|
||||
@router.get("/version")
|
||||
@ -516,26 +511,22 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
|
||||
@router.post("/v1/load_lora_adapter")
|
||||
async def load_lora_adapter(request: LoadLoraAdapterRequest,
|
||||
raw_request: Request):
|
||||
for route in [chat, completion, embedding]:
|
||||
handler = route(raw_request)
|
||||
if handler is not None:
|
||||
response = await handler.load_lora_adapter(request)
|
||||
if isinstance(response, ErrorResponse):
|
||||
return JSONResponse(content=response.model_dump(),
|
||||
status_code=response.code)
|
||||
handler = models(raw_request)
|
||||
response = await handler.load_lora_adapter(request)
|
||||
if isinstance(response, ErrorResponse):
|
||||
return JSONResponse(content=response.model_dump(),
|
||||
status_code=response.code)
|
||||
|
||||
return Response(status_code=200, content=response)
|
||||
|
||||
@router.post("/v1/unload_lora_adapter")
|
||||
async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
|
||||
raw_request: Request):
|
||||
for route in [chat, completion, embedding]:
|
||||
handler = route(raw_request)
|
||||
if handler is not None:
|
||||
response = await handler.unload_lora_adapter(request)
|
||||
if isinstance(response, ErrorResponse):
|
||||
return JSONResponse(content=response.model_dump(),
|
||||
status_code=response.code)
|
||||
handler = models(raw_request)
|
||||
response = await handler.unload_lora_adapter(request)
|
||||
if isinstance(response, ErrorResponse):
|
||||
return JSONResponse(content=response.model_dump(),
|
||||
status_code=response.code)
|
||||
|
||||
return Response(status_code=200, content=response)
|
||||
|
||||
@ -639,13 +630,18 @@ def init_app_state(
|
||||
resolved_chat_template = load_chat_template(args.chat_template)
|
||||
logger.info("Using supplied chat template:\n%s", resolved_chat_template)
|
||||
|
||||
state.openai_serving_models = OpenAIServingModels(
|
||||
model_config=model_config,
|
||||
base_model_paths=base_model_paths,
|
||||
lora_modules=args.lora_modules,
|
||||
prompt_adapters=args.prompt_adapters,
|
||||
)
|
||||
# TODO: The chat template is now broken for lora adapters :(
|
||||
state.openai_serving_chat = OpenAIServingChat(
|
||||
engine_client,
|
||||
model_config,
|
||||
base_model_paths,
|
||||
state.openai_serving_models,
|
||||
args.response_role,
|
||||
lora_modules=args.lora_modules,
|
||||
prompt_adapters=args.prompt_adapters,
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
@ -657,16 +653,14 @@ def init_app_state(
|
||||
state.openai_serving_completion = OpenAIServingCompletion(
|
||||
engine_client,
|
||||
model_config,
|
||||
base_model_paths,
|
||||
lora_modules=args.lora_modules,
|
||||
prompt_adapters=args.prompt_adapters,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
||||
) if model_config.runner_type == "generate" else None
|
||||
state.openai_serving_pooling = OpenAIServingPooling(
|
||||
engine_client,
|
||||
model_config,
|
||||
base_model_paths,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
@ -674,7 +668,7 @@ def init_app_state(
|
||||
state.openai_serving_embedding = OpenAIServingEmbedding(
|
||||
engine_client,
|
||||
model_config,
|
||||
base_model_paths,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
@ -682,14 +676,13 @@ def init_app_state(
|
||||
state.openai_serving_scores = OpenAIServingScores(
|
||||
engine_client,
|
||||
model_config,
|
||||
base_model_paths,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger
|
||||
) if model_config.task == "score" else None
|
||||
state.openai_serving_tokenization = OpenAIServingTokenization(
|
||||
engine_client,
|
||||
model_config,
|
||||
base_model_paths,
|
||||
lora_modules=args.lora_modules,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
|
||||
@ -12,7 +12,7 @@ from typing import List, Optional, Sequence, Union, get_args
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
|
||||
from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
|
||||
validate_chat_template)
|
||||
from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
|
||||
from vllm.entrypoints.openai.serving_models import (LoRAModulePath,
|
||||
PromptAdapterPath)
|
||||
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
@ -20,7 +20,8 @@ from vllm.entrypoints.openai.protocol import (BatchRequestInput,
|
||||
# yapf: enable
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
|
||||
from vllm.entrypoints.openai.serving_engine import BaseModelPath
|
||||
from vllm.entrypoints.openai.serving_models import (BaseModelPath,
|
||||
OpenAIServingModels)
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import FlexibleArgumentParser, random_uuid
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
@ -213,13 +214,17 @@ async def main(args):
|
||||
request_logger = RequestLogger(max_log_len=args.max_log_len)
|
||||
|
||||
# Create the openai serving objects.
|
||||
openai_serving_models = OpenAIServingModels(
|
||||
model_config=model_config,
|
||||
base_model_paths=base_model_paths,
|
||||
lora_modules=None,
|
||||
prompt_adapters=None,
|
||||
)
|
||||
openai_serving_chat = OpenAIServingChat(
|
||||
engine,
|
||||
model_config,
|
||||
base_model_paths,
|
||||
openai_serving_models,
|
||||
args.response_role,
|
||||
lora_modules=None,
|
||||
prompt_adapters=None,
|
||||
request_logger=request_logger,
|
||||
chat_template=None,
|
||||
chat_template_content_format="auto",
|
||||
@ -228,7 +233,7 @@ async def main(args):
|
||||
openai_serving_embedding = OpenAIServingEmbedding(
|
||||
engine,
|
||||
model_config,
|
||||
base_model_paths,
|
||||
openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
chat_template=None,
|
||||
chat_template_content_format="auto",
|
||||
|
||||
@ -21,10 +21,8 @@ from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
|
||||
DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo,
|
||||
RequestResponseMetadata, ToolCall, UsageInfo)
|
||||
from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
|
||||
LoRAModulePath,
|
||||
OpenAIServing,
|
||||
PromptAdapterPath)
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
@ -42,11 +40,9 @@ class OpenAIServingChat(OpenAIServing):
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
model_config: ModelConfig,
|
||||
base_model_paths: List[BaseModelPath],
|
||||
models: OpenAIServingModels,
|
||||
response_role: str,
|
||||
*,
|
||||
lora_modules: Optional[List[LoRAModulePath]],
|
||||
prompt_adapters: Optional[List[PromptAdapterPath]],
|
||||
request_logger: Optional[RequestLogger],
|
||||
chat_template: Optional[str],
|
||||
chat_template_content_format: ChatTemplateContentFormatOption,
|
||||
@ -57,9 +53,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
) -> None:
|
||||
super().__init__(engine_client=engine_client,
|
||||
model_config=model_config,
|
||||
base_model_paths=base_model_paths,
|
||||
lora_modules=lora_modules,
|
||||
prompt_adapters=prompt_adapters,
|
||||
models=models,
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=return_tokens_as_token_ids)
|
||||
|
||||
@ -126,7 +120,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
prompt_adapter_request,
|
||||
) = self._maybe_get_adapters(request)
|
||||
|
||||
model_name = self._get_model_name(lora_request)
|
||||
model_name = self.models.model_name(lora_request)
|
||||
|
||||
tokenizer = await self.engine_client.get_tokenizer(lora_request)
|
||||
|
||||
|
||||
@ -21,10 +21,8 @@ from vllm.entrypoints.openai.protocol import (CompletionLogProbs,
|
||||
RequestResponseMetadata,
|
||||
UsageInfo)
|
||||
# yapf: enable
|
||||
from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
|
||||
LoRAModulePath,
|
||||
OpenAIServing,
|
||||
PromptAdapterPath)
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
||||
@ -41,18 +39,14 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
model_config: ModelConfig,
|
||||
base_model_paths: List[BaseModelPath],
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
lora_modules: Optional[List[LoRAModulePath]],
|
||||
prompt_adapters: Optional[List[PromptAdapterPath]],
|
||||
request_logger: Optional[RequestLogger],
|
||||
return_tokens_as_token_ids: bool = False,
|
||||
):
|
||||
super().__init__(engine_client=engine_client,
|
||||
model_config=model_config,
|
||||
base_model_paths=base_model_paths,
|
||||
lora_modules=lora_modules,
|
||||
prompt_adapters=prompt_adapters,
|
||||
models=models,
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=return_tokens_as_token_ids)
|
||||
diff_sampling_param = self.model_config.get_diff_sampling_param()
|
||||
@ -170,7 +164,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
|
||||
result_generator = merge_async_iterators(*generators)
|
||||
|
||||
model_name = self._get_model_name(lora_request)
|
||||
model_name = self.models.model_name(lora_request)
|
||||
num_prompts = len(engine_prompts)
|
||||
|
||||
# Similar to the OpenAI API, when n != best_of, we do not stream the
|
||||
|
||||
@ -16,7 +16,8 @@ from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest,
|
||||
EmbeddingResponse,
|
||||
EmbeddingResponseData,
|
||||
ErrorResponse, UsageInfo)
|
||||
from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput,
|
||||
PoolingRequestOutput)
|
||||
@ -46,7 +47,7 @@ class OpenAIServingEmbedding(OpenAIServing):
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
model_config: ModelConfig,
|
||||
base_model_paths: List[BaseModelPath],
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
request_logger: Optional[RequestLogger],
|
||||
chat_template: Optional[str],
|
||||
@ -54,9 +55,7 @@ class OpenAIServingEmbedding(OpenAIServing):
|
||||
) -> None:
|
||||
super().__init__(engine_client=engine_client,
|
||||
model_config=model_config,
|
||||
base_model_paths=base_model_paths,
|
||||
lora_modules=None,
|
||||
prompt_adapters=None,
|
||||
models=models,
|
||||
request_logger=request_logger)
|
||||
|
||||
self.chat_template = chat_template
|
||||
|
||||
@ -1,7 +1,5 @@
|
||||
import json
|
||||
import pathlib
|
||||
from concurrent.futures.thread import ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
from http import HTTPStatus
|
||||
from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping,
|
||||
Optional, Sequence, Tuple, TypedDict, Union)
|
||||
@ -28,13 +26,10 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
DetokenizeRequest,
|
||||
EmbeddingChatRequest,
|
||||
EmbeddingCompletionRequest,
|
||||
ErrorResponse,
|
||||
LoadLoraAdapterRequest,
|
||||
ModelCard, ModelList,
|
||||
ModelPermission, ScoreRequest,
|
||||
ErrorResponse, ScoreRequest,
|
||||
TokenizeChatRequest,
|
||||
TokenizeCompletionRequest,
|
||||
UnloadLoraAdapterRequest)
|
||||
TokenizeCompletionRequest)
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.entrypoints.openai.tool_parsers import ToolParser
|
||||
# yapf: enable
|
||||
from vllm.inputs import TokensPrompt
|
||||
@ -48,30 +43,10 @@ from vllm.sequence import Logprob
|
||||
from vllm.tracing import (contains_trace_headers, extract_trace_headers,
|
||||
log_tracing_disabled_warning)
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
||||
from vllm.utils import AtomicCounter, is_list_of, make_async, random_uuid
|
||||
from vllm.utils import is_list_of, make_async, random_uuid
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BaseModelPath:
|
||||
name: str
|
||||
model_path: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class PromptAdapterPath:
|
||||
name: str
|
||||
local_path: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class LoRAModulePath:
|
||||
name: str
|
||||
path: str
|
||||
base_model_name: Optional[str] = None
|
||||
|
||||
|
||||
CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest,
|
||||
EmbeddingCompletionRequest, ScoreRequest,
|
||||
TokenizeCompletionRequest]
|
||||
@ -96,10 +71,8 @@ class OpenAIServing:
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
model_config: ModelConfig,
|
||||
base_model_paths: List[BaseModelPath],
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
lora_modules: Optional[List[LoRAModulePath]],
|
||||
prompt_adapters: Optional[List[PromptAdapterPath]],
|
||||
request_logger: Optional[RequestLogger],
|
||||
return_tokens_as_token_ids: bool = False,
|
||||
):
|
||||
@ -109,35 +82,7 @@ class OpenAIServing:
|
||||
self.model_config = model_config
|
||||
self.max_model_len = model_config.max_model_len
|
||||
|
||||
self.base_model_paths = base_model_paths
|
||||
|
||||
self.lora_id_counter = AtomicCounter(0)
|
||||
self.lora_requests = []
|
||||
if lora_modules is not None:
|
||||
self.lora_requests = [
|
||||
LoRARequest(lora_name=lora.name,
|
||||
lora_int_id=i,
|
||||
lora_path=lora.path,
|
||||
base_model_name=lora.base_model_name
|
||||
if lora.base_model_name
|
||||
and self._is_model_supported(lora.base_model_name)
|
||||
else self.base_model_paths[0].name)
|
||||
for i, lora in enumerate(lora_modules, start=1)
|
||||
]
|
||||
|
||||
self.prompt_adapter_requests = []
|
||||
if prompt_adapters is not None:
|
||||
for i, prompt_adapter in enumerate(prompt_adapters, start=1):
|
||||
with pathlib.Path(prompt_adapter.local_path,
|
||||
"adapter_config.json").open() as f:
|
||||
adapter_config = json.load(f)
|
||||
num_virtual_tokens = adapter_config["num_virtual_tokens"]
|
||||
self.prompt_adapter_requests.append(
|
||||
PromptAdapterRequest(
|
||||
prompt_adapter_name=prompt_adapter.name,
|
||||
prompt_adapter_id=i,
|
||||
prompt_adapter_local_path=prompt_adapter.local_path,
|
||||
prompt_adapter_num_virtual_tokens=num_virtual_tokens))
|
||||
self.models = models
|
||||
|
||||
self.request_logger = request_logger
|
||||
self.return_tokens_as_token_ids = return_tokens_as_token_ids
|
||||
@ -150,33 +95,6 @@ class OpenAIServing:
|
||||
self._tokenize_prompt_input_or_inputs,
|
||||
executor=self._tokenizer_executor)
|
||||
|
||||
async def show_available_models(self) -> ModelList:
|
||||
"""Show available models. Right now we only have one model."""
|
||||
model_cards = [
|
||||
ModelCard(id=base_model.name,
|
||||
max_model_len=self.max_model_len,
|
||||
root=base_model.model_path,
|
||||
permission=[ModelPermission()])
|
||||
for base_model in self.base_model_paths
|
||||
]
|
||||
lora_cards = [
|
||||
ModelCard(id=lora.lora_name,
|
||||
root=lora.local_path,
|
||||
parent=lora.base_model_name if lora.base_model_name else
|
||||
self.base_model_paths[0].name,
|
||||
permission=[ModelPermission()])
|
||||
for lora in self.lora_requests
|
||||
]
|
||||
prompt_adapter_cards = [
|
||||
ModelCard(id=prompt_adapter.prompt_adapter_name,
|
||||
root=self.base_model_paths[0].name,
|
||||
permission=[ModelPermission()])
|
||||
for prompt_adapter in self.prompt_adapter_requests
|
||||
]
|
||||
model_cards.extend(lora_cards)
|
||||
model_cards.extend(prompt_adapter_cards)
|
||||
return ModelList(data=model_cards)
|
||||
|
||||
def create_error_response(
|
||||
self,
|
||||
message: str,
|
||||
@ -205,11 +123,13 @@ class OpenAIServing:
|
||||
) -> Optional[ErrorResponse]:
|
||||
if self._is_model_supported(request.model):
|
||||
return None
|
||||
if request.model in [lora.lora_name for lora in self.lora_requests]:
|
||||
if request.model in [
|
||||
lora.lora_name for lora in self.models.lora_requests
|
||||
]:
|
||||
return None
|
||||
if request.model in [
|
||||
prompt_adapter.prompt_adapter_name
|
||||
for prompt_adapter in self.prompt_adapter_requests
|
||||
for prompt_adapter in self.models.prompt_adapter_requests
|
||||
]:
|
||||
return None
|
||||
return self.create_error_response(
|
||||
@ -223,10 +143,10 @@ class OpenAIServing:
|
||||
None, PromptAdapterRequest]]:
|
||||
if self._is_model_supported(request.model):
|
||||
return None, None
|
||||
for lora in self.lora_requests:
|
||||
for lora in self.models.lora_requests:
|
||||
if request.model == lora.lora_name:
|
||||
return lora, None
|
||||
for prompt_adapter in self.prompt_adapter_requests:
|
||||
for prompt_adapter in self.models.prompt_adapter_requests:
|
||||
if request.model == prompt_adapter.prompt_adapter_name:
|
||||
return None, prompt_adapter
|
||||
# if _check_model has been called earlier, this will be unreachable
|
||||
@ -588,91 +508,5 @@ class OpenAIServing:
|
||||
return logprob.decoded_token
|
||||
return tokenizer.decode(token_id)
|
||||
|
||||
async def _check_load_lora_adapter_request(
|
||||
self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]:
|
||||
# Check if both 'lora_name' and 'lora_path' are provided
|
||||
if not request.lora_name or not request.lora_path:
|
||||
return self.create_error_response(
|
||||
message="Both 'lora_name' and 'lora_path' must be provided.",
|
||||
err_type="InvalidUserInput",
|
||||
status_code=HTTPStatus.BAD_REQUEST)
|
||||
|
||||
# Check if the lora adapter with the given name already exists
|
||||
if any(lora_request.lora_name == request.lora_name
|
||||
for lora_request in self.lora_requests):
|
||||
return self.create_error_response(
|
||||
message=
|
||||
f"The lora adapter '{request.lora_name}' has already been"
|
||||
"loaded.",
|
||||
err_type="InvalidUserInput",
|
||||
status_code=HTTPStatus.BAD_REQUEST)
|
||||
|
||||
return None
|
||||
|
||||
async def _check_unload_lora_adapter_request(
|
||||
self,
|
||||
request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]:
|
||||
# Check if either 'lora_name' or 'lora_int_id' is provided
|
||||
if not request.lora_name and not request.lora_int_id:
|
||||
return self.create_error_response(
|
||||
message=
|
||||
"either 'lora_name' and 'lora_int_id' needs to be provided.",
|
||||
err_type="InvalidUserInput",
|
||||
status_code=HTTPStatus.BAD_REQUEST)
|
||||
|
||||
# Check if the lora adapter with the given name exists
|
||||
if not any(lora_request.lora_name == request.lora_name
|
||||
for lora_request in self.lora_requests):
|
||||
return self.create_error_response(
|
||||
message=
|
||||
f"The lora adapter '{request.lora_name}' cannot be found.",
|
||||
err_type="InvalidUserInput",
|
||||
status_code=HTTPStatus.BAD_REQUEST)
|
||||
|
||||
return None
|
||||
|
||||
async def load_lora_adapter(
|
||||
self,
|
||||
request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]:
|
||||
error_check_ret = await self._check_load_lora_adapter_request(request)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
lora_name, lora_path = request.lora_name, request.lora_path
|
||||
unique_id = self.lora_id_counter.inc(1)
|
||||
self.lora_requests.append(
|
||||
LoRARequest(lora_name=lora_name,
|
||||
lora_int_id=unique_id,
|
||||
lora_path=lora_path))
|
||||
return f"Success: LoRA adapter '{lora_name}' added successfully."
|
||||
|
||||
async def unload_lora_adapter(
|
||||
self,
|
||||
request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]:
|
||||
error_check_ret = await self._check_unload_lora_adapter_request(request
|
||||
)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
lora_name = request.lora_name
|
||||
self.lora_requests = [
|
||||
lora_request for lora_request in self.lora_requests
|
||||
if lora_request.lora_name != lora_name
|
||||
]
|
||||
return f"Success: LoRA adapter '{lora_name}' removed successfully."
|
||||
|
||||
def _is_model_supported(self, model_name):
|
||||
return any(model.name == model_name for model in self.base_model_paths)
|
||||
|
||||
def _get_model_name(self, lora: Optional[LoRARequest]):
|
||||
"""
|
||||
Returns the appropriate model name depending on the availability
|
||||
and support of the LoRA or base model.
|
||||
Parameters:
|
||||
- lora: LoRARequest that contain a base_model_name.
|
||||
Returns:
|
||||
- str: The name of the base model or the first available model path.
|
||||
"""
|
||||
if lora is not None:
|
||||
return lora.lora_name
|
||||
return self.base_model_paths[0].name
|
||||
return self.models.is_base_model(model_name)
|
||||
|
||||
210
vllm/entrypoints/openai/serving_models.py
Normal file
210
vllm/entrypoints/openai/serving_models.py
Normal file
@ -0,0 +1,210 @@
|
||||
import json
|
||||
import pathlib
|
||||
from dataclasses import dataclass
|
||||
from http import HTTPStatus
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.openai.protocol import (ErrorResponse,
|
||||
LoadLoraAdapterRequest,
|
||||
ModelCard, ModelList,
|
||||
ModelPermission,
|
||||
UnloadLoraAdapterRequest)
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.utils import AtomicCounter
|
||||
|
||||
|
||||
@dataclass
|
||||
class BaseModelPath:
|
||||
name: str
|
||||
model_path: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class PromptAdapterPath:
|
||||
name: str
|
||||
local_path: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class LoRAModulePath:
|
||||
name: str
|
||||
path: str
|
||||
base_model_name: Optional[str] = None
|
||||
|
||||
|
||||
class OpenAIServingModels:
|
||||
"""Shared instance to hold data about the loaded base model(s) and adapters.
|
||||
|
||||
Handles the routes:
|
||||
- /v1/models
|
||||
- /v1/load_lora_adapter
|
||||
- /v1/unload_lora_adapter
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_config: ModelConfig,
|
||||
base_model_paths: List[BaseModelPath],
|
||||
*,
|
||||
lora_modules: Optional[List[LoRAModulePath]] = None,
|
||||
prompt_adapters: Optional[List[PromptAdapterPath]] = None,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.base_model_paths = base_model_paths
|
||||
self.max_model_len = model_config.max_model_len
|
||||
|
||||
self.lora_id_counter = AtomicCounter(0)
|
||||
self.lora_requests = []
|
||||
if lora_modules is not None:
|
||||
self.lora_requests = [
|
||||
LoRARequest(lora_name=lora.name,
|
||||
lora_int_id=i,
|
||||
lora_path=lora.path,
|
||||
base_model_name=lora.base_model_name
|
||||
if lora.base_model_name
|
||||
and self.is_base_model(lora.base_model_name) else
|
||||
self.base_model_paths[0].name)
|
||||
for i, lora in enumerate(lora_modules, start=1)
|
||||
]
|
||||
|
||||
self.prompt_adapter_requests = []
|
||||
if prompt_adapters is not None:
|
||||
for i, prompt_adapter in enumerate(prompt_adapters, start=1):
|
||||
with pathlib.Path(prompt_adapter.local_path,
|
||||
"adapter_config.json").open() as f:
|
||||
adapter_config = json.load(f)
|
||||
num_virtual_tokens = adapter_config["num_virtual_tokens"]
|
||||
self.prompt_adapter_requests.append(
|
||||
PromptAdapterRequest(
|
||||
prompt_adapter_name=prompt_adapter.name,
|
||||
prompt_adapter_id=i,
|
||||
prompt_adapter_local_path=prompt_adapter.local_path,
|
||||
prompt_adapter_num_virtual_tokens=num_virtual_tokens))
|
||||
|
||||
def is_base_model(self, model_name):
|
||||
return any(model.name == model_name for model in self.base_model_paths)
|
||||
|
||||
def model_name(self, lora_request: Optional[LoRARequest] = None) -> str:
|
||||
"""Returns the appropriate model name depending on the availability
|
||||
and support of the LoRA or base model.
|
||||
Parameters:
|
||||
- lora: LoRARequest that contain a base_model_name.
|
||||
Returns:
|
||||
- str: The name of the base model or the first available model path.
|
||||
"""
|
||||
if lora_request is not None:
|
||||
return lora_request.lora_name
|
||||
return self.base_model_paths[0].name
|
||||
|
||||
async def show_available_models(self) -> ModelList:
|
||||
"""Show available models. This includes the base model and all
|
||||
adapters"""
|
||||
model_cards = [
|
||||
ModelCard(id=base_model.name,
|
||||
max_model_len=self.max_model_len,
|
||||
root=base_model.model_path,
|
||||
permission=[ModelPermission()])
|
||||
for base_model in self.base_model_paths
|
||||
]
|
||||
lora_cards = [
|
||||
ModelCard(id=lora.lora_name,
|
||||
root=lora.local_path,
|
||||
parent=lora.base_model_name if lora.base_model_name else
|
||||
self.base_model_paths[0].name,
|
||||
permission=[ModelPermission()])
|
||||
for lora in self.lora_requests
|
||||
]
|
||||
prompt_adapter_cards = [
|
||||
ModelCard(id=prompt_adapter.prompt_adapter_name,
|
||||
root=self.base_model_paths[0].name,
|
||||
permission=[ModelPermission()])
|
||||
for prompt_adapter in self.prompt_adapter_requests
|
||||
]
|
||||
model_cards.extend(lora_cards)
|
||||
model_cards.extend(prompt_adapter_cards)
|
||||
return ModelList(data=model_cards)
|
||||
|
||||
async def load_lora_adapter(
|
||||
self,
|
||||
request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]:
|
||||
error_check_ret = await self._check_load_lora_adapter_request(request)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
lora_name, lora_path = request.lora_name, request.lora_path
|
||||
unique_id = self.lora_id_counter.inc(1)
|
||||
self.lora_requests.append(
|
||||
LoRARequest(lora_name=lora_name,
|
||||
lora_int_id=unique_id,
|
||||
lora_path=lora_path))
|
||||
return f"Success: LoRA adapter '{lora_name}' added successfully."
|
||||
|
||||
async def unload_lora_adapter(
|
||||
self,
|
||||
request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]:
|
||||
error_check_ret = await self._check_unload_lora_adapter_request(request
|
||||
)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
lora_name = request.lora_name
|
||||
self.lora_requests = [
|
||||
lora_request for lora_request in self.lora_requests
|
||||
if lora_request.lora_name != lora_name
|
||||
]
|
||||
return f"Success: LoRA adapter '{lora_name}' removed successfully."
|
||||
|
||||
async def _check_load_lora_adapter_request(
|
||||
self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]:
|
||||
# Check if both 'lora_name' and 'lora_path' are provided
|
||||
if not request.lora_name or not request.lora_path:
|
||||
return create_error_response(
|
||||
message="Both 'lora_name' and 'lora_path' must be provided.",
|
||||
err_type="InvalidUserInput",
|
||||
status_code=HTTPStatus.BAD_REQUEST)
|
||||
|
||||
# Check if the lora adapter with the given name already exists
|
||||
if any(lora_request.lora_name == request.lora_name
|
||||
for lora_request in self.lora_requests):
|
||||
return create_error_response(
|
||||
message=
|
||||
f"The lora adapter '{request.lora_name}' has already been"
|
||||
"loaded.",
|
||||
err_type="InvalidUserInput",
|
||||
status_code=HTTPStatus.BAD_REQUEST)
|
||||
|
||||
return None
|
||||
|
||||
async def _check_unload_lora_adapter_request(
|
||||
self,
|
||||
request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]:
|
||||
# Check if either 'lora_name' or 'lora_int_id' is provided
|
||||
if not request.lora_name and not request.lora_int_id:
|
||||
return create_error_response(
|
||||
message=
|
||||
"either 'lora_name' and 'lora_int_id' needs to be provided.",
|
||||
err_type="InvalidUserInput",
|
||||
status_code=HTTPStatus.BAD_REQUEST)
|
||||
|
||||
# Check if the lora adapter with the given name exists
|
||||
if not any(lora_request.lora_name == request.lora_name
|
||||
for lora_request in self.lora_requests):
|
||||
return create_error_response(
|
||||
message=
|
||||
f"The lora adapter '{request.lora_name}' cannot be found.",
|
||||
err_type="InvalidUserInput",
|
||||
status_code=HTTPStatus.BAD_REQUEST)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def create_error_response(
|
||||
message: str,
|
||||
err_type: str = "BadRequestError",
|
||||
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse:
|
||||
return ErrorResponse(message=message,
|
||||
type=err_type,
|
||||
code=status_code.value)
|
||||
@ -15,7 +15,8 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
|
||||
PoolingChatRequest,
|
||||
PoolingRequest, PoolingResponse,
|
||||
PoolingResponseData, UsageInfo)
|
||||
from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import PoolingOutput, PoolingRequestOutput
|
||||
from vllm.utils import merge_async_iterators
|
||||
@ -44,7 +45,7 @@ class OpenAIServingPooling(OpenAIServing):
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
model_config: ModelConfig,
|
||||
base_model_paths: List[BaseModelPath],
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
request_logger: Optional[RequestLogger],
|
||||
chat_template: Optional[str],
|
||||
@ -52,9 +53,7 @@ class OpenAIServingPooling(OpenAIServing):
|
||||
) -> None:
|
||||
super().__init__(engine_client=engine_client,
|
||||
model_config=model_config,
|
||||
base_model_paths=base_model_paths,
|
||||
lora_modules=None,
|
||||
prompt_adapters=None,
|
||||
models=models,
|
||||
request_logger=request_logger)
|
||||
|
||||
self.chat_template = chat_template
|
||||
|
||||
@ -10,7 +10,8 @@ from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.protocol import (ErrorResponse, ScoreRequest,
|
||||
ScoreResponse, ScoreResponseData,
|
||||
UsageInfo)
|
||||
from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.inputs.data import TokensPrompt
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
|
||||
@ -50,15 +51,13 @@ class OpenAIServingScores(OpenAIServing):
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
model_config: ModelConfig,
|
||||
base_model_paths: List[BaseModelPath],
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
request_logger: Optional[RequestLogger],
|
||||
) -> None:
|
||||
super().__init__(engine_client=engine_client,
|
||||
model_config=model_config,
|
||||
base_model_paths=base_model_paths,
|
||||
lora_modules=None,
|
||||
prompt_adapters=None,
|
||||
models=models,
|
||||
request_logger=request_logger)
|
||||
|
||||
async def create_score(
|
||||
|
||||
@ -15,9 +15,8 @@ from vllm.entrypoints.openai.protocol import (DetokenizeRequest,
|
||||
TokenizeRequest,
|
||||
TokenizeResponse)
|
||||
# yapf: enable
|
||||
from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
|
||||
LoRAModulePath,
|
||||
OpenAIServing)
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -29,18 +28,15 @@ class OpenAIServingTokenization(OpenAIServing):
|
||||
self,
|
||||
engine_client: EngineClient,
|
||||
model_config: ModelConfig,
|
||||
base_model_paths: List[BaseModelPath],
|
||||
models: OpenAIServingModels,
|
||||
*,
|
||||
lora_modules: Optional[List[LoRAModulePath]],
|
||||
request_logger: Optional[RequestLogger],
|
||||
chat_template: Optional[str],
|
||||
chat_template_content_format: ChatTemplateContentFormatOption,
|
||||
) -> None:
|
||||
super().__init__(engine_client=engine_client,
|
||||
model_config=model_config,
|
||||
base_model_paths=base_model_paths,
|
||||
lora_modules=lora_modules,
|
||||
prompt_adapters=None,
|
||||
models=models,
|
||||
request_logger=request_logger)
|
||||
|
||||
self.chat_template = chat_template
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
import asyncio
|
||||
import multiprocessing
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
@ -13,10 +12,9 @@ from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO,
|
||||
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.triton_utils.importing import HAS_TRITON
|
||||
from vllm.utils import cuda_is_initialized
|
||||
from vllm.utils import _check_multiproc_method, get_mp_context
|
||||
|
||||
if HAS_TRITON:
|
||||
from vllm.triton_utils import maybe_set_triton_cache_manager
|
||||
@ -274,24 +272,6 @@ def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
|
||||
file.write = write_with_prefix # type: ignore[method-assign]
|
||||
|
||||
|
||||
def _check_multiproc_method():
|
||||
if (cuda_is_initialized()
|
||||
and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
|
||||
logger.warning("CUDA was previously initialized. We must use "
|
||||
"the `spawn` multiprocessing start method. Setting "
|
||||
"VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
|
||||
"See https://docs.vllm.ai/en/latest/getting_started/"
|
||||
"debugging.html#python-multiprocessing "
|
||||
"for more information.")
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
|
||||
def get_mp_context():
|
||||
_check_multiproc_method()
|
||||
mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
|
||||
return multiprocessing.get_context(mp_method)
|
||||
|
||||
|
||||
def set_multiprocessing_worker_envs(parallel_config):
|
||||
""" Set up environment variables that should be used when there are workers
|
||||
in a multiprocessing environment. This should be called by the parent
|
||||
|
||||
@ -8,7 +8,6 @@ import msgspec
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.executor.msgspec_utils import decode_hook, encode_hook
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import ExecuteModelRequest, IntermediateTensors
|
||||
from vllm.utils import get_ip
|
||||
from vllm.worker.worker_base import WorkerWrapperBase
|
||||
@ -229,6 +228,7 @@ def initialize_ray_cluster(
|
||||
the default Ray cluster address.
|
||||
"""
|
||||
assert_ray_available()
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
# Connect to a ray cluster.
|
||||
if current_platform.is_rocm() or current_platform.is_xpu():
|
||||
|
||||
@ -99,6 +99,9 @@ class InputContext:
|
||||
|
||||
merged_kwargs = {**base_kwargs, **kwargs}
|
||||
|
||||
if isinstance(typ, type):
|
||||
merged_kwargs["processor_cls"] = typ
|
||||
|
||||
hf_processor = cached_get_processor(
|
||||
self.model_config.model,
|
||||
trust_remote_code=self.model_config.trust_remote_code,
|
||||
@ -132,10 +135,13 @@ class InputProcessingContext(InputContext):
|
||||
def call_hf_processor(
|
||||
self,
|
||||
hf_processor: ProcessorMixin,
|
||||
prompt: str,
|
||||
processor_data: Mapping[str, object],
|
||||
inference_kwargs: Mapping[str, object],
|
||||
data: Mapping[str, object],
|
||||
kwargs: Mapping[str, object] = {},
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Call :code:`hf_processor` on the prompt :code:`data`
|
||||
(text, image, audio...) with configurable options :code:`kwargs`.
|
||||
"""
|
||||
assert callable(hf_processor)
|
||||
|
||||
base_kwargs = self.model_config.mm_processor_kwargs
|
||||
@ -144,21 +150,15 @@ class InputProcessingContext(InputContext):
|
||||
|
||||
merged_kwargs = resolve_mm_processor_kwargs(
|
||||
base_kwargs,
|
||||
inference_kwargs,
|
||||
kwargs,
|
||||
hf_processor,
|
||||
requires_kw_only=False,
|
||||
allow_var_kwargs=True,
|
||||
)
|
||||
|
||||
try:
|
||||
return hf_processor(
|
||||
text=prompt,
|
||||
**processor_data,
|
||||
**merged_kwargs,
|
||||
return_tensors="pt",
|
||||
)
|
||||
return hf_processor(**data, **merged_kwargs, return_tensors="pt")
|
||||
except Exception as exc:
|
||||
data = dict(text=prompt, **processor_data)
|
||||
msg = (f"Failed to apply {type(hf_processor).__name__} "
|
||||
f"on data={data} with kwargs={merged_kwargs}")
|
||||
|
||||
|
||||
@ -67,15 +67,9 @@ class LoRALayerWeights:
|
||||
peft_helper: PEFTHelper,
|
||||
embeddings_tensor: Optional[torch.Tensor] = None,
|
||||
) -> "LoRALayerWeights":
|
||||
return cls(
|
||||
module_name,
|
||||
peft_helper.r,
|
||||
peft_helper.lora_alpha,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
embeddings_tensor,
|
||||
)
|
||||
return cls(module_name, peft_helper.r, peft_helper.lora_alpha, None,
|
||||
None, None, embeddings_tensor,
|
||||
peft_helper.vllm_lora_scaling_factor)
|
||||
|
||||
@classmethod
|
||||
def create_dummy_lora_weights(
|
||||
|
||||
@ -173,7 +173,7 @@ class LoRAModel(AdapterModel):
|
||||
return cls(lora_model_id,
|
||||
peft_helper.r,
|
||||
loras,
|
||||
scaling_factor=peft_helper.vllm_scaling_factor)
|
||||
scaling_factor=peft_helper.vllm_long_context_scaling_factor)
|
||||
|
||||
@classmethod
|
||||
def from_local_checkpoint(
|
||||
|
||||
@ -4,6 +4,8 @@ import math
|
||||
from dataclasses import MISSING, dataclass, field, fields
|
||||
from typing import Literal, Optional, Union
|
||||
|
||||
from vllm.utils import print_info_once
|
||||
|
||||
|
||||
@dataclass
|
||||
class PEFTHelper:
|
||||
@ -14,21 +16,22 @@ class PEFTHelper:
|
||||
|
||||
bias: Literal["none", "all", "lora_only"] = field(default="none")
|
||||
modules_to_save: Optional[list[str]] = field(default=None)
|
||||
# True to use Rank-Stabilized LoRA (rsLoRA, see: https://arxiv.org/abs/2312.03732)
|
||||
use_rslora: bool = field(default=False)
|
||||
# True to use Weight-Decomposed Low-Rank Adaptation (DoRA, see: https://arxiv.org/abs/2402.09353)
|
||||
use_dora: bool = field(default=False)
|
||||
# long lora field
|
||||
# long context lora field
|
||||
context_length: int = field(default=0)
|
||||
# Extra vllm field, start with 'vllm_' to avoid conflict
|
||||
vllm_lora_scaling_factor: float = field(default=1.0)
|
||||
vllm_max_position_embeddings: Optional[int] = field(default=False)
|
||||
vllm_scaling_factor: Optional[float] = field(default=None)
|
||||
vllm_long_context_scaling_factor: Optional[float] = field(default=None)
|
||||
|
||||
def _validate_features(self):
|
||||
error_msg = []
|
||||
|
||||
if self.modules_to_save:
|
||||
error_msg.append("vLLM only supports modules_to_save being None.")
|
||||
if self.use_rslora:
|
||||
error_msg.append("vLLM does not yet support RSLoRA.")
|
||||
|
||||
if self.use_dora:
|
||||
error_msg.append("vLLM does not yet support DoRA.")
|
||||
@ -38,10 +41,15 @@ class PEFTHelper:
|
||||
|
||||
def __post_init__(self):
|
||||
self._validate_features()
|
||||
if self.use_rslora:
|
||||
print_info_once("Loading LoRA weights trained with rsLoRA.")
|
||||
self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r)
|
||||
else:
|
||||
self.vllm_lora_scaling_factor = self.lora_alpha / self.r
|
||||
if self.context_length:
|
||||
if self.vllm_max_position_embeddings is None:
|
||||
self.vllm_max_position_embeddings = self.context_length
|
||||
self.vllm_scaling_factor = float(
|
||||
self.vllm_long_context_scaling_factor = float(
|
||||
math.ceil(self.context_length /
|
||||
self.vllm_max_position_embeddings))
|
||||
|
||||
|
||||
@ -6,7 +6,7 @@ from vllm.logger import init_logger
|
||||
from vllm.model_executor.guided_decoding.utils import (
|
||||
convert_lark_to_gbnf, grammar_is_likely_lark,
|
||||
has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
|
||||
from vllm.platforms import CpuArchEnum, current_platform
|
||||
from vllm.platforms import CpuArchEnum
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from transformers import PreTrainedTokenizer
|
||||
@ -39,6 +39,7 @@ def maybe_backend_fallback(
|
||||
|
||||
if guided_params.backend == "xgrammar":
|
||||
# xgrammar only has x86 wheels for linux, fallback to outlines
|
||||
from vllm.platforms import current_platform
|
||||
if current_platform.get_cpu_architecture() is not CpuArchEnum.X86:
|
||||
logger.warning("xgrammar is only supported on x86 CPUs. "
|
||||
"Falling back to use outlines instead.")
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# noqa: UP007
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Any
|
||||
@ -309,3 +310,7 @@ class XGrammarLogitsProcessor:
|
||||
scores = scores.to(device_type).squeeze()
|
||||
|
||||
return scores
|
||||
|
||||
def clone(self) -> XGrammarLogitsProcessor:
|
||||
"""Deepcopy due to per-sequence state in the matchers"""
|
||||
return copy.deepcopy(self)
|
||||
|
||||
@ -41,9 +41,20 @@ class FusedMoEMethodBase(QuantizeMethodBase):
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def apply(self, layer: torch.nn.Module, x: torch.Tensor,
|
||||
router_logits: torch.Tensor, top_k: int, renormalize: bool,
|
||||
use_grouped_topk: bool) -> torch.Tensor:
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
renormalize: bool,
|
||||
use_grouped_topk: bool = False,
|
||||
topk_group: Optional[int] = None,
|
||||
num_expert_group: Optional[int] = None,
|
||||
custom_routing_function: Optional[Callable] = None,
|
||||
scoring_func: str = "softmax",
|
||||
e_score_correction_bias: Optional[torch.Tensor] = None
|
||||
) -> torch.Tensor:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@ -79,7 +90,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
renormalize: bool,
|
||||
use_grouped_topk: bool,
|
||||
use_grouped_topk: bool = False,
|
||||
topk_group: Optional[int] = None,
|
||||
num_expert_group: Optional[int] = None,
|
||||
custom_routing_function: Optional[Callable] = None,
|
||||
|
||||
@ -42,12 +42,14 @@ class MambaMixer(CustomOp):
|
||||
use_rms_norm: bool,
|
||||
rms_norm_has_weight: bool = True,
|
||||
rms_norm_eps: float = 1e-5,
|
||||
activation="silu"):
|
||||
activation="silu",
|
||||
is_lora_enabled: bool = False):
|
||||
super().__init__()
|
||||
self.time_step_rank = time_step_rank
|
||||
self.ssm_state_size = ssm_state_size
|
||||
self.use_rms_norm = use_rms_norm
|
||||
self.activation = activation
|
||||
self.is_lora_enabled = is_lora_enabled
|
||||
|
||||
self.conv1d = ColumnParallelLinear(
|
||||
input_size=conv_kernel_size,
|
||||
@ -63,6 +65,7 @@ class MambaMixer(CustomOp):
|
||||
self.in_proj = MergedColumnParallelLinear(hidden_size,
|
||||
[intermediate_size] * 2,
|
||||
bias=use_bias)
|
||||
|
||||
# selective projection used to make dt, B and C input dependent
|
||||
self.x_proj = RowParallelLinear(
|
||||
intermediate_size,
|
||||
@ -170,7 +173,13 @@ class MambaMixer(CustomOp):
|
||||
|
||||
# 3. State Space Model sequence transformation
|
||||
# 3.a. input varying initialization of time_step, B and C
|
||||
ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
|
||||
|
||||
if self.is_lora_enabled:
|
||||
# lora kernel requires contiguous tensor
|
||||
ssm_parameters = self.x_proj(
|
||||
hidden_states.transpose(-2, -1).contiguous())[0]
|
||||
else:
|
||||
ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
|
||||
|
||||
time_step, B, C = torch.split(
|
||||
ssm_parameters,
|
||||
@ -222,6 +231,11 @@ class MambaMixer(CustomOp):
|
||||
scan_outputs = scan_outputs.transpose(0, 1)
|
||||
|
||||
# 4. Final linear projection
|
||||
contextualized_states = self.out_proj(scan_outputs.transpose(-2,
|
||||
-1))[0]
|
||||
if self.is_lora_enabled:
|
||||
# lora kernel requires contiguous tensor
|
||||
contextualized_states = self.out_proj(
|
||||
scan_outputs.transpose(-2, -1).contiguous())[0]
|
||||
else:
|
||||
contextualized_states = self.out_proj(
|
||||
scan_outputs.transpose(-2, -1))[0]
|
||||
return contextualized_states
|
||||
|
||||
@ -440,11 +440,13 @@ class AWQMoEMethod(FusedMoEMethodBase):
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
top_k: int,
|
||||
renormalize: bool = True,
|
||||
renormalize: bool,
|
||||
use_grouped_topk: bool = False,
|
||||
num_expert_group: Optional[int] = None,
|
||||
topk_group: Optional[int] = None,
|
||||
num_expert_group: Optional[int] = None,
|
||||
custom_routing_function: Optional[Callable] = None,
|
||||
scoring_func: str = "softmax",
|
||||
e_score_correction_bias: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
topk_weights, topk_ids = FusedMoE.select_experts(
|
||||
hidden_states=x,
|
||||
@ -454,7 +456,9 @@ class AWQMoEMethod(FusedMoEMethodBase):
|
||||
renormalize=renormalize,
|
||||
topk_group=topk_group,
|
||||
num_expert_group=num_expert_group,
|
||||
custom_routing_function=custom_routing_function)
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
e_score_correction_bias=e_score_correction_bias)
|
||||
|
||||
return torch.ops.vllm.fused_marlin_moe(
|
||||
x,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user