diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml index 708e548727cf5..868b8e95db01d 100644 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -73,7 +73,7 @@ steps: # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" agents: queue: H100 - depends_on: block-h100 + depends_on: ~ plugins: - docker#v5.12.0: image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b563c96343f92..c6f8316412e2f 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -106,14 +106,12 @@ steps: source_file_dependencies: - vllm/ commands: - - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py - - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process - pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests @@ -333,8 +331,6 @@ steps: - vllm/ - tests/models commands: - - pip install -e ./plugins/vllm_add_dummy_model - - pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/test_registry.py - pytest -v -s models/test_initialization.py @@ -360,7 +356,7 @@ steps: - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' - pytest -v -s models/embedding/language -m 'not core_model' -- label: Multi-Modal Models Test (Standard) # 28min +- label: Multi-Modal Models Test (Standard) # 40min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ @@ -376,7 +372,7 @@ steps: - pytest -v -s models/encoder_decoder/language -m core_model - pytest -v -s models/encoder_decoder/vision_language -m core_model -- label: Multi-Modal Models Test (Extended) 1 # 1h16m +- label: Multi-Modal Models Test (Extended) 1 # 48m optional: true source_file_dependencies: - vllm/ @@ -469,11 +465,28 @@ steps: - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)' - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - - pip install -e ./plugins/vllm_add_dummy_model - - pytest -v -s distributed/test_distributed_oot.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py +- label: Plugin Tests (2 GPUs) # 40min + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + fast_check: true + source_file_dependencies: + - vllm/plugins/ + - tests/plugins/ + commands: + # begin platform plugin tests, all the code in-between runs on dummy platform + - pip install -e ./plugins/vllm_add_dummy_platform + - pytest -v -s plugins_tests/test_platform_plugins.py + - pip uninstall vllm_add_dummy_platform -y + # end platform plugin tests + # other tests continue here: + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s distributed/test_distributed_oot.py + - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process + - pytest -v -s models/test_oot_registration.py # it needs a clean process + - label: Multi-step Tests (4 GPUs) # 36min working_dir: "/vllm-workspace/tests" num_gpus: 4 diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml similarity index 100% rename from .github/ISSUE_TEMPLATE/400-bug report.yml rename to .github/ISSUE_TEMPLATE/400-bug-report.yml diff --git a/.github/ISSUE_TEMPLATE/500-feature request.yml b/.github/ISSUE_TEMPLATE/500-feature-request.yml similarity index 100% rename from .github/ISSUE_TEMPLATE/500-feature request.yml rename to .github/ISSUE_TEMPLATE/500-feature-request.yml diff --git a/.github/ISSUE_TEMPLATE/600-new model.yml b/.github/ISSUE_TEMPLATE/600-new-model.yml similarity index 100% rename from .github/ISSUE_TEMPLATE/600-new model.yml rename to .github/ISSUE_TEMPLATE/600-new-model.yml diff --git a/.github/ISSUE_TEMPLATE/700-performance discussion.yml b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml similarity index 100% rename from .github/ISSUE_TEMPLATE/700-performance discussion.yml rename to .github/ISSUE_TEMPLATE/700-performance-discussion.yml diff --git a/.github/ISSUE_TEMPLATE/800-misc discussion.yml b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml similarity index 100% rename from .github/ISSUE_TEMPLATE/800-misc discussion.yml rename to .github/ISSUE_TEMPLATE/800-misc-discussion.yml diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e1f4296bca5c..9188707f5e6cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -225,13 +225,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") FetchContent_Declare( cutlass GIT_REPOSITORY https://github.com/nvidia/cutlass.git - GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227 + GIT_TAG v3.6.0 GIT_PROGRESS TRUE # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history. # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags. # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE - GIT_SHALLOW FALSE + GIT_SHALLOW TRUE ) endif() FetchContent_MakeAvailable(cutlass) diff --git a/Dockerfile.neuron b/Dockerfile.neuron index 77162bc82de62..269139fe90f0b 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -1,6 +1,6 @@ # default base image # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx -ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04" +ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04" FROM $BASE_IMAGE @@ -22,9 +22,9 @@ WORKDIR ${APP_MOUNT}/vllm RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas -RUN python3 -m pip install sentencepiece transformers==4.36.2 -U +RUN python3 -m pip install sentencepiece transformers==4.45.2 -U RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U -RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U +RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U COPY . . ARG GIT_REPO_CHECK=0 diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py new file mode 100644 index 0000000000000..13477ef535e86 --- /dev/null +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -0,0 +1,184 @@ +""" +Offline benchmark to test the long document QA throughput. + +Example usage: + # This command run the vllm with 50GB CPU memory for offloading + # The workload samples 8 different prompts with a default input + # length of 20000 tokens, then replicates each prompt 2 times + # in random order. + python benchmark_long_document_qa_throughput.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --enable-prefix-caching \ + --num-documents 8 \ + --repeat-count 2 + +Commandline arguments: + --num-documents: The number of documents to sample prompts from. + + --document-length: The length of each document in tokens. + (Optional, default: 20000) + + --output-len: The number of tokens to generate for each prompt. + (Optional, default: 10) + + --repeat-count: The number of times to repeat each prompt. + (Optional, default: 2) + + --repeat-mode: The mode to repeat prompts. The supported modes are: + - 'random': shuffle the prompts randomly. (Default) + - 'tile': the entire prompt list is repeated in sequence. (Potentially + lowest cache hit) + - 'interleave': each prompt is repeated consecutively before + moving to the next element. (Highest cache hit) + + --shuffle-seed: Random seed when the repeat mode is "random". + (Optional, default: 0) + +In the meantime, it also supports all the vLLM engine args to initialize the +LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more +details. +""" + +import dataclasses +import random +import time + +from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def test_long_document_qa(llm=None, sampling_params=None, prompts=None): + """ + Test long document QA with the given prompts and sampling parameters. + Print the time spent in processing all the prompts. + + Args: + llm: The language model used for generating responses. + sampling_params: Sampling parameter used to generate the response. + prompts: A list of prompt strings to be processed by the LLM. + """ + start_time = time.time() + llm.generate(prompts, sampling_params=sampling_params) + end_time = time.time() + print(f"Time to execute all requests: {end_time - start_time:.4f} secs") + + +def repeat_prompts(prompts, repeat_count, mode: str): + """ + Repeat each prompt in the list for a specified number of times. + The order of prompts in the output list depends on the mode. + + Args: + prompts: A list of prompts to be repeated. + repeat_count: The number of times each prompt is repeated. + mode: The mode of repetition. Supported modes are: + - 'random': Shuffle the prompts randomly after repetition. + - 'tile': Repeat the entire prompt list in sequence. + Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3]. + - 'interleave': Repeat each prompt consecutively before moving to + the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3]. + + Returns: + A list of repeated prompts in the specified order. + + Raises: + ValueError: If an invalid mode is provided. + """ + print("Repeat mode: ", mode) + if mode == 'random': + repeated_prompts = prompts * repeat_count + random.shuffle(repeated_prompts) + return repeated_prompts + elif mode == 'tile': + return prompts * repeat_count + elif mode == 'interleave': + repeated_prompts = [] + for prompt in prompts: + repeated_prompts.extend([prompt] * repeat_count) + return repeated_prompts + else: + raise ValueError(f"Invalid mode: {mode}, only support " + "'random', 'tile', 'interleave'") + + +def main(args): + random.seed(args.shuffle_seed) + + # Prepare the prompts: + # we append the document id at the beginning to avoid any of the document + # being the prefix of other documents + prompts = [ + str(i) + ' '.join(['hi'] * args.document_length) + for i in range(args.num_documents) + ] + + prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode) + + warmup_prompts = [ + "This is warm up request " + str(i) + \ + ' '.join(['hi'] * args.document_length) + for i in range(args.num_documents)] + + # Create the LLM engine + engine_args = EngineArgs.from_cli_args(args) + llm = LLM(**dataclasses.asdict(engine_args)) + sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) + + print("------warm up------") + test_long_document_qa( + llm=llm, + prompts=warmup_prompts, + sampling_params=sampling_params, + ) + + print("------start generating------") + test_long_document_qa( + llm=llm, + prompts=prompts, + sampling_params=sampling_params, + ) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description= + 'Benchmark the performance with or without automatic prefix caching.') + + parser.add_argument( + '--document-length', + type=int, + # Roughly the number of tokens for a system paper, + # excluding images + default=20000, + help='Range of input lengths for sampling prompts,' + 'specified as "min:max" (e.g., "128:256").') + + parser.add_argument('--num-documents', + type=int, + default=8, + help='Range of input lengths for sampling prompts,' + 'specified as "min:max" (e.g., "128:256").') + + parser.add_argument('--output-len', type=int, default=10) + + parser.add_argument('--repeat-count', + type=int, + default=2, + help='Number of times to repeat each prompt') + + parser.add_argument("--repeat-mode", + type=str, + default='random', + help='The mode to repeat prompts. The supported ' + 'modes are "random", "tile", and "interleave". ' + 'See repeat_prompts() in the source code for details.') + + parser.add_argument("--shuffle-seed", + type=int, + default=0, + help='Random seed when the repeat mode is "random"') + + parser = EngineArgs.add_cli_args(parser) + args = parser.parse_args() + main(args) diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py index a5beea1a35e49..b401736c9824b 100644 --- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py +++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py @@ -14,9 +14,9 @@ class VLLMDataType(enum.Enum): class MixedInputKernelScheduleType(enum.Enum): - TmaWarpSpecializedMixedInput = enum_auto() - TmaWarpSpecializedPingpongMixedInput = enum_auto() - TmaWarpSpecializedCooperativeMixedInput = enum_auto() + TmaWarpSpecialized = enum_auto() + TmaWarpSpecializedPingpong = enum_auto() + TmaWarpSpecializedCooperative = enum_auto() VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = { @@ -68,11 +68,11 @@ VLLMKernelScheduleTag: Dict[Union[ MixedInputKernelScheduleType, KernelScheduleType], str] = { **KernelScheduleTag, # type: ignore **{ - MixedInputKernelScheduleType.TmaWarpSpecializedMixedInput: - "cutlass::gemm::KernelTmaWarpSpecializedMixedInput", - MixedInputKernelScheduleType.TmaWarpSpecializedPingpongMixedInput: - "cutlass::gemm::KernelTmaWarpSpecializedPingpongMixedInput", - MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput: - "cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput", + MixedInputKernelScheduleType.TmaWarpSpecialized: + "cutlass::gemm::KernelTmaWarpSpecialized", + MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: + "cutlass::gemm::KernelTmaWarpSpecializedPingpong", + MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: + "cutlass::gemm::KernelTmaWarpSpecializedCooperative", } } diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index ac63afe79a255..2df4d181902f8 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -189,7 +189,7 @@ using Kernel_{{type_sig}} = MacheteKernelTemplate< {{DataTypeTag[t.b_group_zeropoint]}}, // GroupZeroT {{DataTypeTag[t.b_channel_scale]}}, // ChannelScaleT {{DataTypeTag[t.a_token_scale]}}, // TokenScaleT - cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput, + cutlass::gemm::KernelTmaWarpSpecializedCooperative, Sch>; {% for sch in schs %} @@ -223,7 +223,7 @@ torch::Tensor prepack_B_dispatch(PrepackBArgs args) { {{DataTypeTag[t.convert]}}, // ElementConvert {{DataTypeTag[t.accumulator]}}, // Accumulator cutlass::layout::ColumnMajor, - cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput> + cutlass::gemm::KernelTmaWarpSpecializedCooperative> >(args.B); } {%- endfor %} @@ -239,7 +239,7 @@ torch::Tensor prepack_B_dispatch(PrepackBArgs args) { }; // namespace machete """ -TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput +TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperative TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative @@ -300,7 +300,7 @@ def generate_sch_sig(schedule_config: ScheduleConfig) -> str: # mostly unique shorter sch_sig def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str: kernel_terse_names_replace = { - "KernelTmaWarpSpecializedCooperativeMixedInput_": "TmaMI_", + "KernelTmaWarpSpecializedCooperative": "TmaMI_", "TmaWarpSpecializedCooperative_": "TmaCoop_", "StreamKScheduler": "streamK", } diff --git a/csrc/quantization/machete/machete_collective_builder.cuh b/csrc/quantization/machete/machete_collective_builder.cuh index a74cf8b2dd455..ee825583dee1a 100644 --- a/csrc/quantization/machete/machete_collective_builder.cuh +++ b/csrc/quantization/machete/machete_collective_builder.cuh @@ -18,16 +18,14 @@ struct VLLMCollectiveBuilder< ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType, KernelScheduleType, cute::enable_if_t<( + cute::is_same_v || + cute::is_same_v || cute::is_same_v || - cute::is_same_v || - cute::is_same_v)>> { + KernelTmaWarpSpecializedCooperative>)>> { using CollectiveOp = machete::MacheteCollectiveMma< ElementPairA_, GmemLayoutA_, AlignmentA, ElementPairB_, GmemLayoutB_, AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType, KernelScheduleType>; }; -}; // namespace cutlass::gemm::collective \ No newline at end of file +}; // namespace cutlass::gemm::collective diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh index 816f33a1078e5..4071b19a3564d 100644 --- a/csrc/quantization/machete/machete_mainloop.cuh +++ b/csrc/quantization/machete/machete_mainloop.cuh @@ -66,13 +66,11 @@ struct MacheteCollectiveMma { using Schedule = KernelScheduleType; static_assert( cute::is_same_v || - cute::is_same_v || + cute::is_same_v || + cute::is_same_v || cute::is_same_v || - cute::is_same_v || cute::is_same_v || - cute::is_same_v, + cute::is_same_v, "KernelSchedule must be one of the warp specialized policies"); public: @@ -113,8 +111,7 @@ struct MacheteCollectiveMma { // For coop schedules we have two warp groups cooperatively issuing wgmma // instructions so we use 2 atoms along the M dim (one for each warpgroup) using AtomLayoutMNK = cute::conditional_t< - cute::is_same_v, + cute::is_same_v, Layout>, Layout>>; using TiledMma = decltype(cute::make_tiled_mma( diff --git a/csrc/quantization/machete/machete_prepacked_layout.cuh b/csrc/quantization/machete/machete_prepacked_layout.cuh index 680a858a893c1..81aaa6c4f3a28 100644 --- a/csrc/quantization/machete/machete_prepacked_layout.cuh +++ b/csrc/quantization/machete/machete_prepacked_layout.cuh @@ -98,8 +98,7 @@ struct PrepackedLayoutBTemplate { // For coop schedules we have two warp groups cooperatively issuing wgmma // instructions so we use 2 atoms along the M dim (one for each warpgroup) using AtomLayoutMNK = cute::conditional_t< - cute::is_same_v, + cute::is_same_v, Layout>, Layout>>; using TiledMma = decltype(cute::make_tiled_mma( @@ -247,4 +246,4 @@ struct PrepackedLayoutBTemplate { } }; -}; // namespace machete \ No newline at end of file +}; // namespace machete diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 4859c8ac08bea..25a700033cc9e 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -19,3 +19,4 @@ openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entr fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args requests +zmq diff --git a/docs/source/conf.py b/docs/source/conf.py index 1fe0474631140..71394c5302a39 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -191,6 +191,7 @@ def linkcode_resolve(domain, info): # Mock out external dependencies here, otherwise the autodoc pages may be blank. autodoc_mock_imports = [ + "blake3", "compressed_tensors", "cpuinfo", "cv2", @@ -207,7 +208,7 @@ autodoc_mock_imports = [ "tensorizer", "pynvml", "outlines", - "xgrammar," + "xgrammar", "librosa", "soundfile", "gguf", diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md index 6535414a7dca4..7ffec83333d7d 100644 --- a/docs/source/contributing/dockerfile/dockerfile.md +++ b/docs/source/contributing/dockerfile/dockerfile.md @@ -11,11 +11,11 @@ Below is a visual representation of the multi-stage Dockerfile. The build graph The edges of the build graph represent: -- FROM ... dependencies (with a solid line and a full arrow head) +- `FROM ...` dependencies (with a solid line and a full arrow head) -- COPY --from=... dependencies (with a dashed line and an empty arrow head) +- `COPY --from=...` dependencies (with a dashed line and an empty arrow head) -- RUN --mount=(.\*)from=... dependencies (with a dotted line and an empty diamond arrow head) +- `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head) > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png > :align: center diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md index 9dac41cff0bcb..c960790f47a13 100644 --- a/docs/source/contributing/overview.md +++ b/docs/source/contributing/overview.md @@ -34,7 +34,7 @@ pytest tests/ ``` ```{note} -Currently, the repository does not pass the `mypy` tests. +Currently, the repository is not fully checked by `mypy`. ``` # Contribution Guidelines diff --git a/docs/source/design/multimodal/multimodal_index.md b/docs/source/design/multimodal/multimodal_index.md index 88af07afc7018..e4f2171e84ff7 100644 --- a/docs/source/design/multimodal/multimodal_index.md +++ b/docs/source/design/multimodal/multimodal_index.md @@ -45,31 +45,23 @@ adding_multimodal_plugin ### Base Classes ```{eval-rst} -.. autodata:: vllm.multimodal.NestedTensors -``` - -```{eval-rst} -.. autodata:: vllm.multimodal.BatchedTensorInputs -``` - -```{eval-rst} -.. autoclass:: vllm.multimodal.MultiModalDataBuiltins +.. automodule:: vllm.multimodal.base :members: :show-inheritance: ``` -```{eval-rst} -.. autodata:: vllm.multimodal.MultiModalDataDict -``` +### Input Classes ```{eval-rst} -.. autoclass:: vllm.multimodal.MultiModalKwargs +.. automodule:: vllm.multimodal.inputs :members: :show-inheritance: ``` +### Audio Classes + ```{eval-rst} -.. autoclass:: vllm.multimodal.MultiModalPlugin +.. automodule:: vllm.multimodal.audio :members: :show-inheritance: ``` @@ -81,3 +73,11 @@ adding_multimodal_plugin :members: :show-inheritance: ``` + +### Video Classes + +```{eval-rst} +.. automodule:: vllm.multimodal.video + :members: + :show-inheritance: +``` diff --git a/docs/source/design/plugin_system.md b/docs/source/design/plugin_system.md index 79aff757518f2..225030885f629 100644 --- a/docs/source/design/plugin_system.md +++ b/docs/source/design/plugin_system.md @@ -41,9 +41,11 @@ Every plugin has three parts: 2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name. 3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module. -## What Can Plugins Do? +## Types of supported plugins -Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM. +- **General plugins** (with group name `vllm.general_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model inside the plugin function. + +- **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported. ## Guidelines for Writing Plugins diff --git a/docs/source/getting_started/arm-installation.md b/docs/source/getting_started/arm-installation.md index de807e198b4f6..799b597b3ad5d 100644 --- a/docs/source/getting_started/arm-installation.md +++ b/docs/source/getting_started/arm-installation.md @@ -20,7 +20,7 @@ Contents: ## Requirements - **Operating System**: Linux or macOS -- **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended) +- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended) - **Instruction Set Architecture (ISA)**: NEON support is required (arm-backend-quick-start-dockerfile)= diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/cpu-installation.md index b6f181ace6274..c3d3f715ed804 100644 --- a/docs/source/getting_started/cpu-installation.md +++ b/docs/source/getting_started/cpu-installation.md @@ -24,7 +24,7 @@ Table of contents: ## Requirements - OS: Linux -- Compiler: gcc/g++>=12.3.0 (optional, recommended) +- Compiler: `gcc/g++>=12.3.0` (optional, recommended) - Instruction set architecture (ISA) requirement: AVX512 (optional, recommended) (cpu-backend-quick-start-dockerfile)= @@ -69,7 +69,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install ```{note} - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. -- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building. +- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building. ``` (env-intro)= diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/debugging.md index 3b0029f2e88ce..19eb699572a08 100644 --- a/docs/source/getting_started/debugging.md +++ b/docs/source/getting_started/debugging.md @@ -197,4 +197,4 @@ if __name__ == '__main__': ## Known Issues - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759). -- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) . +- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable `NCCL_CUMEM_ENABLE=0` to disable NCCL's `cuMem` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) . diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/gaudi-installation.md index acf42f210dffb..1f2ee62860dec 100644 --- a/docs/source/getting_started/gaudi-installation.md +++ b/docs/source/getting_started/gaudi-installation.md @@ -141,32 +141,33 @@ Gaudi2 devices. Configurations that are not listed may or may not work. Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag. -```{eval-rst} -.. list-table:: vLLM execution modes - :widths: 25 25 50 - :header-rows: 1 +```{list-table} vLLM execution modes +:widths: 25 25 50 +:header-rows: 1 - * - ``PT_HPU_LAZY_MODE`` - - ``enforce_eager`` - - execution mode - * - 0 - - 0 - - torch.compile - * - 0 - - 1 - - PyTorch eager mode - * - 1 - - 0 - - HPU Graphs - * - 1 - - 1 - - PyTorch lazy mode +* - `PT_HPU_LAZY_MODE` + - `enforce_eager` + - execution mode +* - 0 + - 0 + - torch.compile +* - 0 + - 1 + - PyTorch eager mode +* - 1 + - 0 + - HPU Graphs +* - 1 + - 1 + - PyTorch lazy mode ``` ```{warning} In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. ``` +(gaudi-bucketing-mechanism)= + ### Bucketing mechanism Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. @@ -185,7 +186,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, ma INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] ``` -`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. +`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. Example (with ramp-up) @@ -214,7 +215,7 @@ If a request exceeds maximum bucket size in any dimension, it will be processed As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket. ```{note} -Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. +Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. ``` ### Warmup @@ -235,7 +236,7 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB ``` -This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. +This example uses the same buckets as in the [Bucketing Mechanism](#gaudi-bucketing-mechanism) section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. ```{tip} Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. diff --git a/docs/source/getting_started/neuron-installation.md b/docs/source/getting_started/neuron-installation.md index d6de5760cc82c..baaeeb9f53a10 100644 --- a/docs/source/getting_started/neuron-installation.md +++ b/docs/source/getting_started/neuron-installation.md @@ -26,7 +26,7 @@ Installation steps: (build-from-source-neuron)= ```{note} -The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. +The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. ``` ## Build from source diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index 165e5df146dcd..9c8b7e4f592c9 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -114,7 +114,7 @@ $ "temperature": 0 $ }' ``` -Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` python package: +Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package: ```python from openai import OpenAI @@ -151,7 +151,7 @@ $ ] $ }' ``` -Alternatively, you can use the `openai` python package: +Alternatively, you can use the `openai` Python package: ```python from openai import OpenAI diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/tpu-installation.md index f2a949e7247d8..4d3ac541c90ce 100644 --- a/docs/source/getting_started/tpu-installation.md +++ b/docs/source/getting_started/tpu-installation.md @@ -68,33 +68,32 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ --service-account SERVICE_ACCOUNT ``` -```{eval-rst} -.. list-table:: Parameter descriptions - :header-rows: 1 +```{list-table} Parameter descriptions +:header-rows: 1 - * - Parameter name - - Description - * - QUEUED_RESOURCE_ID - - The user-assigned ID of the queued resource request. - * - TPU_NAME - - The user-assigned name of the TPU which is created when the queued - resource request is allocated. - * - PROJECT_ID - - Your Google Cloud project - * - ZONE - - The GCP zone where you want to create your Cloud TPU. The value you use - depends on the version of TPUs you are using. For more information, see - `TPU regions and zones `_ - * - ACCELERATOR_TYPE - - The TPU version you want to use. Specify the TPU version, for example - `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, - see `TPU versions `_. - * - RUNTIME_VERSION - - The TPU VM runtime version to use. For more information see `TPU VM images `_. - * - SERVICE_ACCOUNT - - The email address for your service account. You can find it in the IAM - Cloud Console under *Service Accounts*. For example: - `tpu-service-account@.iam.gserviceaccount.com` +* - Parameter name + - Description +* - QUEUED_RESOURCE_ID + - The user-assigned ID of the queued resource request. +* - TPU_NAME + - The user-assigned name of the TPU which is created when the queued + resource request is allocated. +* - PROJECT_ID + - Your Google Cloud project +* - ZONE + - The GCP zone where you want to create your Cloud TPU. The value you use + depends on the version of TPUs you are using. For more information, see + `TPU regions and zones `_ +* - ACCELERATOR_TYPE + - The TPU version you want to use. Specify the TPU version, for example + `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, + see `TPU versions `_. +* - RUNTIME_VERSION + - The TPU VM runtime version to use. For more information see `TPU VM images `_. +* - SERVICE_ACCOUNT + - The email address for your service account. You can find it in the IAM + Cloud Console under *Service Accounts*. For example: + `tpu-service-account@.iam.gserviceaccount.com` ``` Connect to your TPU using SSH: @@ -103,7 +102,7 @@ Connect to your TPU using SSH: gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE ``` -Install Miniconda +Install Miniconda: ```bash wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 95add0d71bbab..7682ed104b8c5 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -72,291 +72,290 @@ See [this page](#generative-models) for more information on how to use generativ #### Text Generation (`--task generate`) -```{eval-rst} -.. list-table:: - :widths: 25 25 50 5 5 - :header-rows: 1 +```{list-table} +:widths: 25 25 50 5 5 +:header-rows: 1 - * - Architecture - - Models - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - * - :code:`AquilaForCausalLM` - - Aquila, Aquila2 - - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc. - - ✅︎ - - ✅︎ - * - :code:`ArcticForCausalLM` - - Arctic - - :code:`Snowflake/snowflake-arctic-base`, :code:`Snowflake/snowflake-arctic-instruct`, etc. - - - - ✅︎ - * - :code:`BaiChuanForCausalLM` - - Baichuan2, Baichuan - - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc. - - ✅︎ - - ✅︎ - * - :code:`BloomForCausalLM` - - BLOOM, BLOOMZ, BLOOMChat - - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc. - - - - ✅︎ - * - :code:`BartForConditionalGeneration` - - BART - - :code:`facebook/bart-base`, :code:`facebook/bart-large-cnn`, etc. - - - - - * - :code:`ChatGLMModel` - - ChatGLM - - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc. - - ✅︎ - - ✅︎ - * - :code:`CohereForCausalLM`,:code:`Cohere2ForCausalLM` - - Command-R - - :code:`CohereForAI/c4ai-command-r-v01`, :code:`CohereForAI/c4ai-command-r7b-12-2024`, etc. - - ✅︎ - - ✅︎ - * - :code:`DbrxForCausalLM` - - DBRX - - :code:`databricks/dbrx-base`, :code:`databricks/dbrx-instruct`, etc. - - - - ✅︎ - * - :code:`DeciLMForCausalLM` - - DeciLM - - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc. - - - - ✅︎ - * - :code:`DeepseekForCausalLM` - - DeepSeek - - :code:`deepseek-ai/deepseek-llm-67b-base`, :code:`deepseek-ai/deepseek-llm-7b-chat` etc. - - - - ✅︎ - * - :code:`DeepseekV2ForCausalLM` - - DeepSeek-V2 - - :code:`deepseek-ai/DeepSeek-V2`, :code:`deepseek-ai/DeepSeek-V2-Chat` etc. - - - - ✅︎ - * - :code:`DeepseekV3ForCausalLM` - - DeepSeek-V3 - - :code:`deepseek-ai/DeepSeek-V3-Base`, :code:`deepseek-ai/DeepSeek-V3` etc. - - - - ✅︎ - * - :code:`ExaoneForCausalLM` - - EXAONE-3 - - :code:`LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`FalconForCausalLM` - - Falcon - - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc. - - - - ✅︎ - * - :code:`FalconMambaForCausalLM` - - FalconMamba - - :code:`tiiuae/falcon-mamba-7b`, :code:`tiiuae/falcon-mamba-7b-instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`GemmaForCausalLM` - - Gemma - - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc. - - ✅︎ - - ✅︎ - * - :code:`Gemma2ForCausalLM` - - Gemma2 - - :code:`google/gemma-2-9b`, :code:`google/gemma-2-27b`, etc. - - ✅︎ - - ✅︎ - * - :code:`GlmForCausalLM` - - GLM-4 - - :code:`THUDM/glm-4-9b-chat-hf`, etc. - - ✅︎ - - ✅︎ - * - :code:`GPT2LMHeadModel` - - GPT-2 - - :code:`gpt2`, :code:`gpt2-xl`, etc. - - - - ✅︎ - * - :code:`GPTBigCodeForCausalLM` - - StarCoder, SantaCoder, WizardCoder - - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc. - - ✅︎ - - ✅︎ - * - :code:`GPTJForCausalLM` - - GPT-J - - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc. - - - - ✅︎ - * - :code:`GPTNeoXForCausalLM` - - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM - - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc. - - - - ✅︎ - * - :code:`GraniteForCausalLM` - - Granite 3.0, Granite 3.1, PowerLM - - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.1-8b-instruct`, :code:`ibm/PowerLM-3b`, etc. - - ✅︎ - - ✅︎ - * - :code:`GraniteMoeForCausalLM` - - Granite 3.0 MoE, PowerMoE - - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc. - - ✅︎ - - ✅︎ - * - :code:`GritLM` - - GritLM - - :code:`parasail-ai/GritLM-7B-vllm`. - - ✅︎ - - ✅︎ - * - :code:`InternLMForCausalLM` - - InternLM - - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc. - - ✅︎ - - ✅︎ - * - :code:`InternLM2ForCausalLM` - - InternLM2 - - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc. - - ✅︎ - - ✅︎ - * - :code:`JAISLMHeadModel` - - Jais - - :code:`inceptionai/jais-13b`, :code:`inceptionai/jais-13b-chat`, :code:`inceptionai/jais-30b-v3`, :code:`inceptionai/jais-30b-chat-v3`, etc. - - - - ✅︎ - * - :code:`JambaForCausalLM` - - Jamba - - :code:`ai21labs/AI21-Jamba-1.5-Large`, :code:`ai21labs/AI21-Jamba-1.5-Mini`, :code:`ai21labs/Jamba-v0.1`, etc. - - ✅︎ - - ✅︎ - * - :code:`LlamaForCausalLM` - - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi - - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc. - - ✅︎ - - ✅︎ - * - :code:`MambaForCausalLM` - - Mamba - - :code:`state-spaces/mamba-130m-hf`, :code:`state-spaces/mamba-790m-hf`, :code:`state-spaces/mamba-2.8b-hf`, etc. - - - - ✅︎ - * - :code:`MiniCPMForCausalLM` - - MiniCPM - - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, :code:`openbmb/MiniCPM-S-1B-sft`, etc. - - ✅︎ - - ✅︎ - * - :code:`MiniCPM3ForCausalLM` - - MiniCPM3 - - :code:`openbmb/MiniCPM3-4B`, etc. - - ✅︎ - - ✅︎ - * - :code:`MistralForCausalLM` - - Mistral, Mistral-Instruct - - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc. - - ✅︎ - - ✅︎ - * - :code:`MixtralForCausalLM` - - Mixtral-8x7B, Mixtral-8x7B-Instruct - - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, :code:`mistral-community/Mixtral-8x22B-v0.1`, etc. - - ✅︎ - - ✅︎ - * - :code:`MPTForCausalLM` - - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter - - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc. - - - - ✅︎ - * - :code:`NemotronForCausalLM` - - Nemotron-3, Nemotron-4, Minitron - - :code:`nvidia/Minitron-8B-Base`, :code:`mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. - - ✅︎ - - ✅︎ - * - :code:`OLMoForCausalLM` - - OLMo - - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc. - - - - ✅︎ - * - :code:`OLMo2ForCausalLM` - - OLMo2 - - :code:`allenai/OLMo2-7B-1124`, etc. - - - - ✅︎ - * - :code:`OLMoEForCausalLM` - - OLMoE - - :code:`allenai/OLMoE-1B-7B-0924`, :code:`allenai/OLMoE-1B-7B-0924-Instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`OPTForCausalLM` - - OPT, OPT-IML - - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc. - - - - ✅︎ - * - :code:`OrionForCausalLM` - - Orion - - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc. - - - - ✅︎ - * - :code:`PhiForCausalLM` - - Phi - - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc. - - ✅︎ - - ✅︎ - * - :code:`Phi3ForCausalLM` - - Phi-3 - - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, :code:`microsoft/Phi-3-medium-128k-instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`Phi3SmallForCausalLM` - - Phi-3-Small - - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc. - - - - ✅︎ - * - :code:`PhiMoEForCausalLM` - - Phi-3.5-MoE - - :code:`microsoft/Phi-3.5-MoE-instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`PersimmonForCausalLM` - - Persimmon - - :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc. - - - - ✅︎ - * - :code:`QWenLMHeadModel` - - Qwen - - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. - - ✅︎ - - ✅︎ - * - :code:`Qwen2ForCausalLM` - - Qwen2 - - :code:`Qwen/QwQ-32B-Preview`, :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc. - - ✅︎ - - ✅︎ - * - :code:`Qwen2MoeForCausalLM` - - Qwen2MoE - - :code:`Qwen/Qwen1.5-MoE-A2.7B`, :code:`Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. - - - - ✅︎ - * - :code:`StableLmForCausalLM` - - StableLM - - :code:`stabilityai/stablelm-3b-4e1t`, :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc. - - - - ✅︎ - * - :code:`Starcoder2ForCausalLM` - - Starcoder2 - - :code:`bigcode/starcoder2-3b`, :code:`bigcode/starcoder2-7b`, :code:`bigcode/starcoder2-15b`, etc. - - - - ✅︎ - * - :code:`SolarForCausalLM` - - Solar Pro - - :code:`upstage/solar-pro-preview-instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`TeleChat2ForCausalLM` - - TeleChat2 - - :code:`TeleAI/TeleChat2-3B`, :code:`TeleAI/TeleChat2-7B`, :code:`TeleAI/TeleChat2-35B`, etc. - - ✅︎ - - ✅︎ - * - :code:`XverseForCausalLM` - - XVERSE - - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc. - - ✅︎ - - ✅︎ +* - Architecture + - Models + - Example HF Models + - [LoRA](#lora-adapter) + - [PP](#distributed-serving) +* - `AquilaForCausalLM` + - Aquila, Aquila2 + - `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. + - ✅︎ + - ✅︎ +* - `ArcticForCausalLM` + - Arctic + - `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. + - + - ✅︎ +* - `BaiChuanForCausalLM` + - Baichuan2, Baichuan + - `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. + - ✅︎ + - ✅︎ +* - `BloomForCausalLM` + - BLOOM, BLOOMZ, BLOOMChat + - `bigscience/bloom`, `bigscience/bloomz`, etc. + - + - ✅︎ +* - `BartForConditionalGeneration` + - BART + - `facebook/bart-base`, `facebook/bart-large-cnn`, etc. + - + - +* - `ChatGLMModel` + - ChatGLM + - `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc. + - ✅︎ + - ✅︎ +* - `CohereForCausalLM`, `Cohere2ForCausalLM` + - Command-R + - `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. + - ✅︎ + - ✅︎ +* - `DbrxForCausalLM` + - DBRX + - `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. + - + - ✅︎ +* - `DeciLMForCausalLM` + - DeciLM + - `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc. + - + - ✅︎ +* - `DeepseekForCausalLM` + - DeepSeek + - `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. + - + - ✅︎ +* - `DeepseekV2ForCausalLM` + - DeepSeek-V2 + - `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. + - + - ✅︎ +* - `DeepseekV3ForCausalLM` + - DeepSeek-V3 + - `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. + - + - ✅︎ +* - `ExaoneForCausalLM` + - EXAONE-3 + - `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. + - ✅︎ + - ✅︎ +* - `FalconForCausalLM` + - Falcon + - `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. + - + - ✅︎ +* - `FalconMambaForCausalLM` + - FalconMamba + - `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. + - ✅︎ + - ✅︎ +* - `GemmaForCausalLM` + - Gemma + - `google/gemma-2b`, `google/gemma-7b`, etc. + - ✅︎ + - ✅︎ +* - `Gemma2ForCausalLM` + - Gemma2 + - `google/gemma-2-9b`, `google/gemma-2-27b`, etc. + - ✅︎ + - ✅︎ +* - `GlmForCausalLM` + - GLM-4 + - `THUDM/glm-4-9b-chat-hf`, etc. + - ✅︎ + - ✅︎ +* - `GPT2LMHeadModel` + - GPT-2 + - `gpt2`, `gpt2-xl`, etc. + - + - ✅︎ +* - `GPTBigCodeForCausalLM` + - StarCoder, SantaCoder, WizardCoder + - `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. + - ✅︎ + - ✅︎ +* - `GPTJForCausalLM` + - GPT-J + - `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. + - + - ✅︎ +* - `GPTNeoXForCausalLM` + - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM + - `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. + - + - ✅︎ +* - `GraniteForCausalLM` + - Granite 3.0, Granite 3.1, PowerLM + - `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. + - ✅︎ + - ✅︎ +* - `GraniteMoeForCausalLM` + - Granite 3.0 MoE, PowerMoE + - `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. + - ✅︎ + - ✅︎ +* - `GritLM` + - GritLM + - `parasail-ai/GritLM-7B-vllm`. + - ✅︎ + - ✅︎ +* - `InternLMForCausalLM` + - InternLM + - `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. + - ✅︎ + - ✅︎ +* - `InternLM2ForCausalLM` + - InternLM2 + - `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. + - ✅︎ + - ✅︎ +* - `JAISLMHeadModel` + - Jais + - `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. + - + - ✅︎ +* - `JambaForCausalLM` + - Jamba + - `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. + - ✅︎ + - ✅︎ +* - `LlamaForCausalLM` + - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi + - `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. + - ✅︎ + - ✅︎ +* - `MambaForCausalLM` + - Mamba + - `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. + - + - ✅︎ +* - `MiniCPMForCausalLM` + - MiniCPM + - `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. + - ✅︎ + - ✅︎ +* - `MiniCPM3ForCausalLM` + - MiniCPM3 + - `openbmb/MiniCPM3-4B`, etc. + - ✅︎ + - ✅︎ +* - `MistralForCausalLM` + - Mistral, Mistral-Instruct + - `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. + - ✅︎ + - ✅︎ +* - `MixtralForCausalLM` + - Mixtral-8x7B, Mixtral-8x7B-Instruct + - `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. + - ✅︎ + - ✅︎ +* - `MPTForCausalLM` + - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter + - `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. + - + - ✅︎ +* - `NemotronForCausalLM` + - Nemotron-3, Nemotron-4, Minitron + - `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. + - ✅︎ + - ✅︎ +* - `OLMoForCausalLM` + - OLMo + - `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. + - + - ✅︎ +* - `OLMo2ForCausalLM` + - OLMo2 + - `allenai/OLMo2-7B-1124`, etc. + - + - ✅︎ +* - `OLMoEForCausalLM` + - OLMoE + - `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. + - ✅︎ + - ✅︎ +* - `OPTForCausalLM` + - OPT, OPT-IML + - `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. + - + - ✅︎ +* - `OrionForCausalLM` + - Orion + - `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. + - + - ✅︎ +* - `PhiForCausalLM` + - Phi + - `microsoft/phi-1_5`, `microsoft/phi-2`, etc. + - ✅︎ + - ✅︎ +* - `Phi3ForCausalLM` + - Phi-3 + - `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. + - ✅︎ + - ✅︎ +* - `Phi3SmallForCausalLM` + - Phi-3-Small + - `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. + - + - ✅︎ +* - `PhiMoEForCausalLM` + - Phi-3.5-MoE + - `microsoft/Phi-3.5-MoE-instruct`, etc. + - ✅︎ + - ✅︎ +* - `PersimmonForCausalLM` + - Persimmon + - `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. + - + - ✅︎ +* - `QWenLMHeadModel` + - Qwen + - `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. + - ✅︎ + - ✅︎ +* - `Qwen2ForCausalLM` + - Qwen2 + - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. + - ✅︎ + - ✅︎ +* - `Qwen2MoeForCausalLM` + - Qwen2MoE + - `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. + - + - ✅︎ +* - `StableLmForCausalLM` + - StableLM + - `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. + - + - ✅︎ +* - `Starcoder2ForCausalLM` + - Starcoder2 + - `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. + - + - ✅︎ +* - `SolarForCausalLM` + - Solar Pro + - `upstage/solar-pro-preview-instruct`, etc. + - ✅︎ + - ✅︎ +* - `TeleChat2ForCausalLM` + - TeleChat2 + - `TeleAI/TeleChat2-3B`, `TeleAI/TeleChat2-7B`, `TeleAI/TeleChat2-35B`, etc. + - ✅︎ + - ✅︎ +* - `XverseForCausalLM` + - XVERSE + - `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. + - ✅︎ + - ✅︎ ``` ```{note} @@ -374,51 +373,50 @@ you should explicitly specify the task type to ensure that the model is used in #### Text Embedding (`--task embed`) -```{eval-rst} -.. list-table:: - :widths: 25 25 50 5 5 - :header-rows: 1 +```{list-table} +:widths: 25 25 50 5 5 +:header-rows: 1 - * - Architecture - - Models - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - * - :code:`BertModel` - - BERT-based - - :code:`BAAI/bge-base-en-v1.5`, etc. - - - - - * - :code:`Gemma2Model` - - Gemma2-based - - :code:`BAAI/bge-multilingual-gemma2`, etc. - - - - ✅︎ - * - :code:`GritLM` - - GritLM - - :code:`parasail-ai/GritLM-7B-vllm`. - - ✅︎ - - ✅︎ - * - :code:`LlamaModel`, :code:`LlamaForCausalLM`, :code:`MistralModel`, etc. - - Llama-based - - :code:`intfloat/e5-mistral-7b-instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM` - - Qwen2-based - - :code:`ssmits/Qwen2-7B-Instruct-embed-base` (see note), :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. - - ✅︎ - - ✅︎ - * - :code:`RobertaModel`, :code:`RobertaForMaskedLM` - - RoBERTa-based - - :code:`sentence-transformers/all-roberta-large-v1`, :code:`sentence-transformers/all-roberta-large-v1`, etc. - - - - - * - :code:`XLMRobertaModel` - - XLM-RoBERTa-based - - :code:`intfloat/multilingual-e5-large`, etc. - - - - +* - Architecture + - Models + - Example HF Models + - [LoRA](#lora-adapter) + - [PP](#distributed-serving) +* - `BertModel` + - BERT-based + - `BAAI/bge-base-en-v1.5`, etc. + - + - +* - `Gemma2Model` + - Gemma2-based + - `BAAI/bge-multilingual-gemma2`, etc. + - + - ✅︎ +* - `GritLM` + - GritLM + - `parasail-ai/GritLM-7B-vllm`. + - ✅︎ + - ✅︎ +* - `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. + - Llama-based + - `intfloat/e5-mistral-7b-instruct`, etc. + - ✅︎ + - ✅︎ +* - `Qwen2Model`, `Qwen2ForCausalLM` + - Qwen2-based + - `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. + - ✅︎ + - ✅︎ +* - `RobertaModel`, `RobertaForMaskedLM` + - RoBERTa-based + - `sentence-transformers/all-roberta-large-v1`, `sentence-transformers/all-roberta-large-v1`, etc. + - + - +* - `XLMRobertaModel` + - XLM-RoBERTa-based + - `intfloat/multilingual-e5-large`, etc. + - + - ``` ```{note} @@ -435,35 +433,39 @@ despite being described otherwise on its model card. ``` If your model is not in the above list, we will try to automatically convert the model using -:func:`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings +{func}`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings of the whole prompt are extracted from the normalized hidden state corresponding to the last token. #### Reward Modeling (`--task reward`) -```{eval-rst} -.. list-table:: - :widths: 25 25 50 5 5 - :header-rows: 1 +```{list-table} +:widths: 25 25 50 5 5 +:header-rows: 1 - * - Architecture - - Models - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - * - :code:`LlamaForCausalLM` - - Llama-based - - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc. - - ✅︎ - - ✅︎ - * - :code:`Qwen2ForRewardModel` - - Qwen2-based - - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc. - - ✅︎ - - ✅︎ +* - Architecture + - Models + - Example HF Models + - [LoRA](#lora-adapter) + - [PP](#distributed-serving) +* - `InternLM2ForRewardModel` + - InternLM2-based + - `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. + - ✅︎ + - ✅︎ +* - `LlamaForCausalLM` + - Llama-based + - `peiyi9979/math-shepherd-mistral-7b-prm`, etc. + - ✅︎ + - ✅︎ +* - `Qwen2ForRewardModel` + - Qwen2-based + - `Qwen/Qwen2.5-Math-RM-72B`, etc. + - ✅︎ + - ✅︎ ``` If your model is not in the above list, we will try to automatically convert the model using -:func:`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. +{func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. ```{important} For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, @@ -472,58 +474,56 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1 #### Classification (`--task classify`) -```{eval-rst} -.. list-table:: - :widths: 25 25 50 5 5 - :header-rows: 1 +```{list-table} +:widths: 25 25 50 5 5 +:header-rows: 1 - * - Architecture - - Models - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - * - :code:`JambaForSequenceClassification` - - Jamba - - :code:`ai21labs/Jamba-tiny-reward-dev`, etc. - - ✅︎ - - ✅︎ - * - :code:`Qwen2ForSequenceClassification` - - Qwen2-based - - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc. - - ✅︎ - - ✅︎ +* - Architecture + - Models + - Example HF Models + - [LoRA](#lora-adapter) + - [PP](#distributed-serving) +* - `JambaForSequenceClassification` + - Jamba + - `ai21labs/Jamba-tiny-reward-dev`, etc. + - ✅︎ + - ✅︎ +* - `Qwen2ForSequenceClassification` + - Qwen2-based + - `jason9693/Qwen2.5-1.5B-apeach`, etc. + - ✅︎ + - ✅︎ ``` If your model is not in the above list, we will try to automatically convert the model using -:func:`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. +{func}`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. #### Sentence Pair Scoring (`--task score`) -```{eval-rst} -.. list-table:: - :widths: 25 25 50 5 5 - :header-rows: 1 +```{list-table} +:widths: 25 25 50 5 5 +:header-rows: 1 - * - Architecture - - Models - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - * - :code:`BertForSequenceClassification` - - BERT-based - - :code:`cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. - - - - - * - :code:`RobertaForSequenceClassification` - - RoBERTa-based - - :code:`cross-encoder/quora-roberta-base`, etc. - - - - - * - :code:`XLMRobertaForSequenceClassification` - - XLM-RoBERTa-based - - :code:`BAAI/bge-reranker-v2-m3`, etc. - - - - +* - Architecture + - Models + - Example HF Models + - [LoRA](#lora-adapter) + - [PP](#distributed-serving) +* - `BertForSequenceClassification` + - BERT-based + - `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. + - + - +* - `RobertaForSequenceClassification` + - RoBERTa-based + - `cross-encoder/quora-roberta-base`, etc. + - + - +* - `XLMRobertaForSequenceClassification` + - XLM-RoBERTa-based + - `BAAI/bge-reranker-v2-m3`, etc. + - + - ``` (supported-mm-models)= @@ -553,186 +553,182 @@ See [this page](#generative-models) for more information on how to use generativ #### Text Generation (`--task generate`) -```{eval-rst} -.. list-table:: - :widths: 25 25 15 20 5 5 5 - :header-rows: 1 +```{list-table} +:widths: 25 25 15 20 5 5 5 +:header-rows: 1 - * - Architecture - - Models - - Inputs - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - - V1 - * - :code:`AriaForConditionalGeneration` - - Aria - - T + I - - :code:`rhymes-ai/Aria` - - - - ✅︎ - - - * - :code:`Blip2ForConditionalGeneration` - - BLIP-2 - - T + I\ :sup:`E` - - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc. - - - - ✅︎ - - - * - :code:`ChameleonForConditionalGeneration` - - Chameleon - - T + I - - :code:`facebook/chameleon-7b` etc. - - - - ✅︎ - - - * - :code:`FuyuForCausalLM` - - Fuyu - - T + I - - :code:`adept/fuyu-8b` etc. - - - - ✅︎ - - - * - :code:`ChatGLMModel` - - GLM-4V - - T + I - - :code:`THUDM/glm-4v-9b` etc. - - ✅︎ - - ✅︎ - - - * - :code:`H2OVLChatModel` - - H2OVL - - T + I\ :sup:`E+` - - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc. - - - - ✅︎ - - - * - :code:`Idefics3ForConditionalGeneration` - - Idefics3 - - T + I - - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc. - - ✅︎ - - - - - * - :code:`InternVLChatModel` - - InternVL 2.5, Mono-InternVL, InternVL 2.0 - - T + I\ :sup:`E+` - - :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc. - - - - ✅︎ - - ✅︎ - * - :code:`LlavaForConditionalGeneration` - - LLaVA-1.5 - - T + I\ :sup:`E+` - - :code:`llava-hf/llava-1.5-7b-hf`, :code:`TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. - - - - ✅︎ - - ✅︎ - * - :code:`LlavaNextForConditionalGeneration` - - LLaVA-NeXT - - T + I\ :sup:`E+` - - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - - - - ✅︎ - - - * - :code:`LlavaNextVideoForConditionalGeneration` - - LLaVA-NeXT-Video - - T + V - - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. - - - - ✅︎ - - - * - :code:`LlavaOnevisionForConditionalGeneration` - - LLaVA-Onevision - - T + I\ :sup:`+` + V\ :sup:`+` - - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. - - - - ✅︎ - - - * - :code:`MiniCPMV` - - MiniCPM-V - - T + I\ :sup:`E+` - - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc. - - ✅︎ - - ✅︎ - - - * - :code:`MllamaForConditionalGeneration` - - Llama 3.2 - - T + I\ :sup:`+` - - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc. - - - - - - - * - :code:`MolmoForCausalLM` - - Molmo - - T + I - - :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc. - - - - ✅︎ - - ✅︎ - * - :code:`NVLM_D_Model` - - NVLM-D 1.0 - - T + I\ :sup:`E+` - - :code:`nvidia/NVLM-D-72B`, etc. - - - - ✅︎ - - ✅︎ - * - :code:`PaliGemmaForConditionalGeneration` - - PaliGemma, PaliGemma 2 - - T + I\ :sup:`E` - - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, :code:`google/paligemma2-3b-ft-docci-448`, etc. - - - - ✅︎ - - - * - :code:`Phi3VForCausalLM` - - Phi-3-Vision, Phi-3.5-Vision - - T + I\ :sup:`E+` - - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc. - - - - ✅︎ - - ✅︎ - * - :code:`PixtralForConditionalGeneration` - - Pixtral - - T + I\ :sup:`+` - - :code:`mistralai/Pixtral-12B-2409`, :code:`mistral-community/pixtral-12b` etc. - - - - ✅︎ - - ✅︎ - * - :code:`QWenLMHeadModel` - - Qwen-VL - - T + I\ :sup:`E+` - - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc. - - ✅︎ - - ✅︎ - - - * - :code:`Qwen2AudioForConditionalGeneration` - - Qwen2-Audio - - T + A\ :sup:`+` - - :code:`Qwen/Qwen2-Audio-7B-Instruct` - - - - ✅︎ - - - * - :code:`Qwen2VLForConditionalGeneration` - - Qwen2-VL - - T + I\ :sup:`E+` + V\ :sup:`E+` - - :code:`Qwen/QVQ-72B-Preview`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. - - ✅︎ - - ✅︎ - - - * - :code:`UltravoxModel` - - Ultravox - - T + A\ :sup:`E+` - - :code:`fixie-ai/ultravox-v0_3` - - - - ✅︎ - - +* - Architecture + - Models + - Inputs + - Example HF Models + - [LoRA](#lora-adapter) + - [PP](#distributed-serving) + - [V1](gh-issue:8779) +* - `AriaForConditionalGeneration` + - Aria + - T + I+ + - `rhymes-ai/Aria` + - + - ✅︎ + - ✅︎ +* - `Blip2ForConditionalGeneration` + - BLIP-2 + - T + IE + - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. + - + - ✅︎ + - ✅︎ +* - `ChameleonForConditionalGeneration` + - Chameleon + - T + I + - `facebook/chameleon-7b` etc. + - + - ✅︎ + - ✅︎ +* - `FuyuForCausalLM` + - Fuyu + - T + I + - `adept/fuyu-8b` etc. + - + - ✅︎ + - ✅︎ +* - `ChatGLMModel` + - GLM-4V + - T + I + - `THUDM/glm-4v-9b` etc. + - ✅︎ + - ✅︎ + - +* - `H2OVLChatModel` + - H2OVL + - T + IE+ + - `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. + - + - ✅︎ + - +* - `Idefics3ForConditionalGeneration` + - Idefics3 + - T + I + - `HuggingFaceM4/Idefics3-8B-Llama3` etc. + - ✅︎ + - + - +* - `InternVLChatModel` + - InternVL 2.5, Mono-InternVL, InternVL 2.0 + - T + IE+ + - `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. + - + - ✅︎ + - ✅︎ +* - `LlavaForConditionalGeneration` + - LLaVA-1.5 + - T + IE+ + - `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. + - + - ✅︎ + - ✅︎ +* - `LlavaNextForConditionalGeneration` + - LLaVA-NeXT + - T + IE+ + - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. + - + - ✅︎ + - ✅︎ +* - `LlavaNextVideoForConditionalGeneration` + - LLaVA-NeXT-Video + - T + V + - `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. + - + - ✅︎ + - +* - `LlavaOnevisionForConditionalGeneration` + - LLaVA-Onevision + - T + I+ + V+ + - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. + - + - ✅︎ + - +* - `MiniCPMV` + - MiniCPM-V + - T + IE+ + - `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. + - ✅︎ + - ✅︎ + - +* - `MllamaForConditionalGeneration` + - Llama 3.2 + - T + I+ + - `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. + - + - + - +* - `MolmoForCausalLM` + - Molmo + - T + I + - `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc. + - ✅︎ + - ✅︎ + - ✅︎ +* - `NVLM_D_Model` + - NVLM-D 1.0 + - T + IE+ + - `nvidia/NVLM-D-72B`, etc. + - + - ✅︎ + - ✅︎ +* - `PaliGemmaForConditionalGeneration` + - PaliGemma, PaliGemma 2 + - T + IE + - `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. + - + - ✅︎ + - +* - `Phi3VForCausalLM` + - Phi-3-Vision, Phi-3.5-Vision + - T + IE+ + - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct` etc. + - + - ✅︎ + - ✅︎ +* - `PixtralForConditionalGeneration` + - Pixtral + - T + I+ + - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` etc. + - + - ✅︎ + - ✅︎ +* - `QWenLMHeadModel` + - Qwen-VL + - T + IE+ + - `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. + - ✅︎ + - ✅︎ + - +* - `Qwen2AudioForConditionalGeneration` + - Qwen2-Audio + - T + A+ + - `Qwen/Qwen2-Audio-7B-Instruct` + - + - ✅︎ + - +* - `Qwen2VLForConditionalGeneration` + - Qwen2-VL + - T + IE+ + VE+ + - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. + - ✅︎ + - ✅︎ + - +* - `UltravoxModel` + - Ultravox + - T + AE+ + - `fixie-ai/ultravox-v0_3` + - + - ✅︎ + - ``` -```{eval-rst} -:sup:`E` Pre-computed embeddings can be inputted for this modality. - -:sup:`+` Multiple items can be inputted per text prompt for this modality. -``` +E Pre-computed embeddings can be inputted for this modality. ++ Multiple items can be inputted per text prompt for this modality. ````{important} To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference) @@ -755,8 +751,7 @@ vLLM currently only supports adding LoRA to the language backbone of multimodal ``` ```{note} -To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo ({code}`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`) -and pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. +To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. ``` ```{note} @@ -783,38 +778,37 @@ To get the best results, you should use pooling models that are specifically tra The following table lists those that are tested in vLLM. -```{eval-rst} -.. list-table:: - :widths: 25 25 15 25 5 5 - :header-rows: 1 +```{list-table} +:widths: 25 25 15 25 5 5 +:header-rows: 1 - * - Architecture - - Models - - Inputs - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - * - :code:`LlavaNextForConditionalGeneration` - - LLaVA-NeXT-based - - T / I - - :code:`royokong/e5-v` - - - - ✅︎ - * - :code:`Phi3VForCausalLM` - - Phi-3-Vision-based - - T + I - - :code:`TIGER-Lab/VLM2Vec-Full` - - 🚧 - - ✅︎ - * - :code:`Qwen2VLForConditionalGeneration` - - Qwen2-VL-based - - T + I - - :code:`MrLight/dse-qwen2-2b-mrl-v1` - - - - ✅︎ +* - Architecture + - Models + - Inputs + - Example HF Models + - [LoRA](#lora-adapter) + - [PP](#distributed-serving) +* - `LlavaNextForConditionalGeneration` + - LLaVA-NeXT-based + - T / I + - `royokong/e5-v` + - + - ✅︎ +* - `Phi3VForCausalLM` + - Phi-3-Vision-based + - T + I + - `TIGER-Lab/VLM2Vec-Full` + - 🚧 + - ✅︎ +* - `Qwen2VLForConditionalGeneration` + - Qwen2-VL-based + - T + I + - `MrLight/dse-qwen2-2b-mrl-v1` + - + - ✅︎ ``` -______________________________________________________________________ +_________________ # Model Support Policy diff --git a/docs/source/quantization/supported_hardware.md b/docs/source/quantization/supported_hardware.md index 843ee21627d78..7330c2f8aa194 100644 --- a/docs/source/quantization/supported_hardware.md +++ b/docs/source/quantization/supported_hardware.md @@ -4,121 +4,120 @@ The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: -```{eval-rst} -.. list-table:: - :header-rows: 1 - :widths: 20 8 8 8 8 8 8 8 8 8 8 +```{list-table} +:header-rows: 1 +:widths: 20 8 8 8 8 8 8 8 8 8 8 - * - Implementation - - Volta - - Turing - - Ampere - - Ada - - Hopper - - AMD GPU - - Intel GPU - - x86 CPU - - AWS Inferentia - - Google TPU - * - AWQ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - * - GPTQ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - * - Marlin (GPTQ/AWQ/FP8) - - ✗ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - INT8 (W8A8) - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✅︎ - - ✗ - - ✗ - * - FP8 (W8A8) - - ✗ - - ✗ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - * - AQLM - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - bitsandbytes - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - DeepSpeedFP - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - GGUF - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ +* - Implementation + - Volta + - Turing + - Ampere + - Ada + - Hopper + - AMD GPU + - Intel GPU + - x86 CPU + - AWS Inferentia + - Google TPU +* - AWQ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✅︎ + - ✅︎ + - ✗ + - ✗ +* - GPTQ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✅︎ + - ✅︎ + - ✗ + - ✗ +* - Marlin (GPTQ/AWQ/FP8) + - ✗ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ +* - INT8 (W8A8) + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✅︎ + - ✗ + - ✗ +* - FP8 (W8A8) + - ✗ + - ✗ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ +* - AQLM + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ +* - bitsandbytes + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ +* - DeepSpeedFP + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ +* - GGUF + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ ``` ## Notes: diff --git a/docs/source/serving/deploying_with_cerebrium.md b/docs/source/serving/deploying_with_cerebrium.md index 4863936236119..950064c8c1b10 100644 --- a/docs/source/serving/deploying_with_cerebrium.md +++ b/docs/source/serving/deploying_with_cerebrium.md @@ -33,7 +33,7 @@ docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" vllm = "latest" ``` -Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py\`: +Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`: ```python from vllm import LLM, SamplingParams @@ -55,13 +55,13 @@ def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): return {"results": results} ``` -Then, run the following code to deploy it to the cloud +Then, run the following code to deploy it to the cloud: ```console $ cerebrium deploy ``` -If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run) +If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case` /run`) ```python curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ diff --git a/docs/source/serving/deploying_with_dstack.md b/docs/source/serving/deploying_with_dstack.md index 65ef1c0016208..381f5f786ca2c 100644 --- a/docs/source/serving/deploying_with_dstack.md +++ b/docs/source/serving/deploying_with_dstack.md @@ -25,7 +25,7 @@ $ cd vllm-dstack $ dstack init ``` -Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: +Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: ```yaml type: service diff --git a/docs/source/serving/deploying_with_helm.md b/docs/source/serving/deploying_with_helm.md index 3b26575827011..7286a0a88968f 100644 --- a/docs/source/serving/deploying_with_helm.md +++ b/docs/source/serving/deploying_with_helm.md @@ -43,209 +43,208 @@ chart **including persistent volumes** and deletes the release. ## Values -```{eval-rst} -.. list-table:: Values - :widths: 25 25 25 25 - :header-rows: 1 +```{list-table} +:widths: 25 25 25 25 +:header-rows: 1 - * - Key - - Type - - Default - - Description - * - autoscaling - - object - - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} - - Autoscaling configuration - * - autoscaling.enabled - - bool - - false - - Enable autoscaling - * - autoscaling.maxReplicas - - int - - 100 - - Maximum replicas - * - autoscaling.minReplicas - - int - - 1 - - Minimum replicas - * - autoscaling.targetCPUUtilizationPercentage - - int - - 80 - - Target CPU utilization for autoscaling - * - configs - - object - - {} - - Configmap - * - containerPort - - int - - 8000 - - Container port - * - customObjects - - list - - [] - - Custom Objects configuration - * - deploymentStrategy - - object - - {} - - Deployment strategy configuration - * - externalConfigs - - list - - [] - - External configuration - * - extraContainers - - list - - [] - - Additional containers configuration - * - extraInit - - object - - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} - - Additional configuration for the init container - * - extraInit.pvcStorage - - string - - "50Gi" - - Storage size of the s3 - * - extraInit.s3modelpath - - string - - "relative_s3_model_path/opt-125m" - - Path of the model on the s3 which hosts model weights and config files - * - extraInit.awsEc2MetadataDisabled - - boolean - - true - - Disables the use of the Amazon EC2 instance metadata service - * - extraPorts - - list - - [] - - Additional ports configuration - * - gpuModels - - list - - ["TYPE_GPU_USED"] - - Type of gpu used - * - image - - object - - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} - - Image configuration - * - image.command - - list - - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] - - Container launch command - * - image.repository - - string - - "vllm/vllm-openai" - - Image repository - * - image.tag - - string - - "latest" - - Image tag - * - livenessProbe - - object - - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} - - Liveness probe configuration - * - livenessProbe.failureThreshold - - int - - 3 - - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive - * - livenessProbe.httpGet - - object - - {"path":"/health","port":8000} - - Configuration of the Kubelet http request on the server - * - livenessProbe.httpGet.path - - string - - "/health" - - Path to access on the HTTP server - * - livenessProbe.httpGet.port - - int - - 8000 - - Name or number of the port to access on the container, on which the server is listening - * - livenessProbe.initialDelaySeconds - - int - - 15 - - Number of seconds after the container has started before liveness probe is initiated - * - livenessProbe.periodSeconds - - int - - 10 - - How often (in seconds) to perform the liveness probe - * - maxUnavailablePodDisruptionBudget - - string - - "" - - Disruption Budget Configuration - * - readinessProbe - - object - - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} - - Readiness probe configuration - * - readinessProbe.failureThreshold - - int - - 3 - - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready - * - readinessProbe.httpGet - - object - - {"path":"/health","port":8000} - - Configuration of the Kubelet http request on the server - * - readinessProbe.httpGet.path - - string - - "/health" - - Path to access on the HTTP server - * - readinessProbe.httpGet.port - - int - - 8000 - - Name or number of the port to access on the container, on which the server is listening - * - readinessProbe.initialDelaySeconds - - int - - 5 - - Number of seconds after the container has started before readiness probe is initiated - * - readinessProbe.periodSeconds - - int - - 5 - - How often (in seconds) to perform the readiness probe - * - replicaCount - - int - - 1 - - Number of replicas - * - resources - - object - - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} - - Resource configuration - * - resources.limits."nvidia.com/gpu" - - int - - 1 - - Number of gpus used - * - resources.limits.cpu - - int - - 4 - - Number of CPUs - * - resources.limits.memory - - string - - "16Gi" - - CPU memory configuration - * - resources.requests."nvidia.com/gpu" - - int - - 1 - - Number of gpus used - * - resources.requests.cpu - - int - - 4 - - Number of CPUs - * - resources.requests.memory - - string - - "16Gi" - - CPU memory configuration - * - secrets - - object - - {} - - Secrets configuration - * - serviceName - - string - - - - Service name - * - servicePort - - int - - 80 - - Service port - * - labels.environment - - string - - test - - Environment name - * - labels.release - - string - - test - - Release name +* - Key + - Type + - Default + - Description +* - autoscaling + - object + - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} + - Autoscaling configuration +* - autoscaling.enabled + - bool + - false + - Enable autoscaling +* - autoscaling.maxReplicas + - int + - 100 + - Maximum replicas +* - autoscaling.minReplicas + - int + - 1 + - Minimum replicas +* - autoscaling.targetCPUUtilizationPercentage + - int + - 80 + - Target CPU utilization for autoscaling +* - configs + - object + - {} + - Configmap +* - containerPort + - int + - 8000 + - Container port +* - customObjects + - list + - [] + - Custom Objects configuration +* - deploymentStrategy + - object + - {} + - Deployment strategy configuration +* - externalConfigs + - list + - [] + - External configuration +* - extraContainers + - list + - [] + - Additional containers configuration +* - extraInit + - object + - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} + - Additional configuration for the init container +* - extraInit.pvcStorage + - string + - "50Gi" + - Storage size of the s3 +* - extraInit.s3modelpath + - string + - "relative_s3_model_path/opt-125m" + - Path of the model on the s3 which hosts model weights and config files +* - extraInit.awsEc2MetadataDisabled + - boolean + - true + - Disables the use of the Amazon EC2 instance metadata service +* - extraPorts + - list + - [] + - Additional ports configuration +* - gpuModels + - list + - ["TYPE_GPU_USED"] + - Type of gpu used +* - image + - object + - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} + - Image configuration +* - image.command + - list + - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] + - Container launch command +* - image.repository + - string + - "vllm/vllm-openai" + - Image repository +* - image.tag + - string + - "latest" + - Image tag +* - livenessProbe + - object + - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} + - Liveness probe configuration +* - livenessProbe.failureThreshold + - int + - 3 + - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive +* - livenessProbe.httpGet + - object + - {"path":"/health","port":8000} + - Configuration of the Kubelet http request on the server +* - livenessProbe.httpGet.path + - string + - "/health" + - Path to access on the HTTP server +* - livenessProbe.httpGet.port + - int + - 8000 + - Name or number of the port to access on the container, on which the server is listening +* - livenessProbe.initialDelaySeconds + - int + - 15 + - Number of seconds after the container has started before liveness probe is initiated +* - livenessProbe.periodSeconds + - int + - 10 + - How often (in seconds) to perform the liveness probe +* - maxUnavailablePodDisruptionBudget + - string + - "" + - Disruption Budget Configuration +* - readinessProbe + - object + - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} + - Readiness probe configuration +* - readinessProbe.failureThreshold + - int + - 3 + - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready +* - readinessProbe.httpGet + - object + - {"path":"/health","port":8000} + - Configuration of the Kubelet http request on the server +* - readinessProbe.httpGet.path + - string + - "/health" + - Path to access on the HTTP server +* - readinessProbe.httpGet.port + - int + - 8000 + - Name or number of the port to access on the container, on which the server is listening +* - readinessProbe.initialDelaySeconds + - int + - 5 + - Number of seconds after the container has started before readiness probe is initiated +* - readinessProbe.periodSeconds + - int + - 5 + - How often (in seconds) to perform the readiness probe +* - replicaCount + - int + - 1 + - Number of replicas +* - resources + - object + - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} + - Resource configuration +* - resources.limits."nvidia.com/gpu" + - int + - 1 + - Number of gpus used +* - resources.limits.cpu + - int + - 4 + - Number of CPUs +* - resources.limits.memory + - string + - "16Gi" + - CPU memory configuration +* - resources.requests."nvidia.com/gpu" + - int + - 1 + - Number of gpus used +* - resources.requests.cpu + - int + - 4 + - Number of CPUs +* - resources.requests.memory + - string + - "16Gi" + - CPU memory configuration +* - secrets + - object + - {} + - Secrets configuration +* - serviceName + - string + - + - Service name +* - servicePort + - int + - 80 + - Service port +* - labels.environment + - string + - test + - Environment name +* - labels.release + - string + - test + - Release name ``` diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md index d27db826cd006..77f848088ea43 100644 --- a/docs/source/serving/deploying_with_k8s.md +++ b/docs/source/serving/deploying_with_k8s.md @@ -47,7 +47,11 @@ data: token: "REPLACE_WITH_TOKEN" ``` -Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model: +Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. + +Here are two examples for using NVIDIA GPU and AMD GPU. + +- NVIDIA GPU ```yaml apiVersion: apps/v1 @@ -119,6 +123,79 @@ spec: periodSeconds: 5 ``` +- AMD GPU + +You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X. + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mistral-7b + namespace: default + labels: + app: mistral-7b +spec: + replicas: 1 + selector: + matchLabels: + app: mistral-7b + template: + metadata: + labels: + app: mistral-7b + spec: + volumes: + # PVC + - name: cache-volume + persistentVolumeClaim: + claimName: mistral-7b + # vLLM needs to access the host's shared memory for tensor parallel inference. + - name: shm + emptyDir: + medium: Memory + sizeLimit: "8Gi" + hostNetwork: true + hostIPC: true + containers: + - name: mistral-7b + image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 + securityContext: + seccompProfile: + type: Unconfined + runAsGroup: 44 + capabilities: + add: + - SYS_PTRACE + command: ["/bin/sh", "-c"] + args: [ + "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" + ] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8000 + resources: + limits: + cpu: "10" + memory: 20G + amd.com/gpu: "1" + requests: + cpu: "6" + memory: 6G + amd.com/gpu: "1" + volumeMounts: + - name: cache-volume + mountPath: /root/.cache/huggingface + - name: shm + mountPath: /dev/shm +``` +You can get the full example with steps and sample yaml files from . + 2. **Create a Kubernetes Service for vLLM** Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index c0a4b23f6dc70..7446b7c84cf46 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -8,7 +8,7 @@ Before going into the details of distributed inference and serving, let's first - **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference. - **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4. -- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2. +- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2. In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes. @@ -77,7 +77,7 @@ Then you get a ray cluster of containers. Note that you need to keep the shells Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. -After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: +After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: ```console $ vllm serve /path/to/the/model/in/the/container \ @@ -85,7 +85,7 @@ $ --tensor-parallel-size 8 \ $ --pipeline-parallel-size 2 ``` -You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16: +You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16: ```console $ vllm serve /path/to/the/model/in/the/container \ diff --git a/docs/source/serving/runai_model_streamer.md b/docs/source/serving/runai_model_streamer.md index 1b5756a95075a..d4269050ff574 100644 --- a/docs/source/serving/runai_model_streamer.md +++ b/docs/source/serving/runai_model_streamer.md @@ -41,7 +41,7 @@ For reading from S3, it will be the number of client instances the host is openi $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' ``` -You can controls the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. +You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit). ```console diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md index 3f5d9ffc26278..7292012e36a26 100644 --- a/docs/source/usage/structured_outputs.md +++ b/docs/source/usage/structured_outputs.md @@ -2,7 +2,7 @@ # Structured Outputs -vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines) or [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer) as backends for the guided decoding. +vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding. This document shows you some examples of the different options that are available to generate structured outputs. ## Online Inference (OpenAI API) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index d5a71862656e7..b51bfae455267 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -24,10 +24,13 @@ def run_aria(question: str, modality: str): assert modality == "image" model_name = "rhymes-ai/Aria" + # NOTE: Need L40 (or equivalent) to avoid OOM llm = LLM(model=model_name, tokenizer_mode="slow", - trust_remote_code=True, dtype="bfloat16", + max_model_len=4096, + max_num_seqs=2, + trust_remote_code=True, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) prompt = (f"<|im_start|>user\n<|img|>\n{question}" @@ -57,6 +60,7 @@ def run_chameleon(question: str, modality: str): prompt = f"{question}" llm = LLM(model="facebook/chameleon-7b", max_model_len=4096, + max_num_seqs=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) stop_token_ids = None return llm, prompt, stop_token_ids @@ -257,7 +261,7 @@ def run_minicpmv(question: str, modality: str): # 2.5 # model_name = "openbmb/MiniCPM-Llama3-V-2_5" - #2.6 + # 2.6 model_name = "openbmb/MiniCPM-V-2_6" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -308,7 +312,20 @@ def run_mllama(question: str, modality: str): disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) - prompt = f"<|image|><|begin_of_text|>{question}" + tokenizer = AutoTokenizer.from_pretrained(model_name) + messages = [{ + "role": + "user", + "content": [{ + "type": "image" + }, { + "type": "text", + "text": f"{question}" + }] + }] + prompt = tokenizer.apply_chat_template(messages, + add_generation_prompt=True, + tokenize=False) stop_token_ids = None return llm, prompt, stop_token_ids @@ -417,9 +434,11 @@ def run_pixtral_hf(question: str, modality: str): model_name = "mistral-community/pixtral-12b" + # NOTE: Need L40 (or equivalent) to avoid OOM llm = LLM( model=model_name, max_model_len=8192, + max_num_seqs=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) diff --git a/requirements-neuron.txt b/requirements-neuron.txt index 148fdbe0d6310..5e08d101fcd61 100644 --- a/requirements-neuron.txt +++ b/requirements-neuron.txt @@ -2,6 +2,6 @@ -r requirements-common.txt # Dependencies for Neuron devices -transformers-neuronx >= 0.12.0 -torch-neuronx >= 2.1.2 +transformers-neuronx >= 0.13.0 +torch-neuronx >= 2.5.0 neuronx-cc diff --git a/setup.py b/setup.py index 61d2d710aa20e..ba6953dbdc174 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,4 @@ +import ctypes import importlib.util import logging import os @@ -13,7 +14,7 @@ from packaging.version import Version, parse from setuptools import Extension, find_packages, setup from setuptools.command.build_ext import build_ext from setuptools_scm import get_version -from torch.utils.cpp_extension import CUDA_HOME +from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME def load_module_from_path(module_name, path): @@ -379,25 +380,31 @@ def _build_custom_ops() -> bool: return _is_cuda() or _is_hip() or _is_cpu() -def get_hipcc_rocm_version(): - # Run the hipcc --version command - result = subprocess.run(['hipcc', '--version'], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True) +def get_rocm_version(): + # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so + # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21 + try: + librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so" + if not librocm_core_file.is_file(): + return None + librocm_core = ctypes.CDLL(librocm_core_file) + VerErrors = ctypes.c_uint32 + get_rocm_core_version = librocm_core.getROCmVersion + get_rocm_core_version.restype = VerErrors + get_rocm_core_version.argtypes = [ + ctypes.POINTER(ctypes.c_uint32), + ctypes.POINTER(ctypes.c_uint32), + ctypes.POINTER(ctypes.c_uint32), + ] + major = ctypes.c_uint32() + minor = ctypes.c_uint32() + patch = ctypes.c_uint32() - # Check if the command was executed successfully - if result.returncode != 0: - print("Error running 'hipcc --version'") + if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor), + ctypes.byref(patch)) == 0): + return "%d.%d.%d" % (major.value, minor.value, patch.value) return None - - # Extract the version using a regular expression - match = re.search(r'HIP version: (\S+)', result.stdout) - if match: - # Return the version string - return match.group(1) - else: - print("Could not find HIP version in the output") + except Exception: return None @@ -479,11 +486,10 @@ def get_vllm_version() -> str: if "sdist" not in sys.argv: version += f"{sep}cu{cuda_version_str}" elif _is_hip(): - # Get the HIP version - hipcc_version = get_hipcc_rocm_version() - if hipcc_version != MAIN_CUDA_VERSION: - rocm_version_str = hipcc_version.replace(".", "")[:3] - version += f"{sep}rocm{rocm_version_str}" + # Get the Rocm Version + rocm_version = get_rocm_version() or torch.version.hip + if rocm_version and rocm_version != MAIN_CUDA_VERSION: + version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}" elif _is_neuron(): # Get the Neuron version neuron_version = str(get_neuronxcc_version()) diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 07c10a3a18c55..d4ede4d2320a7 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -7,7 +7,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are initialized randomly with a fixed seed. """ from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Any, List, Optional, Tuple import torch from torch import nn @@ -54,6 +54,16 @@ class LlamaConfig: tractable_init: bool = False random_seed: int = 0 + def compute_hash(self) -> str: + factors: List[Any] = [] + for k, v in self.__dict__.items(): + if k == "random_seed": + continue + factors.append((k, v)) + factors.sort() + import hashlib + return hashlib.md5(str(factors).encode()).hexdigest() + def __post_init__(self): assert self.mlp_size >= self.hidden_size @@ -263,7 +273,8 @@ def run_model(llama_config, compilation_config = CompilationConfig( level=CompilationLevel.NO_COMPILATION, ) - vllm_config = VllmConfig(compilation_config=compilation_config) + vllm_config = VllmConfig(compilation_config=compilation_config, + additional_config=llama_config) with set_current_vllm_config(vllm_config): model = LlamaModel(config=llama_config, vllm_config=vllm_config, diff --git a/tests/conftest.py b/tests/conftest.py index 4e939221329cd..6e2f75e33654f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -31,7 +31,6 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, to_enc_dec_tuple_list, zip_enc_dec_prompts) from vllm.logger import init_logger from vllm.outputs import RequestOutput -from vllm.platforms import current_platform from vllm.sampling_params import BeamSearchParams from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless, identity) @@ -242,6 +241,7 @@ _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict) class HfRunner: def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: + from vllm.platforms import current_platform if x is None or isinstance(x, (bool, )): return x diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index 45e6980a94630..e49562ad6a21f 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -4,7 +4,7 @@ import pytest from vllm.entrypoints.openai.cli_args import (make_arg_parser, validate_parsed_serve_args) -from vllm.entrypoints.openai.serving_engine import LoRAModulePath +from vllm.entrypoints.openai.serving_models import LoRAModulePath from vllm.utils import FlexibleArgumentParser from ...utils import VLLM_PATH diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index c81cfdbbe5cff..183d900c493e5 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -28,6 +28,8 @@ PA_NAME = "swapnilbp/llama_tweet_ptune" # need to change to match the prompt adapter PA_NUM_VIRTUAL_TOKENS = 8 +GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"] + @pytest.fixture(scope="module") def zephyr_lora_files(): @@ -635,8 +637,7 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI): @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) +@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_json_completion(client: openai.AsyncOpenAI, guided_decoding_backend: str, sample_json_schema): @@ -658,8 +659,7 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) +@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_regex_completion(client: openai.AsyncOpenAI, guided_decoding_backend: str, sample_regex): @@ -680,8 +680,7 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) +@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_choice_completion(client: openai.AsyncOpenAI, guided_decoding_backend: str, sample_guided_choice): @@ -761,8 +760,7 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) +@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_decoding_type_error(client: openai.AsyncOpenAI, guided_decoding_backend: str, sample_json_schema, sample_regex): diff --git a/tests/entrypoints/openai/test_lora_lineage.py b/tests/entrypoints/openai/test_lora_lineage.py index ab39684c2f31a..ce4f85c13fff9 100644 --- a/tests/entrypoints/openai/test_lora_lineage.py +++ b/tests/entrypoints/openai/test_lora_lineage.py @@ -55,7 +55,10 @@ def server_with_lora_modules_json(zephyr_lora_files): "64", ] - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + # Enable the /v1/load_lora_adapter endpoint + envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"} + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server: yield remote_server @@ -67,8 +70,8 @@ async def client_for_lora_lineage(server_with_lora_modules_json): @pytest.mark.asyncio -async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI, - zephyr_lora_files): +async def test_static_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI, + zephyr_lora_files): models = await client_for_lora_lineage.models.list() models = models.data served_model = models[0] @@ -81,3 +84,26 @@ async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI, assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models) assert lora_models[0].id == "zephyr-lora" assert lora_models[1].id == "zephyr-lora2" + + +@pytest.mark.asyncio +async def test_dynamic_lora_lineage( + client_for_lora_lineage: openai.AsyncOpenAI, zephyr_lora_files): + + response = await client_for_lora_lineage.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": + "zephyr-lora-3", + "lora_path": + zephyr_lora_files + }) + # Ensure adapter loads before querying /models + assert "success" in response + + models = await client_for_lora_lineage.models.list() + models = models.data + dynamic_lora_model = models[-1] + assert dynamic_lora_model.root == zephyr_lora_files + assert dynamic_lora_model.parent == MODEL_NAME + assert dynamic_lora_model.id == "zephyr-lora-3" diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 51b255bb2a6db..97248f1150979 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -8,7 +8,8 @@ from vllm.config import MultiModalConfig from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.serving_chat import OpenAIServingChat -from vllm.entrypoints.openai.serving_engine import BaseModelPath +from vllm.entrypoints.openai.serving_models import (BaseModelPath, + OpenAIServingModels) from vllm.transformers_utils.tokenizer import get_tokenizer MODEL_NAME = "openai-community/gpt2" @@ -33,6 +34,7 @@ class MockModelConfig: hf_config = MockHFConfig() logits_processor_pattern = None diff_sampling_param: Optional[dict] = None + allowed_local_media_path: str = "" def get_diff_sampling_param(self): return self.diff_sampling_param or {} @@ -49,14 +51,13 @@ async def _async_serving_chat_init(): engine = MockEngine() model_config = await engine.get_model_config() + models = OpenAIServingModels(model_config, BASE_MODEL_PATHS) serving_completion = OpenAIServingChat(engine, model_config, - BASE_MODEL_PATHS, + models, response_role="assistant", chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", - lora_modules=None, - prompt_adapters=None, request_logger=None) return serving_completion @@ -71,14 +72,14 @@ def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False + models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, + model_config=MockModelConfig()) serving_chat = OpenAIServingChat(mock_engine, MockModelConfig(), - BASE_MODEL_PATHS, + models, response_role="assistant", chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", - lora_modules=None, - prompt_adapters=None, request_logger=None) req = ChatCompletionRequest( model=MODEL_NAME, @@ -114,14 +115,14 @@ def test_serving_chat_could_load_correct_generation_config(): mock_engine.errored = False # Initialize the serving chat + models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, + model_config=mock_model_config) serving_chat = OpenAIServingChat(mock_engine, mock_model_config, - BASE_MODEL_PATHS, + models, response_role="assistant", chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", - lora_modules=None, - prompt_adapters=None, request_logger=None) req = ChatCompletionRequest( model=MODEL_NAME, diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_models.py similarity index 61% rename from tests/entrypoints/openai/test_serving_engine.py rename to tests/entrypoints/openai/test_serving_models.py index 096ab6fa0ac09..96897dc730da2 100644 --- a/tests/entrypoints/openai/test_serving_engine.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -4,11 +4,11 @@ from unittest.mock import MagicMock import pytest from vllm.config import ModelConfig -from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.protocol import (ErrorResponse, LoadLoraAdapterRequest, UnloadLoraAdapterRequest) -from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.entrypoints.openai.serving_models import (BaseModelPath, + OpenAIServingModels) from vllm.lora.request import LoRARequest MODEL_NAME = "meta-llama/Llama-2-7b" @@ -19,47 +19,45 @@ LORA_UNLOADING_SUCCESS_MESSAGE = ( "Success: LoRA adapter '{lora_name}' removed successfully.") -async def _async_serving_engine_init(): - mock_engine_client = MagicMock(spec=EngineClient) +async def _async_serving_models_init() -> OpenAIServingModels: mock_model_config = MagicMock(spec=ModelConfig) # Set the max_model_len attribute to avoid missing attribute mock_model_config.max_model_len = 2048 - serving_engine = OpenAIServing(mock_engine_client, - mock_model_config, - BASE_MODEL_PATHS, - lora_modules=None, - prompt_adapters=None, - request_logger=None) - return serving_engine + serving_models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, + model_config=mock_model_config, + lora_modules=None, + prompt_adapters=None) + + return serving_models @pytest.mark.asyncio async def test_serving_model_name(): - serving_engine = await _async_serving_engine_init() - assert serving_engine._get_model_name(None) == MODEL_NAME + serving_models = await _async_serving_models_init() + assert serving_models.model_name(None) == MODEL_NAME request = LoRARequest(lora_name="adapter", lora_path="/path/to/adapter2", lora_int_id=1) - assert serving_engine._get_model_name(request) == request.lora_name + assert serving_models.model_name(request) == request.lora_name @pytest.mark.asyncio async def test_load_lora_adapter_success(): - serving_engine = await _async_serving_engine_init() + serving_models = await _async_serving_models_init() request = LoadLoraAdapterRequest(lora_name="adapter", lora_path="/path/to/adapter2") - response = await serving_engine.load_lora_adapter(request) + response = await serving_models.load_lora_adapter(request) assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter') - assert len(serving_engine.lora_requests) == 1 - assert serving_engine.lora_requests[0].lora_name == "adapter" + assert len(serving_models.lora_requests) == 1 + assert serving_models.lora_requests[0].lora_name == "adapter" @pytest.mark.asyncio async def test_load_lora_adapter_missing_fields(): - serving_engine = await _async_serving_engine_init() + serving_models = await _async_serving_models_init() request = LoadLoraAdapterRequest(lora_name="", lora_path="") - response = await serving_engine.load_lora_adapter(request) + response = await serving_models.load_lora_adapter(request) assert isinstance(response, ErrorResponse) assert response.type == "InvalidUserInput" assert response.code == HTTPStatus.BAD_REQUEST @@ -67,43 +65,43 @@ async def test_load_lora_adapter_missing_fields(): @pytest.mark.asyncio async def test_load_lora_adapter_duplicate(): - serving_engine = await _async_serving_engine_init() + serving_models = await _async_serving_models_init() request = LoadLoraAdapterRequest(lora_name="adapter1", lora_path="/path/to/adapter1") - response = await serving_engine.load_lora_adapter(request) + response = await serving_models.load_lora_adapter(request) assert response == LORA_LOADING_SUCCESS_MESSAGE.format( lora_name='adapter1') - assert len(serving_engine.lora_requests) == 1 + assert len(serving_models.lora_requests) == 1 request = LoadLoraAdapterRequest(lora_name="adapter1", lora_path="/path/to/adapter1") - response = await serving_engine.load_lora_adapter(request) + response = await serving_models.load_lora_adapter(request) assert isinstance(response, ErrorResponse) assert response.type == "InvalidUserInput" assert response.code == HTTPStatus.BAD_REQUEST - assert len(serving_engine.lora_requests) == 1 + assert len(serving_models.lora_requests) == 1 @pytest.mark.asyncio async def test_unload_lora_adapter_success(): - serving_engine = await _async_serving_engine_init() + serving_models = await _async_serving_models_init() request = LoadLoraAdapterRequest(lora_name="adapter1", lora_path="/path/to/adapter1") - response = await serving_engine.load_lora_adapter(request) - assert len(serving_engine.lora_requests) == 1 + response = await serving_models.load_lora_adapter(request) + assert len(serving_models.lora_requests) == 1 request = UnloadLoraAdapterRequest(lora_name="adapter1") - response = await serving_engine.unload_lora_adapter(request) + response = await serving_models.unload_lora_adapter(request) assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format( lora_name='adapter1') - assert len(serving_engine.lora_requests) == 0 + assert len(serving_models.lora_requests) == 0 @pytest.mark.asyncio async def test_unload_lora_adapter_missing_fields(): - serving_engine = await _async_serving_engine_init() + serving_models = await _async_serving_models_init() request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None) - response = await serving_engine.unload_lora_adapter(request) + response = await serving_models.unload_lora_adapter(request) assert isinstance(response, ErrorResponse) assert response.type == "InvalidUserInput" assert response.code == HTTPStatus.BAD_REQUEST @@ -111,9 +109,9 @@ async def test_unload_lora_adapter_missing_fields(): @pytest.mark.asyncio async def test_unload_lora_adapter_not_found(): - serving_engine = await _async_serving_engine_init() + serving_models = await _async_serving_models_init() request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter") - response = await serving_engine.unload_lora_adapter(request) + response = await serving_models.unload_lora_adapter(request) assert isinstance(response, ErrorResponse) assert response.type == "InvalidUserInput" assert response.code == HTTPStatus.BAD_REQUEST diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index 3731b2dcdeae1..c851539c610ec 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -91,5 +91,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str, assert len(embeddings.data) == 1 assert len(embeddings.data[0].embedding) == 3072 assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens == 765 - assert embeddings.usage.total_tokens == 765 + assert embeddings.usage.prompt_tokens == 764 + assert embeddings.usage.total_tokens == 764 diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 996e60bfee592..d63b963522e73 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -2,7 +2,6 @@ import warnings from typing import Optional import pytest -from PIL import Image from vllm.assets.image import ImageAsset from vllm.config import ModelConfig @@ -91,10 +90,7 @@ def _assert_mm_data_is_image_input( image_data = mm_data.get("image") assert image_data is not None - if image_count == 1: - assert isinstance(image_data, Image.Image) - else: - assert isinstance(image_data, list) and len(image_data) == image_count + assert isinstance(image_data, list) and len(image_data) == image_count def test_parse_chat_messages_single_image( diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index d37f95d48d5b2..916cc2efa3895 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -5,7 +5,10 @@ import torch from tests.kernels.utils import override_backend_env_variable from vllm.attention.selector import which_attn_to_use -from vllm.platforms import cpu, cuda, openvino, rocm +from vllm.platforms.cpu import CpuPlatform +from vllm.platforms.cuda import CudaPlatform +from vllm.platforms.openvino import OpenVinoPlatform +from vllm.platforms.rocm import RocmPlatform from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL @@ -20,26 +23,23 @@ def test_env(name: str, device: str, monkeypatch): override_backend_env_variable(monkeypatch, name) if device == "cpu": - with patch("vllm.attention.selector.current_platform", - cpu.CpuPlatform()): + with patch("vllm.attention.selector.current_platform", CpuPlatform()): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == "TORCH_SDPA" elif device == "hip": - with patch("vllm.attention.selector.current_platform", - rocm.RocmPlatform()): + with patch("vllm.attention.selector.current_platform", RocmPlatform()): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == "ROCM_FLASH" elif device == "openvino": with patch("vllm.attention.selector.current_platform", - openvino.OpenVinoPlatform()): + OpenVinoPlatform()): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == "OPENVINO" else: - with patch("vllm.attention.selector.current_platform", - cuda.CudaPlatform()): + with patch("vllm.attention.selector.current_platform", CudaPlatform()): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == name diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 8b247fb9b2388..57ebaa424fc59 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -4,6 +4,7 @@ from typing import Dict, List, TypedDict from unittest.mock import MagicMock, patch import pytest +import safetensors import torch import torch.nn as nn from huggingface_hub import snapshot_download @@ -169,6 +170,29 @@ def mixtral_lora_files_all_target_modules(): return snapshot_download(repo_id="dyang415/mixtral-lora-v0") +@pytest.fixture(scope="session") +def jamba_lora_files(): + # some of the adapters have unnecessary weights for serving, + # hence we remove them + def remove_unnecessary_weights(path): + lora_path = f"{adapter_path}/adapter_model.safetensors" + tensors = safetensors.torch.load_file(lora_path) + nonlora_keys = [] + for k in list(tensors.keys()): + if "lora" not in k: + nonlora_keys.append(k) + for k in nonlora_keys: + del tensors[k] + safetensors.torch.save_file(tensors, lora_path) + + adapter_path = snapshot_download( + repo_id= + "hf-100/Jamba-1.5-mini-Spellbound-StoryWriter-0.1-6583896-ckpt53-lora") + + remove_unnecessary_weights(adapter_path) + return adapter_path + + @pytest.fixture(scope="session") def gemma_lora_files(): return snapshot_download(repo_id="wskwon/gemma-7b-test-lora") diff --git a/tests/lora/test_jamba.py b/tests/lora/test_jamba.py new file mode 100644 index 0000000000000..6aa33926cb6b8 --- /dev/null +++ b/tests/lora/test_jamba.py @@ -0,0 +1,54 @@ +from typing import List + +import pytest +import torch + +import vllm +from vllm.lora.request import LoRARequest + +MODEL_PATH = "ai21labs/AI21-Jamba-1.5-Mini" + +MAX_TOKENS = 40 + + +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, + prompts: List[str]) -> List[str]: + + sampling_params = vllm.SamplingParams(temperature=0, max_tokens=MAX_TOKENS) + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) + # Print the outputs. + generated_texts: List[str] = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text.strip() + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + return generated_texts + + +@pytest.mark.parametrize("tp_size", [4]) +def test_jamba_lora(jamba_lora_files, tp_size): + """Original test, the LoRA model has the common target modules, not all""" + if torch.cuda.device_count() < tp_size: + pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") + + prompts = ["Write a story about a sheep and a goat."] + + llm = vllm.LLM( + MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + distributed_executor_backend="ray", + tensor_parallel_size=tp_size, + ) + + expected_jamba_output = [ + """Once upon a time, in a lush green meadow, there lived a sheep named Clara and a goat named Billy. Clara was a gentle creature, always nibbling on the soft grass and humming""" # noqa: E501 + ] + assert do_sample(llm, jamba_lora_files, lora_id=1, + prompts=prompts) == expected_jamba_output diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 0b76f466702fc..a099f36b0a465 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -1,4 +1,5 @@ import json +import math import os from typing import Dict, List @@ -50,6 +51,18 @@ def test_peft_helper(sql_lora_files): "embed_tokens", "lm_head", ] + scaling = peft_helper.lora_alpha / peft_helper.r + assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3 + + # test RSLoRA + config = dict(r=8, + lora_alpha=16, + target_modules=["gate_proj"], + use_rslora=True) + peft_helper = PEFTHelper.from_dict(config) + + scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r) + assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3 expected_error = "vLLM only supports modules_to_save being None." with pytest.raises(ValueError, match=expected_error): @@ -60,13 +73,6 @@ def test_peft_helper(sql_lora_files): modules_to_save=["lm_head"], ) PEFTHelper.from_dict(config) - expected_error = "vLLM does not yet support RSLoRA." - with pytest.raises(ValueError, match=expected_error): - config = dict(r=8, - lora_alpha=16, - target_modules=["gate_proj"], - use_rslora=True) - PEFTHelper.from_dict(config) expected_error = "vLLM does not yet support DoRA." with pytest.raises(ValueError, match=expected_error): diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index c9f48402b0268..ebdd129db5f6a 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -7,7 +7,7 @@ from vllm.assets.image import ImageAsset from vllm.lora.request import LoRARequest from vllm.platforms import current_platform -MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct" +MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct" PROMPT_TEMPLATE = ( "<|im_start|>system\nYou are a helpful assistant.<|im_end|>" @@ -49,10 +49,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: # Print the outputs. generated_texts: List[str] = [] for output in outputs: - prompt = output.prompt generated_text = output.outputs[0].text.strip() generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + print(f"Generated text: {generated_text!r}") return generated_texts diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py index cd8954ffc48c2..5897c04c89e19 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py @@ -30,7 +30,7 @@ def get_max_qwen2_vl_image_tokens(): @pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [ - ({}, 1225), + ({}, 16384), ({ MIN_PIXELS: 64**2, MAX_PIXELS: 512**2 diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 3101d1d2ea831..7db08166826eb 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -140,10 +140,7 @@ VLM_TEST_SETTINGS = { "aria": VLMTestInfo( models=["rhymes-ai/Aria"], tokenizer_mode="slow", - test_type=( - VLMTestType.IMAGE, - VLMTestType.MULTI_IMAGE, - ), + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), dtype="bfloat16", prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501 img_idx_to_prompt=lambda idx: "<|img|>\n", @@ -179,6 +176,7 @@ VLM_TEST_SETTINGS = { test_type=VLMTestType.IMAGE, prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", max_model_len=4096, + max_num_seqs=2, auto_cls=AutoModelForVision2Seq, postprocess_inputs=model_utils.cast_dtype_post_processor( "pixel_values" @@ -212,7 +210,7 @@ VLM_TEST_SETTINGS = { dtype="bfloat16", get_stop_token_ids=lambda tok: [151329, 151336, 151338], patch_hf_runner=model_utils.glm_patch_hf_runner, - marks=[large_gpu_mark(min_gb=48)], + marks=[large_gpu_mark(min_gb=32)], ), "h2ovl": VLMTestInfo( models = [ @@ -261,6 +259,7 @@ VLM_TEST_SETTINGS = { dtype="bfloat16", use_tokenizer_eos=True, patch_hf_runner=model_utils.internvl_patch_hf_runner, + marks=[large_gpu_mark(min_gb=32)], ), "llava_next": VLMTestInfo( models=["llava-hf/llava-v1.6-mistral-7b-hf"], diff --git a/tests/models/registry.py b/tests/models/registry.py index f5a37420a2909..e5dfb2822745d 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -140,6 +140,8 @@ _EMBEDDING_EXAMPLE_MODELS = { "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"), "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"), "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"), + "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward", + trust_remote_code=True), "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"), # noqa: E501 "LlamaModel": _HfExamplesInfo("llama", is_available_online=False), "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index a4eea7f035c91..3b728f2744fca 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -1,7 +1,6 @@ from unittest.mock import patch import pytest -import transformers from transformers import PretrainedConfig from vllm import LLM @@ -12,9 +11,6 @@ from .registry import HF_EXAMPLE_MODELS @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) def test_can_initialize(model_arch): model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) - if (model_arch == "Cohere2ForCausalLM" - and transformers.__version__ < "4.48.0"): - pytest.skip(reason="Model introduced in HF >= 4.48.0") if not model_info.is_available_online: pytest.skip("Model is not available online") diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index d22d778f81fa8..1850ca46ccc8f 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -1,12 +1,20 @@ +from functools import partial from typing import cast +import numpy as np import pytest +from PIL import Image -from vllm.multimodal.processing import (PromptReplacement, _PlaceholderInfo, - find_text_matches, find_token_matches, - iter_placeholders, iter_token_matches, +from vllm.config import ModelConfig +from vllm.inputs import InputProcessingContext +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.processing import (ProcessingCache, PromptReplacement, + _PlaceholderInfo, find_text_matches, + find_token_matches, iter_placeholders, + iter_token_matches, replace_text_matches, replace_token_matches) +from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import full_groupby @@ -457,6 +465,7 @@ def test_find_replace_tokens( ), ] ) +# yapf: enable def test_iter_placeholders( repl_by_key, prompt, @@ -475,11 +484,203 @@ def test_iter_placeholders( prompt_repls, prompt, # Effectively match all occurrences in the prompt - {key: 3 for key in repl_by_key}, - )) + {key: 3 + for key in repl_by_key}, + )) # Only displayed on error print("result:", result) # Manually constructed results assert result == expected + + +def _rand_img(rng: np.random.RandomState, min_wh: int, max_wh: int): + w, h = rng.randint(min_wh, max_wh, size=(2, )) + arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8) + return Image.fromarray(arr) + + +def _rand_video( + rng: np.random.RandomState, + min_frames: int, + max_frames: int, + min_wh: int, + max_wh: int, +): + # Temporary workaround for https://github.com/huggingface/transformers/issues/35412 + num_frames = rng.randint(min_frames, max_frames) + num_frames = (num_frames // 2) * 2 + + w, h = rng.randint(min_wh, max_wh, size=(2, )) + return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8) + + +def _rand_audio( + rng: np.random.RandomState, + min_len: int, + max_len: int, + sr: int, +): + audio_len = rng.randint(min_len, max_len) + return rng.rand(audio_len), sr + + +def _test_processing_cache_correctness( + model_id: str, + modalities: dict[str, bool], + hit_rate: float, + num_batches: int, + simplify_rate: float, +): + if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3": + hf_overrides = {"architectures": ["MantisForConditionalGeneration"]} + else: + hf_overrides = {} + + model_config = ModelConfig( + model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=True, + seed=0, + dtype="float16", + revision=None, + hf_overrides=hf_overrides, + ) + model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) + + processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls] + ctx = InputProcessingContext( + model_config, + tokenizer=cached_get_tokenizer(model_config.tokenizer), + ) + # Ensure that it can fit all of the data + cache = ProcessingCache(capacity=1 << 30) + + baseline_processor = processor_factory(ctx, cache=None) + cached_processor = processor_factory(ctx, cache=cache) + + rng = np.random.RandomState(0) + + input_to_hit = { + "image": Image.new("RGB", size=(128, 128)), + "video": np.zeros((4, 128, 128, 3), dtype=np.uint8), + "audio": (np.zeros((512, )), 16000), + } + input_factory = { + "image": + partial(_rand_img, rng, min_wh=128, max_wh=256), + "video": + partial(_rand_video, + rng, + min_frames=2, + max_frames=8, + min_wh=128, + max_wh=256), + "audio": + partial(_rand_audio, rng, min_len=256, max_len=512, sr=16000), + } + input_max_count = { + modality: 3 if supports_multi else 1 + for modality, supports_multi in modalities.items() + } + + for batch_idx in range(num_batches): + mm_data = { + k: + [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) + for _ in range(rng.randint(input_max_count[k]))] + for k in modalities + } + + mm_counts = {k: len(vs) for k, vs in mm_data.items()} + prompt = baseline_processor._get_dummy_mm_inputs(mm_counts).prompt_text + + # Drop unnecessary keys and test single -> multi conversion + if rng.rand() < simplify_rate: + for k in list(mm_data.keys()): + if not mm_data[k]: + del mm_data[k] + elif len(mm_data[k]) == 1: + mm_data[k] = mm_data[k][0] + + baseline_result = baseline_processor.apply( + prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + cached_result = cached_processor.apply( + prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + assert baseline_result == cached_result, ( + f"Failed ({batch_idx=}, {mm_data=})") + + +# yapf: disable +# True if the model supports multiple data items of the modality per request +@pytest.mark.parametrize(("model_id", "modalities"), [ + ("rhymes-ai/Aria", {"image": True}), + ("Salesforce/blip2-opt-2.7b", {"image": False}), + ("facebook/chameleon-7b", {"image": False}), + ("adept/fuyu-8b", {"image": False}), + ("llava-hf/llava-1.5-7b-hf", {"image": True}), + ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), + ("mistral-community/pixtral-12b", {"image": True}), + ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}), + ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}), + ("fixie-ai/ultravox-v0_3", {"audio": True}), +]) +@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) +@pytest.mark.parametrize("num_batches", [32]) +@pytest.mark.parametrize("simplify_rate", [1.0]) +# yapf: enable +def test_processing_cache_correctness( + model_id: str, + modalities: dict[str, bool], + hit_rate: float, + num_batches: int, + simplify_rate: float, +): + _test_processing_cache_correctness( + model_id, + modalities, + hit_rate=hit_rate, + num_batches=num_batches, + simplify_rate=simplify_rate, + ) + + +# yapf: disable +@pytest.mark.parametrize(("model_id", "modalities"), [ + ("microsoft/Phi-3-vision-128k-instruct", {"image": True}), +]) +@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) +@pytest.mark.parametrize("num_batches", [32]) +@pytest.mark.parametrize("simplify_rate", [1.0]) +# yapf: enable +def test_processing_cache_correctness_phi3v( + model_id: str, + modalities: dict[str, bool], + hit_rate: float, + num_batches: int, + simplify_rate: float, +): + # HACK - this is an attempted workaround for the following bug + # https://github.com/huggingface/transformers/issues/34307 + from transformers import AutoImageProcessor # noqa: F401 + from transformers import AutoProcessor # noqa: F401 + + AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True) + + _test_processing_cache_correctness( + model_id, + modalities, + hit_rate=hit_rate, + num_batches=num_batches, + simplify_rate=simplify_rate, + ) diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index fd82fb0c55fd7..6029f2e514772 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -9,7 +9,7 @@ import pytest from PIL import Image, ImageChops from transformers import AutoConfig, AutoTokenizer -from vllm.multimodal.utils import (async_fetch_image, fetch_image, +from vllm.multimodal.utils import (MediaConnector, repeat_and_pad_placeholder_tokens) # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) @@ -23,7 +23,12 @@ TEST_IMAGE_URLS = [ @pytest.fixture(scope="module") def url_images() -> Dict[str, Image.Image]: - return {image_url: fetch_image(image_url) for image_url in TEST_IMAGE_URLS} + connector = MediaConnector() + + return { + image_url: connector.fetch_image(image_url) + for image_url in TEST_IMAGE_URLS + } def get_supported_suffixes() -> Tuple[str, ...]: @@ -43,8 +48,10 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool: @pytest.mark.asyncio @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) async def test_fetch_image_http(image_url: str): - image_sync = fetch_image(image_url) - image_async = await async_fetch_image(image_url) + connector = MediaConnector() + + image_sync = connector.fetch_image(image_url) + image_async = await connector.fetch_image_async(image_url) assert _image_equals(image_sync, image_async) @@ -53,6 +60,7 @@ async def test_fetch_image_http(image_url: str): @pytest.mark.parametrize("suffix", get_supported_suffixes()) async def test_fetch_image_base64(url_images: Dict[str, Image.Image], image_url: str, suffix: str): + connector = MediaConnector() url_image = url_images[image_url] try: @@ -75,48 +83,49 @@ async def test_fetch_image_base64(url_images: Dict[str, Image.Image], base64_image = base64.b64encode(f.read()).decode("utf-8") data_url = f"data:{mime_type};base64,{base64_image}" - data_image_sync = fetch_image(data_url) + data_image_sync = connector.fetch_image(data_url) if _image_equals(url_image, Image.open(f)): assert _image_equals(url_image, data_image_sync) else: pass # Lossy format; only check that image can be opened - data_image_async = await async_fetch_image(data_url) + data_image_async = await connector.fetch_image_async(data_url) assert _image_equals(data_image_sync, data_image_async) @pytest.mark.asyncio @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) async def test_fetch_image_local_files(image_url: str): + connector = MediaConnector() + with TemporaryDirectory() as temp_dir: - origin_image = fetch_image(image_url) + local_connector = MediaConnector(allowed_local_media_path=temp_dir) + + origin_image = connector.fetch_image(image_url) origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)), quality=100, icc_profile=origin_image.info.get('icc_profile')) - image_async = await async_fetch_image( - f"file://{temp_dir}/{os.path.basename(image_url)}", - allowed_local_media_path=temp_dir) - - image_sync = fetch_image( - f"file://{temp_dir}/{os.path.basename(image_url)}", - allowed_local_media_path=temp_dir) + image_async = await local_connector.fetch_image_async( + f"file://{temp_dir}/{os.path.basename(image_url)}") + image_sync = local_connector.fetch_image( + f"file://{temp_dir}/{os.path.basename(image_url)}") # Check that the images are equal assert not ImageChops.difference(image_sync, image_async).getbbox() - with pytest.raises(ValueError): - await async_fetch_image( - f"file://{temp_dir}/../{os.path.basename(image_url)}", - allowed_local_media_path=temp_dir) - with pytest.raises(ValueError): - await async_fetch_image( + with pytest.raises(ValueError, match="must be a subpath"): + await local_connector.fetch_image_async( + f"file://{temp_dir}/../{os.path.basename(image_url)}") + with pytest.raises(RuntimeError, match="Cannot load local files"): + await connector.fetch_image_async( f"file://{temp_dir}/../{os.path.basename(image_url)}") - with pytest.raises(ValueError): - fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}", - allowed_local_media_path=temp_dir) - with pytest.raises(ValueError): - fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}") + with pytest.raises(ValueError, match="must be a subpath"): + local_connector.fetch_image( + f"file://{temp_dir}/../{os.path.basename(image_url)}") + with pytest.raises(RuntimeError, match="Cannot load local files"): + connector.fetch_image( + f"file://{temp_dir}/../{os.path.basename(image_url)}") @pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-mistral-7b-hf"]) diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py new file mode 100644 index 0000000000000..31639906898db --- /dev/null +++ b/tests/plugins/vllm_add_dummy_platform/setup.py @@ -0,0 +1,11 @@ +from setuptools import setup + +setup( + name='vllm_add_dummy_platform', + version='0.1', + packages=['vllm_add_dummy_platform'], + entry_points={ + 'vllm.platform_plugins': [ + "dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin" # noqa + ] + }) diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py new file mode 100644 index 0000000000000..594cef520a7de --- /dev/null +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py @@ -0,0 +1,5 @@ +from typing import Optional + + +def dummy_platform_plugin() -> Optional[str]: + return "vllm_add_dummy_platform.dummy_platform.DummyPlatform" diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py new file mode 100644 index 0000000000000..fde93142f1103 --- /dev/null +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py @@ -0,0 +1,5 @@ +from vllm.platforms.cuda import CudaPlatform + + +class DummyPlatform(CudaPlatform): + device_name = "DummyDevice" diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py new file mode 100644 index 0000000000000..0d27cf9f152e0 --- /dev/null +++ b/tests/plugins_tests/test_platform_plugins.py @@ -0,0 +1,16 @@ +def test_platform_plugins(): + # simulate workload by running an example + import runpy + current_file = __file__ + import os + example_file = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(current_file))), + "examples", "offline_inference.py") + runpy.run_path(example_file) + + # check if the plugin is loaded correctly + from vllm.platforms import _init_trace, current_platform + assert current_platform.device_name == "DummyDevice", ( + f"Expected DummyDevice, got {current_platform.device_name}, " + "possibly because current_platform is imported before the plugin" + f" is loaded. The first import:\n{_init_trace}") diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index ed04f0a373c51..35e3a2f972720 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -98,9 +98,9 @@ def test_prefill(): # Incomplete 1 block (6 tokens) unique_token_ids = [3] * 6 req2 = make_request("2", common_token_ids + unique_token_ids) - computed_block = manager.get_computed_blocks(req2) + computed_blocks = manager.get_computed_blocks(req2) assert len(req2.kv_block_hashes) == 3 - assert [b.block_id for b in computed_block] == [0, 1, 2] + assert [b.block_id for b in computed_blocks] == [0, 1, 2] num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks) assert [b.block_id for b in blocks] == [7, 8] @@ -469,9 +469,9 @@ def test_mm_prefix_caching(): # Completed block should have hashes with extra keys. assert not computed_blocks assert len(req0.kv_block_hashes) == 3 - assert req0.kv_block_hashes[0].extra_keys == (("aaa", 0), ) - assert req0.kv_block_hashes[1].extra_keys == (("aaa", 5), ("bbb", 0)) - assert req0.kv_block_hashes[2].extra_keys == (("bbb", 2), ) + assert req0.kv_block_hashes[0].extra_keys == ("aaa", ) + assert req0.kv_block_hashes[1].extra_keys == ("aaa", "bbb") + assert req0.kv_block_hashes[2].extra_keys == ("bbb", ) blocks = manager.allocate_slots(req0, 59, computed_blocks) assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4] @@ -485,7 +485,7 @@ def test_mm_prefix_caching(): # The just completed block should have hashes with extra keys. assert len(req0.kv_block_hashes) == 4 - assert req0.kv_block_hashes[3].extra_keys == (("ccc", 0), ) + assert req0.kv_block_hashes[3].extra_keys == ("ccc", ) # Cache hit. unique_token_ids = [-1] * 7 + [200] * 5 @@ -500,3 +500,62 @@ def test_mm_prefix_caching(): mm_hashes=mm_hashes) computed_blocks = manager.get_computed_blocks(req1) assert len(computed_blocks) == 3 + + +def test_prefill_not_enough_free_blocks_with_computed_blocks(): + """ + This is a unit test that tests the correctness of the allocate_slots + when there is not enough free blocks. Specifically, when a request + has computed blocks but cannot be allocated due to not enough free blocks, + the computed blocks should not be touched. + """ + block_size = 16 + manager = KVCacheManager( + block_size=block_size, + num_gpu_blocks=10, + max_model_len=8192, + sliding_window=None, + enable_caching=True, + num_preallocate_tokens=0, + ) + # Complete 3 blocks (48 tokens) + # | Common-0 | Common-1 | Common-2 | ... | + common_token_ids = [i for i in range(3) for _ in range(16)] + req0 = make_request("0", common_token_ids) + computed_blocks = manager.get_computed_blocks(req0) + assert not computed_blocks + manager.allocate_slots(req0, 48, computed_blocks) + block_part0 = manager.req_to_blocks[req0.request_id] + + # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... | + req1 = make_request("1", common_token_ids * 2) + computed_blocks = manager.get_computed_blocks(req1) + assert computed_blocks == block_part0 + manager.allocate_slots(req1, 48, computed_blocks) + block_part1 = manager.req_to_blocks[req1.request_id] + # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) | + # | Req1-5(F)| ... | + manager.free(req1) + assert {block.ref_cnt for block in block_part1[:3]} == {1} + assert {block.ref_cnt for block in block_part1[3:]} == {0} + + # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) | + # | Req1-5(F)| Req2-0 | Req2-1 | ... | + req2 = make_request("2", [7] * block_size * 2) + computed_blocks = manager.get_computed_blocks(req2) + assert not computed_blocks + manager.allocate_slots(req2, block_size * 2, computed_blocks) + + # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed, + # but it cannot be allocated due to insufficient free blocks (2). + # In this case, the ref_cnt of the computed blocks should not be changed. + assert manager.free_block_queue.num_free_blocks == 5 + req3 = make_request("3", common_token_ids * 3) + computed_blocks = manager.get_computed_blocks(req3) + assert computed_blocks == block_part1 + # Req3 cannot be allocated. + assert manager.allocate_slots(req3, 48, computed_blocks) is None + # Block 0-2 are used by Req 1. + assert {block.ref_cnt for block in block_part1[:3]} == {1} + # Block 3-5 are free. + assert {block.ref_cnt for block in block_part1[3:]} == {0} diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py index 07f343666cb5e..aeae697ca32b0 100644 --- a/tests/v1/engine/test_detokenizer.py +++ b/tests/v1/engine/test_detokenizer.py @@ -3,9 +3,9 @@ from typing import List import pytest from transformers import AutoTokenizer -from vllm.sampling_params import RequestOutputKind -from vllm.v1.engine import EngineCoreOutput -from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerRequest +from vllm.sampling_params import RequestOutputKind, SamplingParams +from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest +from vllm.v1.engine.detokenizer import Detokenizer TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3" tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) @@ -71,16 +71,22 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind): # Make N requests. requests = [ - DetokenizerRequest( - request_id=f"request-{idx}", - prompt=prompt, - prompt_token_ids=prompt_tokens, - skip_special_tokens=False, - spaces_between_special_tokens=False, - output_kind=request_output_kind, - stop=[], - include_stop_str_in_output=False, - ) for idx, ( + EngineCoreRequest(request_id=f"request-{idx}", + prompt=prompt, + prompt_token_ids=prompt_tokens, + arrival_time=0, + mm_inputs=None, + mm_hashes=None, + mm_placeholders=None, + eos_token_id=None, + lora_request=None, + sampling_params=SamplingParams( + skip_special_tokens=False, + spaces_between_special_tokens=False, + output_kind=request_output_kind, + stop=[], + include_stop_str_in_output=False)) + for idx, ( prompt, prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) ] @@ -133,18 +139,25 @@ def test_stop_string(include_stop_str_in_output: bool): # Make N requests. requests = [ - DetokenizerRequest( + EngineCoreRequest( request_id=f"request-{idx}", prompt=prompt, prompt_token_ids=prompt_tokens, - skip_special_tokens=False, - spaces_between_special_tokens=False, - output_kind=RequestOutputKind.DELTA, - stop=STOP_STRINGS, - include_stop_str_in_output=include_stop_str_in_output, - ) for idx, ( - prompt, - prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) + arrival_time=0, + mm_inputs=None, + mm_hashes=None, + mm_placeholders=None, + eos_token_id=None, + lora_request=None, + sampling_params=SamplingParams( + skip_special_tokens=False, + spaces_between_special_tokens=False, + output_kind=RequestOutputKind.DELTA, + stop=STOP_STRINGS, + include_stop_str_in_output=include_stop_str_in_output, + )) for idx, ( + prompt, + prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) ] # Add requests to the detokenizer. diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index c529cd21f384b..954cec734b956 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -7,7 +7,6 @@ from transformers import AutoTokenizer from vllm import SamplingParams from vllm.engine.arg_utils import EngineArgs from vllm.platforms import current_platform -from vllm.usage.usage_lib import UsageContext from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.core import EngineCore @@ -43,13 +42,11 @@ def test_engine_core(monkeypatch): m.setenv("VLLM_USE_V1", "1") """Setup the EngineCore.""" engine_args = EngineArgs(model=MODEL_NAME) - vllm_config = engine_args.create_engine_config( - usage_context=UsageContext.UNKNOWN_CONTEXT) + vllm_config = engine_args.create_engine_config() executor_class = AsyncLLM._get_executor_cls(vllm_config) engine_core = EngineCore(vllm_config=vllm_config, - executor_class=executor_class, - usage_context=UsageContext.UNKNOWN_CONTEXT) + executor_class=executor_class) """Test basic request lifecycle.""" # First request. @@ -151,13 +148,11 @@ def test_engine_core_advanced_sampling(monkeypatch): m.setenv("VLLM_USE_V1", "1") """Setup the EngineCore.""" engine_args = EngineArgs(model=MODEL_NAME) - vllm_config = engine_args.create_engine_config( - usage_context=UsageContext.UNKNOWN_CONTEXT) + vllm_config = engine_args.create_engine_config() executor_class = AsyncLLM._get_executor_cls(vllm_config) engine_core = EngineCore(vllm_config=vllm_config, - executor_class=executor_class, - usage_context=UsageContext.UNKNOWN_CONTEXT) + executor_class=executor_class) """Test basic request lifecycle.""" # First request. request: EngineCoreRequest = make_request() diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 2f1cbec607a91..729975e4ea8c4 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -86,11 +86,10 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): UsageContext.UNKNOWN_CONTEXT) executor_class = AsyncLLM._get_executor_cls(vllm_config) client = EngineCoreClient.make_client( - vllm_config, - executor_class, - UsageContext.UNKNOWN_CONTEXT, multiprocess_mode=multiprocessing_mode, asyncio_mode=False, + vllm_config=vllm_config, + executor_class=executor_class, ) MAX_TOKENS = 20 @@ -158,11 +157,10 @@ async def test_engine_core_client_asyncio(monkeypatch): usage_context=UsageContext.UNKNOWN_CONTEXT) executor_class = AsyncLLM._get_executor_cls(vllm_config) client = EngineCoreClient.make_client( - vllm_config, - executor_class, - UsageContext.UNKNOWN_CONTEXT, multiprocess_mode=True, asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, ) MAX_TOKENS = 20 diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 1daf45f9e7cf6..76ec1886b3c74 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -23,8 +23,7 @@ with contextlib.suppress(ImportError): import vllm._moe_C # noqa: F401 supports_moe_ops = True -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING or current_platform.is_neuron(): +if TYPE_CHECKING: def register_fake(fn): return lambda name: fn diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py index 9033644e3264a..a46c67ad7e00e 100644 --- a/vllm/assets/audio.py +++ b/vllm/assets/audio.py @@ -21,12 +21,10 @@ class AudioAsset: name: Literal["winning_call", "mary_had_lamb"] @property - def audio_and_sample_rate(self) -> tuple[npt.NDArray, int]: + def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]: audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg", s3_prefix=ASSET_DIR) - y, sr = librosa.load(audio_path, sr=None) - assert isinstance(sr, int) - return y, sr + return librosa.load(audio_path, sr=None) @property def url(self) -> str: diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 826d1744d88a5..a8dd628b9cd6f 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -208,8 +208,8 @@ def wrap_inductor(graph: fx.GraphModule, from torch._inductor.compile_fx import graph_returns_tuple returns_tuple = graph_returns_tuple(graph) - # this is the graph we return to Dynamo to run - def compiled_graph(*args) -> Optional[fx.CompiledFxGraph]: + # this is the callable we return to Dynamo to run + def compiled_graph(*args): # convert args to list list_args = list(args) graph_output = inductor_compiled_graph(list_args) @@ -537,7 +537,8 @@ class VllmBackend: example_inputs[x].clone() for x in self.sym_tensor_indices ] - def copy_and_call(*args) -> fx.GraphModule: + # this is the callable we return to Dynamo to run + def copy_and_call(*args): list_args = list(args) for i, index in enumerate(self.sym_tensor_indices): runtime_tensor = list_args[index] @@ -618,8 +619,10 @@ class PiecewiseBackend: # the entries for different shapes that we need to either # compile or capture cudagraph self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {} - self.to_be_compiled_sizes: Set[int] = self.compile_sizes.union( - self.capture_sizes) + + # to_be_compiled_sizes tracks the remaining sizes to compile, + # and updates during the compilation process, so we need to copy it + self.to_be_compiled_sizes: Set[int] = self.compile_sizes.copy() for shape in self.compile_sizes.union(self.capture_sizes): self.concrete_size_entries[shape] = ConcreteSizeEntry( runtime_shape=shape, @@ -627,12 +630,17 @@ class PiecewiseBackend: use_cudagraph=shape in self.capture_sizes, ) + def check_for_ending_compilation(self): + if self.is_last_graph and not self.to_be_compiled_sizes: + # no specific sizes to compile + # save the hash of the inductor graph for the next run + self.compilation_config.inductor_hash_cache.save_to_file() + end_monitoring_torch_compile(self.vllm_config) + def __call__(self, *args) -> Any: if not self.first_run_finished: self.first_run_finished = True - # no specific sizes to compile - if self.is_last_graph and not self.to_be_compiled_sizes: - end_monitoring_torch_compile(self.vllm_config) + self.check_for_ending_compilation() return self.compiled_graph_for_general_shape(*args) runtime_shape = args[self.sym_shape_indices[0]] @@ -661,10 +669,7 @@ class PiecewiseBackend: # finished compilations for all required shapes if self.is_last_graph and not self.to_be_compiled_sizes: - - # save the hash of the inductor graph for the next run - self.compilation_config.inductor_hash_cache.save_to_file() - end_monitoring_torch_compile(self.vllm_config) + self.check_for_ending_compilation() if not entry.use_cudagraph: return entry.runnable(*args) diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index c10241b483169..e3260a10c02ae 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -28,11 +28,12 @@ class TorchCompileWrapperWithCustomDispatcher: compiled_callable: Optional[Callable] = None, compilation_level: int = 0): + vllm_config = get_current_vllm_config() + self.vllm_config = vllm_config if compiled_callable is None: # default compilation settings # compiling the forward method - vllm_config = get_current_vllm_config() backend = vllm_config.compilation_config.init_backend(vllm_config) compiled_callable = torch.compile( @@ -82,6 +83,13 @@ class TorchCompileWrapperWithCustomDispatcher: self.compiled_codes.append(new_code) + if self.vllm_config.compilation_config.use_cudagraph and \ + "update" in new_code.co_names: + import depyf + src = depyf.decompile(new_code) + msg = "Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n" + src # noqa + raise RuntimeError(msg) + @contextmanager def dispatch_to_code(self, index: int): """Context manager to dispatch to the compiled code. diff --git a/vllm/config.py b/vllm/config.py index ac767bbe14be4..e72c53b6130d0 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -9,8 +9,8 @@ from contextlib import contextmanager from dataclasses import dataclass, field, replace from pathlib import Path from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Counter, Dict, - Final, List, Literal, Mapping, Optional, Set, Tuple, Type, - Union) + Final, List, Literal, Mapping, Optional, Protocol, Set, + Tuple, Type, Union) import torch from pydantic import BaseModel, Field, PrivateAttr @@ -22,7 +22,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS, get_quantization_config) from vllm.model_executor.models import ModelRegistry -from vllm.platforms import current_platform, interface +from vllm.platforms import CpuArchEnum from vllm.tracing import is_otel_available, otel_import_error_traceback from vllm.transformers_utils.config import ( ConfigFormat, get_config, get_hf_image_processor_config, @@ -75,6 +75,12 @@ HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig], PretrainedConfig]] +class SupportsHash(Protocol): + + def compute_hash(self) -> str: + ... + + class ModelConfig: """Configuration for the model. @@ -301,7 +307,7 @@ class ModelConfig: sliding_window = getattr(self.hf_text_config, "sliding_window", None) has_interleaved_attention = (sliding_window is not None) and ( isinstance(sliding_window, list) or - (self.hf_text_config.model_type in ["gemma2"])) + (self.hf_text_config.model_type in ["gemma2", "cohere2"])) if (not self.disable_sliding_window and has_interleaved_attention): if envs.VLLM_ATTENTION_BACKEND == "XFORMERS": @@ -343,6 +349,7 @@ class ModelConfig: self.is_hybrid = self._init_is_hybrid() self.has_inner_state = self._init_has_inner_state() + from vllm.platforms import current_platform if current_platform.is_neuron(): self.override_neuron_config = override_neuron_config else: @@ -583,6 +590,7 @@ class ModelConfig: raise ValueError( f"Unknown quantization method: {self.quantization}. Must " f"be one of {supported_quantization}.") + from vllm.platforms import current_platform current_platform.verify_quantization(self.quantization) if self.quantization not in optimized_quantization_methods: logger.warning( @@ -638,6 +646,7 @@ class ModelConfig: # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid + from vllm.platforms import current_platform if not current_platform.is_async_output_supported(self.enforce_eager): logger.warning( "Async output processing is not supported on the " @@ -1006,6 +1015,7 @@ class CacheConfig: raise ValueError( "GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.") + from vllm.platforms import current_platform if (current_platform.is_cuda() and self.block_size is not None and self.block_size > 32): raise ValueError("CUDA Paged Attention kernel only supports " @@ -1273,6 +1283,7 @@ class ParallelConfig: f"distributed executor backend " f"'{self.distributed_executor_backend}'.") ray_only_devices = ["tpu", "hpu"] + from vllm.platforms import current_platform if (current_platform.device_type in ray_only_devices and self.world_size > 1): if self.distributed_executor_backend is None: @@ -1321,7 +1332,7 @@ class ParallelConfig: def _verify_args(self) -> None: # Lazy import to avoid circular import from vllm.executor.executor_base import ExecutorBase - + from vllm.platforms import current_platform if self.distributed_executor_backend not in ( "ray", "mp", None) and not (isinstance( self.distributed_executor_backend, type) and issubclass( @@ -1522,6 +1533,7 @@ class DeviceConfig: def __init__(self, device: str = "auto") -> None: if device == "auto": # Automated device type detection + from vllm.platforms import current_platform self.device_type = current_platform.device_type if not self.device_type: raise RuntimeError("Failed to infer device type") @@ -2235,9 +2247,10 @@ def _get_and_verify_dtype( else: torch_dtype = config_dtype + from vllm.platforms import current_platform if (current_platform.is_cpu() and current_platform.get_cpu_architecture() - == interface.CpuArchEnum.POWERPC + == CpuArchEnum.POWERPC and (config_dtype == torch.float16 or config_dtype == torch.float32)): logger.info( @@ -2559,14 +2572,6 @@ class KVTransferConfig(BaseModel): return KVTransferConfig.model_validate_json(cli_value) def model_post_init(self, __context: Any) -> None: - supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"] - if all([ - self.kv_connector is not None, self.kv_connector - not in supported_kv_connector - ]): - raise ValueError(f"Unsupported kv_connector: {self.kv_connector}. " - f"Supported connectors are " - f"{supported_kv_connector}.") if self.kv_role is not None and self.kv_role not in [ "kv_producer", "kv_consumer", "kv_both" @@ -2977,6 +2982,10 @@ class VllmConfig: init=True) # type: ignore kv_transfer_config: KVTransferConfig = field(default=None, init=True) # type: ignore + # some opaque config, only used to provide additional information + # for the hash computation, mainly used for testing and debugging. + additional_config: SupportsHash = field(default=None, + init=True) # type: ignore instance_id: str = "" def compute_hash(self) -> str: @@ -3008,33 +3017,62 @@ class VllmConfig: vllm_factors.append(__version__) if self.model_config: vllm_factors.append(self.model_config.compute_hash()) + else: + vllm_factors.append("None") if self.cache_config: vllm_factors.append(self.cache_config.compute_hash()) + else: + vllm_factors.append("None") if self.parallel_config: vllm_factors.append(self.parallel_config.compute_hash()) + else: + vllm_factors.append("None") if self.scheduler_config: vllm_factors.append(self.scheduler_config.compute_hash()) + else: + vllm_factors.append("None") if self.device_config: vllm_factors.append(self.device_config.compute_hash()) + else: + vllm_factors.append("None") if self.load_config: vllm_factors.append(self.load_config.compute_hash()) + else: + vllm_factors.append("None") if self.lora_config: vllm_factors.append(self.lora_config.compute_hash()) + else: + vllm_factors.append("None") if self.speculative_config: vllm_factors.append(self.speculative_config.compute_hash()) + else: + vllm_factors.append("None") if self.decoding_config: vllm_factors.append(self.decoding_config.compute_hash()) + else: + vllm_factors.append("None") if self.observability_config: vllm_factors.append(self.observability_config.compute_hash()) + else: + vllm_factors.append("None") if self.prompt_adapter_config: vllm_factors.append(self.prompt_adapter_config.compute_hash()) + else: + vllm_factors.append("None") if self.quant_config: pass # should be captured by model_config.quantization if self.compilation_config: vllm_factors.append(self.compilation_config.compute_hash()) + else: + vllm_factors.append("None") if self.kv_transfer_config: vllm_factors.append(self.kv_transfer_config.compute_hash()) - + else: + vllm_factors.append("None") + if self.additional_config: + vllm_factors.append(self.additional_config.compute_hash()) + else: + vllm_factors.append("None") factors.append(vllm_factors) hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10] @@ -3052,6 +3090,7 @@ class VllmConfig: model_config: ModelConfig, load_config: LoadConfig) -> Optional[QuantizationConfig]: """Get the quantization config.""" + from vllm.platforms import current_platform if model_config.quantization is not None: from vllm.model_executor.model_loader.weight_utils import ( get_quant_config) @@ -3114,6 +3153,7 @@ class VllmConfig: self.quant_config = VllmConfig._get_quantization_config( self.model_config, self.load_config) + from vllm.platforms import current_platform if self.scheduler_config is not None and \ self.model_config is not None and \ self.scheduler_config.chunked_prefill_enabled and \ diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 3e2bb436d24b5..6372dab726086 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -1,4 +1,5 @@ -from typing import TYPE_CHECKING +import importlib +from typing import TYPE_CHECKING, Callable, Dict, Type from .base import KVConnectorBase @@ -7,14 +8,41 @@ if TYPE_CHECKING: class KVConnectorFactory: + _registry: Dict[str, Callable[[], Type[KVConnectorBase]]] = {} - @staticmethod - def create_connector(rank: int, local_rank: int, + @classmethod + def register_connector(cls, name: str, module_path: str, + class_name: str) -> None: + """Register a connector with a lazy-loading module and class name.""" + if name in cls._registry: + raise ValueError(f"Connector '{name}' is already registered.") + + def loader() -> Type[KVConnectorBase]: + module = importlib.import_module(module_path) + return getattr(module, class_name) + + cls._registry[name] = loader + + @classmethod + def create_connector(cls, rank: int, local_rank: int, config: "VllmConfig") -> KVConnectorBase: - supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"] - if config.kv_transfer_config.kv_connector in supported_kv_connector: - from .simple_connector import SimpleConnector - return SimpleConnector(rank, local_rank, config) - else: - raise ValueError(f"Unsupported connector type: " - f"{config.kv_connector}") + connector_name = config.kv_transfer_config.kv_connector + if connector_name not in cls._registry: + raise ValueError(f"Unsupported connector type: {connector_name}") + + connector_cls = cls._registry[connector_name]() + return connector_cls(rank, local_rank, config) + + +# Register various connectors here. +# The registration should not be done in each individual file, as we want to +# only load the files corresponding to the current connector. +KVConnectorFactory.register_connector( + "PyNcclConnector", + "vllm.distributed.kv_transfer.kv_connector.simple_connector", + "SimpleConnector") + +KVConnectorFactory.register_connector( + "MooncakeConnector", + "vllm.distributed.kv_transfer.kv_connector.simple_connector", + "SimpleConnector") diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 5b9236f8c56b6..e6768467f4c27 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -39,7 +39,6 @@ import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer import vllm.envs as envs from vllm.distributed.utils import StatelessProcessGroup from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op, supports_custom_op if TYPE_CHECKING: @@ -194,6 +193,7 @@ class GroupCoordinator: assert self.cpu_group is not None assert self.device_group is not None + from vllm.platforms import current_platform if current_platform.is_cuda_alike(): self.device = torch.device(f"cuda:{local_rank}") else: @@ -1188,6 +1188,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False): import ray # Lazy import Ray ray.shutdown() gc.collect() + from vllm.platforms import current_platform if not current_platform.is_cpu(): torch.cuda.empty_cache() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 21966d003c7ef..69c7c5077fe32 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -18,7 +18,6 @@ from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat, from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -from vllm.platforms import current_platform from vllm.transformers_utils.utils import check_gguf_file from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser, StoreBoolean @@ -1094,6 +1093,7 @@ class EngineArgs: use_sliding_window = (model_config.get_sliding_window() is not None) use_spec_decode = self.speculative_model is not None + from vllm.platforms import current_platform if (is_gpu and not use_sliding_window and not use_spec_decode and not self.enable_lora and not self.enable_prompt_adapter diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 39f59e55da1f7..1db3e59ff3bae 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1124,6 +1124,8 @@ class LLMEngine: seq_group = scheduled_seq_group.seq_group seq_group.maybe_set_first_token_time(now) + if not seq_group.is_prefill(): + seq_group.set_last_token_time(now) request_output = RequestOutputFactory.create( seq_group, self.seq_id_to_seq_group, @@ -1166,6 +1168,8 @@ class LLMEngine: seq_group = scheduled_seq_group.seq_group seq_group.maybe_set_first_token_time(now) + if not seq_group.is_prefill(): + seq_group.set_last_token_time(now) request_output = RequestOutputFactory.create( seq_group, self.seq_id_to_seq_group, @@ -1686,7 +1690,7 @@ class LLMEngine: # If the seq_group just finished the prefill state # get TTFT. if not seq_group.is_prefill(): - latency = seq_group.get_last_latency(now) + latency = seq_group.get_last_token_latency() time_to_first_tokens_iter.append(latency) # One generation token per finished prefill. @@ -1694,7 +1698,7 @@ class LLMEngine: seq_group.num_seqs()) else: # TPOTs. - latency = seq_group.get_last_latency(now) + latency = seq_group.get_last_token_latency() time_per_output_tokens_iter.append(latency) if seq_group.state.current_step == 0: # For async_output_proc, the do_log_stats() diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 3df08c740d65b..a492d5496e025 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -6,7 +6,7 @@ from collections import defaultdict, deque from functools import lru_cache, partial from pathlib import Path from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List, - Literal, Mapping, Optional, Tuple, TypeVar, Union, cast) + Literal, Optional, Tuple, TypeVar, Union, cast) import jinja2.nodes import transformers.utils.chat_template_utils as hf_chat_utils @@ -23,6 +23,8 @@ from openai.types.chat import ( ChatCompletionMessageParam as OpenAIChatCompletionMessageParam) from openai.types.chat import (ChatCompletionMessageToolCallParam, ChatCompletionToolMessageParam) +from openai.types.chat.chat_completion_content_part_input_audio_param import ( + InputAudio) # yapf: enable # pydantic needs the TypedDict from typing_extensions from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast @@ -31,11 +33,7 @@ from typing_extensions import Required, TypeAlias, TypedDict from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.multimodal import MultiModalDataDict -from vllm.multimodal.utils import (async_get_and_parse_audio, - async_get_and_parse_image, - async_get_and_parse_video, - get_and_parse_audio, get_and_parse_image, - get_and_parse_video) +from vllm.multimodal.utils import MediaConnector from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.utils import print_warning_once @@ -368,14 +366,17 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): self._tokenizer = tokenizer self._allowed_items = (model_config.multimodal_config.limit_per_prompt if model_config.multimodal_config else {}) - self._consumed_items = {k: 0 for k in self._allowed_items} - self._items: List[_T] = [] + self._items_by_modality = defaultdict[str, list[_T]](list) @property def model_config(self) -> ModelConfig: return self._model_config + @property + def allowed_local_media_path(self): + return self._model_config.allowed_local_media_path + @staticmethod @lru_cache(maxsize=None) def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str: @@ -435,38 +436,19 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): else: raise TypeError(f"Unknown modality: {modality}") - @staticmethod - def _combine(items: List[MultiModalDataDict]) -> MultiModalDataDict: - mm_lists: Mapping[str, List[object]] = defaultdict(list) - - # Merge all the multi-modal items - for single_mm_data in items: - for mm_key, mm_item in single_mm_data.items(): - if isinstance(mm_item, list): - mm_lists[mm_key].extend(mm_item) - else: - mm_lists[mm_key].append(mm_item) - - # Unpack any single item lists for models that don't expect multiple. - return { - mm_key: mm_list[0] if len(mm_list) == 1 else mm_list - for mm_key, mm_list in mm_lists.items() - } - def add(self, modality: ModalityStr, item: _T) -> Optional[str]: """ Add a multi-modal item to the current prompt and returns the placeholder string to use, if any. """ allowed_count = self._allowed_items.get(modality, 1) - current_count = self._consumed_items.get(modality, 0) + 1 + current_count = len(self._items_by_modality[modality]) + 1 if current_count > allowed_count: raise ValueError( f"At most {allowed_count} {modality}(s) may be provided in " "one request.") - self._consumed_items[modality] = current_count - self._items.append(item) + self._items_by_modality[modality].append(item) return self._placeholder_str(modality, current_count) @@ -475,22 +457,26 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): raise NotImplementedError -class MultiModalItemTracker(BaseMultiModalItemTracker[MultiModalDataDict]): +class MultiModalItemTracker(BaseMultiModalItemTracker[object]): def all_mm_data(self) -> Optional[MultiModalDataDict]: - return self._combine(self._items) if self._items else None + if self._items_by_modality: + return dict(self._items_by_modality) + + return None def create_parser(self) -> "BaseMultiModalContentParser": return MultiModalContentParser(self) -class AsyncMultiModalItemTracker( - BaseMultiModalItemTracker[Awaitable[MultiModalDataDict]]): +class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]): async def all_mm_data(self) -> Optional[MultiModalDataDict]: - if self._items: - items = await asyncio.gather(*self._items) - return self._combine(items) + if self._items_by_modality: + return { + modality: await asyncio.gather(*items) + for modality, items in self._items_by_modality.items() + } return None @@ -522,7 +508,7 @@ class BaseMultiModalContentParser(ABC): raise NotImplementedError @abstractmethod - def parse_input_audio(self, input_audio: Dict[str, str]) -> None: + def parse_input_audio(self, input_audio: InputAudio) -> None: raise NotImplementedError @abstractmethod @@ -537,31 +523,31 @@ class MultiModalContentParser(BaseMultiModalContentParser): self._tracker = tracker + self._connector = MediaConnector( + allowed_local_media_path=tracker.allowed_local_media_path, + ) + def parse_image(self, image_url: str) -> None: - image = get_and_parse_image(image_url, - allowed_local_media_path=self._tracker. - _model_config.allowed_local_media_path) + image = self._connector.fetch_image(image_url) placeholder = self._tracker.add("image", image) self._add_placeholder(placeholder) def parse_audio(self, audio_url: str) -> None: - audio = get_and_parse_audio(audio_url) + audio = self._connector.fetch_audio(audio_url) placeholder = self._tracker.add("audio", audio) self._add_placeholder(placeholder) - def parse_input_audio(self, input_audio: Dict[str, str]) -> None: - input_audio_data = input_audio.get("data","") - input_audio_format = input_audio.get("format","") - audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}" - audio = get_and_parse_audio(audio_url) + def parse_input_audio(self, input_audio: InputAudio) -> None: + audio_data = input_audio.get("data", "") + audio_format = input_audio.get("format", "") + audio_url = f"data:audio/{audio_format};base64,{audio_data}" - placeholder = self._tracker.add("audio", audio) - self._add_placeholder(placeholder) + return self.parse_audio(audio_url) def parse_video(self, video_url: str) -> None: - video = get_and_parse_video(video_url) + video = self._connector.fetch_video(video_url) placeholder = self._tracker.add("video", video) self._add_placeholder(placeholder) @@ -573,33 +559,31 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): super().__init__() self._tracker = tracker + self._connector = MediaConnector( + allowed_local_media_path=tracker.allowed_local_media_path, + ) def parse_image(self, image_url: str) -> None: - image_coro = async_get_and_parse_image( - image_url, - allowed_local_media_path=self._tracker._model_config. - allowed_local_media_path) + image_coro = self._connector.fetch_image_async(image_url) placeholder = self._tracker.add("image", image_coro) self._add_placeholder(placeholder) def parse_audio(self, audio_url: str) -> None: - audio_coro = async_get_and_parse_audio(audio_url) + audio_coro = self._connector.fetch_audio_async(audio_url) placeholder = self._tracker.add("audio", audio_coro) self._add_placeholder(placeholder) - def parse_input_audio(self, input_audio: Dict[str, str]) -> None: - input_audio_data = input_audio.get("data","") - input_audio_format = input_audio.get("format","") - audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}" - audio_coro = async_get_and_parse_audio(audio_url) + def parse_input_audio(self, input_audio: InputAudio) -> None: + audio_data = input_audio.get("data", "") + audio_format = input_audio.get("format", "") + audio_url = f"data:audio/{audio_format};base64,{audio_data}" - placeholder = self._tracker.add("audio", audio_coro) - self._add_placeholder(placeholder) + return self.parse_audio(audio_url) def parse_video(self, video_url: str) -> None: - video = async_get_and_parse_video(video_url) + video = self._connector.fetch_video_async(video_url) placeholder = self._tracker.add("video", video) self._add_placeholder(placeholder) @@ -695,10 +679,13 @@ _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam) _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) _VideoParser = partial(cast, ChatCompletionContentPartVideoParam) +_ContentPart: TypeAlias = Union[str, Dict[str, str], InputAudio] + # Define a mapping from part types to their corresponding parsing functions. -MM_PARSER_MAP: Dict[str, - Callable[[ChatCompletionContentPartParam], - Union[str, Dict[str,str]]]] = { +MM_PARSER_MAP: Dict[ + str, + Callable[[ChatCompletionContentPartParam], _ContentPart], +] = { "text": lambda part: _TextParser(part).get("text", ""), "image_url": @@ -715,8 +702,7 @@ MM_PARSER_MAP: Dict[str, def _parse_chat_message_content_mm_part( - part: ChatCompletionContentPartParam) -> Tuple[str, - Union[str, Dict[str, str]]]: + part: ChatCompletionContentPartParam) -> tuple[str, _ContentPart]: """ Parses a given multi-modal content part based on its type. @@ -783,7 +769,7 @@ def _parse_chat_message_content_parts( *, wrap_dicts: bool, ) -> List[ConversationMessage]: - content: List[Union[str, Dict[str, str]]] = [] + content = list[_ContentPart]() mm_parser = mm_tracker.create_parser() @@ -814,7 +800,7 @@ def _parse_chat_message_content_part( mm_parser: BaseMultiModalContentParser, *, wrap_dicts: bool, -) -> Optional[Union[str, Dict[str, str]]]: +) -> Optional[_ContentPart]: """Parses a single part of a conversation. If wrap_dicts is True, structured dictionary pieces for texts and images will be wrapped in dictionaries, i.e., {"type": "text", "text", ...} and @@ -823,8 +809,7 @@ def _parse_chat_message_content_part( with multimodal placeholders. """ if isinstance(part, str): # Handle plain text parts - text = _TextParser(part) - return text + return part # Handle structured dictionary parts part_type, content = _parse_chat_message_content_mm_part(part) @@ -855,7 +840,7 @@ def _parse_chat_message_content_part( return {'type': 'audio'} if wrap_dicts else None if part_type == "input_audio": - dict_content = cast(Dict[str, str], content) + dict_content = cast(InputAudio, content) mm_parser.parse_input_audio(dict_content) return {'type': 'audio'} if wrap_dicts else None diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 2e45b474237f9..74fe378fdae42 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -58,7 +58,9 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding -from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import (BaseModelPath, + OpenAIServingModels) from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling from vllm.entrypoints.openai.serving_score import OpenAIServingScores from vllm.entrypoints.openai.serving_tokenization import ( @@ -133,32 +135,21 @@ async def build_async_engine_client_from_engine_args( Returns the Client or None if the creation failed. """ - # Fall back - # TODO: fill out feature matrix. + # AsyncLLMEngine. if (MQLLMEngineClient.is_unsupported_config(engine_args) or envs.VLLM_USE_V1 or disable_frontend_multiprocessing): - engine_config = engine_args.create_engine_config( - UsageContext.OPENAI_API_SERVER) - uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config), - "uses_ray", False) - build_engine = partial(AsyncLLMEngine.from_engine_args, - engine_args=engine_args, - engine_config=engine_config, - usage_context=UsageContext.OPENAI_API_SERVER) - if uses_ray: - # Must run in main thread with ray for its signal handlers to work - engine_client = build_engine() - else: - engine_client = await asyncio.get_running_loop().run_in_executor( - None, build_engine) + engine_client: Optional[EngineClient] = None + try: + engine_client = AsyncLLMEngine.from_engine_args( + engine_args=engine_args, + usage_context=UsageContext.OPENAI_API_SERVER) + yield engine_client + finally: + if engine_client and hasattr(engine_client, "shutdown"): + engine_client.shutdown() - yield engine_client - if hasattr(engine_client, "shutdown"): - engine_client.shutdown() - return - - # Otherwise, use the multiprocessing AsyncLLMEngine. + # MQLLMEngine. else: if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: # Make TemporaryDirectory for prometheus multiprocessing @@ -280,6 +271,10 @@ def base(request: Request) -> OpenAIServing: return tokenization(request) +def models(request: Request) -> OpenAIServingModels: + return request.app.state.openai_serving_models + + def chat(request: Request) -> Optional[OpenAIServingChat]: return request.app.state.openai_serving_chat @@ -347,10 +342,10 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request): @router.get("/v1/models") async def show_available_models(raw_request: Request): - handler = base(raw_request) + handler = models(raw_request) - models = await handler.show_available_models() - return JSONResponse(content=models.model_dump()) + models_ = await handler.show_available_models() + return JSONResponse(content=models_.model_dump()) @router.get("/version") @@ -516,26 +511,22 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: @router.post("/v1/load_lora_adapter") async def load_lora_adapter(request: LoadLoraAdapterRequest, raw_request: Request): - for route in [chat, completion, embedding]: - handler = route(raw_request) - if handler is not None: - response = await handler.load_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) + handler = models(raw_request) + response = await handler.load_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) return Response(status_code=200, content=response) @router.post("/v1/unload_lora_adapter") async def unload_lora_adapter(request: UnloadLoraAdapterRequest, raw_request: Request): - for route in [chat, completion, embedding]: - handler = route(raw_request) - if handler is not None: - response = await handler.unload_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) + handler = models(raw_request) + response = await handler.unload_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) return Response(status_code=200, content=response) @@ -639,13 +630,18 @@ def init_app_state( resolved_chat_template = load_chat_template(args.chat_template) logger.info("Using supplied chat template:\n%s", resolved_chat_template) + state.openai_serving_models = OpenAIServingModels( + model_config=model_config, + base_model_paths=base_model_paths, + lora_modules=args.lora_modules, + prompt_adapters=args.prompt_adapters, + ) + # TODO: The chat template is now broken for lora adapters :( state.openai_serving_chat = OpenAIServingChat( engine_client, model_config, - base_model_paths, + state.openai_serving_models, args.response_role, - lora_modules=args.lora_modules, - prompt_adapters=args.prompt_adapters, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, @@ -657,16 +653,14 @@ def init_app_state( state.openai_serving_completion = OpenAIServingCompletion( engine_client, model_config, - base_model_paths, - lora_modules=args.lora_modules, - prompt_adapters=args.prompt_adapters, + state.openai_serving_models, request_logger=request_logger, return_tokens_as_token_ids=args.return_tokens_as_token_ids, ) if model_config.runner_type == "generate" else None state.openai_serving_pooling = OpenAIServingPooling( engine_client, model_config, - base_model_paths, + state.openai_serving_models, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, @@ -674,7 +668,7 @@ def init_app_state( state.openai_serving_embedding = OpenAIServingEmbedding( engine_client, model_config, - base_model_paths, + state.openai_serving_models, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, @@ -682,14 +676,13 @@ def init_app_state( state.openai_serving_scores = OpenAIServingScores( engine_client, model_config, - base_model_paths, + state.openai_serving_models, request_logger=request_logger ) if model_config.task == "score" else None state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, model_config, - base_model_paths, - lora_modules=args.lora_modules, + state.openai_serving_models, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 908f8c3532c9e..22206ef8dbfe6 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -12,7 +12,7 @@ from typing import List, Optional, Sequence, Union, get_args from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, validate_chat_template) -from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, +from vllm.entrypoints.openai.serving_models import (LoRAModulePath, PromptAdapterPath) from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.utils import FlexibleArgumentParser diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 572ed27b39083..822c0f5f7c211 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -20,7 +20,8 @@ from vllm.entrypoints.openai.protocol import (BatchRequestInput, # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding -from vllm.entrypoints.openai.serving_engine import BaseModelPath +from vllm.entrypoints.openai.serving_models import (BaseModelPath, + OpenAIServingModels) from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser, random_uuid from vllm.version import __version__ as VLLM_VERSION @@ -213,13 +214,17 @@ async def main(args): request_logger = RequestLogger(max_log_len=args.max_log_len) # Create the openai serving objects. + openai_serving_models = OpenAIServingModels( + model_config=model_config, + base_model_paths=base_model_paths, + lora_modules=None, + prompt_adapters=None, + ) openai_serving_chat = OpenAIServingChat( engine, model_config, - base_model_paths, + openai_serving_models, args.response_role, - lora_modules=None, - prompt_adapters=None, request_logger=request_logger, chat_template=None, chat_template_content_format="auto", @@ -228,7 +233,7 @@ async def main(args): openai_serving_embedding = OpenAIServingEmbedding( engine, model_config, - base_model_paths, + openai_serving_models, request_logger=request_logger, chat_template=None, chat_template_content_format="auto", diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index d085333563d19..9ba5eeb7709c9 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -21,10 +21,8 @@ from vllm.entrypoints.openai.protocol import ( ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo, RequestResponseMetadata, ToolCall, UsageInfo) -from vllm.entrypoints.openai.serving_engine import (BaseModelPath, - LoRAModulePath, - OpenAIServing, - PromptAdapterPath) +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager from vllm.logger import init_logger from vllm.outputs import CompletionOutput, RequestOutput @@ -42,11 +40,9 @@ class OpenAIServingChat(OpenAIServing): self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, response_role: str, *, - lora_modules: Optional[List[LoRAModulePath]], - prompt_adapters: Optional[List[PromptAdapterPath]], request_logger: Optional[RequestLogger], chat_template: Optional[str], chat_template_content_format: ChatTemplateContentFormatOption, @@ -57,9 +53,7 @@ class OpenAIServingChat(OpenAIServing): ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=lora_modules, - prompt_adapters=prompt_adapters, + models=models, request_logger=request_logger, return_tokens_as_token_ids=return_tokens_as_token_ids) @@ -126,7 +120,7 @@ class OpenAIServingChat(OpenAIServing): prompt_adapter_request, ) = self._maybe_get_adapters(request) - model_name = self._get_model_name(lora_request) + model_name = self.models.model_name(lora_request) tokenizer = await self.engine_client.get_tokenizer(lora_request) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index aaad7b8c7f44c..17197dce8da23 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -21,10 +21,8 @@ from vllm.entrypoints.openai.protocol import (CompletionLogProbs, RequestResponseMetadata, UsageInfo) # yapf: enable -from vllm.entrypoints.openai.serving_engine import (BaseModelPath, - LoRAModulePath, - OpenAIServing, - PromptAdapterPath) +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams, SamplingParams @@ -41,18 +39,14 @@ class OpenAIServingCompletion(OpenAIServing): self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, *, - lora_modules: Optional[List[LoRAModulePath]], - prompt_adapters: Optional[List[PromptAdapterPath]], request_logger: Optional[RequestLogger], return_tokens_as_token_ids: bool = False, ): super().__init__(engine_client=engine_client, model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=lora_modules, - prompt_adapters=prompt_adapters, + models=models, request_logger=request_logger, return_tokens_as_token_ids=return_tokens_as_token_ids) diff_sampling_param = self.model_config.get_diff_sampling_param() @@ -170,7 +164,7 @@ class OpenAIServingCompletion(OpenAIServing): result_generator = merge_async_iterators(*generators) - model_name = self._get_model_name(lora_request) + model_name = self.models.model_name(lora_request) num_prompts = len(engine_prompts) # Similar to the OpenAI API, when n != best_of, we do not stream the diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index b8fb9d6bd77f2..e7116a3d95d10 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -16,7 +16,8 @@ from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest, EmbeddingResponse, EmbeddingResponseData, ErrorResponse, UsageInfo) -from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.logger import init_logger from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput, PoolingRequestOutput) @@ -46,7 +47,7 @@ class OpenAIServingEmbedding(OpenAIServing): self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, *, request_logger: Optional[RequestLogger], chat_template: Optional[str], @@ -54,9 +55,7 @@ class OpenAIServingEmbedding(OpenAIServing): ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=None, - prompt_adapters=None, + models=models, request_logger=request_logger) self.chat_template = chat_template diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 5b6a089e4c319..319f869240036 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1,7 +1,5 @@ import json -import pathlib from concurrent.futures.thread import ThreadPoolExecutor -from dataclasses import dataclass from http import HTTPStatus from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping, Optional, Sequence, Tuple, TypedDict, Union) @@ -28,13 +26,10 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DetokenizeRequest, EmbeddingChatRequest, EmbeddingCompletionRequest, - ErrorResponse, - LoadLoraAdapterRequest, - ModelCard, ModelList, - ModelPermission, ScoreRequest, + ErrorResponse, ScoreRequest, TokenizeChatRequest, - TokenizeCompletionRequest, - UnloadLoraAdapterRequest) + TokenizeCompletionRequest) +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.tool_parsers import ToolParser # yapf: enable from vllm.inputs import TokensPrompt @@ -48,30 +43,10 @@ from vllm.sequence import Logprob from vllm.tracing import (contains_trace_headers, extract_trace_headers, log_tracing_disabled_warning) from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import AtomicCounter, is_list_of, make_async, random_uuid +from vllm.utils import is_list_of, make_async, random_uuid logger = init_logger(__name__) - -@dataclass -class BaseModelPath: - name: str - model_path: str - - -@dataclass -class PromptAdapterPath: - name: str - local_path: str - - -@dataclass -class LoRAModulePath: - name: str - path: str - base_model_name: Optional[str] = None - - CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest, EmbeddingCompletionRequest, ScoreRequest, TokenizeCompletionRequest] @@ -96,10 +71,8 @@ class OpenAIServing: self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, *, - lora_modules: Optional[List[LoRAModulePath]], - prompt_adapters: Optional[List[PromptAdapterPath]], request_logger: Optional[RequestLogger], return_tokens_as_token_ids: bool = False, ): @@ -109,35 +82,7 @@ class OpenAIServing: self.model_config = model_config self.max_model_len = model_config.max_model_len - self.base_model_paths = base_model_paths - - self.lora_id_counter = AtomicCounter(0) - self.lora_requests = [] - if lora_modules is not None: - self.lora_requests = [ - LoRARequest(lora_name=lora.name, - lora_int_id=i, - lora_path=lora.path, - base_model_name=lora.base_model_name - if lora.base_model_name - and self._is_model_supported(lora.base_model_name) - else self.base_model_paths[0].name) - for i, lora in enumerate(lora_modules, start=1) - ] - - self.prompt_adapter_requests = [] - if prompt_adapters is not None: - for i, prompt_adapter in enumerate(prompt_adapters, start=1): - with pathlib.Path(prompt_adapter.local_path, - "adapter_config.json").open() as f: - adapter_config = json.load(f) - num_virtual_tokens = adapter_config["num_virtual_tokens"] - self.prompt_adapter_requests.append( - PromptAdapterRequest( - prompt_adapter_name=prompt_adapter.name, - prompt_adapter_id=i, - prompt_adapter_local_path=prompt_adapter.local_path, - prompt_adapter_num_virtual_tokens=num_virtual_tokens)) + self.models = models self.request_logger = request_logger self.return_tokens_as_token_ids = return_tokens_as_token_ids @@ -150,33 +95,6 @@ class OpenAIServing: self._tokenize_prompt_input_or_inputs, executor=self._tokenizer_executor) - async def show_available_models(self) -> ModelList: - """Show available models. Right now we only have one model.""" - model_cards = [ - ModelCard(id=base_model.name, - max_model_len=self.max_model_len, - root=base_model.model_path, - permission=[ModelPermission()]) - for base_model in self.base_model_paths - ] - lora_cards = [ - ModelCard(id=lora.lora_name, - root=lora.local_path, - parent=lora.base_model_name if lora.base_model_name else - self.base_model_paths[0].name, - permission=[ModelPermission()]) - for lora in self.lora_requests - ] - prompt_adapter_cards = [ - ModelCard(id=prompt_adapter.prompt_adapter_name, - root=self.base_model_paths[0].name, - permission=[ModelPermission()]) - for prompt_adapter in self.prompt_adapter_requests - ] - model_cards.extend(lora_cards) - model_cards.extend(prompt_adapter_cards) - return ModelList(data=model_cards) - def create_error_response( self, message: str, @@ -205,11 +123,13 @@ class OpenAIServing: ) -> Optional[ErrorResponse]: if self._is_model_supported(request.model): return None - if request.model in [lora.lora_name for lora in self.lora_requests]: + if request.model in [ + lora.lora_name for lora in self.models.lora_requests + ]: return None if request.model in [ prompt_adapter.prompt_adapter_name - for prompt_adapter in self.prompt_adapter_requests + for prompt_adapter in self.models.prompt_adapter_requests ]: return None return self.create_error_response( @@ -223,10 +143,10 @@ class OpenAIServing: None, PromptAdapterRequest]]: if self._is_model_supported(request.model): return None, None - for lora in self.lora_requests: + for lora in self.models.lora_requests: if request.model == lora.lora_name: return lora, None - for prompt_adapter in self.prompt_adapter_requests: + for prompt_adapter in self.models.prompt_adapter_requests: if request.model == prompt_adapter.prompt_adapter_name: return None, prompt_adapter # if _check_model has been called earlier, this will be unreachable @@ -588,91 +508,5 @@ class OpenAIServing: return logprob.decoded_token return tokenizer.decode(token_id) - async def _check_load_lora_adapter_request( - self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]: - # Check if both 'lora_name' and 'lora_path' are provided - if not request.lora_name or not request.lora_path: - return self.create_error_response( - message="Both 'lora_name' and 'lora_path' must be provided.", - err_type="InvalidUserInput", - status_code=HTTPStatus.BAD_REQUEST) - - # Check if the lora adapter with the given name already exists - if any(lora_request.lora_name == request.lora_name - for lora_request in self.lora_requests): - return self.create_error_response( - message= - f"The lora adapter '{request.lora_name}' has already been" - "loaded.", - err_type="InvalidUserInput", - status_code=HTTPStatus.BAD_REQUEST) - - return None - - async def _check_unload_lora_adapter_request( - self, - request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]: - # Check if either 'lora_name' or 'lora_int_id' is provided - if not request.lora_name and not request.lora_int_id: - return self.create_error_response( - message= - "either 'lora_name' and 'lora_int_id' needs to be provided.", - err_type="InvalidUserInput", - status_code=HTTPStatus.BAD_REQUEST) - - # Check if the lora adapter with the given name exists - if not any(lora_request.lora_name == request.lora_name - for lora_request in self.lora_requests): - return self.create_error_response( - message= - f"The lora adapter '{request.lora_name}' cannot be found.", - err_type="InvalidUserInput", - status_code=HTTPStatus.BAD_REQUEST) - - return None - - async def load_lora_adapter( - self, - request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]: - error_check_ret = await self._check_load_lora_adapter_request(request) - if error_check_ret is not None: - return error_check_ret - - lora_name, lora_path = request.lora_name, request.lora_path - unique_id = self.lora_id_counter.inc(1) - self.lora_requests.append( - LoRARequest(lora_name=lora_name, - lora_int_id=unique_id, - lora_path=lora_path)) - return f"Success: LoRA adapter '{lora_name}' added successfully." - - async def unload_lora_adapter( - self, - request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]: - error_check_ret = await self._check_unload_lora_adapter_request(request - ) - if error_check_ret is not None: - return error_check_ret - - lora_name = request.lora_name - self.lora_requests = [ - lora_request for lora_request in self.lora_requests - if lora_request.lora_name != lora_name - ] - return f"Success: LoRA adapter '{lora_name}' removed successfully." - def _is_model_supported(self, model_name): - return any(model.name == model_name for model in self.base_model_paths) - - def _get_model_name(self, lora: Optional[LoRARequest]): - """ - Returns the appropriate model name depending on the availability - and support of the LoRA or base model. - Parameters: - - lora: LoRARequest that contain a base_model_name. - Returns: - - str: The name of the base model or the first available model path. - """ - if lora is not None: - return lora.lora_name - return self.base_model_paths[0].name + return self.models.is_base_model(model_name) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py new file mode 100644 index 0000000000000..26966896bc272 --- /dev/null +++ b/vllm/entrypoints/openai/serving_models.py @@ -0,0 +1,210 @@ +import json +import pathlib +from dataclasses import dataclass +from http import HTTPStatus +from typing import List, Optional, Union + +from vllm.config import ModelConfig +from vllm.entrypoints.openai.protocol import (ErrorResponse, + LoadLoraAdapterRequest, + ModelCard, ModelList, + ModelPermission, + UnloadLoraAdapterRequest) +from vllm.lora.request import LoRARequest +from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.utils import AtomicCounter + + +@dataclass +class BaseModelPath: + name: str + model_path: str + + +@dataclass +class PromptAdapterPath: + name: str + local_path: str + + +@dataclass +class LoRAModulePath: + name: str + path: str + base_model_name: Optional[str] = None + + +class OpenAIServingModels: + """Shared instance to hold data about the loaded base model(s) and adapters. + + Handles the routes: + - /v1/models + - /v1/load_lora_adapter + - /v1/unload_lora_adapter + """ + + def __init__( + self, + model_config: ModelConfig, + base_model_paths: List[BaseModelPath], + *, + lora_modules: Optional[List[LoRAModulePath]] = None, + prompt_adapters: Optional[List[PromptAdapterPath]] = None, + ): + super().__init__() + + self.base_model_paths = base_model_paths + self.max_model_len = model_config.max_model_len + + self.lora_id_counter = AtomicCounter(0) + self.lora_requests = [] + if lora_modules is not None: + self.lora_requests = [ + LoRARequest(lora_name=lora.name, + lora_int_id=i, + lora_path=lora.path, + base_model_name=lora.base_model_name + if lora.base_model_name + and self.is_base_model(lora.base_model_name) else + self.base_model_paths[0].name) + for i, lora in enumerate(lora_modules, start=1) + ] + + self.prompt_adapter_requests = [] + if prompt_adapters is not None: + for i, prompt_adapter in enumerate(prompt_adapters, start=1): + with pathlib.Path(prompt_adapter.local_path, + "adapter_config.json").open() as f: + adapter_config = json.load(f) + num_virtual_tokens = adapter_config["num_virtual_tokens"] + self.prompt_adapter_requests.append( + PromptAdapterRequest( + prompt_adapter_name=prompt_adapter.name, + prompt_adapter_id=i, + prompt_adapter_local_path=prompt_adapter.local_path, + prompt_adapter_num_virtual_tokens=num_virtual_tokens)) + + def is_base_model(self, model_name): + return any(model.name == model_name for model in self.base_model_paths) + + def model_name(self, lora_request: Optional[LoRARequest] = None) -> str: + """Returns the appropriate model name depending on the availability + and support of the LoRA or base model. + Parameters: + - lora: LoRARequest that contain a base_model_name. + Returns: + - str: The name of the base model or the first available model path. + """ + if lora_request is not None: + return lora_request.lora_name + return self.base_model_paths[0].name + + async def show_available_models(self) -> ModelList: + """Show available models. This includes the base model and all + adapters""" + model_cards = [ + ModelCard(id=base_model.name, + max_model_len=self.max_model_len, + root=base_model.model_path, + permission=[ModelPermission()]) + for base_model in self.base_model_paths + ] + lora_cards = [ + ModelCard(id=lora.lora_name, + root=lora.local_path, + parent=lora.base_model_name if lora.base_model_name else + self.base_model_paths[0].name, + permission=[ModelPermission()]) + for lora in self.lora_requests + ] + prompt_adapter_cards = [ + ModelCard(id=prompt_adapter.prompt_adapter_name, + root=self.base_model_paths[0].name, + permission=[ModelPermission()]) + for prompt_adapter in self.prompt_adapter_requests + ] + model_cards.extend(lora_cards) + model_cards.extend(prompt_adapter_cards) + return ModelList(data=model_cards) + + async def load_lora_adapter( + self, + request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]: + error_check_ret = await self._check_load_lora_adapter_request(request) + if error_check_ret is not None: + return error_check_ret + + lora_name, lora_path = request.lora_name, request.lora_path + unique_id = self.lora_id_counter.inc(1) + self.lora_requests.append( + LoRARequest(lora_name=lora_name, + lora_int_id=unique_id, + lora_path=lora_path)) + return f"Success: LoRA adapter '{lora_name}' added successfully." + + async def unload_lora_adapter( + self, + request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]: + error_check_ret = await self._check_unload_lora_adapter_request(request + ) + if error_check_ret is not None: + return error_check_ret + + lora_name = request.lora_name + self.lora_requests = [ + lora_request for lora_request in self.lora_requests + if lora_request.lora_name != lora_name + ] + return f"Success: LoRA adapter '{lora_name}' removed successfully." + + async def _check_load_lora_adapter_request( + self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]: + # Check if both 'lora_name' and 'lora_path' are provided + if not request.lora_name or not request.lora_path: + return create_error_response( + message="Both 'lora_name' and 'lora_path' must be provided.", + err_type="InvalidUserInput", + status_code=HTTPStatus.BAD_REQUEST) + + # Check if the lora adapter with the given name already exists + if any(lora_request.lora_name == request.lora_name + for lora_request in self.lora_requests): + return create_error_response( + message= + f"The lora adapter '{request.lora_name}' has already been" + "loaded.", + err_type="InvalidUserInput", + status_code=HTTPStatus.BAD_REQUEST) + + return None + + async def _check_unload_lora_adapter_request( + self, + request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]: + # Check if either 'lora_name' or 'lora_int_id' is provided + if not request.lora_name and not request.lora_int_id: + return create_error_response( + message= + "either 'lora_name' and 'lora_int_id' needs to be provided.", + err_type="InvalidUserInput", + status_code=HTTPStatus.BAD_REQUEST) + + # Check if the lora adapter with the given name exists + if not any(lora_request.lora_name == request.lora_name + for lora_request in self.lora_requests): + return create_error_response( + message= + f"The lora adapter '{request.lora_name}' cannot be found.", + err_type="InvalidUserInput", + status_code=HTTPStatus.BAD_REQUEST) + + return None + + +def create_error_response( + message: str, + err_type: str = "BadRequestError", + status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse: + return ErrorResponse(message=message, + type=err_type, + code=status_code.value) diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 01852f0df1eca..5830322071e58 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -15,7 +15,8 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, PoolingChatRequest, PoolingRequest, PoolingResponse, PoolingResponseData, UsageInfo) -from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.logger import init_logger from vllm.outputs import PoolingOutput, PoolingRequestOutput from vllm.utils import merge_async_iterators @@ -44,7 +45,7 @@ class OpenAIServingPooling(OpenAIServing): self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, *, request_logger: Optional[RequestLogger], chat_template: Optional[str], @@ -52,9 +53,7 @@ class OpenAIServingPooling(OpenAIServing): ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=None, - prompt_adapters=None, + models=models, request_logger=request_logger) self.chat_template = chat_template diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index a8a126e697641..5d3e7139d7a17 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -10,7 +10,8 @@ from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import (ErrorResponse, ScoreRequest, ScoreResponse, ScoreResponseData, UsageInfo) -from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.inputs.data import TokensPrompt from vllm.logger import init_logger from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput @@ -50,15 +51,13 @@ class OpenAIServingScores(OpenAIServing): self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, *, request_logger: Optional[RequestLogger], ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=None, - prompt_adapters=None, + models=models, request_logger=request_logger) async def create_score( diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 2e849333680d4..b67ecfb01316f 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -15,9 +15,8 @@ from vllm.entrypoints.openai.protocol import (DetokenizeRequest, TokenizeRequest, TokenizeResponse) # yapf: enable -from vllm.entrypoints.openai.serving_engine import (BaseModelPath, - LoRAModulePath, - OpenAIServing) +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.logger import init_logger logger = init_logger(__name__) @@ -29,18 +28,15 @@ class OpenAIServingTokenization(OpenAIServing): self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, *, - lora_modules: Optional[List[LoRAModulePath]], request_logger: Optional[RequestLogger], chat_template: Optional[str], chat_template_content_format: ChatTemplateContentFormatOption, ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=lora_modules, - prompt_adapters=None, + models=models, request_logger=request_logger) self.chat_template = chat_template diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index c4d90f0856f86..bc32826529eef 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -1,5 +1,4 @@ import asyncio -import multiprocessing import os import sys import threading @@ -13,10 +12,9 @@ from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO, import torch -import vllm.envs as envs from vllm.logger import init_logger from vllm.triton_utils.importing import HAS_TRITON -from vllm.utils import cuda_is_initialized +from vllm.utils import _check_multiproc_method, get_mp_context if HAS_TRITON: from vllm.triton_utils import maybe_set_triton_cache_manager @@ -274,24 +272,6 @@ def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None: file.write = write_with_prefix # type: ignore[method-assign] -def _check_multiproc_method(): - if (cuda_is_initialized() - and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"): - logger.warning("CUDA was previously initialized. We must use " - "the `spawn` multiprocessing start method. Setting " - "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " - "See https://docs.vllm.ai/en/latest/getting_started/" - "debugging.html#python-multiprocessing " - "for more information.") - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - - -def get_mp_context(): - _check_multiproc_method() - mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD - return multiprocessing.get_context(mp_method) - - def set_multiprocessing_worker_envs(parallel_config): """ Set up environment variables that should be used when there are workers in a multiprocessing environment. This should be called by the parent diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 426aa1b5c728f..8d766bad1a072 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -8,7 +8,6 @@ import msgspec from vllm.config import ParallelConfig from vllm.executor.msgspec_utils import decode_hook, encode_hook from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.sequence import ExecuteModelRequest, IntermediateTensors from vllm.utils import get_ip from vllm.worker.worker_base import WorkerWrapperBase @@ -229,6 +228,7 @@ def initialize_ray_cluster( the default Ray cluster address. """ assert_ray_available() + from vllm.platforms import current_platform # Connect to a ray cluster. if current_platform.is_rocm() or current_platform.is_xpu(): diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index f3ec9d115c9ba..46346b08e99c2 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -99,6 +99,9 @@ class InputContext: merged_kwargs = {**base_kwargs, **kwargs} + if isinstance(typ, type): + merged_kwargs["processor_cls"] = typ + hf_processor = cached_get_processor( self.model_config.model, trust_remote_code=self.model_config.trust_remote_code, @@ -132,10 +135,13 @@ class InputProcessingContext(InputContext): def call_hf_processor( self, hf_processor: ProcessorMixin, - prompt: str, - processor_data: Mapping[str, object], - inference_kwargs: Mapping[str, object], + data: Mapping[str, object], + kwargs: Mapping[str, object] = {}, ) -> BatchFeature: + """ + Call :code:`hf_processor` on the prompt :code:`data` + (text, image, audio...) with configurable options :code:`kwargs`. + """ assert callable(hf_processor) base_kwargs = self.model_config.mm_processor_kwargs @@ -144,21 +150,15 @@ class InputProcessingContext(InputContext): merged_kwargs = resolve_mm_processor_kwargs( base_kwargs, - inference_kwargs, + kwargs, hf_processor, requires_kw_only=False, allow_var_kwargs=True, ) try: - return hf_processor( - text=prompt, - **processor_data, - **merged_kwargs, - return_tensors="pt", - ) + return hf_processor(**data, **merged_kwargs, return_tensors="pt") except Exception as exc: - data = dict(text=prompt, **processor_data) msg = (f"Failed to apply {type(hf_processor).__name__} " f"on data={data} with kwargs={merged_kwargs}") diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py index dde347b78bf81..93ad4651f4b77 100644 --- a/vllm/lora/lora.py +++ b/vllm/lora/lora.py @@ -67,15 +67,9 @@ class LoRALayerWeights: peft_helper: PEFTHelper, embeddings_tensor: Optional[torch.Tensor] = None, ) -> "LoRALayerWeights": - return cls( - module_name, - peft_helper.r, - peft_helper.lora_alpha, - None, - None, - None, - embeddings_tensor, - ) + return cls(module_name, peft_helper.r, peft_helper.lora_alpha, None, + None, None, embeddings_tensor, + peft_helper.vllm_lora_scaling_factor) @classmethod def create_dummy_lora_weights( diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 5c0e4e5cbc636..9cfcc6bba727f 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -173,7 +173,7 @@ class LoRAModel(AdapterModel): return cls(lora_model_id, peft_helper.r, loras, - scaling_factor=peft_helper.vllm_scaling_factor) + scaling_factor=peft_helper.vllm_long_context_scaling_factor) @classmethod def from_local_checkpoint( diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index edf4ba5659575..ddd42ae93d290 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -4,6 +4,8 @@ import math from dataclasses import MISSING, dataclass, field, fields from typing import Literal, Optional, Union +from vllm.utils import print_info_once + @dataclass class PEFTHelper: @@ -14,21 +16,22 @@ class PEFTHelper: bias: Literal["none", "all", "lora_only"] = field(default="none") modules_to_save: Optional[list[str]] = field(default=None) + # True to use Rank-Stabilized LoRA (rsLoRA, see: https://arxiv.org/abs/2312.03732) use_rslora: bool = field(default=False) + # True to use Weight-Decomposed Low-Rank Adaptation (DoRA, see: https://arxiv.org/abs/2402.09353) use_dora: bool = field(default=False) - # long lora field + # long context lora field context_length: int = field(default=0) # Extra vllm field, start with 'vllm_' to avoid conflict + vllm_lora_scaling_factor: float = field(default=1.0) vllm_max_position_embeddings: Optional[int] = field(default=False) - vllm_scaling_factor: Optional[float] = field(default=None) + vllm_long_context_scaling_factor: Optional[float] = field(default=None) def _validate_features(self): error_msg = [] if self.modules_to_save: error_msg.append("vLLM only supports modules_to_save being None.") - if self.use_rslora: - error_msg.append("vLLM does not yet support RSLoRA.") if self.use_dora: error_msg.append("vLLM does not yet support DoRA.") @@ -38,10 +41,15 @@ class PEFTHelper: def __post_init__(self): self._validate_features() + if self.use_rslora: + print_info_once("Loading LoRA weights trained with rsLoRA.") + self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r) + else: + self.vllm_lora_scaling_factor = self.lora_alpha / self.r if self.context_length: if self.vllm_max_position_embeddings is None: self.vllm_max_position_embeddings = self.context_length - self.vllm_scaling_factor = float( + self.vllm_long_context_scaling_factor = float( math.ceil(self.context_length / self.vllm_max_position_embeddings)) diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index 694c5b68b1cbd..18b435a42544a 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -6,7 +6,7 @@ from vllm.logger import init_logger from vllm.model_executor.guided_decoding.utils import ( convert_lark_to_gbnf, grammar_is_likely_lark, has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features) -from vllm.platforms import CpuArchEnum, current_platform +from vllm.platforms import CpuArchEnum if TYPE_CHECKING: from transformers import PreTrainedTokenizer @@ -39,6 +39,7 @@ def maybe_backend_fallback( if guided_params.backend == "xgrammar": # xgrammar only has x86 wheels for linux, fallback to outlines + from vllm.platforms import current_platform if current_platform.get_cpu_architecture() is not CpuArchEnum.X86: logger.warning("xgrammar is only supported on x86 CPUs. " "Falling back to use outlines instead.") diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index 5e1948977bff4..f10a8fb8e03cf 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -1,6 +1,7 @@ # noqa: UP007 from __future__ import annotations +import copy import json from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any @@ -309,3 +310,7 @@ class XGrammarLogitsProcessor: scores = scores.to(device_type).squeeze() return scores + + def clone(self) -> XGrammarLogitsProcessor: + """Deepcopy due to per-sequence state in the matchers""" + return copy.deepcopy(self) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 01ffac4550f28..b108cbd52c218 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -41,9 +41,20 @@ class FusedMoEMethodBase(QuantizeMethodBase): raise NotImplementedError @abstractmethod - def apply(self, layer: torch.nn.Module, x: torch.Tensor, - router_logits: torch.Tensor, top_k: int, renormalize: bool, - use_grouped_topk: bool) -> torch.Tensor: + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None + ) -> torch.Tensor: raise NotImplementedError @@ -79,7 +90,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): router_logits: torch.Tensor, top_k: int, renormalize: bool, - use_grouped_topk: bool, + use_grouped_topk: bool = False, topk_group: Optional[int] = None, num_expert_group: Optional[int] = None, custom_routing_function: Optional[Callable] = None, diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index 10bec75f49fdf..606c796d503cf 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -42,12 +42,14 @@ class MambaMixer(CustomOp): use_rms_norm: bool, rms_norm_has_weight: bool = True, rms_norm_eps: float = 1e-5, - activation="silu"): + activation="silu", + is_lora_enabled: bool = False): super().__init__() self.time_step_rank = time_step_rank self.ssm_state_size = ssm_state_size self.use_rms_norm = use_rms_norm self.activation = activation + self.is_lora_enabled = is_lora_enabled self.conv1d = ColumnParallelLinear( input_size=conv_kernel_size, @@ -63,6 +65,7 @@ class MambaMixer(CustomOp): self.in_proj = MergedColumnParallelLinear(hidden_size, [intermediate_size] * 2, bias=use_bias) + # selective projection used to make dt, B and C input dependent self.x_proj = RowParallelLinear( intermediate_size, @@ -170,7 +173,13 @@ class MambaMixer(CustomOp): # 3. State Space Model sequence transformation # 3.a. input varying initialization of time_step, B and C - ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0] + + if self.is_lora_enabled: + # lora kernel requires contiguous tensor + ssm_parameters = self.x_proj( + hidden_states.transpose(-2, -1).contiguous())[0] + else: + ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0] time_step, B, C = torch.split( ssm_parameters, @@ -222,6 +231,11 @@ class MambaMixer(CustomOp): scan_outputs = scan_outputs.transpose(0, 1) # 4. Final linear projection - contextualized_states = self.out_proj(scan_outputs.transpose(-2, - -1))[0] + if self.is_lora_enabled: + # lora kernel requires contiguous tensor + contextualized_states = self.out_proj( + scan_outputs.transpose(-2, -1).contiguous())[0] + else: + contextualized_states = self.out_proj( + scan_outputs.transpose(-2, -1))[0] return contextualized_states diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 4d1a837d11585..c28fd0c6737e0 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -440,11 +440,13 @@ class AWQMoEMethod(FusedMoEMethodBase): x: torch.Tensor, router_logits: torch.Tensor, top_k: int, - renormalize: bool = True, + renormalize: bool, use_grouped_topk: bool = False, - num_expert_group: Optional[int] = None, topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, @@ -454,7 +456,9 @@ class AWQMoEMethod(FusedMoEMethodBase): renormalize=renormalize, topk_group=topk_group, num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function) + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias) return torch.ops.vllm.fused_marlin_moe( x, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index dad04017d3212..5fd6b017f444b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -203,13 +203,14 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): x: torch.Tensor, router_logits: torch.Tensor, top_k: int, - renormalize: bool = True, + renormalize: bool, use_grouped_topk: bool = False, - num_expert_group: Optional[int] = None, topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: - from vllm.model_executor.layers.fused_moe import fused_experts topk_weights, topk_ids = FusedMoE.select_experts( @@ -220,7 +221,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): renormalize=renormalize, topk_group=topk_group, num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function) + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias) return fused_experts(x, layer.w13_weight, @@ -476,12 +479,15 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): x: torch.Tensor, router_logits: torch.Tensor, top_k: int, - renormalize: bool = True, + renormalize: bool, use_grouped_topk: bool = False, - num_expert_group: Optional[int] = None, topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: + topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, router_logits=router_logits, @@ -490,7 +496,9 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): renormalize=renormalize, topk_group=topk_group, num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function) + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias) return torch.ops.vllm.fused_marlin_moe( x, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 73cc8ce0d2a4b..1d4e4bd52adaa 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -41,10 +41,12 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): ) if current_platform.is_rocm(): + input_scale = getattr(layer, 'input_scale', None) + weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( weight=weight, weight_scale=max_w_scale, - input_scale=layer.input_scale) + input_scale=input_scale) if input_scale is not None: layer.input_scale = Parameter(input_scale, requires_grad=False) @@ -57,11 +59,13 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): weight = layer.weight if current_platform.is_rocm(): + input_scale = getattr(layer, 'input_scale', None) + weight, weight_scale, input_scale = \ normalize_e4m3fn_to_e4m3fnuz( weight=weight, weight_scale=layer.weight_scale, - input_scale=layer.input_scale) + input_scale=input_scale) if input_scale is not None: layer.input_scale = Parameter(input_scale, requires_grad=False) @@ -76,7 +80,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): raise ValueError(f"Unknown quantization strategy {self.strategy}") # INPUT SCALE - if self.is_static_input_scheme: + if self.is_static_input_scheme and hasattr(layer, 'input_scale'): layer.input_scale = Parameter(layer.input_scale.max(), requires_grad=False) else: diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 97297970d9317..209f12c6dfec9 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -99,11 +99,13 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): x: torch.Tensor, router_logits: torch.Tensor, top_k: int, - renormalize: bool = True, + renormalize: bool, use_grouped_topk: bool = False, - num_expert_group: Optional[int] = None, topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe import fused_experts @@ -115,7 +117,9 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): renormalize=renormalize, topk_group=topk_group, num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function) + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias) return fused_experts(x, layer.w13_weight, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 4362468c1db69..2fe22903a385b 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -15,8 +15,6 @@ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod -from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - apply_w8a8_block_fp8_linear) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin) from vllm.model_executor.layers.quantization.utils.quant_utils import ( @@ -337,6 +335,9 @@ class Fp8LinearMethod(LinearMethodBase): size_k=layer.input_size_per_partition, bias=bias) + # Note: lazy import to avoid triton import error. + from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + apply_w8a8_block_fp8_linear) if self.block_quant: assert self.quant_config.weight_block_size is not None return apply_w8a8_block_fp8_linear( @@ -601,14 +602,13 @@ class Fp8MoEMethod(FusedMoEMethodBase): router_logits: torch.Tensor, top_k: int, renormalize: bool, - use_grouped_topk: bool, + use_grouped_topk: bool = False, topk_group: Optional[int] = None, num_expert_group: Optional[int] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", e_score_correction_bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: - from vllm.model_executor.layers.fused_moe import fused_experts topk_weights, topk_ids = FusedMoE.select_experts( diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index a3e58bf1b2a4c..a006d729cc627 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -532,11 +532,13 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): x: torch.Tensor, router_logits: torch.Tensor, top_k: int, - renormalize: bool = True, + renormalize: bool, use_grouped_topk: bool = False, - num_expert_group: Optional[int] = None, topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: # The input must currently be float16 orig_dtype = x.dtype @@ -550,7 +552,9 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): renormalize=renormalize, topk_group=topk_group, num_expert_group=num_expert_group, - custom_routing_function=None) + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias) return torch.ops.vllm.fused_marlin_moe( x, diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 97a1b0c9603bd..165e8309fee64 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -39,7 +39,7 @@ class RejectionSampler(SpecDecodeStochasticBaseSampler): strict_mode: Whether or not to perform shape/device/dtype checks during sampling. This catches correctness issues but adds nontrivial latency. - use_falshinfer: We will use this parameter to determine whether + use_flashinfer: We will use this parameter to determine whether to use the FlashInfer rejection sampling kernel or not. If it's None, we will use the default value from the environment variable. This parameter is only used for testing purposes. diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 117fe086e5e87..3fcd81a3c4213 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -541,19 +541,12 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module): short_cache = self._compute_cos_sin_cache( original_max_position_embeddings, short_factor, short_mscale) short_cache = short_cache.to(dtype) - self.register_buffer("short_cos_sin_cache", - short_cache, - persistent=False) long_cache = self._compute_cos_sin_cache(max_position_embeddings, long_factor, long_mscale) long_cache = long_cache.to(dtype) - self.register_buffer("long_cos_sin_cache", - long_cache, - persistent=False) - long_short_cache = torch.cat( - [self.short_cos_sin_cache, self.long_cos_sin_cache], dim=0) + long_short_cache = torch.cat([short_cache, long_cache], dim=0) self.register_buffer("long_short_cos_sin_cache", long_short_cache, persistent=False) @@ -593,8 +586,6 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module): torch.full_like(positions, k)).long() idx = (torch.add(positions, long_prompt_offset) if long_prompt_offset is not None else positions) - self.long_short_cos_sin_cache: torch.Tensor = ( - self.long_short_cos_sin_cache.to(idx.device)) idx = torch.add(idx, offsets) if offsets is not None else idx cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx) @@ -677,7 +668,6 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding): cos = (freqs.cos() * self.mscale) sin = (freqs.sin() * self.mscale) cache = torch.cat((cos, sin), dim=-1) - print("Cache shape", cache.shape) return cache def forward( diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index f2d9293b31a83..a9c1fa7221217 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -11,7 +11,8 @@ import os import warnings from abc import ABC, abstractmethod from contextlib import contextmanager -from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, cast +from typing import (Any, Callable, Dict, Generator, Iterable, List, Optional, + Tuple, cast) import gguf import huggingface_hub @@ -706,6 +707,8 @@ class BitsAndBytesModelLoader(BaseModelLoader): # Store all module names (from transformers) that support # BNB quantization. self.target_modules: List[str] = [] + # mapping weight names from transformers to vllm. + self.weight_mapper: Callable = lambda name: name def _get_weight_files( self, @@ -763,9 +766,12 @@ class BitsAndBytesModelLoader(BaseModelLoader): def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool): if use_safetensors: - return safetensors_weights_iterator(hf_weights_files) + iterator = safetensors_weights_iterator(hf_weights_files) else: - return pt_weights_iterator(hf_weights_files) + iterator = pt_weights_iterator(hf_weights_files) + for name, param in iterator: + # mapping weight names from transformers to vllm. + yield self.weight_mapper(name), param def _get_quantized_weights_iterator( self, @@ -782,12 +788,12 @@ class BitsAndBytesModelLoader(BaseModelLoader): try: import bitsandbytes - if bitsandbytes.__version__ < "0.44.0": + if bitsandbytes.__version__ < "0.45.0": raise ImportError("bitsandbytes version is wrong. Please " - "install bitsandbytes>=0.44.0.") + "install bitsandbytes>=0.45.0.") except ImportError as err: - raise ImportError("Please install bitsandbytes>=0.44.0 via " - "`pip install bitsandbytes>=0.44.0` to use " + raise ImportError("Please install bitsandbytes>=0.45.0 via " + "`pip install bitsandbytes>=0.45.0` to use " "bitsandbytes quantizer.") from err hf_weights_files, use_safetensors = self._prepare_weights( @@ -991,12 +997,15 @@ class BitsAndBytesModelLoader(BaseModelLoader): if isinstance(module, (LinearBase, )): last_name = name.split(".")[-1] if sub_modules := inverse_stacked_mapping.get(last_name, []): - # Map vllm's names to transformers' names. + # Map vllm's names to transformers's names. for sub_name in sub_modules: self.target_modules.append( name.replace(last_name, sub_name)) - else: - self.target_modules.append(name) + # Add original module name even if the module has stacked map, + # in case model has a mixture of disk-merged and disk-splitted + # weights with same last name. + self.target_modules.append(name) + assert (self.target_modules ), "vllm currently does not support BNB quantization for" f" {type(model).__name__}" @@ -1013,6 +1022,10 @@ class BitsAndBytesModelLoader(BaseModelLoader): f"Model {type(model).__name__} does not support BitsAndBytes " "quantization yet.") + # For some models like Molmo, we need to use hf_to_vllm_mapper + # to ensure correct loading of weights. + if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None): + self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name) # Modules whose weights might have fused on disk # we need their output_sizes to make shard in flight correctly with TP self.maybe_fused_weights_modules: Dict[str, List[int]] = {} diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 9437ad9688422..4ad6e859f4d93 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -1,15 +1,15 @@ -import math -from typing import Iterable, List, Optional, Set, Tuple, TypedDict, Union +from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, + Union) import torch import torch.nn as nn from torch.nn.init import trunc_normal_ -from transformers import LlamaConfig +from transformers import BatchFeature, PretrainedConfig from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, QuantizationConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_rank -from vllm.inputs import INPUT_REGISTRY, token_inputs +from vllm.inputs import InputContext from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -17,30 +17,27 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( get_compressed_tensors_cache_scale) -from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput, - SamplingMetadata) +from vllm.model_executor.layers.sampler import (SamplerOutput, + SamplingMetadata, get_sampler) from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.models.idefics2_vision_model import ( - Idefics2VisionTransformer) -from vllm.model_executor.models.interfaces import SupportsMultiModal -from vllm.model_executor.models.llama import (LlamaDecoderLayer, LlamaMLP, - LlamaModel) -from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, - is_pp_missing_parameter, - maybe_prefix, - merge_multimodal_embeddings) from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.image import cached_get_image_processor -from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors -from vllm.multimodal.utils import (cached_get_tokenizer, - repeat_and_pad_placeholder_tokens) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataItems, ProcessorInputs, + PromptReplacement) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.aria import (AriaMoELMConfig, AriaVisionConfig) -from .utils import flatten_bn +from .idefics2_vision_model import Idefics2VisionTransformer +from .interfaces import SupportsMultiModal +from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel +from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, + is_pp_missing_parameter, maybe_prefix, + merge_multimodal_embeddings) class AriaImagePixelInputs(TypedDict): @@ -251,7 +248,7 @@ class AriaProjector(nn.Module): class AriaFusedMoE(FusedMoE): def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, - shard_id: str) -> Set[str]: + shard_id: str) -> None: # Override the weight_loader to handle the expert weights in the Aria # model, which are already packed with experts, and merge the gate and # up weights for each expert. @@ -346,7 +343,7 @@ class MoEDecoderLayer(LlamaDecoderLayer): def __init__( self, - config: LlamaConfig, + config: AriaMoELMConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -434,7 +431,7 @@ class AriaMoELMModel(LlamaModel): return loaded_params -def build_mm_projector(config): +def build_mm_projector(config: PretrainedConfig): return AriaProjector( patch_to_query_dict=config.projector_patch_to_query_dict, embed_dim=config.vision_config.hidden_size, @@ -445,75 +442,70 @@ def build_mm_projector(config): ) -def get_max_multimodal_tokens(ctx): - return max(ctx.model_config.hf_config.image_size2tokens.values()) +def get_max_aria_image_tokens(ctx: InputContext): + hf_config = ctx.get_hf_config() + return max(hf_config.projector_patch_to_query_dict.values()) -def input_mapper_for_aria(ctx, data): - return MultiModalKwargs(data) +class AriaMultiModalProcessor(BaseMultiModalProcessor): - -def input_processor(ctx, llm_inputs): - multi_modal_data = llm_inputs.get("multi_modal_data") - # if it is pure text input, use it as is - if multi_modal_data is None or "image" not in multi_modal_data: - return llm_inputs - - model_config = ctx.model_config - - tokenizer = cached_get_tokenizer(model_config.tokenizer) - image_processor = cached_get_image_processor( - model_config.model, trust_remote_code=model_config.trust_remote_code) - hf_config = model_config.hf_config - - # prepare image tokens, the max_image_size is used to determine the number - # of patch_size for every image - max_image_size = multi_modal_data.pop("max_image_size", 980) - _split_image = multi_modal_data.pop("split_image", False) - - assert isinstance(max_image_size, - (int, float)), "max_image_size should be float or int" - images = (multi_modal_data["image"] if isinstance( - multi_modal_data["image"], list) else [multi_modal_data["image"]]) - - image_inputs = image_processor.preprocess(images, - max_image_size=max_image_size, - split_image=_split_image, - return_tensors="pt").data - image_inputs['pixel_values'] = image_inputs['pixel_values'].to( - ctx.model_config.dtype) - num_crops = image_inputs.pop("num_crops") - - prompt_token_ids = llm_inputs["prompt_token_ids"] - if num_crops.sum().item() > 0: - _, prompt_token_ids, _ = repeat_and_pad_placeholder_tokens( - tokenizer, - None, - prompt_token_ids, - placeholder_token_id=hf_config.image_token_index, - repeat_count=num_crops, + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + pixel_mask=MultiModalFieldConfig.batched("image"), ) - repeat_count = [hf_config.image_size2tokens[max_image_size] - ] * sum(num_crops).item() - new_prompt, new_token_ids, _ = repeat_and_pad_placeholder_tokens( - tokenizer, - None, - prompt_token_ids, - placeholder_token_id=hf_config.image_token_index, - repeat_count=repeat_count, - ) + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_config = self.ctx.get_hf_config() + image_token_id = hf_config.image_token_index - return token_inputs( - prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data={"image": image_inputs}, - ) + max_image_tokens = get_max_aria_image_tokens(self.ctx) + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=[image_token_id] * max_image_tokens, + ) + ] + + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + hf_config = self.ctx.get_hf_config() + vision_config: AriaVisionConfig = hf_config.vision_config + + max_image_size = vision_config.image_size + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=max_image_size, + height=max_image_size, + num_images=num_images) + } + + hf_processor = self._get_hf_processor() + image_token: str = hf_processor.image_token # type: ignore + + return ProcessorInputs( + prompt_text=image_token * num_images, + mm_data=mm_data, + ) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_multimodal_tokens) -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_aria) -@INPUT_REGISTRY.register_input_processor(input_processor) +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_aria_image_tokens) +@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor) class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): """ Aria model for conditional generation tasks. @@ -540,12 +532,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - # prepare the image_size to tokens mapping for the image preprocess, see - # input_processor - config.image_size2tokens = { - int(math.sqrt(k) * config.vision_config.patch_size): v - for k, v in config.projector_patch_to_query_dict.items() - } self.config = config self.vision_tower = AriaVisionModel(config.vision_config) self.multi_modal_projector = build_mm_projector(config) @@ -566,7 +552,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, self.vocab_size, logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() def _validate_image_sizes( self, images: List[torch.Tensor]) -> List[torch.Tensor]: @@ -588,7 +574,12 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): pixel_values = self._validate_image_sizes(pixel_values) pixel_values = flatten_bn(pixel_values, concat=True) + if pixel_mask is not None: + if not isinstance(pixel_mask, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel mask. " + f"Got type: {type(pixel_mask)}") + pixel_mask = flatten_bn(pixel_mask, concat=True) return AriaImagePixelInputs( diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 42a239cadac46..987dfaf44f228 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -4,22 +4,16 @@ from typing import Iterable, Optional, Set, Tuple, Union import torch import torch.nn as nn -from PIL import Image from transformers import Blip2VisionConfig, BlipVisionConfig from vllm.attention.layer import MultiHeadAttention -from vllm.config import ModelConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size -from vllm.inputs import DecoderOnlyInputs, token_inputs from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.multimodal.utils import (cached_get_tokenizer, - repeat_and_pad_placeholder_tokens) -from vllm.sequence import SequenceData def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int: @@ -33,92 +27,6 @@ def get_blip_num_patches(*, image_size: int, patch_size: int) -> int: return grid_length * grid_length -def get_blip_image_feature_size( - hf_config: Union[BlipVisionConfig, Blip2VisionConfig]) -> int: - return get_blip_num_patches(image_size=hf_config.image_size, - patch_size=hf_config.patch_size) - - -def get_max_blip_image_tokens( - hf_config: Union[BlipVisionConfig, Blip2VisionConfig]) -> int: - return get_blip_image_feature_size(hf_config) - - -def dummy_seq_data_for_blip( - hf_config: Union[BlipVisionConfig, Blip2VisionConfig], - seq_len: int, - num_images: int, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): - if image_feature_size_override is None: - image_feature_size = get_blip_image_feature_size(hf_config) - else: - image_feature_size = image_feature_size_override - - return SequenceData.from_prompt_token_counts( - (image_token_id, image_feature_size * num_images), - (0, seq_len - image_feature_size * num_images), - ) - - -def dummy_image_for_blip( - hf_config: Union[BlipVisionConfig, Blip2VisionConfig], - num_images: int, - *, - image_width_override: Optional[int] = None, - image_height_override: Optional[int] = None, -): - width = height = hf_config.image_size - if image_width_override is not None: - width = image_width_override - if image_height_override is not None: - height = image_height_override - - image = Image.new("RGB", (width, height), color=0) - return {"image": image if num_images == 1 else [image] * num_images} - - -def input_processor_for_blip( - model_config: ModelConfig, - hf_config: Union[BlipVisionConfig, Blip2VisionConfig], - inputs: DecoderOnlyInputs, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - if "multi_modal_placeholders" in inputs and "image" in inputs[ - "multi_modal_placeholders"]: - # The inputs already have placeholders. - return inputs - - tokenizer = cached_get_tokenizer(model_config.tokenizer) - - if image_feature_size_override is None: - image_feature_size = get_blip_image_feature_size(hf_config) - else: - image_feature_size = image_feature_size_override - - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - inputs.get("prompt"), - inputs["prompt_token_ids"], - placeholder_token_id=image_token_id, - repeat_count=image_feature_size, - ) - - # NOTE: Create a defensive copy of the original inputs - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"image": ranges}) - - # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa class BlipVisionEmbeddings(nn.Module): diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 76b8505ee1c2a..50680fadc4aa3 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -4,32 +4,34 @@ from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, import torch import torch.nn as nn -from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig, - apply_chunking_to_forward) +from transformers import (BatchFeature, Blip2Config, Blip2Processor, + Blip2QFormerConfig, apply_chunking_to_forward) from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) +from vllm.inputs import InputContext from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors -from vllm.multimodal.utils import consecutive_placeholder_ranges -from vllm.sequence import IntermediateTensors, SequenceData +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputsV2, MultiModalKwargs, + NestedTensors, PlaceholderRange) +from vllm.multimodal.parse import MultiModalDataParser +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataItems, ProcessorInputs, + PromptReplacement) +from vllm.sequence import IntermediateTensors -from .blip import (BlipVisionModel, dummy_image_for_blip, - get_max_blip_image_tokens) +from .blip import BlipVisionModel from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) # We use this internally as placeholders since there is no image token # defined on the HuggingFace repo -BLIP2_IMAGE_TOKEN = "" -BLIP2_IMAGE_TOKEN_ID = 50265 +_IMAGE_TOKEN_ID = 50265 class Blip2ImagePixelInputs(TypedDict): @@ -396,92 +398,90 @@ class Blip2QFormerModel(nn.Module): return sequence_output -def get_blip2_image_feature_size(hf_config: Blip2Config) -> int: +def get_max_blip2_image_tokens(ctx: InputContext): + hf_config = ctx.get_hf_config(Blip2Config) return hf_config.num_query_tokens -def get_max_blip2_image_tokens(ctx: InputContext): - hf_config = ctx.get_hf_config(Blip2Config) - vision_config = hf_config.vision_config +class Blip2MultiModalProcessor(BaseMultiModalProcessor): - if isinstance(vision_config, Blip2VisionConfig): - return get_max_blip_image_tokens(vision_config) + def _get_data_parser(self) -> MultiModalDataParser: + return MultiModalDataParser(max_mm_counts={"image": 1}) - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) + def _get_hf_processor(self) -> Blip2Processor: + return self.ctx.get_hf_processor(Blip2Processor) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + max_image_tokens = get_max_blip2_image_tokens(self.ctx) + + return [ + PromptReplacement( + modality="image", + target="", + replacement="" * max_image_tokens + "", + ) + ] + + def apply( + self, + prompt_text: str, + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + + # Only tokens should be considered as placeholders, + # so we ignore the trailing bos_token + result["mm_placeholders"] = { + modality: [ + PlaceholderRange(offset=p["offset"], length=p["length"] - 1) + for p in ps + ] + for modality, ps in result["mm_placeholders"].items() + } + + return result + + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + hf_config = self.ctx.get_hf_config(Blip2Config) + vision_config = hf_config.vision_config + + max_image_size = vision_config.image_size + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=max_image_size, + height=max_image_size, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="", + mm_data=mm_data, + ) -def dummy_seq_data_for_blip2( - hf_config: Blip2Config, - seq_len: int, - num_images: int, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): - if image_feature_size_override is None: - image_feature_size = get_blip2_image_feature_size(hf_config) - else: - image_feature_size = image_feature_size_override - - return SequenceData.from_prompt_token_counts( - (image_token_id, image_feature_size * num_images), - (0, seq_len - image_feature_size * num_images), - ), { - "image": - consecutive_placeholder_ranges(num_items=num_images, - item_size=image_feature_size) - } - - -def dummy_data_for_blip2(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - hf_config = ctx.get_hf_config(Blip2Config) - vision_config = hf_config.vision_config - num_images = mm_counts["image"] - - seq_data, ranges = dummy_seq_data_for_blip2( - hf_config, - seq_len, - num_images, - image_token_id=BLIP2_IMAGE_TOKEN_ID, - ) - - if isinstance(vision_config, Blip2VisionConfig): - mm_data = dummy_image_for_blip(vision_config, num_images) - - return DummyData(seq_data, mm_data, ranges) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - -def input_processor_for_blip2(ctx: InputContext, inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - hf_config = ctx.get_hf_config(Blip2Config) - image_feature_size = get_blip2_image_feature_size(hf_config) - - # The original model places image tokens at the front - # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1514 - new_token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size - new_token_ids += inputs["prompt_token_ids"] - - new_prompt = inputs.get("prompt") - if new_prompt is not None: - new_prompt = BLIP2_IMAGE_TOKEN * image_feature_size + new_prompt - - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data) - - -@MULTIMODAL_REGISTRY.register_image_input_mapper() @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_blip2_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_blip2) -@INPUT_REGISTRY.register_input_processor(input_processor_for_blip2) +@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor) class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -627,7 +627,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): if multimodal_embeddings is not None: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, - BLIP2_IMAGE_TOKEN_ID) + _IMAGE_TOKEN_ID) return inputs_embeds def forward( diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index a40c321ce0a58..c731934e792fc 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -3,16 +3,15 @@ from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) import torch +import torch.nn as nn import torch.nn.functional as F -from PIL import Image -from torch import nn -from transformers import ChameleonConfig, ChameleonVQVAEConfig +from transformers import (BatchFeature, ChameleonConfig, ChameleonProcessor, + ChameleonVQVAEConfig) from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) +from vllm.inputs import InputContext from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -29,11 +28,14 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors -from vllm.multimodal.utils import (cached_get_tokenizer, - consecutive_placeholder_ranges, - repeat_and_pad_placeholder_tokens) -from vllm.sequence import IntermediateTensors, SequenceData +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputsV2, MultiModalKwargs, + NestedTensors, PlaceholderRange) +from vllm.multimodal.parse import MultiModalDataParser +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataItems, ProcessorInputs, + PromptReplacement) +from vllm.sequence import IntermediateTensors from vllm.utils import print_warning_once from .interfaces import SupportsMultiModal, SupportsPP @@ -45,10 +47,6 @@ from .utils import (is_pp_missing_parameter, # and processor files, so we hardcode them in the model file for now. CHAMELEON_CROP_SIZE_HEIGHT = CHAMELEON_CROP_SIZE_WIDTH = 512 CHAMELEON_IMAGE_SEQ_LENGTH = 1024 -CHAMELEON_IMAGE_TOKEN_ID = 8711 -CHAMELEON_IMAGE_START_TOKEN_ID = 8197 -CHAMELEON_IMAGE_END_TOKEN_ID = 8196 -CHAMELEON_SEP_TOKEN_ID = 8710 class ChameleonImagePixelInputs(TypedDict): @@ -61,99 +59,78 @@ def get_max_chameleon_image_tokens(ctx: InputContext): return CHAMELEON_IMAGE_SEQ_LENGTH -def dummy_seq_data_for_chameleon( - seq_len: int, - num_images: int, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): - if image_feature_size_override is None: - image_feature_size = CHAMELEON_IMAGE_SEQ_LENGTH - else: - image_feature_size = image_feature_size_override +class ChameleonMultiModalProcessor(BaseMultiModalProcessor): - return SequenceData.from_prompt_token_counts( - (image_token_id, image_feature_size * num_images), - (0, seq_len - image_feature_size * num_images), - ), { - "image": - consecutive_placeholder_ranges(num_items=num_images, - item_size=image_feature_size) - } + def _get_data_parser(self) -> MultiModalDataParser: + return MultiModalDataParser(max_mm_counts={"image": 1}) + def _get_hf_processor(self) -> ChameleonProcessor: + return self.ctx.get_hf_processor(ChameleonProcessor) -def dummy_image_for_chameleon( - num_images: int, - *, - image_width_override: Optional[int] = None, - image_height_override: Optional[int] = None, -): - width = CHAMELEON_CROP_SIZE_WIDTH - height = CHAMELEON_CROP_SIZE_HEIGHT - if image_width_override is not None: - width = image_width_override - if image_height_override is not None: - height = image_height_override + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(pixel_values=MultiModalFieldConfig.batched("image")) - image = Image.new("RGB", (width, height), color=0) - return {"image": image if num_images == 1 else [image] * num_images} + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + processor = self._get_hf_processor() + return [ + PromptReplacement( + modality="image", + target="", + replacement="".join([ + processor.image_start_token, + processor.image_token * CHAMELEON_IMAGE_SEQ_LENGTH, + processor.image_end_token, + ]), + ) + ] -def dummy_data_for_chameleon(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - num_images = mm_counts["image"] + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) - seq_data, ranges = dummy_seq_data_for_chameleon( - seq_len, - num_images, - image_token_id=CHAMELEON_IMAGE_TOKEN_ID, - ) + mm_data = { + "image": + self._get_dummy_images(width=CHAMELEON_CROP_SIZE_WIDTH, + height=CHAMELEON_CROP_SIZE_HEIGHT, + num_images=num_images) + } - mm_data = dummy_image_for_chameleon(num_images) - return DummyData(seq_data, mm_data, ranges) + return ProcessorInputs( + prompt_text="" * num_images, + mm_data=mm_data, + ) + def apply( + self, + prompt_text: str, + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) -def input_processor_for_chameleon(ctx: InputContext, - inputs: DecoderOnlyInputs): + # Only tokens should be considered as placeholders, + # so we ignore the image_start_token and image_end_token + result["mm_placeholders"] = { + modality: [ + PlaceholderRange(offset=p["offset"] + 1, + length=p["length"] - 2) for p in ps + ] + for modality, ps in result["mm_placeholders"].items() + } - """ - Processing input prompt to insert required tokens for image placeholder. - - See https://github.com/huggingface/transformers/blob/0fdea8607d7e01eb0e38a1ebeb7feee30a22f0cf/src/transformers/models/chameleon/processing_chameleon.py#L58 - """ # noqa - - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - if "multi_modal_placeholders" in inputs and "image" in inputs[ - "multi_modal_placeholders"]: - # The inputs already have placeholders. - return inputs - - model_config = ctx.model_config - tokenizer = cached_get_tokenizer(model_config.tokenizer) - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - inputs.get("prompt"), - inputs["prompt_token_ids"], - placeholder_token_id=CHAMELEON_IMAGE_TOKEN_ID, - repeat_count=CHAMELEON_IMAGE_SEQ_LENGTH, - pad_token_left=CHAMELEON_IMAGE_START_TOKEN_ID, - pad_token_right=CHAMELEON_IMAGE_END_TOKEN_ID, - ) - - # Appending sep token for chat mode to follow default processor - # behavior - if new_prompt is not None: - new_prompt += tokenizer.sep_token - new_token_ids += [CHAMELEON_SEP_TOKEN_ID] - - # NOTE: Create a defensive copy of the original inputs - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data) + return result class ChameleonLayerNorm(nn.LayerNorm): @@ -736,7 +713,7 @@ class ChameleonVQVAEEncoder(nn.Module): for i_level in range(self.num_resolutions): for i_block in range(self.num_res_blocks): hidden_state = self.down[i_level].block[i_block]( - hidden_states[-1], ) + hidden_states[-1]) if len(self.down[i_level].attn) > 0: hidden_state = self.down[i_level].attn[i_block]( hidden_state) @@ -925,10 +902,8 @@ class ChameleonModel(nn.Module): return hidden_states -@MULTIMODAL_REGISTRY.register_image_input_mapper() @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_chameleon_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_chameleon) -@INPUT_REGISTRY.register_input_processor(input_processor_for_chameleon) +@MULTIMODAL_REGISTRY.register_processor(ChameleonMultiModalProcessor) class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 6c50882d83c3b..ffd6891b25965 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -33,7 +33,7 @@ from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalData, MultiModalKwargs, +from vllm.multimodal.inputs import (ModalityData, MultiModalKwargs, NestedTensors) from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, @@ -54,7 +54,7 @@ def calculate_image_placeholder(vision_config): def mm_input_mapper_for_glmv( ctx: InputContext, - data: MultiModalData[object], + data: ModalityData[object], ) -> Dict: model_config = ctx.model_config tokenizer = cached_get_tokenizer( diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index c846e42f1b0c3..d22d1f3171463 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -172,16 +172,18 @@ class CohereAttention(nn.Module): is_neox_style=False, ) - sliding_window = getattr(config, "sliding_window", None) - # Model v2 has sliding windows, v1 does not - self.v1 = sliding_window is None + # Model v2 has interleaved sliding windows, v1 does not + interleaved_sliding_window = getattr(config, + "interleaved_sliding_window", + None) + self.v1 = interleaved_sliding_window is None layer_idx = extract_layer_index(prefix) layer_has_sliding_window = ( getattr(config, "sliding_window_pattern", False) and (layer_idx + 1) % self.config.sliding_window_pattern != 0) - self.sliding_window = (sliding_window + self.sliding_window = (interleaved_sliding_window if layer_has_sliding_window else None) self.attn = Attention(self.num_heads, diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 6e86900326c4b..0a48fa3fe11c0 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -15,32 +15,30 @@ # limitations under the License. """ PyTorch Fuyu model.""" import math -from array import array from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict) import torch import torch.nn as nn -import torch.utils.checkpoint -from PIL import Image -from transformers import FuyuImageProcessor +from transformers import (BatchFeature, FuyuConfig, FuyuImageProcessor, + FuyuProcessor) from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) +from vllm.inputs import InputContext from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.image import cached_get_image_processor -from vllm.multimodal.inputs import NestedTensors -from vllm.multimodal.utils import (cached_get_tokenizer, - consecutive_placeholder_ranges) -from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, - SequenceData) -from vllm.utils import is_list_of +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputsV2, MultiModalKwargs, + NestedTensors, PlaceholderRange) +from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataParser +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataItems, ProcessorInputs, + PromptReplacement) +from vllm.sequence import IntermediateTensors from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, @@ -54,178 +52,196 @@ MAX_IMAGE_FEATURE_SIZE_HEIGHT = 1080 MAX_IMAGE_FEATURE_SIZE_WIDTH = 1920 -class FuyuImagePixelInputs(TypedDict): - type: Literal["pixel_values"] - data: torch.Tensor +class FuyuImagePatchInputs(TypedDict): + type: Literal["image_patches"] + flat_data: torch.Tensor """ Shape: - (batch_size, num_patches, patch_size_x * patch_size_y * num_channels) + `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)` + """ + + patches_per_image: List[int] + """ + List of number of total patches for each image in the batch. + This is used to restore the first two dimensions of `flat_data`. """ -def _calculate_num_image_tokens( - height: int, - width: int, +def _get_fuyu_num_image_tokens( + image_height: int, + image_width: int, ) -> Tuple[int, int]: """ - calculate number of image tokens needed for a given image size - The expected Fuyu image prompts is in format: + Calculate the number of image tokens needed for a given image size. + + The expected Fuyu image prompts can be expressed as: + + .. code-block:: (image_token * ncols + newline_token) * nrows - args: - image_size: Tuple[int, int] - (width, height) of the image - returns: - ncols: int - number of image tokens in x direction - nrows: int - number of image tokens in y direction + + Args: + image_size: Tuple[int, int] - `(width, height)` of the image + + Returns: + ncols: int - number of image tokens in `x` direction + nrows: int - number of image tokens in `y` direction """ - ncol = math.ceil(width / 30) - nrow = math.ceil(height / 30) - return ncol, nrow - - -def get_max_fuyu_image_feature_size(): - - return _calculate_num_image_tokens( - height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, - width=MAX_IMAGE_FEATURE_SIZE_WIDTH, - ) + ncols = math.ceil(image_width / 30) + nrows = math.ceil(image_height / 30) + return ncols, nrows def get_max_fuyu_image_tokens(ctx: InputContext): - ncol, nrow = get_max_fuyu_image_feature_size() - return (ncol + 1) * nrow - - -def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int, num_images: int): - ncol, nrow = get_max_fuyu_image_feature_size() - image_feature_size = get_max_fuyu_image_tokens(ctx) - - image_token_ids = ( - array(VLLM_TOKEN_ID_ARRAY_TYPE, [_IMAGE_TOKEN_ID]) * ncol + - array(VLLM_TOKEN_ID_ARRAY_TYPE, [_NEWLINE_TOKEN_ID])) * nrow - token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, image_token_ids) * num_images - token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, - [0]) * (seq_len - image_feature_size * num_images) - return SequenceData(token_ids), { - "image": - consecutive_placeholder_ranges(num_items=num_images, - item_size=image_feature_size) - } - - -def dummy_image_for_fuyu( - num_images: int, - *, - image_width: int, - image_height: int, -): - image = Image.new("RGB", (image_width, image_height), color=0) - return {"image": image if num_images == 1 else [image] * num_images} - - -def dummy_data_for_fuyu(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - num_images = mm_counts["image"] - seq_data, ranges = dummy_seq_data_for_fuyu(ctx, seq_len, num_images) - mm_data = dummy_image_for_fuyu(num_images, - image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, - image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT) - return DummyData(seq_data, mm_data, ranges) - - -def _fuyu_image_preprocess(image_processor: FuyuImageProcessor, - data: List[Image.Image]): - image_encoding = image_processor.preprocess(data, return_tensors="pt") - batch_images = torch.stack([img[0] for img in image_encoding["images"] - ]).unsqueeze(1) - image_unpadded_heights = torch.tensor( - image_encoding["image_unpadded_heights"]) - image_unpadded_widths = torch.tensor( - image_encoding["image_unpadded_widths"]) - - batch_size = len(image_encoding["images"]) - image_present = torch.ones(batch_size, 1, 1) - model_image_input = image_processor.preprocess_with_tokenizer_info( - image_input=batch_images, - image_present=image_present, - image_unpadded_h=image_unpadded_heights, - image_unpadded_w=image_unpadded_widths, - image_placeholder_id=_IMAGE_TOKEN_ID, - image_newline_id=_NEWLINE_TOKEN_ID, - variable_sized=True, + ncols, nrows = _get_fuyu_num_image_tokens( + image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, + image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, ) - return model_image_input + + return (ncols + 1) * nrows -def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs +class FuyuMultiModalProcessor(BaseMultiModalProcessor): - model_config = ctx.model_config - image_data = multi_modal_data["image"] - new_multi_modal_data = {} - image_list = image_data if isinstance(image_data, list) else [image_data] + def _get_data_parser(self) -> MultiModalDataParser: + return MultiModalDataParser(max_mm_counts={"image": 1}) - # process image data - if is_list_of(image_list, Image.Image): - # Fuyu's image_processor can also finish token padding - image_processor: FuyuImageProcessor = cached_get_image_processor( - model_config.model) + def _get_hf_processor(self) -> FuyuProcessor: + return self.ctx.get_hf_processor(FuyuProcessor) - model_image_input = _fuyu_image_preprocess(image_processor, image_data) - image_patches = torch.cat([ - image_patch[0] - for image_patch in model_image_input["image_patches"] - ]) - new_multi_modal_data["image"] = image_patches + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: - elif is_list_of(image_list, torch.Tensor): - raise NotImplementedError("Embeddings input is not supported yet") - else: - raise TypeError(f"Invalid image type: {type(image_data)}") + if not mm_data: + # Avoid warning from HF logger for text-only input + # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id + # Tokenizer won't add boa_token_id by default, we add it manually. + tokenizer = self._get_tokenizer() + boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore + prompt_ids = tokenizer.encode(prompt) + [boa_token_id] + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") - # process prompts - prompt = inputs.get("prompt") - prompt_token_ids = inputs["prompt_token_ids"] - tokenizer = cached_get_tokenizer(model_config.model) - # dim0 is batch_size, dim1 is subseq_size which will always be 1 - image_input_ids: List[List[ - torch.Tensor]] = model_image_input["image_input_ids"] - image_input_ids = image_input_ids[0][0].tolist() - bos_token = tokenizer.encode("", add_special_tokens=False)[1:] - boa_token = tokenizer.encode("\x04", add_special_tokens=False)[1:] + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) - new_prompt = prompt + "\x04" - new_prompt_token_ids = image_input_ids + bos_token + prompt_token_ids[ - 1:] + boa_token + image_patches = processed_outputs.get("image_patches") + if image_patches is not None: + images = mm_data["images"] + assert isinstance(images, list) - return token_inputs(prompt=new_prompt, - prompt_token_ids=new_prompt_token_ids, - multi_modal_data=new_multi_modal_data) + # Original output: (1, num_images, Pn, Px * Py * C) + # New output: (num_images, Pn, Px * Py * C) + assert (isinstance(image_patches, list) + and len(image_patches) == 1) + assert (isinstance(image_patches[0], torch.Tensor) + and len(image_patches[0]) == len(images)) + + processed_outputs["image_patches"] = image_patches[0] + + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(image_patches=MultiModalFieldConfig.batched("image")) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_config = self.ctx.get_hf_config(FuyuConfig) + bos_token_id = hf_config.bos_token_id + + tokenizer = self._get_tokenizer() + eot_token_id = tokenizer.bos_token_id + assert isinstance(eot_token_id, int) + + hf_processor = self._get_hf_processor() + image_processor: FuyuImageProcessor = hf_processor.image_processor + target_size = image_processor.size + target_height, target_width = (target_size["height"], + target_size["width"]) + + def get_replacement_fuyu(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) + width, height = image_size.width, image_size.height + if not (width <= target_width and height <= target_height): + height_scale_factor = target_height / height + width_scale_factor = target_width / width + optimal_scale_factor = min(height_scale_factor, + width_scale_factor) + + height = int(height * optimal_scale_factor) + width = int(width * optimal_scale_factor) + + ncols, nrows = _get_fuyu_num_image_tokens( + image_width=width, + image_height=height, + ) + + return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows + + [bos_token_id]) + + return [ + PromptReplacement( + modality="image", + target=[eot_token_id], + replacement=get_replacement_fuyu, + ) + ] + + def apply( + self, + prompt_text: str, + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + + # Only |SPEAKER| (image) tokens should be considered as placeholders, + # so we ignore the trailing bos_token_id + result["mm_placeholders"] = { + modality: [ + PlaceholderRange(offset=p["offset"], length=p["length"] - 1) + for p in ps + ] + for modality, ps in result["mm_placeholders"].items() + } + + return result + + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=MAX_IMAGE_FEATURE_SIZE_WIDTH, + height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="", + mm_data=mm_data, + ) -def input_mapper_for_fuyu(ctx: InputContext, data: object): - model_config = ctx.model_config - data_list = data if isinstance(data, list) else [data] - if is_list_of(data_list, Image.Image): - # Fuyu's image_processor can also finish token padding - image_processor: FuyuImageProcessor = cached_get_image_processor( - model_config.model) - - model_image_input = _fuyu_image_preprocess(image_processor, data_list) - data = torch.stack([ - image_patch[0] - for image_patch in model_image_input["image_patches"] - ]) - - # image has been processed with prompt in input processor - return MultiModalKwargs({"pixel_values": data}) - - -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu) @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_fuyu_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_fuyu) -@INPUT_REGISTRY.register_input_processor(input_processor_for_fuyu) +@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor) class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -280,28 +296,33 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): return data.to(self.vision_embed_tokens.weight.dtype) def _parse_and_validate_image_input( - self, **kwargs: object) -> Optional[FuyuImagePixelInputs]: - pixel_values = kwargs.pop("pixel_values", None) - - if pixel_values is not None: - if not isinstance(pixel_values, (torch.Tensor, list)): + self, **kwargs: object) -> Optional[FuyuImagePatchInputs]: + image_patches = kwargs.pop("image_patches", None) + if image_patches is not None: + if not isinstance(image_patches, (torch.Tensor, list)): raise ValueError("Incorrect type of image patches. " - f"Got type: {type(pixel_values)}") + f"Got type: {type(image_patches)}") - return FuyuImagePixelInputs( - type="pixel_values", - data=self._validate_pixel_values( - flatten_bn(pixel_values, concat=True)), + image_patches_flat = flatten_bn(image_patches) + + return FuyuImagePatchInputs( + type="image_patches", + flat_data=self._validate_pixel_values( + flatten_bn(image_patches_flat, concat=True)), + patches_per_image=[x.size(0) for x in image_patches_flat], ) return None def _process_image_input( - self, image_input: FuyuImagePixelInputs) -> torch.Tensor: + self, image_input: FuyuImagePatchInputs) -> NestedTensors: + image_patches_flat = image_input["flat_data"] + patches_per_image = image_input["patches_per_image"] assert self.vision_embed_tokens is not None - vision_embeddings, _ = self.vision_embed_tokens(image_input["data"]) - return vision_embeddings + vision_embeddings_flat, _ = self.vision_embed_tokens( + image_patches_flat) + return vision_embeddings_flat.split(patches_per_image, dim=0) def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index e430a158d869a..4e42a4b6f9e64 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -69,7 +69,8 @@ class Idefics2VisionEmbeddings(nn.Module): patch_attention_mask: torch.BoolTensor, tgt_sizes: Optional[torch.IntTensor] = None) -> torch.Tensor: batch_size, _, max_im_h, max_im_w = pixel_values.shape - patch_embeds = self.patch_embedding(pixel_values) + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(target_dtype)) embeddings = patch_embeds.flatten(2).transpose(1, 2) max_nb_patches_h, max_nb_patches_w = ( max_im_h // self.patch_size, @@ -309,7 +310,8 @@ class Idefics2VisionTransformer(nn.Module): hidden_states = self.embeddings( pixel_values=pixel_values, patch_attention_mask=patch_attention_mask, - tgt_sizes=tgt_sizes) + tgt_sizes=tgt_sizes, + ) encoder_outputs = self.encoder(hidden_states) last_hidden_state = self.post_layernorm(encoder_outputs) return last_hidden_state diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 41b9f110d771f..28c23edd4c8e8 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -18,14 +18,16 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors +from vllm.sequence import IntermediateTensors, PoolerOutput from .interfaces import SupportsLoRA, SupportsPP from .utils import (is_pp_missing_parameter, @@ -433,3 +435,59 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params + + +class InternLM2ForRewardModel(InternLM2ForCausalLM): + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + model_type: Type[InternLM2Model] = InternLM2Model, + ): + super().__init__(vllm_config=vllm_config, + prefix=prefix, + model_type=model_type) + + for attr in ("output", "logits_processor", "sampler"): + delattr(self, attr) + + config = vllm_config.model_config.hf_config + self.v_head = RowParallelLinear( + config.hidden_size, + 1, + bias=False, + input_is_parallel=False, + prefix=maybe_prefix(prefix, "v_head"), + ) + + pooler_config = vllm_config.model_config.pooler_config + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.ALL, + normalize=False, + softmax=False, + ) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata, intermediate_tensors, + inputs_embeds) + logits, _ = self.v_head(hidden_states) + return logits + + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Optional[PoolerOutput]: + return self._pooler(hidden_states, pooling_metadata) diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 91786db5ddc96..890b5530b97d6 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -107,9 +107,11 @@ class JambaMambaDecoderLayer(nn.Module): layer_idx: int, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - prefix: str = "") -> None: + is_lora_enabled: Optional[bool] = False, + **kwargs) -> None: super().__init__() self.config = config + self.is_lora_enabled = is_lora_enabled self.mamba = MambaMixer(hidden_size= config.hidden_size, ssm_state_size = config.mamba_d_state, conv_kernel_size = config.mamba_d_conv, @@ -120,7 +122,9 @@ class JambaMambaDecoderLayer(nn.Module): use_bias = config.mamba_proj_bias, use_rms_norm=True, rms_norm_eps=config.rms_norm_eps, - activation=config.hidden_act) + activation=config.hidden_act, + is_lora_enabled = self.is_lora_enabled + ) num_experts = config.layers_num_experts[layer_idx] ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP @@ -156,14 +160,13 @@ class JambaMambaDecoderLayer(nn.Module): class JambaAttentionDecoderLayer(nn.Module): - def __init__( - self, - config: JambaConfig, - layer_idx: int, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, + config: JambaConfig, + layer_idx: int, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + **kwargs) -> None: super().__init__() self.hidden_size = config.hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -287,17 +290,18 @@ class JambaModel(nn.Module): org_num_embeddings=config.vocab_size, ) + extra_kwargs = {"is_lora_enabled": bool(vllm_config.lora_config)} + def get_layer(prefix: str): layer_idx = int(prefix.rsplit(".", 1)[1]) layer_class = ALL_DECODER_LAYER_TYPES[ config.layers_block_type[layer_idx]] - return layer_class( - config, - layer_idx, - cache_config, - quant_config=quant_config, - prefix=prefix, - ) + return layer_class(config, + layer_idx, + cache_config, + quant_config=quant_config, + prefix=prefix, + **extra_kwargs) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers") @@ -371,14 +375,13 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, "k_proj", "v_proj", ], + "in_proj": ["in_proj"], } # LoRA specific attributes supported_lora_modules = [ - "qkv_proj", - "o_proj", - "embed_tokens", - "lm_head", + "qkv_proj", "o_proj", "embed_tokens", "lm_head", "up_proj", + "down_proj", "gate_proj", "out_proj", "in_proj", "x_proj" ] embedding_modules = { "embed_tokens": "input_embeddings", @@ -423,9 +426,9 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) if self.scheduler_config is not None and \ - not self.model_config.enforce_eager: + not self.model_config.enforce_eager: if self.scheduler_config.max_num_seqs > \ - vllm_config.compilation_config.max_capture_size: + vllm_config.compilation_config.max_capture_size: self.max_batch_size = \ vllm_config.compilation_config.max_capture_size else: @@ -446,7 +449,6 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): if self.mamba_cache is None: - num_mamba_layers = self.model_config.get_num_layers_by_block_type( self.vllm_config.parallel_config, LayerBlockType.mamba) self.mamba_cache = MambaCacheManager( diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 0662d90e79b92..34dc7fa31ce6f 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,5 +1,4 @@ from functools import cached_property -from types import MethodType from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, Set, Tuple, TypedDict, Union) @@ -7,7 +6,7 @@ import torch import torch.nn as nn from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig, PixtralVisionConfig, PretrainedConfig, - ProcessorMixin, SiglipVisionConfig) + SiglipVisionConfig) from transformers.models.llava import LlavaProcessor from transformers.models.pixtral import PixtralProcessor @@ -21,10 +20,14 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputsV2, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import ImageProcessorItems from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessorInputs, - PromptReplacement) + PromptReplacement, + full_groupby_modality) from vllm.sequence import IntermediateTensors from .clip import (CLIPVisionModel, dummy_image_for_clip, @@ -116,36 +119,54 @@ def get_max_llava_image_tokens(ctx: InputContext): class LlavaMultiModalProcessor(BaseMultiModalProcessor): - def _patch_pixtral_processor(self, hf_processor: PixtralProcessor): - if getattr(hf_processor, "__is_patched__", False): - return # Already patched - - image_processor = hf_processor.image_processor # type: ignore - orig_preprocess = image_processor.preprocess - - def preprocess(__self, *args, **kwargs): - hf_inputs = orig_preprocess(*args, **kwargs) - hf_inputs["is_pixtral"] = torch.tensor(True) - return hf_inputs - - image_processor.preprocess = MethodType(preprocess, image_processor) - - hf_processor.__is_patched__ = True # type: ignore - def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]: - hf_processor = self.ctx.get_hf_processor( - (LlavaProcessor, PixtralProcessor)) + return self.ctx.get_hf_processor((LlavaProcessor, PixtralProcessor)) - if isinstance(hf_processor, PixtralProcessor): - self._patch_pixtral_processor(hf_processor) + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) - return hf_processor + # NOTE: pixel_values=None for MLlavaProcessor + pixel_values = processed_outputs.get("pixel_values") + if pixel_values is not None: + images = mm_data["images"] + assert isinstance(images, list) + + if isinstance(self._get_hf_processor(), PixtralProcessor): + # Original output: (1, num_images, C, H, W) + # New output: (num_images, C, H, W) + assert (isinstance(pixel_values, list) + and len(pixel_values) == 1) + assert (isinstance(pixel_values[0], list) + and len(pixel_values[0]) == len(images)) + + processed_outputs["pixel_values"] = pixel_values[0] + + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_inputs: BatchFeature, - mm_processor_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: hf_config = self.ctx.get_hf_config(LlavaConfig) image_token_id = hf_config.image_token_index @@ -160,7 +181,9 @@ class LlavaMultiModalProcessor(BaseMultiModalProcessor): assert isinstance(vision_config, PixtralVisionConfig) def get_replacement_pixtral(item_idx: int): - image_size = mm_items.get_image_size(item_idx) + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) + ( num_width_tokens, num_height_tokens, @@ -200,7 +223,7 @@ class LlavaMultiModalProcessor(BaseMultiModalProcessor): ) -> ProcessorInputs: hf_config = self.ctx.get_hf_config(LlavaConfig) vision_config = hf_config.vision_config - num_images = mm_counts["image"] + num_images = mm_counts.get("image", 0) if isinstance(vision_config, CLIPVisionConfig): data = dummy_image_for_clip(vision_config, num_images) @@ -218,7 +241,6 @@ class LlavaMultiModalProcessor(BaseMultiModalProcessor): return ProcessorInputs( prompt_text=image_token * num_images, mm_data=data, - mm_processor_kwargs={}, ) @@ -379,7 +401,6 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[LlavaImageInputs]: pixel_values = kwargs.pop("pixel_values", None) - is_pixtral = kwargs.pop("is_pixtral", torch.tensor([False])) image_embeds = kwargs.pop("image_embeds", None) if pixel_values is None and image_embeds is None: @@ -390,33 +411,6 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): raise ValueError("Incorrect type of pixel values. " f"Got type: {type(pixel_values)}") - assert isinstance(is_pixtral, torch.Tensor) - if is_pixtral.any(): - images = pixel_values - - def flatten_to_3d_tensors(item): - if isinstance(item, torch.Tensor): - if item.dim() >= 3: - return [t for t in item.view(-1, *item.shape[-3:])] - else: - raise ValueError( - f"Unexpected tensor dimension: {item.dim()}") - elif isinstance(item, list): - return [ - t for subitem in item - for t in flatten_to_3d_tensors(subitem) - ] - else: - raise ValueError(f"Unexpected type: {type(item)}") - - # Restructure the batched images into a list of lists of images - images = flatten_to_3d_tensors(pixel_values) - - return LlavaImagePixelInputs( - type="pixel_values", - data=images, - ) - return LlavaImagePixelInputs( type="pixel_values", data=self._validate_pixel_values( @@ -586,19 +580,71 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): class MantisMultiModalProcessor(LlavaMultiModalProcessor): - def _get_hf_processor(self) -> ProcessorMixin: - try: - from mantis.models.mllava import MLlavaProcessor - except ModuleNotFoundError as exc: - raise ModuleNotFoundError( - "You need to `pip install " - "git+https://github.com/TIGER-AI-Lab/Mantis.git` " - "to use this model") from exc + def _get_hf_processor(self): + return self.ctx.get_hf_processor(LlavaProcessor) - processor = MLlavaProcessor.from_pretrained( - self.ctx.model_config.tokenizer) - assert isinstance(processor, ProcessorMixin) - return processor + def apply( + self, + prompt_text: str, + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + hf_config = self.ctx.get_hf_config(LlavaConfig) + image_token_id = hf_config.image_token_index + max_image_tokens = get_max_llava_image_tokens(self.ctx) + + result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + + mm_items = self._to_mm_items(mm_data) + mm_item_counts = mm_items.get_all_counts() + mm_kwargs = result["mm_kwargs"] + + # We reimplement the functionality of MLlavaProcessor from + # https://github.com/TIGER-AI-Lab/Mantis.git + def get_replacement_mantis(item_idx: int): + return "".join([ + f"(image {item_idx+1}: ", # 7 tokens + "" * max_image_tokens, + ")", # 3 tokens + ]) + + mantis_repls = self._bind_prompt_replacements([ + PromptReplacement( + modality="image", + target=[image_token_id] * max_image_tokens, + replacement=get_replacement_mantis, + ) + ]) + + prompt_ids, prompt_text, _ = self._apply_prompt_replacements( + result["prompt_token_ids"], + mantis_repls, + mm_item_counts, + ) + + unbound_orig_repls = self._get_prompt_replacements( + mm_items, + hf_processor_mm_kwargs, + mm_kwargs, + ) + orig_repls = self._bind_prompt_replacements(unbound_orig_repls) + + all_placeholders = self._find_placeholders(orig_repls, prompt_ids, + mm_item_counts) + assert len(all_placeholders) == mm_item_counts.get("image", 0) + + mm_placeholders = { + modality: [item.to_range() for item in items] + for modality, items in full_groupby_modality(all_placeholders) + } + + return MultiModalInputsV2( + type="multimodal", + prompt=prompt_text, + prompt_token_ids=prompt_ids, + mm_kwargs=mm_kwargs, + mm_placeholders=mm_placeholders, + ) # To use this model, please use diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index a39f2f4124d05..5e70c11363c83 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -528,10 +528,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, stacked_image_features = self._image_pixels_to_features( self.vision_tower, stacked_pixel_values) - return [ - self.multi_modal_projector(image_features) for image_features in - torch.split(stacked_image_features, num_patches_per_batch) - ] + return torch.split(self.multi_modal_projector(stacked_image_features), + num_patches_per_batch) def _process_image_input( self, diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 06c8d9723cd01..553bc9c28cb21 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -38,10 +38,12 @@ class MambaDecoderLayer(nn.Module): def __init__(self, config: MambaConfig, cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None) -> None: + quant_config: Optional[QuantizationConfig] = None, + is_lora_enabled: Optional[bool] = False) -> None: super().__init__() self.config = config self.is_falcon_mamba = config.model_type == "falcon_mamba" + self.is_lora_enabled = is_lora_enabled mixer_rms_eps = config.mixer_rms_eps if self.is_falcon_mamba else None self.mixer = MambaMixer(hidden_size=config.hidden_size, ssm_state_size=config.state_size, @@ -53,7 +55,8 @@ class MambaDecoderLayer(nn.Module): use_rms_norm=self.is_falcon_mamba, rms_norm_has_weight=not self.is_falcon_mamba, rms_norm_eps=mixer_rms_eps, - activation=config.hidden_act) + activation=config.hidden_act, + is_lora_enabled=self.is_lora_enabled) self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) @@ -85,6 +88,7 @@ class MambaModel(nn.Module): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config + is_lora_enabled = bool(lora_config) self.config = config self.padding_idx = config.pad_token_id @@ -101,8 +105,10 @@ class MambaModel(nn.Module): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: MambaDecoderLayer( - config, cache_config=cache_config, quant_config=quant_config), + lambda prefix: MambaDecoderLayer(config, + cache_config=cache_config, + quant_config=quant_config, + is_lora_enabled=is_lora_enabled), prefix=f"{prefix}.layers") self.norm_f = RMSNorm(config.hidden_size, diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 1e8f9bd4cf418..712022502539b 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -487,6 +487,12 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): image_embeds = kwargs.pop("image_embeds", None) if image_embeds is not None: + if not isinstance(image_embeds, (torch.Tensor, list)): + raise ValueError(f"Incorrect type of image embeds. " + f"Got type: {type(image_embeds)}") + if isinstance(image_embeds, list): + image_embeds = torch.concat(image_embeds) + return MiniCPMVImageEmbeddingInputs( image_bounds=self._get_image_bounds(input_ids, im_start_id, im_end_id, slice_start_id, diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 8938f62d0c494..cc25be9f5b6a9 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -36,6 +36,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.inputs import NestedTensors, PlaceholderRange from vllm.multimodal.utils import cached_get_tokenizer @@ -43,7 +44,7 @@ from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, SequenceData) from vllm.transformers_utils.processor import get_processor -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, merge_multimodal_embeddings) @@ -461,30 +462,71 @@ class MolmoAttention(nn.Module): return output -class MolmoMLP(nn.Module): +class SwiGLU(nn.Module): + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x, gate = x.chunk(2, dim=-1) + # Note that the order is reversed compared to + # SiluAndMul. + return x * F.silu(gate) + + +class LanuageModelMLP(nn.Module): """Molmo's LLM mlp.""" def __init__(self, config: PretrainedConfig, input_dim: Optional[int] = None, - quant_config: Optional[QuantizationConfig] = None, - proj_name: str = "gate_up_proj") -> None: + quant_config: Optional[QuantizationConfig] = None) -> None: super().__init__() self.hidden_size = config.hidden_size self.intermediate_size = config.intermediate_size // 2 - # Molmo's LLM proj weights are already merged into the disk, while - # image_projector proj is separate. If the same proj_name were used, it - # would create ambiguity and make it difficult to support BNB and LoRA. - self.proj_name = proj_name - setattr( - self, proj_name, - MergedColumnParallelLinear( - input_dim or self.hidden_size, - [self.intermediate_size] * 2, - bias=False, - quant_config=quant_config, - )) + self.gate_up_proj = MergedColumnParallelLinear( + input_dim or self.hidden_size, + [self.intermediate_size] * 2, + bias=False, + quant_config=quant_config, + ) + # Activation function. + self.act_fn = SwiGLU() + # Feed-forward output projection. + self.down_proj = RowParallelLinear( + self.intermediate_size, + self.hidden_size, + bias=False, + quant_config=quant_config, + ) + + def forward( + self, + x: torch.Tensor, + ) -> torch.Tensor: + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class ImageProjectorMLP(nn.Module): + """Molmo's image_projector mlp.""" + + def __init__( + self, + config: PretrainedConfig, + input_dim: Optional[int] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size // 2 + + self.merged_linear = MergedColumnParallelLinear( + input_dim or self.hidden_size, + [self.intermediate_size] * 2, + bias=False, + quant_config=quant_config, + ) # Activation function. self.act_fn = SiluAndMul() @@ -500,7 +542,7 @@ class MolmoMLP(nn.Module): self, x: torch.Tensor, ) -> torch.Tensor: - gate_up, _ = getattr(self, self.proj_name)(x) + gate_up, _ = self.merged_linear(x) x = self.act_fn(gate_up) x, _ = self.down_proj(x) return x @@ -523,9 +565,7 @@ class MolmoDecoderLayer(nn.Module): prefix=f"{prefix}.self_attn") # MLP block. - self.mlp = MolmoMLP(config, - quant_config=quant_config, - proj_name="gate_up_proj") + self.mlp = LanuageModelMLP(config, quant_config=quant_config) # LayerNorm assert config.layer_norm_type == "rms" @@ -617,11 +657,10 @@ class MolmoVisionBackbone(nn.Module): vision_config, nlayers=len(self.vit_layers), quant_config=quant_config) - self.image_projector = MolmoMLP( + self.image_projector = ImageProjectorMLP( config, input_dim=vision_config.image_emb_dim, quant_config=quant_config, - proj_name="merged_linear", ) image_dim = vision_config.image_emb_dim * len(self.vit_layers) @@ -842,10 +881,6 @@ class MolmoModel(nn.Module): loaded_params: Set[str] = set() for name, loaded_weight in weights: - if "gate_up_proj" in name: - up_proj, gate_proj = loaded_weight.chunk(2, dim=0) - loaded_weight = torch.cat([gate_proj, up_proj], dim=0) - if name.endswith(".bias") and name not in params_dict: continue if is_pp_missing_parameter(name, self): @@ -1127,8 +1162,8 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs): @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_molmo_image_tokens) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_molmo) @INPUT_REGISTRY.register_input_processor(input_processor_for_molmo) -class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - +class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, + SupportsLoRA): hf_to_vllm_mapper = WeightsMapper( orig_to_new_substr={ # vision backbone mapping @@ -1157,13 +1192,47 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): }, ) + packed_modules_mapping = { + "qkv_proj": ["qkv_proj"], + "gate_up_proj": ["gate_up_proj"], # language model + "merged_linear": ["gate_proj", "up_proj"] # image_projector + } + + # LoRA specific attributes + supported_lora_modules = [ + # language model + "qkv_proj", + "o_proj", + "gate_up_proj", + "down_proj", # same name with image_projector + # vision tower + "wq", + "wk", + "wv", + "wo", + "w1", + "w2", + # image_projector + "merged_linear", + ] + embedding_modules = {} + embedding_padding_modules = [] + + # BitandBytes specific attributes + bitsandbytes_stacked_params_mapping = { + "gate_proj": ("merged_linear", 0), + "up_proj": ("merged_linear", 1), + } + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config + lora_config = vllm_config.lora_config self.config = config self.multimodal_config = multimodal_config + self.lora_config = lora_config vision_config = VisionBackboneConfig() self.vision_backbone = MolmoVisionBackbone(config, vision_config, @@ -1337,6 +1406,16 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): weights = _get_weights_with_merged_embedding(weights) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="model", + connector="vision_backbone.image_projector", + tower_model="vision_backbone", + ) + def _get_weights_with_merged_embedding( weights: Iterable[Tuple[str, torch.Tensor]] diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 4e2e7f5761544..15362db6cdfbf 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -12,9 +12,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from collections.abc import Iterable, Mapping, Sequence from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union import torch import torch.nn as nn @@ -32,10 +32,15 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputsV2, MultiModalKwargs, + NestedTensors, PlaceholderRange) +from vllm.multimodal.parse import ImageProcessorItems from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessorInputs, - PromptReplacement) + PromptReplacement, + _BoundPromptReplacement, + _PlaceholderInfo) from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -306,11 +311,11 @@ def get_max_phi3v_image_tokens( *, num_crops: Optional[int] = None, ) -> int: - mm_processor_kwargs = {} + hf_processor_mm_kwargs = {} if num_crops: - mm_processor_kwargs["num_crops"] = num_crops + hf_processor_mm_kwargs["num_crops"] = num_crops - processor = ctx.get_hf_processor(**mm_processor_kwargs) + processor = ctx.get_hf_processor(**hf_processor_mm_kwargs) return processor.calc_num_image_tokens_from_image_size( width=MAX_IMAGE_FEATURE_SIZE_WIDTH, @@ -331,62 +336,100 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor): def _call_hf_processor( self, - hf_processor: ProcessorMixin, prompt: str, - processor_data: Mapping[str, object], - mm_processor_kwargs: Mapping[str, object], + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], ) -> BatchFeature: processed_outputs = super()._call_hf_processor( - hf_processor, prompt=prompt, - processor_data=processor_data, - mm_processor_kwargs=mm_processor_kwargs, + mm_data=mm_data, + mm_kwargs=mm_kwargs, ) + input_ids = processed_outputs["input_ids"] + assert isinstance(input_ids, torch.Tensor) + # Phi3v processor has inserted -1, -2 etc as placeholder in prompt_ids, # which will cause OverflowError when decoding the prompt_ids. # Therefore, we need to do an early replacement here - token_ids = processed_outputs['input_ids'] - token_ids[token_ids < 0] = _IMAGE_TOKEN_ID - processed_outputs['input_ids'] = token_ids + input_ids.masked_fill_(input_ids < 0, _IMAGE_TOKEN_ID) return processed_outputs + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_sizes=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_inputs: BatchFeature, - mm_processor_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: hf_processor = self._get_hf_processor() image_tokens: list[str] = hf_processor.img_tokens # type: ignore image_processor = hf_processor.image_processor # type: ignore - mm_config = self.ctx.get_mm_config() - max_images = mm_config.limit_per_prompt.get("image", 1) + tokenizer = self._get_tokenizer() + bos_token_id = tokenizer.bos_token_id + assert isinstance(bos_token_id, int) def get_replacement_phi3v(item_idx: int): - image_size = mm_items.get_image_size(item_idx) + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) + num_tokens = image_processor.calc_num_image_tokens_from_image_size( width=image_size.width, height=image_size.height, ) - return [_IMAGE_TOKEN_ID] * num_tokens + return [_IMAGE_TOKEN_ID] * num_tokens + [bos_token_id] + + num_images = mm_items.get_count("image", strict=False) return [ PromptReplacement( modality="image", target=image_token, replacement=get_replacement_phi3v, - ) for image_token in image_tokens[:max_images] + ) for image_token in image_tokens[:num_images] ] + def _apply_prompt_replacements( + self, + token_ids: list[int], + prompt_repls: Sequence[_BoundPromptReplacement], + mm_item_counts: Mapping[str, int], + ) -> tuple[list[int], str, list[_PlaceholderInfo]]: + token_ids, text, placeholders = super()._apply_prompt_replacements( + token_ids=token_ids, + prompt_repls=prompt_repls, + mm_item_counts=mm_item_counts, + ) + + # Keep the behavior in line with HF processor + if text.startswith(" <|image|>"): + text = text.replace(" <|image|>", "<|image|>", 1) + token_ids = [token_ids[0], *token_ids[2:]] + placeholders = [ + _PlaceholderInfo(p.modality, p.start_idx - 1, p.replacement) + for p in placeholders + ] + + return token_ids, text, placeholders + def _get_dummy_mm_inputs( self, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - num_images = mm_counts["image"] + num_images = mm_counts.get("image", 0) data = dummy_image_for_clip( CLIP_VIT_LARGE_PATCH14_336_CONFIG, @@ -401,9 +444,28 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor): return ProcessorInputs( prompt_text="".join(image_tokens[:num_images]), mm_data=data, - mm_processor_kwargs={}, ) + def apply( + self, + prompt_text: str, + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + + # Only <|image|> tokens should be considered as placeholders, + # so we ignore the trailing bos_token_id + result["mm_placeholders"] = { + modality: [ + PlaceholderRange(offset=p["offset"], length=p["length"] - 1) + for p in ps + ] + for modality, ps in result["mm_placeholders"].items() + } + + return result + @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens) @MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index f3d66c2313198..2bce13792a88d 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -1,8 +1,8 @@ +import math from dataclasses import dataclass, fields from functools import cached_property from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union -import numpy import torch import torch.nn as nn import torch.nn.functional as F @@ -45,13 +45,6 @@ try: except ImportError: USE_XFORMERS_OPS = False -# These token ids cannot be retrieved from model config -# so we hardcode them here. -PIXTRAL_12B_IMAGE_BREAK_ID = 12 -PIXTRAL_12B_IMAGE_END_ID = 13 -PIXTRAL_LARGE_IMAGE_BREAK_ID = 14 -PIXTRAL_LARGE_IMAGE_END_ID = 15 - def get_max_pixtral_image_tokens(ctx: InputContext): tokenizer = cached_get_tokenizer( @@ -201,6 +194,13 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, if key in dataclass_fields } + if not ("image_break_token_id" in vision_args + and "image_end_token_id" in vision_args): + raise ValueError( + "'image_break_token_id' and 'image_end_token_id' not found " + "in the vision_encoder arguments. Please download the latest " + "version of 'params.json' from the model repository.") + self.vision_args = VisionEncoderArgs(**vision_args) # init MistralForCausalLM @@ -240,9 +240,8 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, # NOTE: Image embeddings are split into separate tensors for each image # by the indices of `[IMG_END]` token. - image_end_condition = (image_tokens == PIXTRAL_12B_IMAGE_END_ID) | ( - image_tokens == PIXTRAL_LARGE_IMAGE_END_ID) - split_indices = torch.where(image_end_condition)[0] + 1 + image_end_mask = image_tokens == self.vision_args.image_end_token_id + split_indices = torch.where(image_end_mask)[0] + 1 if len(split_indices) <= 1: # Do not split, return as tensor of shape [1, fs, hs] return image_embeds.unsqueeze(0) @@ -265,10 +264,8 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, [ self.vision_args.image_token_id, - PIXTRAL_12B_IMAGE_END_ID, - PIXTRAL_12B_IMAGE_BREAK_ID, - PIXTRAL_LARGE_IMAGE_BREAK_ID, - PIXTRAL_LARGE_IMAGE_END_ID, + self.vision_args.image_break_token_id, + self.vision_args.image_end_token_id, ]) return inputs_embeds @@ -309,7 +306,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, images: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor], torch.Tensor]] = None, image_tokens: Optional[torch.Tensor] = None, - ) -> Optional[List[torch.Tensor]]: + ) -> Tuple[Optional[List[torch.Tensor]], Optional[torch.Tensor]]: if images is None: return None, None @@ -409,6 +406,8 @@ class VisionEncoderArgs: num_attention_heads: int rope_theta: float # for rope-2D image_token_id: int + image_break_token_id: int + image_end_token_id: int adapter_bias: bool = True @@ -605,11 +604,11 @@ class VisionTransformer(nn.Module): return self.args.image_size // self.args.patch_size @property - def device(self) -> torch.device: + def device(self) -> torch.types.Device: return next(self.parameters()).device @property - def dtype(self) -> torch.device: + def dtype(self) -> torch.dtype: return next(self.parameters()).dtype @property @@ -742,8 +741,8 @@ def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig, ratio = max(image_width / max_width, image_height / max_height) if ratio > 1: - image_width = int(numpy.ceil(image_width / ratio)) - image_height = int(numpy.ceil(image_height / ratio)) + image_width = int(math.ceil(image_width / ratio)) + image_height = int(math.ceil(image_height / ratio)) num_height_tokens, num_width_tokens = _get_pixtral_hf_num_image_tokens( (image_height, image_width), diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 63d1374ab4092..baf955f6b515d 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -225,7 +225,7 @@ class VisualAttentionBlock(nn.Module): d_model: int, n_head: int, mlp_ratio: float = 4.0, - norm_layer: Callable = nn.LayerNorm, + norm_layer: Callable[[int], nn.Module] = nn.LayerNorm, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() @@ -266,7 +266,7 @@ class TransformerBlock(nn.Module): layers: int, heads: int, mlp_ratio: float = 4.0, - norm_layer: Callable = nn.LayerNorm, + norm_layer: Callable[[int], nn.Module] = nn.LayerNorm, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 6259166a7fc57..de55bc6bcc123 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -20,13 +20,12 @@ # limitations under the License. """Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" from functools import cached_property -from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, + Union) -import numpy as np import torch import torch.nn as nn -from transformers import BatchFeature, ProcessorMixin +from transformers import BatchFeature from transformers.models.qwen2_audio import (Qwen2AudioConfig, Qwen2AudioEncoder, Qwen2AudioProcessor) @@ -38,7 +37,9 @@ from vllm.inputs import InputContext from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessorInputs, PromptReplacement) @@ -73,7 +74,7 @@ class Qwen2AudioMultiModalProjector(nn.Module): # From Qwen2AudioEncoder._get_feat_extract_output_lengths -def _get_feat_extract_output_lengths(input_lengths: torch.LongTensor): +def _get_feat_extract_output_lengths(input_lengths: torch.Tensor): feat_lengths = (input_lengths - 1) // 2 + 1 output_lengths = (feat_lengths - 2) // 2 + 1 return feat_lengths, output_lengths @@ -88,64 +89,74 @@ def get_max_qwen2_audio_audio_tokens(ctx: InputContext) -> int: class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): - def _get_hf_processor(self) -> Qwen2AudioProcessor: + def _get_hf_processor( + self, + *, + # Ignored in initialization + sampling_rate: Optional[int] = None, + ) -> Qwen2AudioProcessor: return self.ctx.get_hf_processor(Qwen2AudioProcessor) def _get_feature_extractor(self) -> WhisperFeatureExtractor: return self._get_hf_processor().feature_extractor # type: ignore - def _get_processor_data( - self, - mm_items: MultiModalDataItems, - ) -> tuple[dict[str, Any], dict[str, Any]]: - # resample audio to the model's sampling rate + def _get_data_parser(self) -> MultiModalDataParser: feature_extractor = self._get_feature_extractor() - mm_items.resample_audios(feature_extractor.sampling_rate) - - return super()._get_processor_data(mm_items) + return MultiModalDataParser(target_sr=feature_extractor.sampling_rate) def _call_hf_processor( self, - hf_processor: ProcessorMixin, prompt: str, - processor_data: Mapping[str, object], - mm_processor_kwargs: Mapping[str, object], + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], ) -> BatchFeature: - processor_data = dict(processor_data) - audios = processor_data.pop("audios", []) + mm_data = dict(mm_data) + audios = mm_data.pop("audios", []) if audios: - processor_data["audios"] = audios + mm_data["audios"] = audios feature_extractor = self._get_feature_extractor() - mm_processor_kwargs = dict( - **mm_processor_kwargs, + mm_kwargs = dict( + **mm_kwargs, sampling_rate=feature_extractor.sampling_rate, ) else: # NOTE: WhisperFeatureExtractor cannot handle empty list of audios pass - return super()._call_hf_processor( - hf_processor, + processed_outputs = super()._call_hf_processor( prompt=prompt, - processor_data=processor_data, - mm_processor_kwargs=mm_processor_kwargs, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + input_features=MultiModalFieldConfig.batched("audio"), + feature_attention_mask=MultiModalFieldConfig.batched("audio"), ) def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_inputs: BatchFeature, - mm_processor_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: hf_config = self.ctx.get_hf_config(Qwen2AudioConfig) placeholder = hf_config.audio_token_index - feature_attention_mask = hf_inputs.get("feature_attention_mask") + feature_attention_mask = out_mm_kwargs.get("feature_attention_mask") if feature_attention_mask is None: audio_output_lengths = [] else: + assert isinstance(feature_attention_mask, torch.Tensor) _, audio_output_lengths = _get_feat_extract_output_lengths( feature_attention_mask.sum(-1)) @@ -165,17 +176,19 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): mm_counts: Mapping[str, int], ) -> ProcessorInputs: feature_extractor = self._get_feature_extractor() + sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate + num_audios = mm_counts.get("audio", 0) - audio_count = mm_counts["audio"] - audio = np.zeros(audio_len) - data = {"audio": [audio] * audio_count} + mm_data = { + "audio": + self._get_dummy_audios(length=audio_len, num_audios=num_audios) + } return ProcessorInputs( - prompt_text="<|AUDIO|>" * audio_count, - mm_data=data, - mm_processor_kwargs={}, + prompt_text="<|AUDIO|>" * num_audios, + mm_data=mm_data, ) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index fb97eb1916002..0df101b3dcce4 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -22,14 +22,13 @@ # limitations under the License. """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" from functools import cached_property, partial -from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set, - Tuple, Type, TypedDict, Union) +from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional, + Set, Tuple, Type, TypedDict, Union) import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange, repeat -from PIL import Image from transformers import BatchFeature from transformers.models.qwen2_vl import (Qwen2VLImageProcessor, Qwen2VLProcessor) @@ -53,15 +52,18 @@ from vllm.model_executor.layers.quantization.gptq_marlin import ( GPTQMarlinConfig) from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalDataDict, NestedTensors +from vllm.multimodal.inputs import (ImageItem, ModalityData, + MultiModalFieldConfig, MultiModalKwargs, + NestedTensors, VideoItem) +from vllm.multimodal.parse import ModalityDataItems, MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessorInputs, PromptReplacement) from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope -from vllm.utils import is_list_of from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend, @@ -229,9 +231,9 @@ class Qwen2VisionAttention(nn.Module): def __init__( self, - embed_dim: Optional[int] = None, - num_heads: Optional[int] = None, - projection_size: Optional[int] = None, + embed_dim: int, + num_heads: int, + projection_size: int, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: @@ -264,7 +266,7 @@ class Qwen2VisionAttention(nn.Module): self, x: torch.Tensor, cu_seqlens: torch.Tensor, - rotary_pos_emb: torch.Tensor = None, + rotary_pos_emb: torch.Tensor, ) -> torch.Tensor: # [s, b, c] --> [s, b, head * 3 * head_dim] x, _ = self.qkv(x) @@ -347,7 +349,7 @@ class Qwen2VisionBlock(nn.Module): num_heads: int, mlp_ratio: float, act_layer: Type[nn.Module] = QuickGELU, - norm_layer: Type[nn.Module] = None, + norm_layer: Optional[Callable[[int], nn.Module]] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: @@ -384,7 +386,7 @@ class Qwen2VisionPatchEmbed(nn.Module): self, patch_size: int = 14, temporal_patch_size: int = 2, - in_chans: int = 3, + in_channels: int = 3, embed_dim: int = 1152, ) -> None: super().__init__() @@ -392,8 +394,8 @@ class Qwen2VisionPatchEmbed(nn.Module): self.temporal_patch_size = temporal_patch_size self.embed_dim = embed_dim - kernel_size = [temporal_patch_size, patch_size, patch_size] - self.proj = nn.Conv3d(in_chans, + kernel_size = (temporal_patch_size, patch_size, patch_size) + self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, @@ -413,7 +415,7 @@ class Qwen2VisionPatchMerger(nn.Module): self, d_model: int, context_dim: int, - norm_layer: Type[nn.Module] = None, + norm_layer: Optional[Callable[[int], nn.Module]] = None, spatial_merge_size: int = 2, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -489,15 +491,15 @@ class Qwen2VisionTransformer(nn.Module): ) -> None: super().__init__() - patch_size: int = vision_config.patch_size - temporal_patch_size: int = vision_config.temporal_patch_size - spatial_merge_size: int = vision_config.spatial_merge_size - in_chans: int = vision_config.in_chans - hidden_size: int = vision_config.hidden_size - embed_dim: int = vision_config.embed_dim - depth: int = vision_config.depth - num_heads: int = vision_config.num_heads - mlp_ratio: float = vision_config.mlp_ratio + patch_size = vision_config.patch_size + temporal_patch_size = vision_config.temporal_patch_size + spatial_merge_size = vision_config.spatial_merge_size + in_channels = vision_config.in_channels + hidden_size = vision_config.hidden_size + embed_dim = vision_config.embed_dim + depth = vision_config.depth + num_heads = vision_config.num_heads + mlp_ratio = vision_config.mlp_ratio self.spatial_merge_size = spatial_merge_size self.num_heads = num_heads @@ -506,7 +508,7 @@ class Qwen2VisionTransformer(nn.Module): self.patch_embed = Qwen2VisionPatchEmbed( patch_size=patch_size, temporal_patch_size=temporal_patch_size, - in_chans=in_chans, + in_channels=in_channels, embed_dim=embed_dim, ) @@ -717,51 +719,81 @@ get_max_qwen2_vl_video_tokens = partial(get_max_qwen2_vl_mm_tokens, data_type_key="video") -class Qwen2VLMultiModalDataItems(MultiModalDataItems): +class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor], + dict[str, torch.Tensor]]): - @staticmethod - def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems": - """ - Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`. - """ - multi_data = Qwen2VLMultiModalDataItems() + def __init__(self, data: dict, modality: str) -> None: + super().__init__(data) - for k, v in data.items(): - # TODO: Make a separate modality for embedding inputs - # to avoid confusion - # yapf: disable - if k == "video": - # Special case since even a single item can be a list - multi_data[k] = ( # type: ignore[index] - v if (isinstance(v, (dict, torch.Tensor)) # type: ignore[assignment] - or is_list_of(v, list)) else [v] - ) - elif k in ("image", "audio"): - multi_data[k] = ( # type: ignore[index] - v if isinstance(v, (dict, torch.Tensor, list)) else [v] - ) - else: - multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index] - # yapf: enable + self.modality = modality - return multi_data + grid_thw = data[f"{modality}_grid_thw"] + slice_idxs = [0] + grid_thw.prod(-1).cumsum_(0).tolist() + self._slices = [ + slice(slice_idxs[i], slice_idxs[i + 1]) + for i in range(len(grid_thw)) + ] - def get_item_counts(self) -> Mapping[str, int]: - return { - m: ( - len(items[f"{m}_grid_thw"]) # type: ignore - if isinstance(items, dict) else len(items)) - for m, items in self.items() - } + def __repr__(self) -> str: + return (f"{type(self).__name__}(modality={self.modality!r})") + + def get_count(self) -> int: + return len(self.data[f"{self.modality}_grid_thw"]) + + def get(self, index: int) -> dict[str, torch.Tensor]: + out = {} + for k, v in self.data.items(): + if v != f"{self.modality}_grid_thw": + v = v[self._slices[index]] + + out[k] = v + + return out + + def get_processor_data(self) -> Mapping[str, object]: + return {} + + def get_passthrough_data(self) -> Mapping[str, object]: + return self.data + + +class Qwen2ImageEmbeddingItems(Qwen2EmbeddingItems): + + def __init__(self, data: dict) -> None: + super().__init__(data, "image") + + +class Qwen2VideoEmbeddingItems(Qwen2EmbeddingItems): + + def __init__(self, data: dict) -> None: + super().__init__(data, "video") + + +class Qwen2MultiModalDataParser(MultiModalDataParser): + + def _parse_image_data( + self, + data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]], + ) -> ModalityDataItems[Any, Any]: + if isinstance(data, dict): + return Qwen2EmbeddingItems(data, modality="image") + + return super()._parse_image_data(data) + + def _parse_video_data( + self, + data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]], + ) -> ModalityDataItems[Any, Any]: + if isinstance(data, dict): + return Qwen2EmbeddingItems(data, modality="video") + + return super()._parse_video_data(data) class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): - def _get_mm_items( - self, - mm_data: MultiModalDataDict, - ) -> MultiModalDataItems: - return Qwen2VLMultiModalDataItems.from_dict(mm_data) + def _get_data_parser(self) -> MultiModalDataParser: + return Qwen2MultiModalDataParser() def _get_hf_processor( self, @@ -784,40 +816,11 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): return hf_processor - def _get_processor_data( - self, - mm_items: MultiModalDataItems, - ) -> tuple[dict[str, Any], dict[str, Any]]: - processor_data = dict[str, Any]() - passthrough_data = dict[str, Any]() - - for k, v in mm_items.items(): - # TODO: Make a separate modality for embedding inputs - # to avoid confusion - if k in ("image", "video", "audio"): - if isinstance(v, dict): - # Pass through embedding inputs (dict) - passthrough_data.update(v) - elif isinstance(v, torch.Tensor) and v.ndim == 3: - # Pass through embedding inputs (single) - passthrough_data[f"{k}_embeds"] = [v] - elif (is_list_of(v, torch.Tensor) and len(v) > 0 - and v[0].ndim == 2): - # Pass through embedding inputs (multi) - passthrough_data[f"{k}_embeds"] = v - else: - # Map keys to plural form, e.g.: image -> images - processor_data[f"{k}s"] = v - else: - processor_data[k] = v - - return processor_data, passthrough_data - def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_inputs: BatchFeature, - mm_processor_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: hf_processor = self._get_hf_processor() image_processor = _get_image_processor(hf_processor) @@ -831,7 +834,9 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): merge_length = image_processor.merge_size**2 def get_replacement_qwen2vl(item_idx: int, modality: str): - grid_thw = hf_inputs[f"{modality}_grid_thw"][item_idx] + grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx] + assert isinstance(grid_thw, torch.Tensor) + num_tokens = grid_thw.prod() // merge_length return placeholder[modality] * num_tokens @@ -844,16 +849,43 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): ) for modality in ("image", "video") ] + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3))) + image_slice_idxs = [0] + image_grid_thw.prod(-1).cumsum_(0).tolist() + image_slices = [ + slice(image_slice_idxs[i], image_slice_idxs[i + 1]) + for i in range(len(image_grid_thw)) + ] + + video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3))) + video_slice_idxs = [0] + video_grid_thw.prod(-1).cumsum_(0).tolist() + video_slices = [ + slice(video_slice_idxs[i], video_slice_idxs[i + 1]) + for i in range(len(video_grid_thw)) + ] + + return dict( + pixel_values=MultiModalFieldConfig.flat("image", image_slices), + image_embeds=MultiModalFieldConfig.flat("image", image_slices), + image_grid_thw=MultiModalFieldConfig.batched("image"), + pixel_values_videos=MultiModalFieldConfig.flat( + "video", video_slices), + video_embeds=MultiModalFieldConfig.flat("video", video_slices), + video_grid_thw=MultiModalFieldConfig.batched("video"), + ) + def _get_dummy_mm_inputs( self, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - num_images = mm_counts["image"] hf_processor = self._get_hf_processor() - image_token: str = hf_processor.image_token image_processor = _get_image_processor(hf_processor) - data = {} + image_token: str = hf_processor.image_token resized_height, resized_width = smart_resize( height=9999999, width=9999999, @@ -861,15 +893,18 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): min_pixels=image_processor.min_pixels, max_pixels=image_processor.max_pixels, ) + num_images = mm_counts.get("image", 0) - dummy_image = Image.new("RGB", (resized_width, resized_height), - color=0) - data["image"] = [dummy_image] * num_images + mm_data = { + "image": + self._get_dummy_images(width=resized_width, + height=resized_height, + num_images=num_images) + } return ProcessorInputs( prompt_text=image_token * num_images, - mm_data=data, - mm_processor_kwargs={}, + mm_data=mm_data, ) @@ -892,15 +927,23 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, } # LoRA specific attributes - # TODO Support LoRA for the visual encoder in the future. supported_lora_modules = [ "qkv_proj", "o_proj", "gate_up_proj", "down_proj", + # vision tower + "qkv", + "attn.proj", # Distinguish patch_embed.proj + "fc1", + "fc2", + # projector + "mlp.0", + "mlp.2" ] embedding_modules = {} embedding_padding_modules = [] + # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ "lm_head.": "language_model.lm_head.", @@ -950,9 +993,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, return None return quant_config - def _validate_and_reshape_mm_tensor(self, - mm_input: Union[torch.Tensor, - List[torch.Tensor]], + def _validate_and_reshape_mm_tensor(self, mm_input: object, name: str) -> torch.Tensor: if not isinstance(mm_input, (torch.Tensor, list)): raise ValueError(f"Incorrect type of {name}. " @@ -962,7 +1003,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, return mm_input if mm_input.ndim != 3: raise ValueError(f"{name} should be 2D or batched 3D tensor. " - f"Got ndim: {mm_input.ndim}") + f"Got ndim: {mm_input.ndim} " + f"(shape={mm_input.shape})") return torch.concat(list(mm_input)) else: return torch.concat(mm_input) @@ -1198,3 +1240,12 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="visual.", + tower_model="visual.merger.") diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index feb33bb373c3e..07f4b5a3b3bc8 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -18,7 +18,6 @@ import cloudpickle import torch.nn as nn from vllm.logger import init_logger -from vllm.platforms import current_platform from .interfaces import (has_inner_state, is_attention_free, is_hybrid, supports_cross_encoding, supports_multimodal, @@ -113,6 +112,7 @@ _EMBEDDING_MODELS = { "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"), "GlmForCausalLM": ("glm", "GlmForCausalLM"), "GritLM": ("gritlm", "GritLM"), + "InternLM2ForRewardModel": ("internlm2", "InternLM2ForRewardModel"), "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"), # noqa: E501 "LlamaModel": ("llama", "LlamaForCausalLM"), **{ @@ -187,31 +187,6 @@ _VLLM_MODELS = { **_SPECULATIVE_DECODING_MODELS, } -# Models not supported by ROCm. -_ROCM_UNSUPPORTED_MODELS: List[str] = [] - -# Models partially supported by ROCm. -# Architecture -> Reason. -_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in " - "Triton flash attention. For half-precision SWA support, " - "please use CK flash attention by setting " - "`VLLM_USE_TRITON_FLASH_ATTN=0`") -_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = { - "Qwen2ForCausalLM": - _ROCM_SWA_REASON, - "MistralForCausalLM": - _ROCM_SWA_REASON, - "MixtralForCausalLM": - _ROCM_SWA_REASON, - "PaliGemmaForConditionalGeneration": - ("ROCm flash attention does not yet " - "fully support 32-bit precision on PaliGemma"), - "Phi3VForCausalLM": - ("ROCm Triton flash attention may run into compilation errors due to " - "excessive use of shared memory. If this happens, disable Triton FA " - "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`") -} - @dataclass(frozen=True) class _ModelInfo: @@ -297,17 +272,8 @@ def _try_load_model_cls( model_arch: str, model: _BaseRegisteredModel, ) -> Optional[Type[nn.Module]]: - if current_platform.is_rocm(): - if model_arch in _ROCM_UNSUPPORTED_MODELS: - raise ValueError(f"Model architecture '{model_arch}' is not " - "supported by ROCm for now.") - - if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS: - msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch] - logger.warning( - "Model architecture '%s' is partially " - "supported by ROCm: %s", model_arch, msg) - + from vllm.platforms import current_platform + current_platform.verify_model_arch(model_arch) try: return model.load_model_cls() except Exception: diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py index 28c37bb96612c..02ca7fe08e556 100644 --- a/vllm/model_executor/models/telechat2.py +++ b/vllm/model_executor/models/telechat2.py @@ -31,19 +31,6 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, class TeleChat2Model(LlamaModel): - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_prefix={ - "transformer.": "model.", - }, - orig_to_new_substr={ - ".h.": ".layers.", - ".self_attention.": ".self_attn.", - ".word_embeddings.": ".embed_tokens.", - ".dense.": ".o_proj.", - ".ln_f.": ".norm.", - }, - ) - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # 1. Initialize the LlamaModel with bias vllm_config.model_config.hf_config.bias = True @@ -118,6 +105,19 @@ class TeleChat2Model(LlamaModel): class TeleChat2ForCausalLM(LlamaForCausalLM): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "transformer.": "model.", + }, + orig_to_new_substr={ + ".h.": ".layers.", + ".self_attention.": ".self_attn.", + ".word_embeddings.": ".embed_tokens.", + ".dense.": ".o_proj.", + ".ln_f.": ".norm.", + }, + ) + def _init_model(self, vllm_config: VllmConfig, prefix: str = ""): return TeleChat2Model(vllm_config=vllm_config, prefix=prefix) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 509ad9e580ddf..54be7fed3f2be 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -3,8 +3,8 @@ import math from functools import cached_property, lru_cache -from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set, - Tuple, TypedDict, Union) +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, + TypedDict, Union) import numpy as np import torch @@ -23,7 +23,10 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.loader import DefaultModelLoader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessorInputs, PromptReplacement) @@ -72,45 +75,55 @@ def get_ultravox_max_audio_tokens(ctx: InputContext): class UltravoxMultiModalProcessor(BaseMultiModalProcessor): + def _get_hf_processor( + self, + *, + # Ignored in initialization + sampling_rate: Optional[int] = None, + ) -> ProcessorMixin: + return self.ctx.get_hf_processor() + def _get_feature_extractor(self) -> WhisperFeatureExtractor: hf_processor = self._get_hf_processor() return hf_processor.audio_processor.feature_extractor # type: ignore - def _get_processor_data( - self, - mm_items: MultiModalDataItems, - ) -> tuple[dict[str, Any], dict[str, Any]]: - # resample audio to the model's sampling rate + def _get_data_parser(self) -> MultiModalDataParser: feature_extractor = self._get_feature_extractor() - mm_items.resample_audios(feature_extractor.sampling_rate) - - return super()._get_processor_data(mm_items) + return MultiModalDataParser(target_sr=feature_extractor.sampling_rate) def _call_hf_processor( self, - hf_processor: ProcessorMixin, prompt: str, - processor_data: Mapping[str, object], - mm_processor_kwargs: Mapping[str, object], + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], ) -> BatchFeature: - processor_data = dict(processor_data) - audios = processor_data.pop("audios", []) + # Text-only input not supported in composite processor + if not mm_data: + tokenizer = self._get_tokenizer() + + prompt_ids = tokenizer.encode( + prompt, + add_special_tokens=False, # type: ignore + ) + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") + + mm_data = dict(mm_data) + audios = mm_data.pop("audios", []) if not audios: return super()._call_hf_processor( - hf_processor, prompt=prompt, - processor_data=processor_data, - mm_processor_kwargs=mm_processor_kwargs, + mm_data=mm_data, + mm_kwargs=mm_kwargs, ) feature_extractor = self._get_feature_extractor() - mm_processor_kwargs = dict( - **mm_processor_kwargs, + mm_kwargs = dict( + **mm_kwargs, sampling_rate=feature_extractor.sampling_rate, ) - # Already resampled by _get_processor_data + # Already resampled by _get_hf_mm_data assert is_list_of(audios, np.ndarray) # Ultravox processor doesn't support multiple inputs, @@ -119,13 +132,12 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): shared_outputs = {} for audio in audios: # NOTE: Ultravox processor accepts "audio" instead of "audios" - item_processor_data = dict(**processor_data, audio=audio) + item_processor_data = dict(**mm_data, audio=audio) item_outputs = super()._call_hf_processor( - hf_processor, prompt=prompt, - processor_data=item_processor_data, - mm_processor_kwargs=mm_processor_kwargs, + mm_data=item_processor_data, + mm_kwargs=mm_kwargs, ) audio_features.append(item_outputs.pop("audio_values")[0]) @@ -139,17 +151,28 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): ) return BatchFeature(combined_outputs) + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + audio_features=MultiModalFieldConfig.batched("audio"), + audio_token_len=MultiModalFieldConfig.batched("audio"), + audio_embeds=MultiModalFieldConfig.batched("audio"), + ) + def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_inputs: BatchFeature, - mm_processor_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: hf_processor = self._get_hf_processor() placeholder = hf_processor.audio_token_replacement # type: ignore def get_replacement_ultravox(item_idx: int): - audio_token_len = hf_inputs["audio_token_len"][item_idx] + audio_token_len = out_mm_kwargs["audio_token_len"][item_idx] return placeholder * audio_token_len return [ @@ -165,17 +188,19 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): mm_counts: Mapping[str, int], ) -> ProcessorInputs: feature_extractor = self._get_feature_extractor() + sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate + num_audios = mm_counts.get("audio", 0) - audio_count = mm_counts["audio"] - audio = np.zeros(audio_len) - data = {"audio": [audio] * audio_count} + mm_data = { + "audio": + self._get_dummy_audios(length=audio_len, num_audios=num_audios) + } return ProcessorInputs( - prompt_text="<|audio|>" * audio_count, - mm_data=data, - mm_processor_kwargs={}, + prompt_text="<|audio|>" * num_audios, + mm_data=mm_data, ) diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 39ead08c238ce..6f1cc9d5e0c30 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -3,10 +3,9 @@ from typing import Any, Dict, Optional import torch -from vllm.platforms import current_platform - def set_random_seed(seed: int) -> None: + from vllm.platforms import current_platform current_platform.seed_everything(seed) @@ -38,6 +37,7 @@ def set_weight_attrs( # This sometimes causes OOM errors during model loading. To avoid this, # we sync the param tensor after its weight loader is called. # TODO(woosuk): Remove this hack once we have a better solution. + from vllm.platforms import current_platform if current_platform.is_tpu() and key == "weight_loader": value = _make_synced_weight_loader(value) setattr(weight, key, value) diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 9255e062e4870..e58bbe81717a0 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,8 +1,7 @@ from .base import MultiModalPlaceholderMap, MultiModalPlugin -from .inputs import (BatchedTensorInputs, MultiModalData, - MultiModalDataBuiltins, MultiModalDataDict, - MultiModalKwargs, MultiModalPlaceholderDict, - NestedTensors) +from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins, + MultiModalDataDict, MultiModalKwargs, + MultiModalPlaceholderDict, NestedTensors) from .registry import MultiModalRegistry MULTIMODAL_REGISTRY = MultiModalRegistry() @@ -16,7 +15,7 @@ See also: __all__ = [ "BatchedTensorInputs", - "MultiModalData", + "ModalityData", "MultiModalDataBuiltins", "MultiModalDataDict", "MultiModalKwargs", diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index ed3bb82bf0aaa..de80f22bac2a3 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -1,17 +1,26 @@ +import base64 +from io import BytesIO +from pathlib import Path + import numpy as np import numpy.typing as npt from vllm.inputs.registry import InputContext from vllm.utils import PlaceholderModule -from .base import MultiModalPlugin -from .inputs import AudioItem, MultiModalData, MultiModalKwargs +from .base import MediaIO, MultiModalPlugin +from .inputs import AudioItem, ModalityData, MultiModalKwargs try: import librosa except ImportError: librosa = PlaceholderModule("librosa") # type: ignore[assignment] +try: + import soundfile +except ImportError: + soundfile = PlaceholderModule("soundfile") # type: ignore[assignment] + class AudioPlugin(MultiModalPlugin): """Plugin for audio data.""" @@ -22,7 +31,7 @@ class AudioPlugin(MultiModalPlugin): def _default_input_mapper( self, ctx: InputContext, - data: MultiModalData[AudioItem], + data: ModalityData[AudioItem], **mm_processor_kwargs, ) -> MultiModalKwargs: raise NotImplementedError("There is no default audio input mapper") @@ -39,3 +48,28 @@ def resample_audio( target_sr: float, ) -> npt.NDArray[np.floating]: return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr) + + +class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): + + def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]: + return librosa.load(BytesIO(data), sr=None) + + def load_base64( + self, + media_type: str, + data: str, + ) -> tuple[npt.NDArray, float]: + return self.load_bytes(base64.b64decode(data)) + + def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]: + return librosa.load(filepath, sr=None) + + def encode_base64(self, media: tuple[npt.NDArray, float]) -> str: + audio, sr = media + + with BytesIO() as buffer: + soundfile.write(buffer, audio, sr, format="WAV") + data = buffer.getvalue() + + return base64.b64encode(data).decode('utf-8') diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 1e5a46946c6c0..7f4029e726332 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from collections import defaultdict -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, +from pathlib import Path +from typing import (TYPE_CHECKING, Any, Callable, Generic, NamedTuple, Optional, Sequence, Tuple, Type, TypeVar, Union) from torch import nn @@ -14,12 +15,12 @@ if TYPE_CHECKING: from vllm.config import ModelConfig from vllm.sequence import SequenceGroupMetadata -from .inputs import (MultiModalData, MultiModalDataDict, MultiModalKwargs, +from .inputs import (ModalityData, MultiModalDataDict, MultiModalKwargs, PlaceholderRange) logger = init_logger(__name__) -MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]], +MultiModalInputMapper = Callable[[InputContext, ModalityData[object]], MultiModalKwargs] """ Return a dictionary to be passed as keyword arguments to @@ -68,7 +69,7 @@ class MultiModalPlugin(ABC): def _default_input_mapper( self, ctx: InputContext, - data: MultiModalData[Any], + data: ModalityData[Any], **mm_processor_kwargs, ) -> MultiModalKwargs: """ @@ -117,8 +118,8 @@ class MultiModalPlugin(ABC): def map_input( self, model_config: "ModelConfig", - data: MultiModalData[Any], - mm_processor_kwargs: Optional[Dict[str, Any]], + data: ModalityData[Any], + mm_processor_kwargs: Optional[dict[str, Any]], ) -> MultiModalKwargs: """ Transform the data into a dictionary of model inputs using the @@ -254,10 +255,10 @@ class MultiModalPlaceholderMap: """ class IndexMap(NamedTuple): - src: List[int] - dest: List[int] + src: list[int] + dest: list[int] - src_ranges: List[range] + src_ranges: list[range] """ The indices of the multi-modal embeddings that will replace the corresponding placeholder embeddings pointed to by ``dest_ranges``. @@ -268,7 +269,7 @@ class MultiModalPlaceholderMap: The total number of flattened multi-modal embeddings. """ - dest_ranges: List[range] + dest_ranges: list[range] """ The indices of the placeholder embeddings that will be replaced by the multimodal embeddings. @@ -288,7 +289,7 @@ class MultiModalPlaceholderMap: @classmethod def from_seq_group( cls, seq_group: "SequenceGroupMetadata", positions: range - ) -> Tuple[Optional[MultiModalDataDict], Dict[str, + ) -> Tuple[Optional[MultiModalDataDict], dict[str, "MultiModalPlaceholderMap"]]: """ Returns the multi-modal items that intersect with the portion of a @@ -296,35 +297,37 @@ class MultiModalPlaceholderMap: ``MultiModalPlaceholderMap`` that relates the multi-modal embedding vectors to their corresponding placeholders. - Consider the following scenarios: + Examples: - Prompt: |AAAA BBBB What's in these images?| - Positions: |.................................| + .. code-block:: - images = [A, B] - src_ranges = [(0, 4), (4, 8)] - dest_ranges = [(0, 4), (5, 9)] + Prompt: |AAAA BBBB What's in these images?| + Positions: |.................................| - Prompt: |AAAA BBBB What's in these images?| - Positions: | ..... | + images = [A, B] + src_ranges = [(0, 4), (4, 8)] + dest_ranges = [(0, 4), (5, 9)] - images = [A, B] - src_ranges = [(2, 4), (4, 6)] - dest_ranges = [(0, 2), (3, 5)] + Prompt: |AAAA BBBB What's in these images?| + Positions: | ..... | - Prompt: |AAAA BBBB What's in these images?| - Positions: | ......... | + images = [A, B] + src_ranges = [(2, 4), (4, 6)] + dest_ranges = [(0, 2), (3, 5)] - images = [B] - src_ranges = [(0, 4)] - dest_ranges = [(0, 4)] + Prompt: |AAAA BBBB What's in these images?| + Positions: | ......... | - Prompt: |AAAA BBBB What's in these images?| - Positions: | .......................| + images = [B] + src_ranges = [(0, 4)] + dest_ranges = [(0, 4)] - images = [] - src_ranges = [] - dest_ranges = [] + Prompt: |AAAA BBBB What's in these images?| + Positions: | .......................| + + images = [] + src_ranges = [] + dest_ranges = [] """ seq_mm_data = seq_group.multi_modal_data seq_mm_placeholders = seq_group.multi_modal_placeholders @@ -376,9 +379,9 @@ class MultiModalPlaceholderMap: def append_items_from_seq_group( self, positions: range, - multi_modal_items: List[_T], + multi_modal_items: list[_T], multi_modal_placeholders: Sequence[PlaceholderRange], - ) -> List[_T]: + ) -> list[_T]: """ Adds the multi-modal items that intersect ```positions`` to this placeholder map and returns the intersecting items. @@ -454,3 +457,22 @@ class MultiModalPlaceholderMap: return MultiModalPlaceholderMap.IndexMap(src=src_indices, dest=dest_indices) + + +class MediaIO(ABC, Generic[_T]): + + @abstractmethod + def load_bytes(self, data: bytes) -> _T: + raise NotImplementedError + + @abstractmethod + def load_base64(self, media_type: str, data: str) -> _T: + """ + List of media types: + https://www.iana.org/assignments/media-types/media-types.xhtml + """ + raise NotImplementedError + + @abstractmethod + def load_file(self, filepath: Path) -> _T: + raise NotImplementedError diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index c705e1a3d1554..da13a381c4530 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -1,4 +1,7 @@ +import base64 from functools import lru_cache +from io import BytesIO +from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, Optional import torch @@ -9,8 +12,8 @@ from vllm.logger import init_logger from vllm.transformers_utils.processor import get_image_processor from vllm.utils import is_list_of -from .base import MultiModalPlugin -from .inputs import ImageItem, MultiModalData, MultiModalKwargs +from .base import MediaIO, MultiModalPlugin +from .inputs import ImageItem, ModalityData, MultiModalKwargs if TYPE_CHECKING: from vllm.config import ModelConfig @@ -41,7 +44,7 @@ class ImagePlugin(MultiModalPlugin): def _default_input_mapper( self, ctx: InputContext, - data: MultiModalData[ImageItem], + data: ModalityData[ImageItem], **mm_processor_kwargs, ) -> MultiModalKwargs: model_config = ctx.model_config @@ -96,3 +99,39 @@ def rescale_image_size(image: Image.Image, if transpose >= 0: image = image.transpose(Image.Transpose(transpose)) return image + + +class ImageMediaIO(MediaIO[Image.Image]): + + def __init__(self, *, image_mode: str = "RGB") -> None: + super().__init__() + + self.image_mode = image_mode + + def load_bytes(self, data: bytes) -> Image.Image: + image = Image.open(BytesIO(data)) + image.load() + return image.convert(self.image_mode) + + def load_base64(self, media_type: str, data: str) -> Image.Image: + return self.load_bytes(base64.b64decode(data)) + + def load_file(self, filepath: Path) -> Image.Image: + image = Image.open(filepath) + image.load() + return image.convert(self.image_mode) + + def encode_base64( + self, + media: Image.Image, + *, + image_format: str = "JPEG", + ) -> str: + image = media + + with BytesIO() as buffer: + image = image.convert(self.image_mode) + image.save(buffer, image_format) + data = buffer.getvalue() + + return base64.b64encode(data).decode('utf-8') diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 9ecae2c1ca2bf..db489af7ac475 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -1,50 +1,75 @@ +from abc import ABC, abstractmethod from collections import UserDict, defaultdict -from typing import (Any, Dict, List, Literal, Mapping, Sequence, Tuple, - TypedDict, TypeVar, Union, cast, final) +from collections.abc import Mapping, Sequence +from dataclasses import dataclass +from typing import Any, Literal, TypedDict, TypeVar, Union, cast, final import numpy as np import torch import torch.types from PIL.Image import Image +from transformers import BatchFeature from typing_extensions import NotRequired, TypeAlias from vllm.utils import JSONTree, is_list_of, json_map_leaves _T = TypeVar("_T") -# yapf: disable -ImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor] +HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor] """ A :class:`transformers.image_utils.ImageInput` representing a single image item, which can be passed to a HuggingFace :code:`ImageProcessor`. """ -VideoItem: TypeAlias = Union[ - list[Image], - np.ndarray, - torch.Tensor, - list[np.ndarray], - list[torch.Tensor], -] +HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor, + list[np.ndarray], list[torch.Tensor]] """ A :class:`transformers.image_utils.VideoInput` representing a single video item, which can be passed to a HuggingFace :code:`VideoProcessor`. """ -AudioItem: TypeAlias = Union[ - np.ndarray, - list[float], - # `(audio, sampling_rate)`: If the audio's sampling rate is different - # from that expected by the model, we need to resample it. - tuple[np.ndarray, float], -] +HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor] """ Represents a single audio item, which can be passed to a HuggingFace :code:`AudioProcessor`. """ -# yapf: enable -MultiModalData: TypeAlias = Union[_T, List[_T]] +ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor] +""" +A :class:`transformers.image_utils.ImageInput` representing a single image +item, which can be passed to a HuggingFace :code:`ImageProcessor`. + +Alternatively, a 3-D tensor or batch of 2-D tensors, +which are treated as image embeddings; +these are directly passed to the model without HF processing. +""" + +VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor] +""" +A :class:`transformers.image_utils.VideoInput` representing a single video +item, which can be passed to a HuggingFace :code:`VideoProcessor`. + +Alternatively, a 3-D tensor or batch of 2-D tensors, +which are treated as video embeddings; +these are directly passed to the model without HF processing. +""" + +AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float], + torch.Tensor] +""" +Represents a single audio +item, which can be passed to a HuggingFace :code:`AudioProcessor`. + +Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate +is different from that expected by the model; +these are resampled to the model's sampling rate before being processed by HF. + +Alternatively, a 3-D tensor or batch of 2-D tensors, +which are treated as audio embeddings; +these are directly passed to the model without HF processing. +""" + +ModalityData: TypeAlias = Union[_T, list[_T]] """ Either a single data item, or a list of data items. @@ -57,17 +82,17 @@ The number of data items allowed per modality is restricted by class MultiModalDataBuiltins(TypedDict, total=False): """Type annotations for modality types predefined by vLLM.""" - image: MultiModalData[ImageItem] + image: ModalityData[ImageItem] """The input image(s).""" - video: MultiModalData[VideoItem] + video: ModalityData[VideoItem] """The input video(s).""" - audio: MultiModalData[AudioItem] + audio: ModalityData[AudioItem] """The input audio(s).""" -MultiModalDataDict: TypeAlias = Mapping[str, MultiModalData[Any]] +MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]] """ A dictionary containing an entry for each modality type to input. @@ -83,9 +108,14 @@ class PlaceholderRange(TypedDict): """ Placeholder location information for multi-modal data. - For example: - Prompt: AAAA BBBB What is in these images? + Example: + + Prompt: :code:`AAAA BBBB What is in these images?` + Images A and B will have: + + .. code-block:: + A: { "offset": 0, "length": 4 } B: { "offset": 5, "length": 4 } """ @@ -97,25 +127,256 @@ class PlaceholderRange(TypedDict): """The length of the placeholder.""" -NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor, - Tuple[torch.Tensor, ...]] +NestedTensors = Union[list["NestedTensors"], list[torch.Tensor], torch.Tensor, + tuple[torch.Tensor, ...]] """ Uses a list instead of a tensor if the dimensions of each element do not match. """ -BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors] + +def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: + """Equality check between :data:`NestedTensors` objects.""" + if isinstance(a, torch.Tensor): + return isinstance(b, torch.Tensor) and bool((a == b).all().item()) + elif isinstance(b, torch.Tensor): + return isinstance(a, torch.Tensor) and bool((b == a).all().item()) + + if isinstance(a, list): + return (isinstance(b, list) + and all(nested_tensors_equal(a_, b_) for a_, b_ in zip(a, b))) + if isinstance(b, list): + return (isinstance(a, list) + and all(nested_tensors_equal(b_, a_) for b_, a_ in zip(b, a))) + + # Both a and b are scalars + return a == b + + +BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors] """ A dictionary containing nested tensors which have been batched via :meth:`MultiModalKwargs.batch`. """ +@dataclass(frozen=True) +class MultiModalFieldItem: + """ + Contains metadata and data in :class:`MultiModalKwargs` + corresponding to a data item in :class:`MultiModalDataItems`. + """ + field: "BaseMultiModalField" + data: NestedTensors + + def __eq__(self, other: object) -> bool: + if not isinstance(other, self.__class__): + return False + + return (self.field == other.field + and nested_tensors_equal(self.data, other.data)) + + +@dataclass(frozen=True) +class BaseMultiModalField(ABC): + """Abstract base class for a field in :class:`MultiModalKwargs`.""" + key: str + modality: str + + @abstractmethod + def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: + raise NotImplementedError + + def _build_item(self, data: NestedTensors) -> MultiModalFieldItem: + return MultiModalFieldItem(self, data) + + def reduce(self, batch: list[MultiModalFieldItem]) -> MultiModalFieldItem: + """Merge multiple instances of :class:`MultiModalFieldItem` together.""" + fields = [item.field for item in batch] + if len(set(fields)) > 1: + raise ValueError(f"Cannot merge different {fields=}") + + data = self._reduce_data([item.data for item in batch]) + + return self._build_item(data) + + +@dataclass(frozen=True) +class MultiModalBatchedField(BaseMultiModalField): + """ + A :class:`BaseMultiModalField` implementation where an item is obtained by + directly indexing into the first dimension of the underlying data. + """ + + def build_items(self, batch: NestedTensors) -> list[MultiModalFieldItem]: + return [self._build_item(item) for item in batch] + + def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: + if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"): + first_shape = batch[0].shape + if all(item.shape == first_shape for item in batch): + return torch.stack(batch) + + return batch + + +@dataclass(frozen=True) +class MultiModalFlatField(BaseMultiModalField): + """ + A :class:`BaseMultiModalField` implementation where an item is obtained by + slicing along the first dimension of the underlying data. + """ + + def build_items( + self, + batch: NestedTensors, + slices: Sequence[slice], + ) -> list[MultiModalFieldItem]: + return [self._build_item(batch[slice_]) for slice_ in slices] + + def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: + if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"): + first_shape = batch[0].shape + if all(item.shape[1:] == first_shape[1:] for item in batch): + return torch.concat(batch) + + return [elem for item in batch for elem in item] + + +class MultiModalFieldConfig: + + @staticmethod + def batched(modality: str): + return MultiModalFieldConfig( + field_cls=MultiModalBatchedField, + modality=modality, + ) + + @staticmethod + def flat(modality: str, slices: Sequence[slice]): + return MultiModalFieldConfig( + field_cls=MultiModalFlatField, + modality=modality, + slices=slices, + ) + + def __init__( + self, + field_cls: type[BaseMultiModalField], + modality: str, + **field_config: Any, + ) -> None: + super().__init__() + + self._field_cls = field_cls + self._modality = modality + self._field_config = field_config + + def build_items( + self, + key: str, + batch: NestedTensors, + ) -> list[MultiModalFieldItem]: + field = self._field_cls(key=key, modality=self._modality) + return field.build_items(batch, **self._field_config) # type: ignore + + class MultiModalKwargs(UserDict[str, NestedTensors]): """ A dictionary that represents the keyword arguments to :meth:`~torch.nn.Module.forward`. + + The metadata :code:`items_by_key` defines how to split batched keyword + arguments corresponding to each data item in :class:`MultiModalDataItems`: + + - For a keyword argument, we can access the :code:`i` th item in the batch + via :code:`items_by_key[key][i]`. + - We can gather the keyword arguments belonging to a modality by finding + the keys with items that belong to that modality, then accessing + the :code:`i` th item in the batch for each such key. + + Example: + + .. code-block:: python + + # All items belong to the "image" modality + items_by_key={ + "pixel_values": [a, b, c, d], # "image" modality + "image_grid_thw": [e, f, g, h], # "image" modality + "pixel_values_video": [h, i, j], # "video" modality + "video_grid_thw": [k, l, m], # "video" modality + } + + - The keyword arguments belonging to the first image are + :code:`{"pixel_values": a, "image_grid_thw": e}`. + - The keyword arguments belonging to the second video are + :code:`{"pixel_values_video": i, "video_grid_thw": l}`. """ + @staticmethod + def from_hf_inputs( + hf_inputs: BatchFeature, + config_by_key: Mapping[str, MultiModalFieldConfig], + *, + enable_sanity_checks: bool = False, + ): + # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key` + # We assume that those fields are not used in vLLM + items_by_key = { + key: config.build_items(key, batch) + for key, config in config_by_key.items() + if (batch := hf_inputs.get(key)) is not None + } + + return MultiModalKwargs.from_items_by_key( + items_by_key, + enable_sanity_checks=enable_sanity_checks, + ) + + @staticmethod + def from_items_by_key( + items_by_key: Mapping[str, list[MultiModalFieldItem]], + *, + enable_sanity_checks: bool = False, + ) -> "MultiModalKwargs": + data = { + key: items[0].field.reduce(items).data + for key, items in items_by_key.items() if len(items) > 0 + } + + return MultiModalKwargs(data, + items_by_key=items_by_key, + enable_sanity_checks=enable_sanity_checks) + + def __init__( + self, + data: Mapping[str, NestedTensors], + *, + items_by_key: Mapping[str, list[MultiModalFieldItem]] = {}, + enable_sanity_checks: bool = False, + ) -> None: + super().__init__(data) + + # Shallow copy to avoid footgun in case a defaultdict is passed in + self._items_by_key = dict(items_by_key) + + keys_by_modality = defaultdict[str, set[str]](set) + for key, items in items_by_key.items(): + for item in items: + keys_by_modality[item.field.modality].add(key) + + self._keys_by_modality = dict(keys_by_modality) + + if enable_sanity_checks: + for modality, keys in keys_by_modality.items(): + items_in_modality = {k: items_by_key[k] for k in keys} + batch_sizes = {k: len(v) for k, v in items_in_modality.items()} + batch_size = next(iter(batch_sizes.values()), 0) + assert all(bs == batch_size + for bs in batch_sizes.values()), dict( + modality=modality, + batch_sizes=batch_sizes, + items_by_key=items_by_key) + @staticmethod def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: """ @@ -139,7 +400,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): # Only tensors (not lists) can be stacked. return stacked - tensors_ = cast(List[torch.Tensor], stacked) + tensors_ = cast(list[torch.Tensor], stacked) if any(t.shape != tensors_[0].shape for t in tensors_): # The tensors have incompatible shapes and can't be stacked. return tensors_ @@ -147,7 +408,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): return torch.stack(tensors_) @staticmethod - def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs: + def batch(inputs_list: list["MultiModalKwargs"]) -> BatchedTensorInputs: """ Batch multiple inputs together into a dictionary. @@ -162,7 +423,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): # We need to consider the case where each item in the batch # contains different modalities (i.e. different keys). - item_lists: Dict[str, List[NestedTensors]] = defaultdict(list) + item_lists = defaultdict[str, list[NestedTensors]](list) for inputs in inputs_list: for k, v in inputs.items(): @@ -188,6 +449,62 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): return cast(BatchedTensorInputs, json_mapped) + def __eq__(self, other: object) -> bool: + if not isinstance(other, self.__class__): + return False + if self._items_by_key != other._items_by_key: + return False + + ks = self.keys() + return (ks == other.keys() + and all(nested_tensors_equal(self[k], other[k]) for k in ks)) + + def get_item(self, key: str, item_index: int) -> MultiModalFieldItem: + return self._items_by_key[key][item_index] + + def get_items_by_modality( + self, + modality: str, + item_index: int, + ) -> Mapping[str, MultiModalFieldItem]: + """ + Get the keyword arguments corresponding to an item identified by + its modality and index. + """ + if modality not in self._keys_by_modality: + available_modalities = set(self._keys_by_modality.keys()) + raise KeyError(f"Modality {modality!r} not found. " + f"Available modalities: {available_modalities}") + + keys_to_gather = self._keys_by_modality[modality] + + return { + key: self.get_item(key, item_index) + for key in keys_to_gather if key in self + } + + @staticmethod + def from_items_by_modality( + items_by_modality: Mapping[str, list[Mapping[str, + MultiModalFieldItem]]], + *, + enable_sanity_checks: bool = False, + ) -> "MultiModalKwargs": + """ + Construct a new :class:`MultiModalKwargs` from multiple items returned + by :meth:`get_fields_by_modality`. + """ + items_by_key = defaultdict[str, list[MultiModalFieldItem]](list) + for fields in items_by_modality.values(): + for field in fields: + for k, v in field.items(): + items_by_key[k].append(v) + + return MultiModalKwargs.from_items_by_key( + items_by_key, + enable_sanity_checks=enable_sanity_checks, + ) + MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]] """ @@ -207,16 +524,16 @@ class MultiModalInputsV2(TypedDict): prompt: str """The processed prompt text.""" - prompt_token_ids: List[int] + prompt_token_ids: list[int] """The processed token IDs which includes placeholder tokens.""" - token_type_ids: NotRequired[List[int]] + token_type_ids: NotRequired[list[int]] """The token type IDs of the prompt.""" mm_kwargs: MultiModalKwargs """Keyword arguments to be directly passed to the model after batching.""" - mm_hashes: NotRequired[List[str]] + mm_hashes: NotRequired[list[str]] """The hashes of the multi-modal data.""" mm_placeholders: MultiModalPlaceholderDict diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py new file mode 100644 index 0000000000000..da111e999ebb8 --- /dev/null +++ b/vllm/multimodal/parse.py @@ -0,0 +1,368 @@ +from abc import ABC, abstractmethod +from collections import UserDict +from collections.abc import Callable, Iterator, Mapping, Sequence +from typing import TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar + +import numpy as np +import torch +from PIL.Image import Image +from typing_extensions import TypeAlias, TypeGuard, assert_never + +from vllm.utils import is_list_of + +from .audio import resample_audio +from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem, + ImageItem, ModalityData, MultiModalDataDict, + NestedTensors, VideoItem) + +_T = TypeVar("_T") +_I = TypeVar("_I") + + +class ModalityDataItems(ABC, Generic[_T, _I]): + + def __init__(self, data: _T) -> None: + super().__init__() + + self.data = data + + def __len__(self) -> int: + return self.get_count() + + def __getitem__(self, index: int) -> _I: + return self.get(index) + + if TYPE_CHECKING: + # Auto-generated + def __iter__(self) -> Iterator[_I]: + ... + + @abstractmethod + def get_count(self) -> int: + """Get the number of data items.""" + raise NotImplementedError + + @abstractmethod + def get(self, index: int) -> _I: + """Get a data item by its index.""" + raise NotImplementedError + + def get_all(self) -> list[_I]: + """Get all data items.""" + return [self.get(idx) for idx in range(self.get_count())] + + @abstractmethod + def get_processor_data(self) -> Mapping[str, object]: + """Get the data to pass to the HF processor.""" + raise NotImplementedError + + @abstractmethod + def get_passthrough_data(self) -> Mapping[str, object]: + """Get the data to pass directly to the model.""" + raise NotImplementedError + + +class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]): + + def __init__(self, data: Sequence[_T], modality: str) -> None: + super().__init__(data) + + self.modality = modality + + def __repr__(self) -> str: + return (f"{type(self).__name__}(modality={self.modality!r})") + + def get_count(self) -> int: + return len(self.data) + + def get(self, index: int) -> _T: + return self.data[index] + + def get_processor_data(self) -> Mapping[str, object]: + return {f"{self.modality}s": self.data} + + def get_passthrough_data(self) -> Mapping[str, object]: + return {} + + +class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]): + + def __init__(self, data: NestedTensors, modality: str) -> None: + super().__init__(data) + + self.modality = modality + + def __repr__(self) -> str: + return (f"{type(self).__name__}(modality={self.modality!r})") + + def get_count(self) -> int: + return len(self.data) + + def get(self, index: int) -> object: + return self.data[index] + + def get_processor_data(self) -> Mapping[str, object]: + return {} + + def get_passthrough_data(self) -> Mapping[str, object]: + return {f"{self.modality}_embeds": self.data} + + +class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]): + + def __init__(self, data: Sequence[HfAudioItem]) -> None: + super().__init__(data, "audio") + + +class AudioEmbeddingItems(EmbeddingItems): + + def __init__(self, data: NestedTensors) -> None: + super().__init__(data, "audio") + + +class ImageSize(NamedTuple): + width: int + height: int + + +class ImageProcessorItems(ProcessorBatchItems[HfImageItem]): + + def __init__(self, data: Sequence[HfImageItem]) -> None: + super().__init__(data, "image") + + def get_image_size(self, item_idx: int) -> ImageSize: + image = self.get(item_idx) + + if isinstance(image, Image): + return ImageSize(*image.size) + if isinstance(image, (np.ndarray, torch.Tensor)): + _, h, w = image.shape + return ImageSize(w, h) + + assert_never(image) + + +class ImageEmbeddingItems(EmbeddingItems): + + def __init__(self, data: NestedTensors) -> None: + super().__init__(data, "image") + + +class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]): + + def __init__(self, data: Sequence[HfVideoItem]) -> None: + super().__init__(data, "video") + + +class VideoEmbeddingItems(EmbeddingItems): + + def __init__(self, data: NestedTensors) -> None: + super().__init__(data, "video") + + +_D = TypeVar("_D", bound=ModalityDataItems[Any, Any]) + + +class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): + """ + As :class:`MultiModalDataDict`, but normalized such that each entry + corresponds to a list. + """ + + def get_count(self, modality: str, *, strict: bool = True) -> int: + """ + Get the number of data items belonging to a modality. + + If `strict=False`, return `0` instead of raising :exc:`KeyError` + even if the modality is not found. + """ + if modality not in self: + if strict: + available_modalities = set(self.keys()) + raise KeyError(f"Modality {modality!r} not found. " + f"Available modalities: {available_modalities}") + + return 0 + + return self[modality].get_count() + + def get_all_counts(self) -> Mapping[str, int]: + """Get the number of items belonging to each modality.""" + return {m: items.get_count() for m, items in self.items()} + + def get_items( + self, + modality: str, + typ: type[_D], + ) -> _D: + """ + Get the data items belonging to a modality, + requiring that they belong to a certain type. + """ + if modality not in self: + available_modalities = set(self.keys()) + raise KeyError(f"Modality {modality!r} not found. " + f"Available modalities: {available_modalities}") + + items = self[modality] + if not isinstance(items, typ): + raise TypeError(f"Invalid type of data items for {modality=}. " + f"Expected type: {typ}, but " + f"found type: {type(items)}") + + return items + + +ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]], + ModalityDataItems[Any, Any]] + + +class MultiModalDataParser: + """ + Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`. + + Args: + max_mm_counts (Mapping[str, int]): The maximum allowed number of items + belonging to each modality. This effectively sets a hard limit over + `--limit-mm-per-prompt`. + target_sr (float, optional): Enables automatic resampling of audio + items to the model's expected sampling rate. + """ + + def __init__( + self, + *, + max_mm_counts: Mapping[str, int] = {}, + target_sr: Optional[float] = None, + ) -> None: + super().__init__() + + self.max_mm_counts = max_mm_counts + self.target_sr = target_sr + + def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]: + if isinstance(data, torch.Tensor): + return data.ndim == 3 + if is_list_of(data, torch.Tensor): + return len(data) == 0 or data[0].ndim == 2 + + return False + + def _get_audio_with_sr( + self, + audio: AudioItem, + ) -> tuple[np.ndarray, Optional[float]]: + if isinstance(audio, tuple): + return audio + if isinstance(audio, list): + return np.array(audio), None + if isinstance(audio, np.ndarray): + return audio, None + if isinstance(audio, torch.Tensor): + return audio.numpy(), None + + assert_never(audio) + + def _parse_audio_data( + self, + data: ModalityData[AudioItem], + ) -> ModalityDataItems[Any, Any]: + if self._is_embeddings(data): + return AudioEmbeddingItems(data) + + if (is_list_of(data, float) + or isinstance(data, + (np.ndarray, torch.Tensor)) and data.ndim == 1 + or isinstance(data, tuple)): + data_items = [data] + elif isinstance(data, (np.ndarray, torch.Tensor)): + data_items = [elem for elem in data] + else: + data_items = data + + new_audios = list[np.ndarray]() + for data_item in data_items: + audio, orig_sr = self._get_audio_with_sr(data_item) + if orig_sr is None: + new_audio = audio + else: + target_sr = self.target_sr + if target_sr is None: + raise RuntimeError( + "Audio resampling is not supported when " + "`target_sr` is not provided") + + new_audio = resample_audio(audio, + orig_sr=orig_sr, + target_sr=target_sr) + + new_audios.append(new_audio) + + return AudioProcessorItems(new_audios) + + def _parse_image_data( + self, + data: ModalityData[ImageItem], + ) -> ModalityDataItems[Any, Any]: + if self._is_embeddings(data): + return ImageEmbeddingItems(data) + + if (isinstance(data, Image) + or isinstance(data, + (np.ndarray, torch.Tensor)) and data.ndim == 3): + data_items = [data] + elif isinstance(data, (np.ndarray, torch.Tensor)): + data_items = [elem for elem in data] + else: + data_items = data + + return ImageProcessorItems(data_items) + + def _parse_video_data( + self, + data: ModalityData[VideoItem], + ) -> ModalityDataItems[Any, Any]: + if self._is_embeddings(data): + return VideoEmbeddingItems(data) + + if (is_list_of(data, Image) + or isinstance(data, + (np.ndarray, torch.Tensor)) and data.ndim == 4): + data_items = [data] + elif isinstance(data, (np.ndarray, torch.Tensor)): + data_items = [elem for elem in data] + else: + data_items = data + + return VideoProcessorItems(data_items) + + def _get_subparsers(self) -> Mapping[str, ModalityDataParser]: + return { + "audio": self._parse_audio_data, + "image": self._parse_image_data, + "video": self._parse_video_data, + } + + def parse_mm_data(self, + mm_data: MultiModalDataDict) -> MultiModalDataItems: + max_mm_counts = self.max_mm_counts + subparsers = self._get_subparsers() + + mm_items = MultiModalDataItems() + for k, v in mm_data.items(): + if k not in subparsers: + raise ValueError(f"Unsupported modality: {k}") + + modality_items = subparsers[k](v) + + if k in max_mm_counts: + max_count = max_mm_counts[k] + if len(modality_items) > max_count: + raise ValueError( + f"This model supports at most {max_count} {k} items " + f"per prompt, but {len(modality_items)} {k} items " + "were given or set as its limit_mm_per_prompt.") + + mm_items[k] = modality_items + + return mm_items diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 6baf19d675d50..7712c3bcebe20 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,26 +1,28 @@ +import pickle import re from abc import ABC, abstractmethod -from collections import UserDict +from collections import defaultdict from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence from dataclasses import dataclass, field from functools import lru_cache from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union import numpy as np +import numpy.typing as npt import torch -from PIL.Image import Image +from blake3 import blake3 +from PIL import Image from transformers import BatchFeature, ProcessorMixin -from typing_extensions import assert_never from vllm.inputs import DummyData, InputProcessingContext from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import flatten_2d_lists, full_groupby, is_list_of +from vllm.utils import LRUCache, flatten_2d_lists, full_groupby -from .audio import resample_audio -from .inputs import (AudioItem, ImageItem, MultiModalDataDict, - MultiModalInputsV2, MultiModalKwargs, PlaceholderRange, - VideoItem) +from .inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalFieldItem, MultiModalInputsV2, MultiModalKwargs, + PlaceholderRange) +from .parse import MultiModalDataItems, MultiModalDataParser logger = init_logger(__name__) @@ -201,111 +203,6 @@ class _BoundPromptReplacement: return bound_replacement -class ImageSize(NamedTuple): - width: int - height: int - - -class MultiModalDataItems(UserDict[str, list[Any]]): - """ - As :class:`MultiModalDataDict`, but normalized such that each entry - corresponds to a list. - """ - - @staticmethod - def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems": - """ - Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`. - """ - multi_data = MultiModalDataItems() - - for k, v in data.items(): - # TODO: Make a separate modality for embedding inputs - # to avoid confusion - # yapf: disable - if k == "video": - # Special case since even a single item can be a list - multi_data[k] = ( # type: ignore[index] - v if (isinstance(v, torch.Tensor) - or is_list_of(v, list)) else [v] - ) - elif k in ("image", "audio"): - multi_data[k] = ( # type: ignore[index] - v if isinstance(v, (torch.Tensor, list)) else [v] - ) - else: - multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index] - # yapf: enable - - return multi_data - - # NOTE: When a field (e.g. `images`) doesn't exist, directly appending to - # `self.images` doesn't update this dictionary, which may be confusing - # We annotate the getter methods as `Sequence` to prevent others from - # trying to update the list in this way - @property - def images(self) -> Sequence[ImageItem]: - return self.get("image", []) - - @property - def videos(self) -> Sequence[VideoItem]: - return self.get("video", []) - - @property - def audios(self) -> Sequence[AudioItem]: - return self.get("audio", []) - - def get_item_counts(self) -> Mapping[str, int]: - return {m: len(items) for m, items in self.items()} - - def get_image_size(self, item_idx: int) -> ImageSize: - image = self.images[item_idx] - - if isinstance(image, Image): - return ImageSize(*image.size) - if isinstance(image, (np.ndarray, torch.Tensor)): - _, h, w = image.shape - return ImageSize(w, h) - - assert_never(image) - - def get_audio_with_sr( - self, - item_idx: int, - *, - default_sr: float, - ) -> tuple[np.ndarray, float]: - audio = self.audios[item_idx] - - if isinstance(audio, tuple): - return audio - if isinstance(audio, list): - return np.array(audio), default_sr - if isinstance(audio, np.ndarray): - return audio, default_sr - - assert_never(audio) - - def resample_audios(self, new_sr: float, *, drop_sr: bool = True) -> None: - """ - If :code:`drop_sr=True`, the audio items in this dictionary are updated - to be NumPy arrays which implicitly means that their sampling rate is - the same as the model's expected sampling rate; otherwise, they remain - as :code:`(audio, new_sr)` tuples. - """ - if not self.audios: - return - - new_audios = [] - for item_idx in range(len(self.audios)): - audio, sr = self.get_audio_with_sr(item_idx, default_sr=new_sr) - audio = resample_audio(audio, orig_sr=sr, target_sr=new_sr) - - new_audios.append(audio if drop_sr else (audio, new_sr)) - - self["audio"] = new_audios - - class _TokenMatch(NamedTuple): start_idx: int end_idx: int @@ -458,13 +355,13 @@ def _replace_matches( ) -> list[_S]: out_seqs = list[_S]() prev_end_idx = 0 - next_idx_by_modality = {modality: 0 for modality in mm_item_counts} + next_idx_by_modality = defaultdict[str, int](lambda: 0) for match in _resolve_matches(prompt, matches): modality = match.modality item_idx = next_idx_by_modality[modality] - if item_idx >= mm_item_counts[modality]: + if item_idx >= mm_item_counts.get(modality, 0): continue start_idx = match.start_idx @@ -583,11 +480,124 @@ def iter_placeholders( ) -class ProcessorInputs(NamedTuple): - """Keyword arguments to :meth:`BaseMultiModalProcessor`""" +@dataclass +class ProcessorInputs: + """Keyword arguments to :meth:`BaseMultiModalProcessor`.""" prompt_text: str mm_data: MultiModalDataDict - mm_processor_kwargs: Mapping[str, object] + hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict) + + +class ProcessingCache: + + def __init__(self, capacity: int) -> None: + super().__init__() + + # DEBUG: Set to None to disable + self.debug_cache_hit_ratio_steps: Optional[int] = None + + self._cache = LRUCache[str, Mapping[str, + MultiModalFieldItem]](capacity) + + def _maybe_log_cache_stats(self) -> None: + steps = self.debug_cache_hit_ratio_steps + if not steps: + return + + cache_stats = self._cache.stat() + if cache_stats.total % steps == 0: + logger.debug("ProcessingCache: hit_ratio = %.2f", + cache_stats.hit_ratio) + + def _serialize_item(self, obj: object) -> bytes: + # Simple cases + if isinstance(obj, str): + return obj.encode("utf-8") + if isinstance(obj, bytes): + return obj + if isinstance(obj, Image.Image): + return obj.tobytes() + + # Convertible to NumPy arrays + if isinstance(obj, torch.Tensor): + obj = obj.numpy() + if isinstance(obj, (int, float)): + obj = np.array(obj) + if isinstance(obj, np.ndarray): + return obj.tobytes() + + logger.warning( + "No serialization method found for %s. " + "Falling back to pickle.", type(obj)) + + return pickle.dumps(obj) + + def _item_to_bytes( + self, + key: str, + obj: object, + ) -> Iterable[tuple[bytes, bytes]]: + # Recursive cases + if isinstance(obj, (list, tuple)): + for i, elem in enumerate(obj): + yield from self._item_to_bytes(f"{key}.{i}", elem) + elif isinstance(obj, dict): + for k, v in obj.items(): + yield from self._item_to_bytes(f"{key}.{k}", v) + else: + key_bytes = self._serialize_item(key) + value_bytes = self._serialize_item(obj) + yield key_bytes, value_bytes + + def _hash_kwargs(self, **kwargs: object) -> str: + hasher = blake3() + + for k, v in kwargs.items(): + for k_bytes, v_bytes in self._item_to_bytes(k, v): + hasher.update(k_bytes) + hasher.update(v_bytes) + + return hasher.hexdigest() + + def get( + self, + model_id: str, + modality: str, + input_item: object, + input_kwargs: Mapping[str, object], + ) -> Optional[Mapping[str, MultiModalFieldItem]]: + """ + Get a processed multi-modal item from the cache + according to its dependencies, including: + + - The model ID + - The modality of the item + - The original data item passed to the HF processor + - The configuration options of the HF processor + """ + self._maybe_log_cache_stats() + + cache_key = self._hash_kwargs(model_id=model_id, + **{modality: input_item}, + **input_kwargs) + return self._cache.get(cache_key) + + def put( + self, + model_id: str, + modality: str, + input_item: object, + input_kwargs: Mapping[str, object], + output_kwargs: Mapping[str, MultiModalFieldItem], + ) -> None: + """ + Put a processed multi-modal item into the cache + according to its dependencies (see :meth:`get`). + """ + cache_key = self._hash_kwargs(model_id=model_id, + **{modality: input_item}, + **input_kwargs) + self._cache.put(cache_key, output_kwargs) class BaseMultiModalProcessor(ABC): @@ -595,18 +605,34 @@ class BaseMultiModalProcessor(ABC): Abstract base class to process multi-modal inputs to be used in vLLM. """ - def __init__(self, ctx: InputProcessingContext) -> None: + def __init__(self, + ctx: InputProcessingContext, + *, + cache: Optional[ProcessingCache] = None, + enable_sanity_checks: bool = True) -> None: super().__init__() self.ctx = ctx + self.cache = cache + self.enable_sanity_checks = enable_sanity_checks def __call__( self, prompt: str, mm_data: MultiModalDataDict, - mm_processor_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, object], ) -> MultiModalInputsV2: - return self.apply(prompt, mm_data, mm_processor_kwargs) + return self.apply(prompt, mm_data, hf_processor_mm_kwargs) + + def _get_data_parser(self) -> MultiModalDataParser: + """ + Construct a data parser to preprocess multi-modal data items + before passing them to :meth:`_get_hf_mm_data`. + + You can support additional modalities by creating a subclass + of :class:`MultiModalDataParser` that has additional subparsers. + """ + return MultiModalDataParser() def _get_hf_processor(self) -> ProcessorMixin: """ @@ -618,27 +644,45 @@ class BaseMultiModalProcessor(ABC): def _get_tokenizer(self) -> AnyTokenizer: return self.ctx.tokenizer - def _get_mm_items( + def _to_mm_items( self, mm_data: MultiModalDataDict, ) -> MultiModalDataItems: - return MultiModalDataItems.from_dict(mm_data) + """ + Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems` + before passing them to :meth:`_get_hf_mm_data`. + """ + parser = self._get_data_parser() + return parser.parse_mm_data(mm_data) + + @abstractmethod + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + """Given the HF-processed data, output the metadata of each field.""" + raise NotImplementedError @abstractmethod def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_inputs: BatchFeature, - mm_processor_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: """ Given the original multi-modal items for this modality and HF-processed data, output the replacements to perform. - Note: - Even when the HF processor already performs replacement for us, - we still use this replacement information to determine - the placeholder token positions for each multi-modal item. + Notes: + - You should not assume that HF processor always performs prompt + replacement: in :meth:`_apply_hf_processor_missing`, this method + is called on text-only and multimodal-only inputs separately, + instead of passing them in the same call. + - The replacement information returned by this method is also used + to determine the placeholder token positions for each multi-modal + item. """ raise NotImplementedError @@ -651,67 +695,203 @@ class BaseMultiModalProcessor(ABC): return list( iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts)) - def _get_processor_data( + def _get_hf_mm_data( self, mm_items: MultiModalDataItems, ) -> tuple[dict[str, Any], dict[str, Any]]: processor_data = dict[str, Any]() passthrough_data = dict[str, Any]() - for k, v in mm_items.items(): - # TODO: Make a separate modality for embedding inputs - # to avoid confusion - if k in ("image", "video", "audio"): - if isinstance(v, torch.Tensor) and v.ndim == 3: - # Pass through embedding inputs (single) - passthrough_data[f"{k}_embeds"] = [v] - elif (is_list_of(v, torch.Tensor) and len(v) > 0 - and v[0].ndim == 2): - # Pass through embedding inputs (multi) - passthrough_data[f"{k}_embeds"] = v - else: - # Map keys to plural form, e.g.: image -> images - processor_data[f"{k}s"] = v - else: - processor_data[k] = v + for items in mm_items.values(): + processor_data.update(items.get_processor_data()) + passthrough_data.update(items.get_passthrough_data()) return processor_data, passthrough_data def _call_hf_processor( self, - hf_processor: ProcessorMixin, prompt: str, - processor_data: Mapping[str, object], - mm_processor_kwargs: Mapping[str, object], + # Not to be confused with `mm_data` in `self.apply`. + # This refers to the data to be passed to HF processor. + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], ) -> BatchFeature: + """ + Call the HF processor on the prompt text and + associated multi-modal data. + """ return self.ctx.call_hf_processor( - hf_processor, - prompt, - processor_data, - mm_processor_kwargs, + self._get_hf_processor(**mm_kwargs), + dict(text=prompt, **mm_data), + mm_kwargs, ) def _apply_hf_processor( self, - prompt: str, + prompt_text: str, mm_items: MultiModalDataItems, - mm_processor_kwargs: Mapping[str, object], - ) -> BatchFeature: - # some mm_processor_kwargs may be used in processor initialization - # instead of processor call - hf_processor = self._get_hf_processor(**mm_processor_kwargs) + hf_processor_mm_kwargs: Mapping[str, object], + ) -> tuple[list[int], MultiModalKwargs]: + """ + Wrapper of :meth:`_call_hf_processor` that applies + additional pre-processing and post-processing. + """ + processor_data, passthrough_data = self._get_hf_mm_data(mm_items) - processor_data, passthrough_data = self._get_processor_data(mm_items) - - hf_inputs = self._call_hf_processor( - hf_processor, - prompt=prompt, - processor_data=processor_data, - mm_processor_kwargs=mm_processor_kwargs, + processed_data = self._call_hf_processor( + prompt=prompt_text, + mm_data=processor_data, + mm_kwargs=hf_processor_mm_kwargs, ) - hf_inputs.update(passthrough_data) + processed_data.update(passthrough_data) - return hf_inputs + prompt_ids, = processed_data.pop("input_ids").tolist() + + mm_kwargs = MultiModalKwargs.from_hf_inputs( + processed_data, + self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs), + enable_sanity_checks=self.enable_sanity_checks, + ) + + return prompt_ids, mm_kwargs + + def _apply_hf_processor_missing( + self, + prompt_text: str, + mm_missing_data_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + ): + """ + Apply the HF processor on the full prompt text, but only on the + multi-modal data that are missing from the cache. + + Note: + We pass prompt text and multi-modal data into the HF processor + in separate calls to avoid HF prompt replacement being done for + cached items; instead, we rely on our own prompt replacement logic + (:meth:`_get_prompt_replacements`) for the full text. + """ + mm_missing_counts = mm_missing_data_items.get_all_counts() + + prompt_ids, _ = self._apply_hf_processor( + prompt_text=prompt_text, + mm_items=MultiModalDataItems({}), + hf_processor_mm_kwargs={}, + ) + + # Some HF processors (e.g. Qwen2-VL) expect corresponding + # multi-modal tokens to be in the prompt text + dummy_inputs = self._get_dummy_mm_inputs(mm_missing_counts) + + _, mm_missing_kwargs = self._apply_hf_processor( + prompt_text=dummy_inputs.prompt_text, + mm_items=mm_missing_data_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) + + return prompt_ids, mm_missing_kwargs + + def _cached_apply_hf_processor( + self, + prompt_text: str, + mm_data_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> tuple[list[int], MultiModalKwargs]: + """ + Apply the HF processor on the full prompt text, + caching the results and reusing cached results. + """ + cache = self.cache + model_id = self.ctx.model_config.model + + _, passthrough_data = self._get_hf_mm_data(mm_data_items) + if cache is None or passthrough_data: + return self._apply_hf_processor( + prompt_text=prompt_text, + mm_items=mm_data_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) + + mm_maybe_cached_field_items = { + modality: [ + cache.get(model_id, modality, item, hf_processor_mm_kwargs) + for item in items + ] + for modality, items in mm_data_items.items() + } + + mm_missing_idxs = { + modality: [idx for idx, out in enumerate(fields) if out is None] + for modality, fields in mm_maybe_cached_field_items.items() + } + mm_missing_data = { + modality: [mm_data_items[modality][idx] for idx in idxs] + for modality, idxs in mm_missing_idxs.items() + } + mm_missing_data_items = self._to_mm_items(mm_missing_data) + + prompt_ids, mm_missing_kwargs = self._apply_hf_processor_missing( + prompt_text=prompt_text, + mm_missing_data_items=mm_missing_data_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) + + mm_missing_next_idx = { + modality: 0 + for modality in mm_missing_data_items + } + + mm_merged_field_items = dict[str, list[Mapping[str, + MultiModalFieldItem]]]() + for modality, modal_items_lst in mm_maybe_cached_field_items.items(): + merged_modal_items_lst = list[Mapping[str, MultiModalFieldItem]]() + + for idx, modal_items in enumerate(modal_items_lst): + if modal_items is None: + modal_items = mm_missing_kwargs.get_items_by_modality( + modality, + mm_missing_next_idx[modality], + ) + + cache.put( + model_id, + modality, + mm_data_items[modality][idx], + hf_processor_mm_kwargs, + modal_items, + ) + + mm_missing_next_idx[modality] += 1 + + merged_modal_items_lst.append(modal_items) + + mm_merged_field_items[modality] = merged_modal_items_lst + + if self.enable_sanity_checks: + mm_missing_counts = mm_missing_data_items.get_all_counts() + assert all( + item_count == mm_missing_counts[modality] + for modality, item_count in mm_missing_next_idx.items()), dict( + mm_missing_next_idx=mm_missing_next_idx, + mm_missing_counts=mm_missing_counts) + + mm_kwargs = MultiModalKwargs.from_items_by_modality( + mm_merged_field_items, + enable_sanity_checks=self.enable_sanity_checks, + ) + + if self.enable_sanity_checks: + mm_item_counts = mm_data_items.get_all_counts() + + for modality, item_count in mm_item_counts.items(): + for item_idx in range(item_count): + try: + mm_kwargs.get_items_by_modality(modality, item_idx) + except Exception as e: + # Make it easy to set a breakpoint in the debugger + raise e + + return prompt_ids, mm_kwargs def _bind_prompt_replacements( self, @@ -730,6 +910,10 @@ class BaseMultiModalProcessor(ABC): tokenizer = self._get_tokenizer() token_matches = find_token_matches(token_ids, prompt_repls) + mm_match_counts = { + modality: len(matches) + for modality, matches in full_groupby_modality(token_matches) + } # If the search text does not represent a special token, # it may have different token IDs in the prompt, because @@ -742,8 +926,8 @@ class BaseMultiModalProcessor(ABC): # of the search text in the prompt, we instead perform string # replacement on the decoded token IDs, then encode them back. if all( - len(matches) >= mm_item_counts[modality] - for modality, matches in full_groupby_modality(token_matches) + mm_match_counts.get(modality, 0) >= item_count + for modality, item_count in mm_item_counts.items() ): # yapf: disable token_ids = replace_token_matches( token_ids, @@ -775,7 +959,7 @@ class BaseMultiModalProcessor(ABC): self, prompt_text: str, mm_data: MultiModalDataDict, - mm_processor_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, object], ) -> MultiModalInputsV2: """ Process multi-modal inputs to be used in vLLM. @@ -790,22 +974,26 @@ class BaseMultiModalProcessor(ABC): 3. Extract information about the placeholder tokens from the processed token IDs. """ - mm_items = self._get_mm_items(mm_data) + mm_items = self._to_mm_items(mm_data) - hf_inputs = self._apply_hf_processor(prompt_text, mm_items, - mm_processor_kwargs) - prompt_ids, = hf_inputs.pop("input_ids").tolist() - mm_kwargs = MultiModalKwargs(hf_inputs) + prompt_ids, mm_kwargs = self._cached_apply_hf_processor( + prompt_text, + mm_items, + hf_processor_mm_kwargs, + ) - prompt_repls = self._get_prompt_replacements(mm_items, hf_inputs, - mm_processor_kwargs) - all_prompt_repls = self._bind_prompt_replacements(prompt_repls) + unbound_prompt_repls = self._get_prompt_replacements( + mm_items, + hf_processor_mm_kwargs, + mm_kwargs, + ) + prompt_repls = self._bind_prompt_replacements(unbound_prompt_repls) # If HF processor already inserts placeholder tokens, # there is no need for us to insert them - mm_item_counts = mm_items.get_item_counts() - all_placeholders = self._find_placeholders(all_prompt_repls, - prompt_ids, mm_item_counts) + mm_item_counts = mm_items.get_all_counts() + all_placeholders = self._find_placeholders(prompt_repls, prompt_ids, + mm_item_counts) if all_placeholders: tokenizer = self._get_tokenizer() @@ -817,7 +1005,7 @@ class BaseMultiModalProcessor(ABC): all_placeholders, ) = self._apply_prompt_replacements( prompt_ids, - all_prompt_repls, + prompt_repls, mm_item_counts, ) @@ -834,6 +1022,36 @@ class BaseMultiModalProcessor(ABC): mm_placeholders=mm_placeholders, ) + def _get_dummy_audios( + self, + *, + length: int, + num_audios: int, + ) -> list[npt.NDArray]: + audio = np.zeros((length, )) + return [audio] * num_audios + + def _get_dummy_images( + self, + *, + width: int, + height: int, + num_images: int, + ) -> list[Image.Image]: + image = Image.new("RGB", (width, height), color=0) + return [image] * num_images + + def _get_dummy_videos( + self, + *, + width: int, + height: int, + num_frames: int, + num_videos: int, + ) -> list[npt.NDArray]: + video = np.zeros((num_frames, width, height, 3)) + return [video] * num_videos + @abstractmethod def _get_dummy_mm_inputs( self, @@ -855,23 +1073,29 @@ class BaseMultiModalProcessor(ABC): from vllm.sequence import SequenceData processor_inputs = self._get_dummy_mm_inputs(mm_counts) - mm_inputs = self.apply(*processor_inputs) + mm_inputs = self.apply( + prompt_text=processor_inputs.prompt_text, + mm_data=processor_inputs.mm_data, + hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, + ) prompt_token_ids = mm_inputs["prompt_token_ids"] placeholders_by_modality = mm_inputs["mm_placeholders"] - total_placeholders_by_modality = dict[str, int]() - for modality, placeholders in placeholders_by_modality.items(): - num_placeholders = sum(item["length"] for item in placeholders) - max_tokens = mm_max_tokens[modality] - - if num_placeholders != max_tokens: - logger.warning( - "The processed dummy data has a total of %d placeholder " - "tokens for the '%s' modality, which is not the expected " - "%d tokens.", num_placeholders, modality, max_tokens) - - total_placeholders_by_modality[modality] = num_placeholders + total_placeholders_by_modality = { + modality: sum(item["length"] for item in placeholders) + for modality, placeholders in placeholders_by_modality.items() + } + expected_placeholders_by_modality = { + modality: mm_max_tokens[modality] + for modality in placeholders_by_modality + } + if total_placeholders_by_modality != expected_placeholders_by_modality: + raise AssertionError( + f"The processed dummy data has a total of " + f"{total_placeholders_by_modality} placeholder tokens, which " + f"is not the expected {expected_placeholders_by_modality} " + "tokens.") total_len = len(prompt_token_ids) if total_len > seq_len: diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index ded45a7184b5d..3a5e11867ad9e 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -1,10 +1,9 @@ import functools from collections import UserDict -from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional, +from typing import (TYPE_CHECKING, Any, Dict, Mapping, Optional, Protocol, Sequence, Type, TypeVar) import torch.nn as nn -from typing_extensions import TypeAlias from vllm.inputs import InputProcessingContext from vllm.logger import init_logger @@ -15,7 +14,7 @@ from .audio import AudioPlugin from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc from .image import ImagePlugin from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors -from .processing import BaseMultiModalProcessor +from .processing import BaseMultiModalProcessor, ProcessingCache from .video import VideoPlugin if TYPE_CHECKING: @@ -23,15 +22,22 @@ if TYPE_CHECKING: logger = init_logger(__name__) +# TODO: Tune the MM cache size +MM_CACHE_SIZE = 256 + N = TypeVar("N", bound=Type[nn.Module]) -MultiModalProcessorFactory: TypeAlias = Callable[[InputProcessingContext], - BaseMultiModalProcessor] -""" -Constructs a :class:`MultiModalProcessor` instance from the context. -The processing metadata should be derived from the context. -""" +class MultiModalProcessorFactory(Protocol): + """Constructs a :class:`MultiModalProcessor` instance from the context.""" + + def __call__( + self, + ctx: InputProcessingContext, + *, + cache: Optional[ProcessingCache] = None, + ) -> BaseMultiModalProcessor: + ... class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]): @@ -71,6 +77,8 @@ class MultiModalRegistry: self._limits_by_model = _MultiModalLimits() + self._processing_cache = ProcessingCache(MM_CACHE_SIZE) + def register_plugin(self, plugin: MultiModalPlugin) -> None: """ Register a multi-modal plugin so it can be recognized by vLLM. @@ -328,15 +336,18 @@ class MultiModalRegistry: return wrapper - def has_processor(self, model_config: "ModelConfig") -> bool: - """ - Test whether a multi-modal processor is defined for a specific model. - """ + def _get_model_cls(self, model_config: "ModelConfig"): # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture model_cls, _ = get_model_architecture(model_config) - return model_cls in self._processor_factories + return model_cls + + def has_processor(self, model_config: "ModelConfig") -> bool: + """ + Test whether a multi-modal processor is defined for a specific model. + """ + return self._get_model_cls(model_config) in self._processor_factories def create_processor( self, @@ -346,12 +357,11 @@ class MultiModalRegistry: """ Create a multi-modal processor for a specific model and tokenizer. """ - - # Avoid circular import - from vllm.model_executor.model_loader import get_model_architecture - - model_cls, _ = get_model_architecture(model_config) + model_cls = self._get_model_cls(model_config) processor_factory = self._processor_factories[model_cls] ctx = InputProcessingContext(model_config, tokenizer) - return processor_factory(ctx) + cache = (None if model_config.disable_mm_preprocessor_cache else + self._processing_cache) + + return processor_factory(ctx, cache=cache) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index a49da2bdee972..7b6ded6a27084 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -1,8 +1,7 @@ -import base64 -import os from functools import lru_cache -from io import BytesIO -from typing import List, Optional, Tuple, TypeVar, Union +from pathlib import Path +from typing import Optional, TypeVar, Union +from urllib.parse import ParseResult, urlparse import numpy as np import numpy.typing as npt @@ -10,283 +9,246 @@ import torch from PIL import Image import vllm.envs as envs -from vllm.connections import global_http_connection +from vllm.connections import HTTPConnection, global_http_connection from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer -from vllm.utils import PlaceholderModule -from .inputs import MultiModalDataDict, PlaceholderRange - -try: - import decord -except ImportError: - decord = PlaceholderModule("decord") # type: ignore[assignment] - -try: - import librosa -except ImportError: - librosa = PlaceholderModule("librosa") # type: ignore[assignment] - -try: - import soundfile -except ImportError: - soundfile = PlaceholderModule("soundfile") # type: ignore[assignment] +from .audio import AudioMediaIO +from .base import MediaIO +from .image import ImageMediaIO +from .inputs import PlaceholderRange +from .video import VideoMediaIO logger = init_logger(__name__) cached_get_tokenizer = lru_cache(get_tokenizer) - -def _load_image_from_bytes(b: bytes) -> Image.Image: - image = Image.open(BytesIO(b)) - image.load() - return image +_M = TypeVar("_M") -def _is_subpath(image_path: str, allowed_local_media_path: str) -> bool: - # Get the common path - common_path = os.path.commonpath([ - os.path.abspath(image_path), - os.path.abspath(allowed_local_media_path) - ]) - # Check if the common path is the same as allowed_local_media_path - return common_path == os.path.abspath(allowed_local_media_path) +class MediaConnector: + def __init__( + self, + connection: HTTPConnection = global_http_connection, + *, + allowed_local_media_path: str = "", + ) -> None: + super().__init__() -def _load_image_from_file(image_url: str, - allowed_local_media_path: str) -> Image.Image: - if not allowed_local_media_path: - raise ValueError("Invalid 'image_url': Cannot load local files without" - "'--allowed-local-media-path'.") - if allowed_local_media_path: - if not os.path.exists(allowed_local_media_path): + self.connection = connection + + if allowed_local_media_path: + allowed_local_media_path_ = Path(allowed_local_media_path) + + if not allowed_local_media_path_.exists(): + raise ValueError( + "Invalid `--allowed-local-media-path`: The path " + f"{allowed_local_media_path_} does not exist.") + if not allowed_local_media_path_.is_dir(): + raise ValueError( + "Invalid `--allowed-local-media-path`: The path " + f"{allowed_local_media_path_} must be a directory.") + else: + allowed_local_media_path_ = None + + self.allowed_local_media_path = allowed_local_media_path_ + + def _load_data_url( + self, + url_spec: ParseResult, + media_io: MediaIO[_M], + ) -> _M: + data_spec, data = url_spec.path.split(",", 1) + media_type, data_type = data_spec.split(";", 1) + + if data_type != "base64": + msg = "Only base64 data URLs are supported for now." + raise NotImplementedError(msg) + + return media_io.load_base64(media_type, data) + + def _load_file_url( + self, + url_spec: ParseResult, + media_io: MediaIO[_M], + ) -> _M: + allowed_local_media_path = self.allowed_local_media_path + if allowed_local_media_path is None: + raise RuntimeError("Cannot load local files without " + "`--allowed-local-media-path`.") + + filepath = Path(url_spec.path) + if allowed_local_media_path not in filepath.resolve().parents: raise ValueError( - "Invalid '--allowed-local-media-path': " - f"The path {allowed_local_media_path} does not exist.") - if not os.path.isdir(allowed_local_media_path): - raise ValueError( - "Invalid '--allowed-local-media-path': " - f"The path {allowed_local_media_path} must be a directory.") + f"The file path {filepath} must be a subpath " + f"of `--allowed-local-media-path` {allowed_local_media_path}.") - # Only split once and assume the second part is the image path - _, image_path = image_url.split("file://", 1) - if not _is_subpath(image_path, allowed_local_media_path): - raise ValueError( - f"Invalid 'image_url': The file path {image_path} must" - " be a subpath of '--allowed-local-media-path'" - f" '{allowed_local_media_path}'.") + return media_io.load_file(filepath) - image = Image.open(image_path) - image.load() - return image + def load_from_url( + self, + url: str, + media_io: MediaIO[_M], + *, + fetch_timeout: Optional[int] = None, + ) -> _M: + url_spec = urlparse(url) + if url_spec.scheme.startswith("http"): + connection = self.connection + data = connection.get_bytes(url, timeout=fetch_timeout) -def _load_image_from_data_url(image_url: str) -> Image.Image: - # Only split once and assume the second part is the base64 encoded image - _, image_base64 = image_url.split(",", 1) - return load_image_from_base64(image_base64) + return media_io.load_bytes(data) + if url_spec.scheme == "data": + return self._load_data_url(url_spec, media_io) -def fetch_image(image_url: str, - *, - image_mode: str = "RGB", - allowed_local_media_path: str = "") -> Image.Image: - """ - Load a PIL image from a HTTP or base64 data URL. + if url_spec.scheme == "file": + return self._load_file_url(url_spec, media_io) - By default, the image is converted into RGB format. - """ - if image_url.startswith('http'): - image_raw = global_http_connection.get_bytes( - image_url, - timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, - ) - image = _load_image_from_bytes(image_raw) + msg = "The URL must be either a HTTP, data or file URL." + raise ValueError(msg) - elif image_url.startswith('data:image'): - image = _load_image_from_data_url(image_url) - elif image_url.startswith('file://'): - image = _load_image_from_file(image_url, allowed_local_media_path) - else: - raise ValueError("Invalid 'image_url': A valid 'image_url' must start " - "with either 'data:image', 'file://' or 'http'.") + async def load_from_url_async( + self, + url: str, + media_io: MediaIO[_M], + *, + fetch_timeout: Optional[int] = None, + ) -> _M: + url_spec = urlparse(url) - return image.convert(image_mode) + if url_spec.scheme.startswith("http"): + connection = self.connection + data = await connection.async_get_bytes(url, timeout=fetch_timeout) + return media_io.load_bytes(data) -async def async_fetch_image(image_url: str, - *, - image_mode: str = "RGB", - allowed_local_media_path: str = "") -> Image.Image: - """ - Asynchronously load a PIL image from a HTTP or base64 data URL. + if url_spec.scheme == "data": + return self._load_data_url(url_spec, media_io) - By default, the image is converted into RGB format. - """ - if image_url.startswith('http'): - image_raw = await global_http_connection.async_get_bytes( - image_url, - timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, - ) - image = _load_image_from_bytes(image_raw) + if url_spec.scheme == "file": + return self._load_file_url(url_spec, media_io) - elif image_url.startswith('data:image'): - image = _load_image_from_data_url(image_url) - elif image_url.startswith('file://'): - image = _load_image_from_file(image_url, allowed_local_media_path) - else: - raise ValueError("Invalid 'image_url': A valid 'image_url' must start " - "with either 'data:image', 'file://' or 'http'.") + msg = "The URL must be either a HTTP, data or file URL." + raise ValueError(msg) - return image.convert(image_mode) + def fetch_audio( + self, + audio_url: str, + ) -> tuple[np.ndarray, Union[int, float]]: + """ + Load audio from a URL. + """ + audio_io = AudioMediaIO() - -def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray: - video_path = BytesIO(b) - vr = decord.VideoReader(video_path, num_threads=1) - total_frame_num = len(vr) - - if total_frame_num > num_frames: - uniform_sampled_frames = np.linspace(0, - total_frame_num - 1, - num_frames, - dtype=int) - frame_idx = uniform_sampled_frames.tolist() - else: - frame_idx = [i for i in range(0, total_frame_num)] - frames = vr.get_batch(frame_idx).asnumpy() - - return frames - - -def _load_video_from_data_url(video_url: str) -> npt.NDArray: - # Only split once and assume the second part is the base64 encoded video - _, video_base64 = video_url.split(",", 1) - - if video_url.startswith("data:video/jpeg;"): - return np.stack([ - np.array(load_image_from_base64(frame_base64)) - for frame_base64 in video_base64.split(",") - ]) - - return load_video_from_base64(video_base64) - - -def fetch_video(video_url: str, *, num_frames: int = 32) -> npt.NDArray: - """ - Load video from a HTTP or base64 data URL. - """ - if video_url.startswith('http') or video_url.startswith('https'): - video_raw = global_http_connection.get_bytes( - video_url, - timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT, - ) - video = _load_video_from_bytes(video_raw, num_frames) - elif video_url.startswith('data:video'): - video = _load_video_from_data_url(video_url) - else: - raise ValueError("Invalid 'video_url': A valid 'video_url' must start " - "with either 'data:video' or 'http'.") - return video - - -async def async_fetch_video(video_url: str, - *, - num_frames: int = 32) -> npt.NDArray: - """ - Asynchronously load video from a HTTP or base64 data URL. - - By default, the image is converted into RGB format. - """ - if video_url.startswith('http') or video_url.startswith('https'): - video_raw = await global_http_connection.async_get_bytes( - video_url, - timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT, - ) - video = _load_video_from_bytes(video_raw, num_frames) - elif video_url.startswith('data:video'): - video = _load_video_from_data_url(video_url) - else: - raise ValueError("Invalid 'video_url': A valid 'video_url' must start " - "with either 'data:video' or 'http'.") - return video - - -def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]: - """ - Load audio from a URL. - """ - if audio_url.startswith("http"): - audio_bytes = global_http_connection.get_bytes( + return self.load_from_url( audio_url, - timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT, + audio_io, + fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT, ) - elif audio_url.startswith("data:audio"): - _, audio_base64 = audio_url.split(",", 1) - audio_bytes = base64.b64decode(audio_base64) - else: - raise ValueError("Invalid 'audio_url': A valid 'audio_url' must start " - "with either 'data:audio' or 'http'.") - return librosa.load(BytesIO(audio_bytes), sr=None) + async def fetch_audio_async( + self, + audio_url: str, + ) -> tuple[np.ndarray, Union[int, float]]: + """ + Asynchronously fetch audio from a URL. + """ + audio_io = AudioMediaIO() - -async def async_fetch_audio( - audio_url: str) -> Tuple[np.ndarray, Union[int, float]]: - """ - Asynchronously fetch audio from a URL. - """ - if audio_url.startswith("http"): - audio_bytes = await global_http_connection.async_get_bytes( + return await self.load_from_url_async( audio_url, - timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT, + audio_io, + fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT, ) - elif audio_url.startswith("data:audio"): - _, audio_base64 = audio_url.split(",", 1) - audio_bytes = base64.b64decode(audio_base64) - else: - raise ValueError("Invalid 'audio_url': A valid 'audio_url' must start " - "with either 'data:audio' or 'http'.") - return librosa.load(BytesIO(audio_bytes), sr=None) - - -def get_and_parse_audio(audio_url: str) -> MultiModalDataDict: - audio, sr = fetch_audio(audio_url) - return {"audio": (audio, sr)} - - -def get_and_parse_image( + def fetch_image( + self, image_url: str, *, - allowed_local_media_path: str = "") -> MultiModalDataDict: - image = fetch_image(image_url, - allowed_local_media_path=allowed_local_media_path) - return {"image": image} + image_mode: str = "RGB", + ) -> Image.Image: + """ + Load a PIL image from a HTTP or base64 data URL. + By default, the image is converted into RGB format. + """ + image_io = ImageMediaIO(image_mode=image_mode) -def get_and_parse_video(video_url: str) -> MultiModalDataDict: - video = fetch_video(video_url) - return {"video": video} + return self.load_from_url( + image_url, + image_io, + fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, + ) - -async def async_get_and_parse_audio(audio_url: str) -> MultiModalDataDict: - audio, sr = await async_fetch_audio(audio_url) - return {"audio": (audio, sr)} - - -async def async_get_and_parse_image( + async def fetch_image_async( + self, image_url: str, *, - allowed_local_media_path: str = "") -> MultiModalDataDict: - image = await async_fetch_image( - image_url, allowed_local_media_path=allowed_local_media_path) - return {"image": image} + image_mode: str = "RGB", + ) -> Image.Image: + """ + Asynchronously load a PIL image from a HTTP or base64 data URL. + + By default, the image is converted into RGB format. + """ + image_io = ImageMediaIO(image_mode=image_mode) + + return await self.load_from_url_async( + image_url, + image_io, + fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, + ) + + def fetch_video( + self, + video_url: str, + *, + image_mode: str = "RGB", + num_frames: int = 32, + ) -> npt.NDArray: + """ + Load video from a HTTP or base64 data URL. + """ + image_io = ImageMediaIO(image_mode=image_mode) + video_io = VideoMediaIO(image_io, num_frames=num_frames) + + return self.load_from_url( + video_url, + video_io, + fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT, + ) + + async def fetch_video_async( + self, + video_url: str, + *, + image_mode: str = "RGB", + num_frames: int = 32, + ) -> npt.NDArray: + """ + Asynchronously load video from a HTTP or base64 data URL. + + By default, the image is converted into RGB format. + """ + image_io = ImageMediaIO(image_mode=image_mode) + video_io = VideoMediaIO(image_io, num_frames=num_frames) + + return await self.load_from_url_async( + video_url, + video_io, + fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT, + ) -async def async_get_and_parse_video(video_url: str) -> MultiModalDataDict: - video = await async_fetch_video(video_url) - return {"video": video} +global_media_connector = MediaConnector() +"""The global :class:`MediaConnector` instance used by vLLM.""" + +fetch_audio = global_media_connector.fetch_audio +fetch_image = global_media_connector.fetch_image +fetch_video = global_media_connector.fetch_video def encode_audio_base64( @@ -294,10 +256,8 @@ def encode_audio_base64( sampling_rate: int, ) -> str: """Encode audio as base64.""" - buffered = BytesIO() - soundfile.write(buffered, audio, sampling_rate, format="WAV") - - return base64.b64encode(buffered.getvalue()).decode('utf-8') + audio_io = AudioMediaIO() + return audio_io.encode_base64((audio, sampling_rate)) def encode_image_base64( @@ -311,29 +271,14 @@ def encode_image_base64( By default, the image is converted into RGB format before being encoded. """ - buffered = BytesIO() - image = image.convert(image_mode) - image.save(buffered, format) - return base64.b64encode(buffered.getvalue()).decode('utf-8') - - -def load_image_from_base64(image: Union[bytes, str]) -> Image.Image: - """Load image from base64 format.""" - return _load_image_from_bytes(base64.b64decode(image)) + image_io = ImageMediaIO(image_mode=image_mode) + return image_io.encode_base64(image, image_format=format) def encode_video_base64(frames: npt.NDArray) -> str: - base64_frames = [] - frames_list = [frames[i] for i in range(frames.shape[0])] - for frame in frames_list: - img_base64 = encode_image_base64(Image.fromarray(frame)) - base64_frames.append(img_base64) - return ",".join(base64_frames) - - -def load_video_from_base64(video: Union[bytes, str]) -> npt.NDArray: - """Load video from base64 format.""" - return _load_video_from_bytes(base64.b64decode(video)) + image_io = ImageMediaIO() + video_io = VideoMediaIO(image_io) + return video_io.encode_base64(frames) def resolve_visual_encoder_outputs( @@ -389,7 +334,7 @@ def repeat_and_pad_token( repeat_count: int = 1, pad_token_left: Optional[_T] = None, pad_token_right: Optional[_T] = None, -) -> List[_T]: +) -> list[_T]: replacement = [token] * repeat_count if pad_token_left is not None: replacement = [pad_token_left] + replacement @@ -402,13 +347,13 @@ def repeat_and_pad_token( def repeat_and_pad_placeholder_tokens( tokenizer: AnyTokenizer, prompt: Optional[str], - prompt_token_ids: List[int], + prompt_token_ids: list[int], *, placeholder_token_id: int, - repeat_count: Union[int, List[int]], + repeat_count: Union[int, list[int]], pad_token_left: Optional[int] = None, pad_token_right: Optional[int] = None, -) -> Tuple[Optional[str], List[int], List[PlaceholderRange]]: +) -> tuple[Optional[str], list[int], list[PlaceholderRange]]: if isinstance(repeat_count, int): repeat_count = [repeat_count] @@ -450,20 +395,24 @@ def repeat_and_pad_placeholder_tokens( new_prompt += prompt_parts[i] + replacement_str new_prompt += prompt_parts[-1] - new_token_ids: List[int] = [] - placeholder_ranges: List[PlaceholderRange] = [] + new_token_ids = list[int]() + placeholder_ranges = list[PlaceholderRange]() placeholder_token_idx = 0 for i, token in enumerate(prompt_token_ids): if token == placeholder_token_id: + curr_repeat_count = repeat_count[placeholder_token_idx] replacement_ids = repeat_and_pad_token( placeholder_token_id, - repeat_count=repeat_count[placeholder_token_idx], + repeat_count=curr_repeat_count, pad_token_left=pad_token_left, pad_token_right=pad_token_right, ) + offset = len(new_token_ids) + if pad_token_left is not None: + offset += 1 placeholder_ranges.append({ - "offset": len(new_token_ids), - "length": len(replacement_ids) + "offset": offset, + "length": curr_repeat_count, }) new_token_ids.extend(replacement_ids) placeholder_token_idx += 1 @@ -481,7 +430,7 @@ def repeat_and_pad_placeholder_tokens( def consecutive_placeholder_ranges( num_items: int, item_size: int, - initial_offset: int = 0) -> List[PlaceholderRange]: + initial_offset: int = 0) -> list[PlaceholderRange]: """Returns a list of consecutive PlaceholderRanges of a fixed size""" return [ diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index c4be100562703..1ad1f5abc27a2 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -1,23 +1,32 @@ -from functools import lru_cache +import base64 +from functools import lru_cache, partial +from io import BytesIO +from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, Optional import cv2 import numpy as np import numpy.typing as npt +from PIL import Image from vllm.inputs.registry import InputContext from vllm.logger import init_logger from vllm.transformers_utils.processor import get_video_processor from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.utils import is_list_of +from vllm.utils import PlaceholderModule, is_list_of -from .base import MultiModalData -from .image import ImagePlugin +from .base import MediaIO, ModalityData +from .image import ImageMediaIO, ImagePlugin from .inputs import MultiModalKwargs, VideoItem if TYPE_CHECKING: from vllm.config import ModelConfig +try: + import decord +except ImportError: + decord = PlaceholderModule("decord") # type: ignore[assignment] + logger = init_logger(__name__) cached_get_video_processor = lru_cache(get_video_processor) @@ -45,7 +54,7 @@ class VideoPlugin(ImagePlugin): def _default_input_mapper( self, ctx: InputContext, - data: MultiModalData[VideoItem], + data: ModalityData[VideoItem], **mm_processor_kwargs, ) -> MultiModalKwargs: model_config = ctx.model_config @@ -107,3 +116,73 @@ def sample_frames_from_video(frames: npt.NDArray, frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) sampled_frames = frames[frame_indices, ...] return sampled_frames + + +class VideoMediaIO(MediaIO[npt.NDArray]): + + def __init__( + self, + image_io: ImageMediaIO, + *, + num_frames: int = 32, + ) -> None: + super().__init__() + + self.image_io = image_io + self.num_frames = num_frames + + def load_bytes(self, data: bytes) -> npt.NDArray: + vr = decord.VideoReader(BytesIO(data), num_threads=1) + total_frame_num = len(vr) + + num_frames = self.num_frames + if total_frame_num > num_frames: + uniform_sampled_frames = np.linspace(0, + total_frame_num - 1, + num_frames, + dtype=int) + frame_idx = uniform_sampled_frames.tolist() + else: + frame_idx = list(range(0, total_frame_num)) + + return vr.get_batch(frame_idx).asnumpy() + + def load_base64(self, media_type: str, data: str) -> npt.NDArray: + if media_type.lower() == "video/jpeg": + load_frame = partial( + self.image_io.load_base64, + "image/jpeg", + ) + + return np.stack([ + np.array(load_frame(frame_data)) + for frame_data in data.split(",") + ]) + + return self.load_bytes(base64.b64decode(data)) + + def load_file(self, filepath: Path) -> npt.NDArray: + with filepath.open("rb") as f: + data = f.read() + + return self.load_bytes(data) + + def encode_base64( + self, + media: npt.NDArray, + *, + video_format: str = "JPEG", + ) -> str: + video = media + + if video_format == "JPEG": + encode_frame = partial( + self.image_io.encode_base64, + image_format=video_format, + ) + + return ",".join( + encode_frame(Image.fromarray(frame)) for frame in video) + + msg = "Only JPEG format is supported for now." + raise NotImplementedError(msg) diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 419237c252ffd..f6ac14446c021 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -1,123 +1,223 @@ +import logging +import traceback +from itertools import chain +from typing import TYPE_CHECKING, Optional + +from vllm.plugins import load_plugins_by_group +from vllm.utils import resolve_obj_by_qualname + from .interface import _Backend # noqa: F401 -from .interface import CpuArchEnum, Platform, PlatformEnum, UnspecifiedPlatform +from .interface import CpuArchEnum, Platform, PlatformEnum -current_platform: Platform +logger = logging.getLogger(__name__) -# NOTE: we don't use `torch.version.cuda` / `torch.version.hip` because -# they only indicate the build configuration, not the runtime environment. -# For example, people can install a cuda build of pytorch but run on tpu. -is_tpu = False -try: - # While it's technically possible to install libtpu on a non-TPU machine, - # this is a very uncommon scenario. Therefore, we assume that libtpu is - # installed if and only if the machine has TPUs. - import libtpu # noqa: F401 - is_tpu = True -except Exception: - pass - -is_cuda = False - -try: - import pynvml - pynvml.nvmlInit() +def tpu_platform_plugin() -> Optional[str]: + is_tpu = False try: - if pynvml.nvmlDeviceGetCount() > 0: + # While it's technically possible to install libtpu on a + # non-TPU machine, this is a very uncommon scenario. Therefore, + # we assume that libtpu is installed if and only if the machine + # has TPUs. + import libtpu # noqa: F401 + is_tpu = True + except Exception: + pass + + return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None + + +def cuda_platform_plugin() -> Optional[str]: + is_cuda = False + + try: + import pynvml + pynvml.nvmlInit() + try: + if pynvml.nvmlDeviceGetCount() > 0: + is_cuda = True + finally: + pynvml.nvmlShutdown() + except Exception: + # CUDA is supported on Jetson, but NVML may not be. + import os + + def cuda_is_jetson() -> bool: + return os.path.isfile("/etc/nv_tegra_release") \ + or os.path.exists("/sys/class/tegra-firmware") + + if cuda_is_jetson(): is_cuda = True - finally: - pynvml.nvmlShutdown() -except Exception: - # CUDA is supported on Jetson, but NVML may not be. - import os - def cuda_is_jetson() -> bool: - return os.path.isfile("/etc/nv_tegra_release") \ - or os.path.exists("/sys/class/tegra-firmware") + return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None - if cuda_is_jetson(): - is_cuda = True -is_rocm = False +def rocm_platform_plugin() -> Optional[str]: + is_rocm = False -try: - import amdsmi - amdsmi.amdsmi_init() try: - if len(amdsmi.amdsmi_get_processor_handles()) > 0: - is_rocm = True - finally: - amdsmi.amdsmi_shut_down() -except Exception: - pass + import amdsmi + amdsmi.amdsmi_init() + try: + if len(amdsmi.amdsmi_get_processor_handles()) > 0: + is_rocm = True + finally: + amdsmi.amdsmi_shut_down() + except Exception: + pass -is_hpu = False -try: - from importlib import util - is_hpu = util.find_spec('habana_frameworks') is not None -except Exception: - pass + return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None -is_xpu = False -try: - # installed IPEX if the machine has XPUs. - import intel_extension_for_pytorch # noqa: F401 - import oneccl_bindings_for_pytorch # noqa: F401 - import torch - if hasattr(torch, 'xpu') and torch.xpu.is_available(): - is_xpu = True -except Exception: - pass +def hpu_platform_plugin() -> Optional[str]: + is_hpu = False + try: + from importlib import util + is_hpu = util.find_spec('habana_frameworks') is not None + except Exception: + pass -is_cpu = False -try: - from importlib.metadata import version - is_cpu = "cpu" in version("vllm") -except Exception: - pass + return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None -is_neuron = False -try: - import transformers_neuronx # noqa: F401 - is_neuron = True -except ImportError: - pass -is_openvino = False -try: - from importlib.metadata import version - is_openvino = "openvino" in version("vllm") -except Exception: - pass +def xpu_platform_plugin() -> Optional[str]: + is_xpu = False -if is_tpu: - # people might install pytorch built with cuda but run on tpu - # so we need to check tpu first - from .tpu import TpuPlatform - current_platform = TpuPlatform() -elif is_cuda: - from .cuda import CudaPlatform - current_platform = CudaPlatform() -elif is_rocm: - from .rocm import RocmPlatform - current_platform = RocmPlatform() -elif is_hpu: - from .hpu import HpuPlatform - current_platform = HpuPlatform() -elif is_xpu: - from .xpu import XPUPlatform - current_platform = XPUPlatform() -elif is_cpu: - from .cpu import CpuPlatform - current_platform = CpuPlatform() -elif is_neuron: - from .neuron import NeuronPlatform - current_platform = NeuronPlatform() -elif is_openvino: - from .openvino import OpenVinoPlatform - current_platform = OpenVinoPlatform() -else: - current_platform = UnspecifiedPlatform() + try: + # installed IPEX if the machine has XPUs. + import intel_extension_for_pytorch # noqa: F401 + import oneccl_bindings_for_pytorch # noqa: F401 + import torch + if hasattr(torch, 'xpu') and torch.xpu.is_available(): + is_xpu = True + except Exception: + pass -__all__ = ['Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum'] + return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None + + +def cpu_platform_plugin() -> Optional[str]: + is_cpu = False + try: + from importlib.metadata import version + is_cpu = "cpu" in version("vllm") + except Exception: + pass + + return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None + + +def neuron_platform_plugin() -> Optional[str]: + is_neuron = False + try: + import transformers_neuronx # noqa: F401 + is_neuron = True + except ImportError: + pass + + return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None + + +def openvino_platform_plugin() -> Optional[str]: + is_openvino = False + try: + from importlib.metadata import version + is_openvino = "openvino" in version("vllm") + except Exception: + pass + + return "vllm.platforms.openvino.OpenVinoPlatform" if is_openvino else None + + +builtin_platform_plugins = { + 'tpu': tpu_platform_plugin, + 'cuda': cuda_platform_plugin, + 'rocm': rocm_platform_plugin, + 'hpu': hpu_platform_plugin, + 'xpu': xpu_platform_plugin, + 'cpu': cpu_platform_plugin, + 'neuron': neuron_platform_plugin, + 'openvino': openvino_platform_plugin, +} + + +def resolve_current_platform_cls_qualname() -> str: + platform_plugins = load_plugins_by_group('vllm.platform_plugins') + + activated_plugins = [] + + for name, func in chain(builtin_platform_plugins.items(), + platform_plugins.items()): + try: + assert callable(func) + platform_cls_qualname = func() + if platform_cls_qualname is not None: + activated_plugins.append(name) + except Exception: + pass + + activated_builtin_plugins = list( + set(activated_plugins) & set(builtin_platform_plugins.keys())) + activated_oot_plugins = list( + set(activated_plugins) & set(platform_plugins.keys())) + + if len(activated_oot_plugins) >= 2: + raise RuntimeError( + "Only one platform plugin can be activated, but got: " + f"{activated_oot_plugins}") + elif len(activated_oot_plugins) == 1: + platform_cls_qualname = platform_plugins[activated_oot_plugins[0]]() + logger.info("Platform plugin %s is activated", + activated_oot_plugins[0]) + elif len(activated_builtin_plugins) >= 2: + raise RuntimeError( + "Only one platform plugin can be activated, but got: " + f"{activated_builtin_plugins}") + elif len(activated_builtin_plugins) == 1: + platform_cls_qualname = builtin_platform_plugins[ + activated_builtin_plugins[0]]() + logger.info("Automatically detected platform %s.", + activated_builtin_plugins[0]) + else: + platform_cls_qualname = "vllm.interface.UnspecifiedPlatform" + logger.info( + "No platform detected, vLLM is running on UnspecifiedPlatform") + return platform_cls_qualname + + +_current_platform = None +_init_trace: str = '' + +if TYPE_CHECKING: + current_platform: Platform + + +def __getattr__(name: str): + if name == 'current_platform': + # lazy init current_platform. + # 1. out-of-tree platform plugins need `from vllm.platforms import + # Platform` so that they can inherit `Platform` class. Therefore, + # we cannot resolve `current_platform` during the import of + # `vllm.platforms`. + # 2. when users use out-of-tree platform plugins, they might run + # `import vllm`, some vllm internal code might access + # `current_platform` during the import, and we need to make sure + # `current_platform` is only resolved after the plugins are loaded + # (we have tests for this, if any developer violate this, they will + # see the test failures). + global _current_platform + if _current_platform is None: + platform_cls_qualname = resolve_current_platform_cls_qualname() + _current_platform = resolve_obj_by_qualname( + platform_cls_qualname)() + global _init_trace + _init_trace = "".join(traceback.format_stack()) + return _current_platform + else: + return globals()[name] + + +__all__ = [ + 'Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum', + "_init_trace" +] diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 4150b0cdf836a..ddccaa2ce0148 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -199,6 +199,18 @@ class Platform: """ pass + @classmethod + def verify_model_arch(cls, model_arch: str) -> None: + """ + Verify whether the current platform supports the specified model + architecture. + + - This will raise an Error or Warning based on the model support on + the current platform. + - By default all models are considered supported. + """ + pass + @classmethod def verify_quantization(cls, quant: str) -> None: """ diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 7778b565372cb..aa779f265135f 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -1,6 +1,6 @@ import os from functools import lru_cache -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Dict, List, Optional import torch @@ -33,6 +33,31 @@ if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]: " `spawn` instead.") os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +# Models not supported by ROCm. +_ROCM_UNSUPPORTED_MODELS: List[str] = [] + +# Models partially supported by ROCm. +# Architecture -> Reason. +_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in " + "Triton flash attention. For half-precision SWA support, " + "please use CK flash attention by setting " + "`VLLM_USE_TRITON_FLASH_ATTN=0`") +_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = { + "Qwen2ForCausalLM": + _ROCM_SWA_REASON, + "MistralForCausalLM": + _ROCM_SWA_REASON, + "MixtralForCausalLM": + _ROCM_SWA_REASON, + "PaliGemmaForConditionalGeneration": + ("ROCm flash attention does not yet " + "fully support 32-bit precision on PaliGemma"), + "Phi3VForCausalLM": + ("ROCm Triton flash attention may run into compilation errors due to " + "excessive use of shared memory. If this happens, disable Triton FA " + "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`") +} + class RocmPlatform(Platform): _enum = PlatformEnum.ROCM @@ -102,6 +127,18 @@ class RocmPlatform(Platform): else: parallel_config.worker_cls = "vllm.worker.worker.Worker" + @classmethod + def verify_model_arch(cls, model_arch: str) -> None: + if model_arch in _ROCM_UNSUPPORTED_MODELS: + raise ValueError(f"Model architecture '{model_arch}' is not " + "supported by ROCm for now.") + + if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS: + msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch] + logger.warning( + "Model architecture '%s' is partially " + "supported by ROCm: %s", model_arch, msg) + @classmethod def verify_quantization(cls, quant: str) -> None: super().verify_quantization(quant) diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 17f604ea0e202..c50eb2cef4cd5 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -1,10 +1,10 @@ import logging import os +from typing import Callable, Dict import torch import vllm.envs as envs -from vllm.platforms import current_platform logger = logging.getLogger(__name__) @@ -12,6 +12,39 @@ logger = logging.getLogger(__name__) plugins_loaded = False +def load_plugins_by_group(group: str) -> Dict[str, Callable]: + import sys + if sys.version_info < (3, 10): + from importlib_metadata import entry_points + else: + from importlib.metadata import entry_points + + allowed_plugins = envs.VLLM_PLUGINS + + discovered_plugins = entry_points(group=group) + if len(discovered_plugins) == 0: + logger.debug("No plugins for group %s found.", group) + return {} + logger.info("Available plugins for group %s:", group) + for plugin in discovered_plugins: + logger.info("name=%s, value=%s", plugin.name, plugin.value) + if allowed_plugins is None: + logger.info("all available plugins for group %s will be loaded.", + group) + logger.info("set environment variable VLLM_PLUGINS to control" + " which plugins to load.") + plugins = {} + for plugin in discovered_plugins: + if allowed_plugins is None or plugin.name in allowed_plugins: + try: + func = plugin.load() + plugins[plugin.name] = func + logger.info("plugin %s loaded.", plugin.name) + except Exception: + logger.exception("Failed to load plugin %s", plugin.name) + return plugins + + def load_general_plugins(): """WARNING: plugins can be loaded for multiple times in different processes. They should be designed in a way that they can be loaded @@ -26,6 +59,9 @@ def load_general_plugins(): os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1' # see https://github.com/vllm-project/vllm/issues/10619 torch._inductor.config.compile_threads = 1 + + from vllm.platforms import current_platform + if current_platform.is_xpu(): # see https://github.com/pytorch/pytorch/blob/8cada5cbe5450e17c26fb8b358116785324537b2/torch/_dynamo/config.py#L158 # noqa os.environ['TORCH_COMPILE_DISABLE'] = 'True' @@ -47,33 +83,7 @@ def load_general_plugins(): if plugins_loaded: return plugins_loaded = True - import sys - if sys.version_info < (3, 10): - from importlib_metadata import entry_points - else: - from importlib.metadata import entry_points - - allowed_plugins = envs.VLLM_PLUGINS - - discovered_plugins = entry_points(group='vllm.general_plugins') - if len(discovered_plugins) == 0: - logger.debug("No plugins found.") - return - logger.info("Available plugins:") - for plugin in discovered_plugins: - logger.info("name=%s, value=%s, group=%s", plugin.name, plugin.value, - plugin.group) - if allowed_plugins is None: - logger.info("all available plugins will be loaded.") - logger.info("set environment variable VLLM_PLUGINS to control" - " which plugins to load.") - else: - logger.info("plugins to load: %s", allowed_plugins) - for plugin in discovered_plugins: - if allowed_plugins is None or plugin.name in allowed_plugins: - try: - func = plugin.load() - func() - logger.info("plugin %s loaded.", plugin.name) - except Exception: - logger.exception("Failed to load plugin %s", plugin.name) + plugins = load_plugins_by_group(group='vllm.general_plugins') + # general plugins, we only need to execute the loaded functions + for func in plugins.values(): + func() diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index fc77f3ca529b2..605c09b8d7225 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -450,15 +450,16 @@ class SamplingParams( return self._all_stop_token_ids def clone(self) -> "SamplingParams": - """Deep copy excluding LogitsProcessor objects. + """Deep copy, but maybe not the LogitsProcessor objects. - LogitsProcessor objects are excluded because they may contain an - arbitrary, nontrivial amount of data. + LogitsProcessor objects may contain an arbitrary, nontrivial amount of + data that is expensive to copy. However, if not copied, the processor + needs to support parallel decoding for multiple sequences See https://github.com/vllm-project/vllm/issues/3087 """ logit_processor_refs = None if self.logits_processors is None else { - id(lp): lp + id(lp): lp.clone() if hasattr(lp, 'clone') else lp for lp in self.logits_processors } return copy.deepcopy(self, memo=logit_processor_refs) diff --git a/vllm/sequence.py b/vllm/sequence.py index cc3d96fc93a79..034f89c0ddbe9 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -667,6 +667,7 @@ class SequenceGroup: first_scheduled_time=None, first_token_time=None, time_in_queue=None) + self.last_token_latency = 0.0 self.lora_request = lora_request self.prompt_logprobs: Optional[PromptLogprobs] = None self.state = SequenceGroupState() @@ -762,18 +763,21 @@ class SequenceGroup: assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill self.init_multi_step(num_steps=num_lookahead_slots + 1) - def get_last_latency(self, now: float) -> float: + def set_last_token_time(self, now: float) -> None: """Sets the last token time for Request level timings.""" - # If still in prefill phase, raise Error. - if self.is_prefill(): - raise ValueError( - "seq_group.get_last_latency() should not be called " - "if the seq_group is in prefill phase.") - - # Otherwise return token latency. - latency = now - self.metrics.last_token_time + # If still in prefill phase, assertion fails. + assert not self.is_prefill(), ( + "seq_group.set_last_token_time() should not be called " + "if the seq_group is in prefill phase.") + self.last_token_latency = now - self.metrics.last_token_time self.metrics.last_token_time = now - return latency + + def get_last_token_latency(self) -> float: + """Returns the latency of the last token.""" + assert not self.is_prefill(), ( + "seq_group.get_last_token_latency() should not be called " + "if the seq_group is in prefill phase.") + return self.last_token_latency def maybe_set_first_token_time(self, time: float) -> None: """Sets the first token time for Request level timings.""" @@ -1368,7 +1372,7 @@ class ParallelSampleSequenceGroup(SequenceGroupBase): @staticmethod def add_request(request_id: str, engine, params, **kwargs): original_params = params - params = copy.deepcopy(original_params) + params = original_params.clone() params.n = 1 group = ParallelSampleSequenceGroup(request_id) seqs = [] diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index 03dc46600d8a9..d678f4578499b 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -6,7 +6,6 @@ import torch from vllm.model_executor.layers.spec_decode_base_sampler import ( SpecDecodeBaseSampler) -from vllm.platforms import current_platform from vllm.utils import is_pin_memory_available @@ -94,6 +93,7 @@ class AsyncMetricsCollector: def maybe_collect_rejsample_metrics( self, k: int) -> Optional[SpecDecodeWorkerMetrics]: # currently using cuda.Event, skip for any non_cuda_alike platform + from vllm.platforms import current_platform if not current_platform.is_cuda_alike(): return None diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 4529cf27ef565..58417980e7b47 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -22,9 +22,9 @@ from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger # yapf conflicts with isort for this block # yapf: disable -from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, - EAGLEConfig, ExaoneConfig, - H2OVLChatConfig, +from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config, + DbrxConfig, EAGLEConfig, + ExaoneConfig, H2OVLChatConfig, InternVLChatConfig, JAISConfig, MedusaConfig, MllamaConfig, MLPSpeculatorConfig, MPTConfig, @@ -52,6 +52,7 @@ _CONFIG_REGISTRY_OVERRIDE_HF: Dict[str, Type[PretrainedConfig]] = { _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { "chatglm": ChatGLMConfig, + "cohere2": Cohere2Config, "dbrx": DbrxConfig, "mpt": MPTConfig, "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index c24433cd436b4..a41a35c88b3a1 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -1,4 +1,5 @@ from vllm.transformers_utils.configs.chatglm import ChatGLMConfig +from vllm.transformers_utils.configs.cohere2 import Cohere2Config from vllm.transformers_utils.configs.dbrx import DbrxConfig from vllm.transformers_utils.configs.eagle import EAGLEConfig from vllm.transformers_utils.configs.exaone import ExaoneConfig @@ -22,6 +23,7 @@ from vllm.transformers_utils.configs.ultravox import UltravoxConfig __all__ = [ "ChatGLMConfig", + "Cohere2Config", "DbrxConfig", "MPTConfig", "RWConfig", diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py new file mode 100644 index 0000000000000..1509330fc2179 --- /dev/null +++ b/vllm/transformers_utils/configs/cohere2.py @@ -0,0 +1,192 @@ +# ruff: noqa + +# Adapted from +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/cohere2/configuration_cohere2.py +from transformers import PretrainedConfig +from transformers.modeling_rope_utils import rope_config_validation + + +class Cohere2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere + model according to the specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. Instantiating a configuration + with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model. + + + Args: + vocab_size (`int`, *optional*, defaults to 256000): + Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`CohereModel`] + hidden_size (`int`, *optional*, defaults to 8192): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 22528): + Dimension of the MLP representations. + logit_scale (`float`, *optional*, defaults to 0.0625): + The scaling factor for the output logits. + num_hidden_layers (`int`, *optional*, defaults to 40): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 64): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 8192): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 5): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 255001): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + sliding_window (`int`, *optional*, defaults to 4096): + Size of the sliding window attention context. + sliding_window_pattern (`int`, *optional*, defaults to 4): + Pattern for the sliding window attention. + cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`. + + ```python + >>> from transformers import Cohere2Model, Cohere2Config + + >>> # Initializing a Cohere Nextmodel configuration + >>> configuration = Cohere2Config() + + >>> # Initializing a model from the Cohere2 configuration + >>> model = Cohere2Model(configuration) # doctest: +SKIP + + >>> # Accessing the model configuration + >>> configuration = model.config # doctest: +SKIP + ``` + """ + + model_type = "cohere2" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=256000, + hidden_size=8192, + intermediate_size=22528, + logit_scale=0.0625, + num_hidden_layers=40, + num_attention_heads=64, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=8192, + initializer_range=0.02, + layer_norm_eps=1e-5, + use_cache=True, + pad_token_id=0, + bos_token_id=5, + eos_token_id=255001, + tie_word_embeddings=True, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + sliding_window=4096, + sliding_window_pattern=4, + cache_implementation="hybrid", + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.logit_scale = logit_scale + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.sliding_window = sliding_window + self.sliding_window_pattern = sliding_window_pattern + # Need to specify head_dim in the config so it can be used in the attention forward functions + self.head_dim = hidden_size // num_attention_heads + self.cache_implementation = cache_implementation + + # Validate the correctness of rotary position embeddings parameters + rope_config_validation(self) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +__all__ = ["Cohere2Config"] diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index f1523667b0466..b12cc83a22970 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -1,25 +1,31 @@ from functools import lru_cache from typing import Any, cast +from transformers.processing_utils import ProcessorMixin + def get_processor( processor_name: str, *args: Any, trust_remote_code: bool = False, + processor_cls: type[ProcessorMixin] = ProcessorMixin, **kwargs: Any, ): """Load a processor for the given model name via HuggingFace.""" # don't put this import at the top level # it will call torch.cuda.device_count() from transformers import AutoProcessor - from transformers.processing_utils import ProcessorMixin + + processor_factory = (AutoProcessor + if processor_cls == ProcessorMixin else processor_cls) try: - processor = AutoProcessor.from_pretrained( + processor = processor_factory.from_pretrained( processor_name, *args, trust_remote_code=trust_remote_code, - **kwargs) + **kwargs, + ) except ValueError as e: # If the error pertains to the processor class not existing or not # currently being imported, suggest using the --trust-remote-code flag. diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py index 36315abcdfcda..0c96e0632f646 100644 --- a/vllm/triton_utils/importing.py +++ b/vllm/triton_utils/importing.py @@ -8,7 +8,6 @@ logger = init_logger(__name__) HAS_TRITON = ( find_spec("triton") is not None and not current_platform.is_xpu() # Not compatible - and not current_platform.is_neuron() # neuron has too old torch ) if not HAS_TRITON: diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 9ae46ff43a916..a9deee881f41a 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -17,7 +17,6 @@ import torch import vllm.envs as envs from vllm.connections import global_http_connection -from vllm.platforms import current_platform from vllm.version import __version__ as VLLM_VERSION _config_home = envs.VLLM_CONFIG_ROOT @@ -152,6 +151,7 @@ class UsageMessage: usage_context: UsageContext, extra_kvs: Dict[str, Any]) -> None: # Platform information + from vllm.platforms import current_platform if current_platform.is_cuda_alike(): device_property = torch.cuda.get_device_properties(0) self.gpu_count = torch.cuda.device_count() diff --git a/vllm/utils.py b/vllm/utils.py index 6a2bc7bc54771..9bcfb861af9e1 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -10,6 +10,7 @@ import importlib.metadata import importlib.util import inspect import ipaddress +import multiprocessing import os import re import resource @@ -20,17 +21,19 @@ import sys import tempfile import threading import time +import traceback import uuid import warnings import weakref from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task from collections import OrderedDict, UserDict, defaultdict -from collections.abc import Iterable, Mapping +from collections.abc import Hashable, Iterable, Mapping from dataclasses import dataclass, field from functools import lru_cache, partial, wraps from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, - Dict, Generator, Generic, Hashable, List, Literal, - Optional, Tuple, Type, TypeVar, Union, overload) + Dict, Generator, Generic, Iterator, List, Literal, + NamedTuple, Optional, Tuple, Type, TypeVar, Union, + overload) from uuid import uuid4 import numpy as np @@ -39,13 +42,14 @@ import psutil import torch import torch.types import yaml +import zmq +import zmq.asyncio from packaging.version import Version from torch.library import Library from typing_extensions import ParamSpec, TypeIs, assert_never import vllm.envs as envs from vllm.logger import enable_trace_function_call, init_logger -from vllm.platforms import current_platform if TYPE_CHECKING: from vllm.config import VllmConfig @@ -194,13 +198,29 @@ class Counter: self.counter = 0 +class CacheInfo(NamedTuple): + hits: int + total: int + + @property + def hit_ratio(self) -> float: + if self.total == 0: + return 0 + + return self.hits / self.total + + class LRUCache(Generic[_K, _V]): + """Note: This class is not thread safe!""" def __init__(self, capacity: int) -> None: self.cache = OrderedDict[_K, _V]() self.pinned_items = set[_K]() self.capacity = capacity + self._hits = 0 + self._total = 0 + def __contains__(self, key: _K) -> bool: return key in self.cache @@ -218,6 +238,9 @@ class LRUCache(Generic[_K, _V]): def __delitem__(self, key: _K) -> None: self.pop(key) + def stat(self) -> CacheInfo: + return CacheInfo(hits=self._hits, total=self._total) + def touch(self, key: _K) -> None: self.cache.move_to_end(key) @@ -226,8 +249,12 @@ class LRUCache(Generic[_K, _V]): if key in self.cache: value = self.cache[key] self.cache.move_to_end(key) + + self._hits += 1 else: value = default + + self._total += 1 return value def put(self, key: _K, value: _V) -> None: @@ -581,6 +608,7 @@ def create_kv_caches_with_random_flash( seed: int = 0, device: Optional[str] = "cuda", ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: + from vllm.platforms import current_platform current_platform.seed_everything(seed) torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) @@ -622,7 +650,7 @@ def create_kv_caches_with_random( raise ValueError( f"Does not support key cache of type fp8 with head_size {head_size}" ) - + from vllm.platforms import current_platform current_platform.seed_everything(seed) torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) @@ -675,6 +703,7 @@ def print_warning_once(msg: str) -> None: @lru_cache(maxsize=None) def is_pin_memory_available() -> bool: + from vllm.platforms import current_platform return current_platform.is_pin_memory_available() @@ -693,6 +722,7 @@ class DeviceMemoryProfiler: def current_memory_usage(self) -> float: # Return the memory usage in bytes. + from vllm.platforms import current_platform if current_platform.is_cuda_alike(): torch.cuda.reset_peak_memory_stats(self.device) mem = torch.cuda.max_memory_allocated(self.device) @@ -1046,6 +1076,7 @@ def _cuda_device_count_stateless( import torch.cuda import torch.version + from vllm.platforms import current_platform if not torch.cuda._is_compiled(): return 0 if current_platform.is_rocm(): @@ -1661,6 +1692,7 @@ def direct_register_custom_op( return if not supports_custom_op(): + from vllm.platforms import current_platform assert not current_platform.is_cuda_alike(), ( "cuda platform needs torch>=2.4 to support custom op, " "chances are you are using an old version of pytorch " @@ -1837,7 +1869,7 @@ def memory_profiling( result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes # noqa -# Adapted from: https://github.com/sgl-project/sglang/blob/f46f394f4d4dbe4aae85403dec006199b34d2840/python/sglang/srt/utils.py#L630 # noqa: E501Curre +# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501 def set_ulimit(target_soft_limit=65535): resource_type = resource.RLIMIT_NOFILE current_soft, current_hard = resource.getrlimit(resource_type) @@ -1852,3 +1884,82 @@ def set_ulimit(target_soft_limit=65535): "with error %s. This can cause fd limit errors like" "`OSError: [Errno 24] Too many open files`. Consider " "increasing with ulimit -n", current_soft, e) + + +# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/utils.py#L28 # noqa: E501 +def get_exception_traceback(): + etype, value, tb = sys.exc_info() + err_str = "".join(traceback.format_exception(etype, value, tb)) + return err_str + + +# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L783 # noqa: E501 +def make_zmq_socket( + ctx: Union[zmq.asyncio.Context, zmq.Context], # type: ignore[name-defined] + path: str, + type: Any, +) -> Union[zmq.Socket, zmq.asyncio.Socket]: # type: ignore[name-defined] + """Make a ZMQ socket with the proper bind/connect semantics.""" + + mem = psutil.virtual_memory() + socket = ctx.socket(type) + + # Calculate buffer size based on system memory + total_mem = mem.total / 1024**3 + available_mem = mem.available / 1024**3 + # For systems with substantial memory (>32GB total, >16GB available): + # - Set a large 0.5GB buffer to improve throughput + # For systems with less memory: + # - Use system default (-1) to avoid excessive memory consumption + if total_mem > 32 and available_mem > 16: + buf_size = int(0.5 * 1024**3) # 0.5GB in bytes + else: + buf_size = -1 # Use system default buffer size + + if type == zmq.constants.PULL: + socket.setsockopt(zmq.constants.RCVHWM, 0) + socket.setsockopt(zmq.constants.RCVBUF, buf_size) + socket.connect(path) + elif type == zmq.constants.PUSH: + socket.setsockopt(zmq.constants.SNDHWM, 0) + socket.setsockopt(zmq.constants.SNDBUF, buf_size) + socket.bind(path) + else: + raise ValueError(f"Unknown Socket Type: {type}") + + return socket + + +@contextlib.contextmanager +def zmq_socket_ctx( + path: str, + type: Any) -> Iterator[zmq.Socket]: # type: ignore[name-defined] + """Context manager for a ZMQ socket""" + + ctx = zmq.Context(io_threads=2) # type: ignore[attr-defined] + try: + yield make_zmq_socket(ctx, path, type) + + except KeyboardInterrupt: + logger.debug("Got Keyboard Interrupt.") + + finally: + ctx.destroy(linger=0) + + +def _check_multiproc_method(): + if (cuda_is_initialized() + and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"): + logger.warning("CUDA was previously initialized. We must use " + "the `spawn` multiprocessing start method. Setting " + "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " + "See https://docs.vllm.ai/en/latest/getting_started/" + "debugging.html#python-multiprocessing " + "for more information.") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + +def get_mp_context(): + _check_multiproc_method() + mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD + return multiprocessing.get_context(mp_method) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 78efacccfa078..00d0de51634ae 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -191,7 +191,7 @@ class KVCacheManager: request: The request to allocate slots. num_tokens: The number of tokens to allocate. Note that this does not include the tokens that have already been computed. - computed_blocks: The blocks that have already been computed. + computed_blocks: A list of computed blocks. Returns: A list of new allocated blocks. @@ -200,6 +200,18 @@ class KVCacheManager: raise ValueError( f"num_tokens must be greater than 0, got {num_tokens}") + # If a computed block of a request is an eviction candidate (in the + # free queue and ref_cnt == 0), it cannot be counted as a free block + # when allocating this request. + num_evictable_computed_blocks = sum(1 for blk in computed_blocks + if blk.ref_cnt == 0) + + num_required_blocks = cdiv(num_tokens, self.block_size) + if (num_required_blocks > self.free_block_queue.num_free_blocks - + num_evictable_computed_blocks): + # Cannot allocate new blocks. + return None + # Touch the computed blocks to make sure they won't be evicted. if self.enable_caching: self._touch(computed_blocks) @@ -208,11 +220,6 @@ class KVCacheManager: "Computed blocks should be empty when " "prefix caching is disabled") - num_required_blocks = cdiv(num_tokens, self.block_size) - if (num_required_blocks > self.free_block_queue.num_free_blocks): - # Cannot allocate new blocks. - return None - # Determine the number of new blocks to allocate considering # preallocated blocks. num_new_blocks = min( diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 9ddbff7c9a604..84ff48bf428a0 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -218,8 +218,8 @@ def generate_block_hash_extra_keys( continue # The block contains the current mm input. - mm_start = max(0, start_token_idx - offset) - extra_keys.append((mm_hashes[curr_mm_idx], mm_start)) + extra_keys.append(mm_hashes[curr_mm_idx]) + if end_token_idx >= offset + length: # If this block contains the end of the current mm input, # move to the next mm input as this block may also contain diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index cc0c7ea23469a..f70464fc88298 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -6,21 +6,7 @@ import msgspec from vllm.lora.request import LoRARequest from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict -from vllm.sampling_params import RequestOutputKind, SamplingParams - - -@dataclass -class DetokenizerRequest: - - request_id: str - prompt: Optional[str] - prompt_token_ids: List[int] - skip_special_tokens: bool - spaces_between_special_tokens: bool - output_kind: RequestOutputKind - - stop: List[str] - include_stop_str_in_output: bool +from vllm.sampling_params import SamplingParams @dataclass diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index cfdbea8004c35..3f097ca7f439c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,4 +1,6 @@ import asyncio +import os +import signal from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig @@ -9,14 +11,14 @@ from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.outputs import PoolingRequestOutput, RequestOutput +from vllm.outputs import RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.v1.engine.async_stream import AsyncStream +from vllm.utils import kill_process_tree from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.detokenizer import Detokenizer from vllm.v1.engine.processor import Processor @@ -39,6 +41,22 @@ class AsyncLLM(EngineClient): log_requests: bool = True, start_engine_loop: bool = True, ) -> None: + + # The child processes will send SIGQUIT when unrecoverable + # errors happen. We kill the process tree here so that the + # stack trace is very evident. + # TODO: rather than killing the main process, we should + # figure out how to raise an AsyncEngineDeadError and + # handle at the API server level so we can return a better + # error code to the clients calling VLLM. + def sigquit_handler(signum, frame): + logger.fatal( + "AsyncLLM got SIGQUIT from worker processes, shutting " + "down. See stack trace above for root cause issue.") + kill_process_tree(os.getpid()) + + signal.signal(signal.SIGQUIT, sigquit_handler) + assert start_engine_loop self.log_requests = log_requests @@ -54,10 +72,8 @@ class AsyncLLM(EngineClient): lora_config=vllm_config.lora_config) self.tokenizer.ping() - # Request streams (map of request_id -> AsyncStream). - self.request_streams: Dict[str, AsyncStream] = {} - # List of cancelled request ids to be aborted. - self.client_aborted_requests: List[str] = [] + # Request streams (map of request_id -> queue). + self.rid_to_queue: Dict[str, asyncio.Queue] = {} # Processor (converts Inputs --> EngineCoreRequests). self.processor = Processor( @@ -78,11 +94,11 @@ class AsyncLLM(EngineClient): # EngineCore (starts the engine in background process). self.engine_core = EngineCoreClient.make_client( - vllm_config=vllm_config, - executor_class=executor_class, - usage_context=usage_context, multiprocess_mode=True, asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=self.log_stats, ) self.output_handler: Optional[asyncio.Task] = None @@ -153,28 +169,31 @@ class AsyncLLM(EngineClient): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, - ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: + ) -> asyncio.Queue[RequestOutput]: """Add new request to the AsyncLLM.""" - if self.detokenizer.is_request_active(request_id): - raise ValueError(f"Request {request_id} already exists.") + # 1) Create a new output queue for the request. + if request_id in self.rid_to_queue: + raise ValueError(f"Request id {request_id} already running.") + self.rid_to_queue[request_id] = asyncio.Queue() - # 1) Create a new AsyncStream for the request. - stream = self._add_request_to_streams(request_id) - - # 2) Convert input --> DetokenizerRequest / EngineCoreRequest. - detokenizer_req, engine_core_req = self.processor.process_inputs( - request_id, prompt, params, arrival_time, lora_request, - trace_headers, prompt_adapter_request, priority) + # 2) Convert Input --> Request. + request = self.processor.process_inputs(request_id, prompt, params, + arrival_time, lora_request, + trace_headers, + prompt_adapter_request, + priority) # 3) Add the request to Detokenizer (this process). - self.detokenizer.add_request(detokenizer_req) + self.detokenizer.add_request(request) # 4) Add the EngineCoreRequest to EngineCore (separate process). - await self.engine_core.add_request_async(engine_core_req) + await self.engine_core.add_request_async(request) - # 5) Return the generator. - return stream.generator() + if self.log_requests: + logger.info("Added request %s.", request_id) + + return self.rid_to_queue[request_id] # TODO: we should support multiple prompts in one call, as you # can do with LLM.generate. So that for multi-prompt completion @@ -194,7 +213,7 @@ class AsyncLLM(EngineClient): """ Main function called by the API server to kick off a request * 1) Making an AsyncStream corresponding to the Request. - # 2) Processing the Input. + * 2) Processing the Input. * 3) Adding the Request to the Detokenizer. * 4) Adding the Request to the EngineCore (separate process). @@ -206,14 +225,15 @@ class AsyncLLM(EngineClient): returning the RequestOutput back to the caller. """ - # We start the output_handler on the first call to generate() so that - # we can call __init__ before the event loop starts, which enables us - # to handle startup failure gracefully in the OpenAI server. - if self.output_handler is None: - self.output_handler = asyncio.create_task( - self._run_output_handler()) + try: + # We start the output_handler on the first call to generate() so + # we can call __init__ before the event loop, which enables us + # to handle startup failure gracefully in the OpenAI server. + if self.output_handler is None: + self.output_handler = asyncio.create_task( + self._run_output_handler()) - async for output in await self.add_request( + q = await self.add_request( request_id, prompt, sampling_params, @@ -221,79 +241,42 @@ class AsyncLLM(EngineClient): trace_headers=trace_headers, prompt_adapter_request=prompt_adapter_request, priority=priority, - ): - yield output + ) - def _finish_stream(self, request_id: str): - stream = self.request_streams.pop(request_id, None) - if stream is not None: - stream.finish() + # The output_handler task pushes items into the queue. + # This task pulls from the queue and yields to caller. + while True: + # Note: drain queue without await if possible (avoids + # task switching under load which helps performance). + out = q.get_nowait() if q.qsize() > 0 else await q.get() - def _add_request_to_streams( - self, - request_id: str, - ) -> AsyncStream: + # Note: both Detokenizer and EngineCore handle their + # own request cleanup based on finished. + if out.finished: + del self.rid_to_queue[request_id] + yield out + break - if request_id in self.request_streams: - raise ValueError(f"Request id {request_id} already running.") + yield out - # Avoid streams having circular ref to parent AsyncLLM object. - aborted_reqs = self.client_aborted_requests - stream = AsyncStream(request_id, aborted_reqs.append) - self.request_streams[request_id] = stream - - if self.log_requests: - logger.info("Added request %s.", request_id) - - return stream - - async def _process_cancellations(self) -> None: - """ - Process requests cancelled from user disconnecting. - - When a client disconnects, AsyncStream._cancel() is called. - We passed a callback to AsyncStream(), which appends to - self.client_aborted_requests. - - As a result, if any requests are canceled from the user side - the request_id will show up in self.client_aborted_requests. - """ - - # Avoid streams having circular ref to parent AsyncLLM object. - if not self.client_aborted_requests: - return - reqs_to_abort = self.client_aborted_requests.copy() - self.client_aborted_requests.clear() - - # Remove from Detokenizer. - self.detokenizer.abort_requests(reqs_to_abort) - - # Remove from RequestStreams. - for request_id in reqs_to_abort: - if self.log_requests: - logger.info("User-cancelled request %s.", request_id) - self._finish_stream(request_id) - - # Remove from EngineCore. - await self.engine_core.abort_requests_async(reqs_to_abort) + # If the request is disconnected by the client, the + # generate() task will be canceled. So, we abort the + # request if we end up here. + except asyncio.CancelledError: + await self.abort(request_id) + raise def _process_request_outputs(self, request_outputs: List[RequestOutput]): - """Process outputs by putting them into per-request AsyncStreams.""" + """Process outputs by putting them into per-request queues.""" for request_output in request_outputs: request_id = request_output.request_id - assert request_id in self.request_streams - # Each request in the API server pulls from the per-request stream. - stream = self.request_streams.get(request_id) - if stream is not None: - stream.put(request_output) - - # If finished, remove from the tracker. - if request_output.finished: - if self.log_requests: - logger.info("Finished request %s.", request_id) - self._finish_stream(request_id) + # Note: it is possible a request was aborted and removed from + # the state due to client cancellations, so if we encounter a + # request id not in the state, we skip. + if request_id in self.rid_to_queue: + self.rid_to_queue[request_id].put_nowait(request_output) async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" @@ -306,24 +289,27 @@ class AsyncLLM(EngineClient): # 2) Detokenize based on the output. request_outputs, reqs_to_abort = self.detokenizer.step(outputs) - # 3) Put the RequestOutputs into the per-request AsyncStreams. + # 3) Put the RequestOutputs into the per-request queues. self._process_request_outputs(request_outputs) # 4) Abort any requests that finished due to stop strings. await self.engine_core.abort_requests_async(reqs_to_abort) - # 5) Abort any requests due to client cancellations. - await self._process_cancellations() - - except BaseException as e: - logger.error(e) - raise e - - # TODO: can we eliminate these? + except Exception as e: + logger.exception("EngineCore output handler hit an error: %s", e) + kill_process_tree(os.getpid()) async def abort(self, request_id: str) -> None: - # Note: Who Calls this? I dont think this is actually used. - raise ValueError("Not Supported on V1 yet.") + """Abort RequestId in self, detokenizer, and engine core.""" + + request_ids = [request_id] + await self.engine_core.abort_requests_async(request_ids) + self.detokenizer.abort_requests(request_ids) + + # If a request finishes while we await then the request_id + # will be removed from the tracked queues before we get here. + if request_id in self.rid_to_queue: + del self.rid_to_queue[request_id] def encode( self, diff --git a/vllm/v1/engine/async_stream.py b/vllm/v1/engine/async_stream.py deleted file mode 100644 index 35449238c3259..0000000000000 --- a/vllm/v1/engine/async_stream.py +++ /dev/null @@ -1,55 +0,0 @@ -import asyncio -from typing import Any, AsyncGenerator, Callable, Optional, Type, Union - -from vllm.outputs import PoolingRequestOutput, RequestOutput - - -class AsyncStream: - """A stream of RequestOutputs or PoolingRequestOutputs for a request - that can be iterated over asynchronously via an async generator.""" - - STOP_ITERATION = Exception() # Sentinel - - def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None: - self.request_id = request_id - self._cancel = cancel - self._queue: asyncio.Queue = asyncio.Queue() - self._finished = False - - def put(self, item: Union[RequestOutput, PoolingRequestOutput, - Exception]) -> None: - if not self._finished: - self._queue.put_nowait(item) - - def finish( - self, - exception: Optional[Union[BaseException, Type[BaseException]]] = None, - ) -> None: - if not self._finished: - self._finished = True - self._queue.put_nowait(exception if self._is_raisable(exception) - else AsyncStream.STOP_ITERATION) - - async def generator( - self - ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: - finished = False - try: - while True: - result = await self._queue.get() - if self._is_raisable(result): - finished = True - if result == AsyncStream.STOP_ITERATION: - return - raise result - yield result - finally: - self._finished = True - if not finished: - self._cancel(self.request_id) - - @staticmethod - def _is_raisable(value: Any): - return isinstance(value, BaseException) or \ - (isinstance(value, type) and \ - issubclass(value, BaseException)) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 497d5db5b4c99..5840541d774ba 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -3,20 +3,19 @@ import queue import signal import threading import time -from dataclasses import dataclass -from multiprocessing.process import BaseProcess +from multiprocessing.connection import Connection from typing import List, Tuple, Type +import psutil import zmq import zmq.asyncio from msgspec import msgpack from vllm.config import CacheConfig, VllmConfig -from vllm.executor.multiproc_worker_utils import get_mp_context from vllm.logger import init_logger from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) -from vllm.usage.usage_lib import UsageContext +from vllm.utils import get_exception_traceback, zmq_socket_ctx from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, @@ -25,14 +24,13 @@ from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus from vllm.v1.serial_utils import PickleEncoder -from vllm.v1.utils import make_zmq_socket from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) POLLING_TIMEOUT_MS = 5000 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 -LOGGING_TIME_S = 5000 +LOGGING_TIME_S = 5 class EngineCore: @@ -42,9 +40,10 @@ class EngineCore: self, vllm_config: VllmConfig, executor_class: Type[Executor], - usage_context: UsageContext, + log_stats: bool = False, ): assert vllm_config.model_config.runner_type != "pooling" + self.log_stats = log_stats logger.info("Initializing an LLM engine (v%s) with config: %s", VLLM_VERSION, vllm_config) @@ -134,29 +133,19 @@ class EngineCore: self.model_executor.profile(is_start) -@dataclass -class EngineCoreProcHandle: - proc: BaseProcess - ready_path: str - input_path: str - output_path: str - - class EngineCoreProc(EngineCore): """ZMQ-wrapper for running EngineCore in background process.""" - READY_STR = "READY" - def __init__( self, - vllm_config: VllmConfig, - executor_class: Type[Executor], - usage_context: UsageContext, input_path: str, output_path: str, - ready_path: str, + ready_pipe: Connection, + vllm_config: VllmConfig, + executor_class: Type[Executor], + log_stats: bool = False, ): - super().__init__(vllm_config, executor_class, usage_context) + super().__init__(vllm_config, executor_class, log_stats) # Background Threads and Queues for IO. These enable us to # overlap ZMQ socket IO with GPU since they release the GIL, @@ -173,68 +162,7 @@ class EngineCoreProc(EngineCore): daemon=True).start() # Send Readiness signal to EngineClient. - with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket: - ready_socket.send_string(EngineCoreProc.READY_STR) - - @staticmethod - def wait_for_startup( - proc: BaseProcess, - ready_path: str, - ) -> None: - """Wait until the EngineCore is ready.""" - - try: - sync_ctx = zmq.Context() # type: ignore[attr-defined] - socket = sync_ctx.socket(zmq.constants.PULL) - socket.connect(ready_path) - - # Wait for EngineCore to send EngineCoreProc.READY_STR. - while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0: - logger.debug("Waiting for EngineCoreProc to startup.") - - if not proc.is_alive(): - raise RuntimeError("EngineCoreProc failed to start.") - - message = socket.recv_string() - assert message == EngineCoreProc.READY_STR - - except BaseException as e: - logger.exception(e) - raise e - - finally: - sync_ctx.destroy(linger=0) - - @staticmethod - def make_engine_core_process( - vllm_config: VllmConfig, - executor_class: Type[Executor], - usage_context: UsageContext, - input_path: str, - output_path: str, - ready_path: str, - ) -> EngineCoreProcHandle: - context = get_mp_context() - - process_kwargs = { - "input_path": input_path, - "output_path": output_path, - "ready_path": ready_path, - "vllm_config": vllm_config, - "executor_class": executor_class, - "usage_context": usage_context, - } - # Run EngineCore busy loop in background process. - proc = context.Process(target=EngineCoreProc.run_engine_core, - kwargs=process_kwargs) - proc.start() - - # Wait for startup - EngineCoreProc.wait_for_startup(proc, ready_path) - return EngineCoreProcHandle(proc=proc, - ready_path=ready_path, - input_path=input_path, - output_path=output_path) + ready_pipe.send({"status": "READY"}) @staticmethod def run_engine_core(*args, **kwargs): @@ -258,6 +186,7 @@ class EngineCoreProc(EngineCore): signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) + parent_process = psutil.Process().parent() engine_core = None try: engine_core = EngineCoreProc(*args, **kwargs) @@ -266,9 +195,10 @@ class EngineCoreProc(EngineCore): except SystemExit: logger.debug("EngineCore interrupted.") - except BaseException as e: - logger.exception(e) - raise e + except Exception: + traceback = get_exception_traceback() + logger.error("EngineCore hit an exception: %s", traceback) + parent_process.send_signal(signal.SIGQUIT) finally: if engine_core is not None: @@ -309,6 +239,9 @@ class EngineCoreProc(EngineCore): def _log_stats(self): """Log basic stats every LOGGING_TIME_S""" + if not self.log_stats: + return + now = time.time() if now - self._last_logging_time > LOGGING_TIME_S: @@ -339,7 +272,7 @@ class EngineCoreProc(EngineCore): decoder_add_req = PickleEncoder() decoder_abort_req = PickleEncoder() - with make_zmq_socket(input_path, zmq.constants.PULL) as socket: + with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket: while True: # (RequestType, RequestData) type_frame, data_frame = socket.recv_multipart(copy=False) @@ -367,7 +300,7 @@ class EngineCoreProc(EngineCore): # Reuse send buffer. buffer = bytearray() - with make_zmq_socket(output_path, zmq.constants.PUSH) as socket: + with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket: while True: engine_core_outputs = self.output_queue.get() outputs = EngineCoreOutputs(outputs=engine_core_outputs) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index d56fcbdb1e7c4..3293205e110af 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,19 +1,19 @@ -import os -import weakref -from typing import List, Optional +from typing import List, Optional, Type import msgspec import zmq import zmq.asyncio +from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import get_open_zmq_ipc_path, kill_process_tree +from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, EngineCoreRequestUnion) -from vllm.v1.engine.core import (EngineCore, EngineCoreProc, - EngineCoreProcHandle) +from vllm.v1.engine.core import EngineCore, EngineCoreProc +from vllm.v1.executor.abstract import Executor from vllm.v1.serial_utils import PickleEncoder +from vllm.v1.utils import BackgroundProcHandle logger = init_logger(__name__) @@ -31,10 +31,11 @@ class EngineCoreClient: @staticmethod def make_client( - *args, multiprocess_mode: bool, asyncio_mode: bool, - **kwargs, + vllm_config: VllmConfig, + executor_class: Type[Executor], + log_stats: bool = False, ) -> "EngineCoreClient": # TODO: support this for debugging purposes. @@ -44,12 +45,12 @@ class EngineCoreClient: "is not currently supported.") if multiprocess_mode and asyncio_mode: - return AsyncMPClient(*args, **kwargs) + return AsyncMPClient(vllm_config, executor_class, log_stats) if multiprocess_mode and not asyncio_mode: - return SyncMPClient(*args, **kwargs) + return SyncMPClient(vllm_config, executor_class, log_stats) - return InprocClient(*args, **kwargs) + return InprocClient(vllm_config, executor_class, log_stats) def shutdown(self): pass @@ -128,9 +129,10 @@ class MPClient(EngineCoreClient): def __init__( self, - *args, asyncio_mode: bool, - **kwargs, + vllm_config: VllmConfig, + executor_class: Type[Executor], + log_stats: bool = False, ): # Serialization setup. self.encoder = PickleEncoder() @@ -142,61 +144,49 @@ class MPClient(EngineCoreClient): else: self.ctx = zmq.Context() # type: ignore[attr-defined] - # Path for IPC. - ready_path = get_open_zmq_ipc_path() + # Paths and sockets for IPC. output_path = get_open_zmq_ipc_path() input_path = get_open_zmq_ipc_path() - - # Get output (EngineCoreOutput) from EngineCore. - self.output_socket = self.ctx.socket(zmq.constants.PULL) - self.output_socket.connect(output_path) - - # Send input (EngineCoreRequest) to EngineCore. - self.input_socket = self.ctx.socket(zmq.constants.PUSH) - self.input_socket.bind(input_path) + self.output_socket = make_zmq_socket(self.ctx, output_path, + zmq.constants.PULL) + self.input_socket = make_zmq_socket(self.ctx, input_path, + zmq.constants.PUSH) # Start EngineCore in background process. - self.proc_handle: Optional[EngineCoreProcHandle] - self.proc_handle = EngineCoreProc.make_engine_core_process( - *args, - input_path= - input_path, # type: ignore[misc] # MyPy incorrectly flags duplicate keywords - output_path=output_path, # type: ignore[misc] - ready_path=ready_path, # type: ignore[misc] - **kwargs, - ) - self._finalizer = weakref.finalize(self, self.shutdown) + self.proc_handle: Optional[BackgroundProcHandle] + self.proc_handle = BackgroundProcHandle( + input_path=input_path, + output_path=output_path, + process_name="EngineCore", + target_fn=EngineCoreProc.run_engine_core, + process_kwargs={ + "vllm_config": vllm_config, + "executor_class": executor_class, + "log_stats": log_stats, + }) def shutdown(self): # Shut down the zmq context. self.ctx.destroy(linger=0) if hasattr(self, "proc_handle") and self.proc_handle: - # Shutdown the process if needed. - if self.proc_handle.proc.is_alive(): - self.proc_handle.proc.terminate() - self.proc_handle.proc.join(5) - - if self.proc_handle.proc.is_alive(): - kill_process_tree(self.proc_handle.proc.pid) - - # Remove zmq ipc socket files - ipc_sockets = [ - self.proc_handle.ready_path, self.proc_handle.output_path, - self.proc_handle.input_path - ] - for ipc_socket in ipc_sockets: - socket_file = ipc_socket.replace("ipc://", "") - if os and os.path.exists(socket_file): - os.remove(socket_file) + self.proc_handle.shutdown() self.proc_handle = None class SyncMPClient(MPClient): """Synchronous client for multi-proc EngineCore.""" - def __init__(self, *args, **kwargs): - super().__init__(*args, asyncio_mode=False, **kwargs) + def __init__(self, + vllm_config: VllmConfig, + executor_class: Type[Executor], + log_stats: bool = False): + super().__init__( + asyncio_mode=False, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=log_stats, + ) def get_output(self) -> List[EngineCoreOutput]: @@ -225,8 +215,16 @@ class SyncMPClient(MPClient): class AsyncMPClient(MPClient): """Asyncio-compatible client for multi-proc EngineCore.""" - def __init__(self, *args, **kwargs): - super().__init__(*args, asyncio_mode=True, **kwargs) + def __init__(self, + vllm_config: VllmConfig, + executor_class: Type[Executor], + log_stats: bool = False): + super().__init__( + asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=log_stats, + ) async def get_output_async(self) -> List[EngineCoreOutput]: diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 02f34e2b54dd5..65be9e58e03c8 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -8,7 +8,7 @@ from vllm.sampling_params import RequestOutputKind from vllm.transformers_utils.detokenizer_utils import ( AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.v1.engine import DetokenizerRequest, EngineCoreOutput +from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest logger = init_logger(__name__) @@ -55,19 +55,19 @@ class IncrementalDetokenizer: def from_new_request( cls, tokenizer: AnyTokenizer, - request: DetokenizerRequest, + request: EngineCoreRequest, ) -> "IncrementalDetokenizer": tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens( tokenizer=tokenizer, prompt_ids=request.prompt_token_ids, - skip_special_tokens=request.skip_special_tokens, + skip_special_tokens=request.sampling_params.skip_special_tokens, ) - stops = request.stop + stops = request.sampling_params.stop # Number of chars to hold back when stop strings are to be excluded # from streamed output. - if stops and not request.include_stop_str_in_output: + if stops and not request.sampling_params.include_stop_str_in_output: stop_buffer_length = max(len(s) for s in stops) - 1 else: stop_buffer_length = 0 @@ -79,13 +79,14 @@ class IncrementalDetokenizer: # NOTE(Nick): could we take ownership of it though? token_ids=request.prompt_token_ids.copy(), stop=stops, - include_stop_str_in_output=request.include_stop_str_in_output, + include_stop_str_in_output=request.sampling_params. + include_stop_str_in_output, prefix_offset=prefix_offset, read_offset=read_offset, - skip_special_tokens=request.skip_special_tokens, - spaces_between_special_tokens=request. + skip_special_tokens=request.sampling_params.skip_special_tokens, + spaces_between_special_tokens=request.sampling_params. spaces_between_special_tokens, - output_kind=request.output_kind, + output_kind=request.sampling_params.output_kind, request_id=request.request_id, prompt=request.prompt, prompt_token_ids=request.prompt_token_ids, @@ -227,7 +228,7 @@ class Detokenizer: def add_request( self, - request: DetokenizerRequest, + request: EngineCoreRequest, ): """Add new request to the Detokenizer.""" diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index b58f62778ffe9..a19109559eabf 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -72,11 +72,11 @@ class LLMEngine: # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs) self.engine_core = EngineCoreClient.make_client( - vllm_config, - executor_class, - usage_context, multiprocess_mode=multiprocess_mode, asyncio_mode=False, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=False, ) @classmethod @@ -152,15 +152,17 @@ class LLMEngine: ) -> None: # 1) Process raw inputs into the request. - detokenizer_req, engine_core_req = self.processor.process_inputs( - request_id, prompt, params, arrival_time, lora_request, - trace_headers, prompt_adapter_request, priority) + request = self.processor.process_inputs(request_id, prompt, params, + arrival_time, lora_request, + trace_headers, + prompt_adapter_request, + priority) # 2) Add the request to Detokenizer. - self.detokenizer.add_request(detokenizer_req) + self.detokenizer.add_request(request) # 3) Add the request to EngineCore. - self.engine_core.add_request(engine_core_req) + self.engine_core.add_request(request) def step(self) -> List[RequestOutput]: diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 6ee8732bc902c..5b5a5a61cea7d 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -1,5 +1,5 @@ import time -from typing import Mapping, Optional, Tuple, Union +from typing import Mapping, Optional, Union from vllm.config import CacheConfig, LoRAConfig, ModelConfig from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, @@ -13,7 +13,7 @@ from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup -from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest +from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient @@ -62,7 +62,7 @@ class Processor: trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, - ) -> Tuple[DetokenizerRequest, EngineCoreRequest]: + ) -> EngineCoreRequest: # TODO(woosuk): Support pooling models. # TODO(woosuk): Check max_logprobs @@ -123,20 +123,7 @@ class Processor: decoder_inputs.multi_modal_data, mm_hashes, decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs) - # Make Request for Detokenizer. - detokenizer_request = DetokenizerRequest( - request_id, - decoder_inputs.prompt, - decoder_inputs.prompt_token_ids, - sampling_params.skip_special_tokens, - sampling_params.spaces_between_special_tokens, - sampling_params.output_kind, - sampling_params.stop, - sampling_params.include_stop_str_in_output, - ) - - # Make Request for EngineCore. - engine_core_request = EngineCoreRequest( + return EngineCoreRequest( request_id, decoder_inputs.prompt, decoder_inputs.prompt_token_ids, @@ -149,8 +136,6 @@ class Processor: lora_request, ) - return detokenizer_request, engine_core_request - def _validate_model_inputs(self, inputs: ProcessorInputs): if is_encoder_decoder_inputs(inputs): # For encoder-decoder multimodal models, the max_prompt_len diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 128101aa6956d..ed64e7741390d 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -17,13 +17,12 @@ from vllm.distributed import (destroy_distributed_environment, from vllm.distributed.device_communicators.shm_broadcast import (Handle, MessageQueue) from vllm.executor.multiproc_worker_utils import ( - _add_prefix, get_mp_context, set_multiprocessing_worker_envs) + _add_prefix, set_multiprocessing_worker_envs) from vllm.logger import init_logger -from vllm.utils import (get_distributed_init_method, get_open_port, - get_open_zmq_ipc_path) +from vllm.utils import (get_distributed_init_method, get_mp_context, + get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx) from vllm.v1.executor.abstract import Executor from vllm.v1.outputs import ModelRunnerOutput -from vllm.v1.utils import make_zmq_socket from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -250,7 +249,7 @@ class WorkerProc: worker_response_mq_handle = self.worker_response_mq.export_handle() # Send Readiness signal to EngineCore process. - with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket: + with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket: payload = pickle.dumps(worker_response_mq_handle, protocol=pickle.HIGHEST_PROTOCOL) ready_socket.send_string(WorkerProc.READY_STR) @@ -352,7 +351,7 @@ class WorkerProc: ready_path: str, ) -> Optional[Handle]: """Wait until the Worker is ready.""" - with make_zmq_socket(ready_path, zmq.constants.PULL) as socket: + with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket: # Wait for Worker to send READY. while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0: diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index c088c3c129ca5..f2007d85c61a5 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -44,7 +44,7 @@ class TopKTopPSampler(nn.Module): logger.warning( "FlashInfer is not available. Falling back to the PyTorch-" "native implementation of top-p & top-k sampling. For the " - "best performance, please install FalshInfer.") + "best performance, please install FlashInfer.") self.forward = self.forward_native else: self.forward = self.forward_native diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index e802c6439b740..19e0dd17237c9 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -1,11 +1,11 @@ +import os +import weakref from collections.abc import Sequence -from contextlib import contextmanager -from typing import (Any, Generic, Iterator, List, Optional, TypeVar, Union, - overload) - -import zmq +from typing import (Any, Callable, Dict, Generic, List, Optional, TypeVar, + Union, overload) from vllm.logger import init_logger +from vllm.utils import get_mp_context, kill_process_tree logger = init_logger(__name__) @@ -77,27 +77,58 @@ class ConstantList(Generic[T], Sequence): return len(self._x) -@contextmanager -def make_zmq_socket( - path: str, - type: Any) -> Iterator[zmq.Socket]: # type: ignore[name-defined] - """Context manager for a ZMQ socket""" +class BackgroundProcHandle: + """ + Utility class to handle creation, readiness, and shutdown + of background processes used by the AsyncLLM and LLMEngine. + """ - ctx = zmq.Context() # type: ignore[attr-defined] - try: - socket = ctx.socket(type) + def __init__( + self, + input_path: str, + output_path: str, + process_name: str, + target_fn: Callable, + process_kwargs: Dict[Any, Any], + ): + self._finalizer = weakref.finalize(self, self.shutdown) - if type == zmq.constants.PULL: - socket.connect(path) - elif type == zmq.constants.PUSH: - socket.bind(path) - else: - raise ValueError(f"Unknown Socket Type: {type}") + context = get_mp_context() + reader, writer = context.Pipe(duplex=False) - yield socket + assert ("ready_pipe" not in process_kwargs + and "input_path" not in process_kwargs + and "output_path" not in process_kwargs) + process_kwargs["ready_pipe"] = writer + process_kwargs["input_path"] = input_path + process_kwargs["output_path"] = output_path + self.input_path = input_path + self.output_path = output_path - except KeyboardInterrupt: - logger.debug("Worker had Keyboard Interrupt.") + # Run Detokenizer busy loop in background process. + self.proc = context.Process(target=target_fn, kwargs=process_kwargs) + self.proc.start() - finally: - ctx.destroy(linger=0) + # Wait for startup. + if reader.recv()["status"] != "READY": + raise RuntimeError(f"{process_name} initialization failed. " + "See root cause above.") + + def __del__(self): + self.shutdown() + + def shutdown(self): + # Shutdown the process if needed. + if hasattr(self, "proc") and self.proc.is_alive(): + self.proc.terminate() + self.proc.join(5) + + if self.proc.is_alive(): + kill_process_tree(self.proc.pid) + + # Remove zmq ipc socket files + ipc_sockets = [self.output_path, self.input_path] + for ipc_socket in ipc_sockets: + socket_file = ipc_socket.replace("ipc://", "") + if os and os.path.exists(socket_file): + os.remove(socket_file) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 10cf7436609e7..8055cf1debd72 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -58,11 +58,13 @@ class InputBatch: # TODO(woosuk): This buffer could be too large if max_model_len is big. # Find a way to reduce the CPU memory usage. + # This buffer is not directly transferred to the GPU, so it does not + # need to be pinned. self.token_ids_cpu_tensor = torch.zeros( (max_num_reqs, max_model_len), device="cpu", dtype=torch.int32, - pin_memory=pin_memory, + pin_memory=False, ) self.token_ids_cpu = self.token_ids_cpu_tensor.numpy() self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7d1337b049283..0ab8118167765 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -647,10 +647,23 @@ class GPUModelRunner: self.mm_registry.get_max_tokens_per_item_by_modality( self.model_config).values()) - max_num_mm_items = min( + max_num_mm_items_encoder_budget = min( self.max_num_encoder_input_tokens, self.encoder_cache_size) // max_tokens_per_mm_item + max_mm_items_per_req = max( + self.mm_registry.get_mm_limits_per_prompt( + self.model_config).values()) + + # NOTE: We do not consider max_num_batched_tokens on purpose + # because the multimodal embeddings can be generated in advance + # and chunked prefilled. + max_num_mm_items_decoder_budget = self.max_num_reqs * \ + max_mm_items_per_req + + max_num_mm_items = min(max_num_mm_items_encoder_budget, + max_num_mm_items_decoder_budget) + # Dummy data definition in V0 may contain multiple multimodal items # (e.g, multiple images) for a single request, therefore here we # always replicate first item by max_num_mm_items times since in V1 diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 0000b09bfaa36..af438f7d5820c 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -48,6 +48,7 @@ class Worker: self.prompt_adapter_config = vllm_config.prompt_adapter_config self.observability_config = vllm_config.observability_config + self.parallel_config.rank = rank self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index cd4770202a186..c7abad7e0258d 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -12,7 +12,6 @@ from torch import is_tensor from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors, SequenceGroupMetadata if TYPE_CHECKING: @@ -265,13 +264,13 @@ class ModelRunnerBase(ABC, Generic[T]): """ raise NotImplementedError - @current_platform.inference_mode() def execute_model( self, model_input: T, kv_caches: Optional[List[torch.Tensor]], - intermediate_tensors: Optional[IntermediateTensors], + intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, + **kwargs, ) -> Optional[List[SamplerOutput]]: """ Execute the model on the given input. diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 65d9bab0e2822..dee63a75c0605 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -544,6 +544,7 @@ class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]): model_input.record_step_event(current_stream) if get_pp_group().is_last_rank and self.is_driver_worker: + assert isinstance(output, list) assert len( output ) == 1, "MultiStepModelRunner requires single-step base_models" diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 3ac7fb8dfb766..249b3ed2dfd37 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -11,7 +11,6 @@ from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.platforms import current_platform from vllm.sequence import ExecuteModelRequest, IntermediateTensors from vllm.utils import (enable_trace_function_call_for_thread, resolve_obj_by_qualname, update_environment_variables) @@ -44,6 +43,8 @@ class WorkerBase(ABC): self.prompt_adapter_config = vllm_config.prompt_adapter_config self.observability_config = vllm_config.observability_config self.kv_transfer_config = vllm_config.kv_transfer_config + from vllm.platforms import current_platform + self.current_platform = current_platform @abstractmethod def init_device(self) -> None: @@ -74,17 +75,17 @@ class WorkerBase(ABC): """ raise NotImplementedError - @current_platform.inference_mode() def start_worker_execution_loop(self) -> None: """Execute model loop in parallel worker. You can stop the loop by executing a driver worker with an empty output. See `stop_remote_worker_execution_loop` for more details. """ - while True: - output = self.execute_model(execute_model_req=None) - if output is None: - return None + with self.current_platform.inference_mode(): + while True: + output = self.execute_model(execute_model_req=None) + if output is None: + return None @abstractmethod def execute_model( @@ -352,6 +353,7 @@ class LocalOrDistributedWorkerBase(WorkerBase): model_execute_time = time.perf_counter() - start_time if not get_pp_group().is_last_rank: # output is IntermediateTensors + assert isinstance(output, IntermediateTensors) if (self.observability_config is not None and self.observability_config.collect_model_execute_time): output.tensors["model_execute_time"] = torch.tensor(