From 3fb17d26c83d6314c50c1c2eedc61625738a047d Mon Sep 17 00:00:00 2001 From: yasu52 <65061005+yasu52@users.noreply.github.com> Date: Thu, 13 Mar 2025 20:33:09 -0700 Subject: [PATCH 001/169] [Doc] Fix typo in documentation (#14783) Signed-off-by: yasu52 --- docs/source/deployment/frameworks/helm.md | 4 ++-- docs/source/deployment/k8s.md | 2 +- docs/source/design/kernel/paged_attention.md | 2 +- docs/source/design/v1/metrics.md | 4 ++-- docs/source/features/lora.md | 2 +- docs/source/getting_started/faq.md | 2 +- .../installation/ai_accelerator/hpu-gaudi.inc.md | 2 +- .../installation/ai_accelerator/openvino.inc.md | 2 +- docs/source/getting_started/installation/gpu/xpu.inc.md | 8 ++++---- docs/source/serving/distributed_serving.md | 4 ++-- docs/source/training/rlhf.md | 2 +- examples/other/logging_configuration.md | 2 +- vllm/distributed/kv_transfer/README.md | 2 +- 13 files changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/source/deployment/frameworks/helm.md b/docs/source/deployment/frameworks/helm.md index e4fc5e1313079..7320d727fbaa4 100644 --- a/docs/source/deployment/frameworks/helm.md +++ b/docs/source/deployment/frameworks/helm.md @@ -4,9 +4,9 @@ A Helm chart to deploy vLLM for Kubernetes -Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLMm Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variables values. +Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values. -This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm install and documentation on architecture and values file. +This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file. ## Prerequisites diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md index 64071ba042d0b..dd3769c47fc50 100644 --- a/docs/source/deployment/k8s.md +++ b/docs/source/deployment/k8s.md @@ -14,7 +14,7 @@ Alternatively, you can also deploy Kubernetes using [helm chart](https://docs.vl ## Pre-requisite -Ensure that you have a running Kubernetes environment with GPU (you can follow [this tutorial](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) to install a Kubernetes environment on a bare-medal GPU machine). +Ensure that you have a running Kubernetes environment with GPU (you can follow [this tutorial](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) to install a Kubernetes environment on a bare-metal GPU machine). ## Deployment using native K8s diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md index 5f2582877260a..e1770c8226435 100644 --- a/docs/source/design/kernel/paged_attention.md +++ b/docs/source/design/kernel/paged_attention.md @@ -419,7 +419,7 @@ List of `v_vec` for one thread which is also `V_VEC_SIZE` elements from `logits`. Overall, with multiple inner iterations, each warp will process one block of value tokens. And with multiple outer iterations, the whole context value - tokens are processd + tokens are processed ```cpp float accs[NUM_ROWS_PER_THREAD]; diff --git a/docs/source/design/v1/metrics.md b/docs/source/design/v1/metrics.md index bed40516ca46a..b3981b2dc24a7 100644 --- a/docs/source/design/v1/metrics.md +++ b/docs/source/design/v1/metrics.md @@ -13,7 +13,7 @@ Ensure the v1 LLM Engine exposes a superset of the metrics available in v0. Metrics in vLLM can be categorized as follows: 1. Server-level metrics: these are global metrics that track the state and performance of the LLM engine. These are typically exposed as Gauges or Counters in Prometheus. -2. Request-level metrics: these are metrics that track the characteristics - e.g. size and timing - of individual requests. These are typically exposed as Histrograms in Prometheus, and are often the SLO that an SRE monitoring vLLM will be tracking. +2. Request-level metrics: these are metrics that track the characteristics - e.g. size and timing - of individual requests. These are typically exposed as Histograms in Prometheus, and are often the SLO that an SRE monitoring vLLM will be tracking. The mental model is that the "Server-level Metrics" explain why the "Request-level Metrics" are what they are. @@ -47,7 +47,7 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics` - `vllm:tokens_total` (Counter) - `vllm:iteration_tokens_total` (Histogram) - `vllm:time_in_queue_requests` (Histogram) -- `vllm:model_forward_time_milliseconds` (Histogram +- `vllm:model_forward_time_milliseconds` (Histogram) - `vllm:model_execute_time_milliseconds` (Histogram) - `vllm:request_params_n` (Histogram) - `vllm:request_params_max_tokens` (Histogram) diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md index dff7e916fb460..a71da72e4360a 100644 --- a/docs/source/features/lora.md +++ b/docs/source/features/lora.md @@ -110,7 +110,7 @@ In addition to serving LoRA adapters at server startup, the vLLM server now supp LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility to change models on-the-fly is needed. -Note: Enabling this feature in production environments is risky as user may participate model adapter management. +Note: Enabling this feature in production environments is risky as users may participate in model adapter management. To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING` is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active. diff --git a/docs/source/getting_started/faq.md b/docs/source/getting_started/faq.md index 4751b325e6fc4..c1bb28937c144 100644 --- a/docs/source/getting_started/faq.md +++ b/docs/source/getting_started/faq.md @@ -15,7 +15,7 @@ more are listed [here](#supported-models). By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B), [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models, -but they are expected be inferior to models that are specifically trained on embedding tasks. +but they are expected to be inferior to models that are specifically trained on embedding tasks. ______________________________________________________________________ diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md index 7e52f6048909c..e91ed6fbd7a88 100644 --- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md +++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md @@ -119,7 +119,7 @@ If you're observing the following error: `docker: Error response from daemon: Un ## Supported configurations -The following configurations have been validated to be function with +The following configurations have been validated to function with Gaudi2 devices. Configurations that are not listed may or may not work. - [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) diff --git a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md index 5641c1563656c..ab0db4795da77 100644 --- a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md +++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md @@ -19,7 +19,7 @@ Currently, there are no pre-built OpenVINO wheels. ### Build wheel from source -First, install Python and ensure you lave the latest pip. For example, on Ubuntu 22.04, you can run: +First, install Python and ensure you have the latest pip. For example, on Ubuntu 22.04, you can run: ```console sudo apt-get update -y diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md index 5a47b16f77661..84a9b387789c7 100644 --- a/docs/source/getting_started/installation/gpu/xpu.inc.md +++ b/docs/source/getting_started/installation/gpu/xpu.inc.md @@ -1,6 +1,6 @@ # Installation -vLLM initially supports basic model inferencing and serving on Intel GPU platform. +vLLM initially supports basic model inference and serving on Intel GPU platform. :::{attention} There are no pre-built wheels or images for this device, so you must build vLLM from source. @@ -65,7 +65,7 @@ $ docker run -it \ ## Supported features -XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following: +XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following: ```console python -m vllm.entrypoints.openai.api_server \ @@ -78,6 +78,6 @@ python -m vllm.entrypoints.openai.api_server \ -tp=8 ``` -By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. +By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. -There are some new features coming with ipex-xpu 2.6, eg: **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc. +There are some new features coming with ipex-xpu 2.6, e.g. **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc. diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index e6be644b73932..b36a3dcb170f2 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -20,7 +20,7 @@ There is one edge case: if the model fits in a single node with multiple GPUs, b ## Running vLLM on a single node -vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray. +vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inference currently requires Ray. Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured `tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the `LLM` class `distributed_executor_backend` argument or `--distributed-executor-backend` API server argument. Set it to `mp` for multiprocessing or `ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. @@ -29,7 +29,7 @@ To run multi-GPU inference with the `LLM` class, set the `tensor_parallel_size` ```python from vllm import LLM llm = LLM("facebook/opt-13b", tensor_parallel_size=4) -output = llm.generate("San Franciso is a") +output = llm.generate("San Francisco is a") ``` To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: diff --git a/docs/source/training/rlhf.md b/docs/source/training/rlhf.md index 00822aefe11e6..72e89c0c7478c 100644 --- a/docs/source/training/rlhf.md +++ b/docs/source/training/rlhf.md @@ -1,6 +1,6 @@ # Reinforcement Learning from Human Feedback -Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviours. +Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors. vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl). diff --git a/examples/other/logging_configuration.md b/examples/other/logging_configuration.md index c70b853c12769..fbdbce6a4612a 100644 --- a/examples/other/logging_configuration.md +++ b/examples/other/logging_configuration.md @@ -127,7 +127,7 @@ configuration for the root vLLM logger and for the logger you wish to silence: "vllm": { "handlers": ["vllm"], "level": "DEBUG", - "propagage": false + "propagate": false }, "vllm.example_noisy_logger": { "propagate": false diff --git a/vllm/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md index c408d4a67522c..349d3dfbd84fc 100644 --- a/vllm/distributed/kv_transfer/README.md +++ b/vllm/distributed/kv_transfer/README.md @@ -24,6 +24,6 @@ NOTE: If you want to not only transfer KV caches, but adjust the model execution The example usage is in [this file](../../../examples/online_serving/disaggregated_prefill.sh). -Here is the diagram of how we run disaggretgated prefilling. +Here is the diagram of how we run disaggregated prefilling. ![Disaggregated prefill workflow](./disagg_prefill_workflow.jpg) From 60c872d4b665786ce4b4e1e9b82bacc0ca8e8cc2 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Fri, 14 Mar 2025 11:33:12 +0800 Subject: [PATCH 002/169] [Doc] Fix small typo in Transformers fallback (#14791) Signed-off-by: Chen Zhang --- docs/source/models/supported_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index bcbd7bf9600c5..3d42d5f6b529e 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -101,7 +101,7 @@ class MyAttention(nn.Module): def forward(self, hidden_states, **kwargs): # <- kwargs are required ... - attention_interface = attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( self, query_states, From 7888e1d0a3eb83becda81bbad5e9848dbb598453 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com> Date: Thu, 13 Mar 2025 23:40:05 -0400 Subject: [PATCH 003/169] [V1] TPU - Enable prefix caching by default (#14773) --- vllm/platforms/tpu.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index fc68e5d63a6e5..8e2c28d9327b5 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -108,12 +108,6 @@ class TpuPlatform(Platform): parallel_config.worker_cls = \ "vllm.worker.tpu_worker.TPUWorker" - # Adjust scheduler config for V1 - # TODO: Add support for these - if envs.VLLM_USE_V1 and vllm_config.cache_config.enable_prefix_caching: - logger.warning("[V1][TPU] Disable prefix caching") - vllm_config.cache_config.enable_prefix_caching = False - assert not vllm_config.speculative_config, ( "Speculative decoding is not yet supported for TPU backend") From 2a602b055a180c982126e8a438d188325fdb01a5 Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Thu, 13 Mar 2025 20:40:15 -0700 Subject: [PATCH 004/169] forward fix PR 14245, restore build on ROCm 6.2 (#14709) Signed-off-by: Jeff Daily --- csrc/quantization/fp8/amd/quant_utils.cuh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh index f01427cc3d0ca..feda497d0210e 100644 --- a/csrc/quantization/fp8/amd/quant_utils.cuh +++ b/csrc/quantization/fp8/amd/quant_utils.cuh @@ -19,12 +19,24 @@ __device__ __forceinline__ fp8_type cvt_c10(float const r) { return {}; } +// __hip_fp8_e4m3 only exists starting in ROCm 6.3. The macro +// HIP_FP8_TYPE_OCP comes from the hip_fp8.h header and also makes +// its first appearance in ROCm 6.3. Since VLLM_DISPATCH_FP8_TYPES +// on ROCm instantiates both OCP and FNUZ kernels, we need to replace +// the new HW cvt with something reasonable that doesn't rely on the +// ROCm 6.3 feature. This allows compiling on ROCm 6.2 or newer. template <> __device__ __forceinline__ c10::Float8_e4m3fn cvt_c10(float const r) { + #if HIP_FP8_TYPE_OCP return c10::Float8_e4m3fn( __hip_cvt_float_to_fp8(r, __hip_fp8_e4m3::__default_saturation, __hip_fp8_e4m3::__default_interpret), c10::Float8_e4m3fn::from_bits()); + #else + // Cast implemented by pytorch. Uses bit manipulation instead of HW cvt. + // HW cvt above is faster when it is available (ROCm 6.3 or newer). + return static_cast(r); + #endif } template <> From ad19c8a00313e9d3f6016ac16333ce0d817a9c2a Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Thu, 13 Mar 2025 20:40:23 -0700 Subject: [PATCH 005/169] [V1] Move OOM check into sampler run (#14728) Signed-off-by: Roger Wang Co-authored-by: Simon Mo --- vllm/v1/worker/gpu_model_runner.py | 15 ++++++++++++--- vllm/v1/worker/gpu_worker.py | 20 +++++--------------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index df7ca70924bf5..c2a976108e4d4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1288,9 +1288,18 @@ class GPUModelRunner(LoRAModelRunnerMixin): allowed_token_ids_mask=None, bad_words_token_ids={}, ) - sampler_output = self.model.sample(logits=logits, - sampling_metadata=dummy_metadata) - + try: + sampler_output = self.model.sample( + logits=logits, sampling_metadata=dummy_metadata) + except RuntimeError as e: + if 'out of memory' in str(e): + raise RuntimeError( + "CUDA out of memory occurred when warming up sampler with " + f"{num_reqs} dummy requests. Please try lowering " + "`max_num_seqs` or `gpu_memory_utilization` when " + "initializing the engine.") from e + else: + raise e return sampler_output def profile_run(self) -> None: diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 5527a105f8670..241869e35c620 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -221,21 +221,11 @@ class Worker(WorkerBase): # NOTE: This is called after `capture_model` on purpose to prevent # memory buffers from being cleared by `torch.cuda.empty_cache`. if get_pp_group().is_last_rank: - try: - max_num_reqs = min( - self.scheduler_config.max_num_seqs, - self.scheduler_config.max_num_batched_tokens) - self.model_runner._dummy_sampler_run( - hidden_states=self.model_runner._dummy_run( - num_tokens=max_num_reqs)) - except RuntimeError as e: - if 'out of memory' in str(e): - raise RuntimeError( - "CUDA out of memory occurred when warming up sampler. " - "Please try lowering `gpu_memory_utilization` when " - "initializing the engine.") from None - else: - raise e + max_num_reqs = min(self.scheduler_config.max_num_seqs, + self.scheduler_config.max_num_batched_tokens) + self.model_runner._dummy_sampler_run( + hidden_states=self.model_runner._dummy_run( + num_tokens=max_num_reqs)) # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. From 32ef4983cd029d613172dbcf1edf91e62920bbc8 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 13 Mar 2025 20:40:35 -0700 Subject: [PATCH 006/169] [V1] Temporarily disable FlashInfer Rejection Sampler (#14788) Signed-off-by: Woosuk Kwon --- vllm/v1/sample/ops/topk_topp_sampler.py | 2 +- vllm/v1/sample/rejection_sampler.py | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index 1bb950be822c1..7d70e839b6f4e 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -22,7 +22,7 @@ class TopKTopPSampler(nn.Module): def __init__(self): super().__init__() - if current_platform.is_cuda: + if current_platform.is_cuda(): if is_flashinfer_available: if envs.VLLM_USE_FLASHINFER_SAMPLER is not False: # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index 80a4b24186ab7..ea7f3353c115f 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -24,9 +24,18 @@ class RejectionSampler(nn.Module): def __init__(self): super().__init__() - if current_platform.is_cuda: + if current_platform.is_cuda(): if is_flashinfer_available: if envs.VLLM_USE_FLASHINFER_SAMPLER is not False: + # FIXME(woosuk): Currently, we have errors when using + # FlashInfer for rejection sampling. As a workaround, we + # disable FlashInfer for rejection sampling by default. + logger.info("Currently, FlashInfer rejection sampler is " + "disabled because of a bug. Falling back to " + "the PyTorch-native implementation of " + "rejection sampling.") + self.forward_method = self.forward_native + # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by # default it is unused). For backward compatibility, we set @@ -35,8 +44,8 @@ class RejectionSampler(nn.Module): # None means False, while in V1, None means True. This is # why we use the condition # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here. - logger.info("Using FlashInfer for rejection sampling.") - self.forward_method = self.flashinfer_sample + # logger.info("Using FlashInfer for rejection sampling.") + # self.forward_method = self.flashinfer_sample else: logger.warning( "FlashInfer is available, but it is not enabled. " From 0b1cfa61806d8d0f3e3d8d6f8303d0fe644f329e Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Thu, 13 Mar 2025 23:42:04 -0400 Subject: [PATCH 007/169] [Kernel] LoRA - Enable CUDAGraphs for V1 (#14626) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- tests/lora/test_worker.py | 1 + vllm/config.py | 25 +++++++++++++++++++------ vllm/lora/layers.py | 15 +++++++++------ vllm/lora/punica_wrapper/punica_gpu.py | 8 ++++++-- 4 files changed, 35 insertions(+), 14 deletions(-) diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index fc1be4ed440a6..30b74ce3ef70e 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -52,6 +52,7 @@ def test_worker_apply_lora(sql_lora_files): seed=0, dtype="float16", revision=None, + enforce_eager=True, ), load_config=LoadConfig( download_dir=None, diff --git a/vllm/config.py b/vllm/config.py index 35411ca73ad23..429ec0dd51c13 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2287,9 +2287,14 @@ class LoRAConfig: excluding anything before input ids/embeddings and after the final hidden states. """ - # no factors to consider. - # LoRA is not compatible with `torch.compile` . factors: list[Any] = [] + factors.append(self.max_lora_rank) + factors.append(self.max_loras) + factors.append(self.fully_sharded_loras) + factors.append(self.lora_dtype) + factors.append(self.lora_extra_vocab_size) + factors.append(self.long_lora_scaling_factors) + factors.append(self.bias_enabled) hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -3303,6 +3308,11 @@ class VllmConfig: vllm_factors.append("None") if self.lora_config: vllm_factors.append(self.lora_config.compute_hash()) + # LoRA creates static buffers based on max_num_batched_tokens. + # The tensor sizes and strides get captured in the torch.compile + # graph explicitly. + vllm_factors.append( + str(self.scheduler_config.max_num_batched_tokens)) else: vllm_factors.append("None") if self.speculative_config: @@ -3453,12 +3463,15 @@ class VllmConfig: " Disabling `torch.compile`.") self.compilation_config.level = CompilationLevel.NO_COMPILATION - if self.lora_config is not None and self.compilation_config.level !=\ - CompilationLevel.NO_COMPILATION: - logger.warning("LoRA is not supported with `torch.compile` yet. " - "Disabling `torch.compile`.") + if ((not envs.VLLM_USE_V1) and self.lora_config is not None + and self.compilation_config.level + != CompilationLevel.NO_COMPILATION): + logger.warning( + "LoRA for V0 is not supported with `torch.compile` yet. " + "Disabling `torch.compile`.") self.compilation_config.level = CompilationLevel.NO_COMPILATION + if self.model_config and self.model_config.use_mla and \ not (current_platform.is_cuda() or current_platform.is_rocm()): logger.info( diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 1c1f76702ddbc..7a9d5237ab754 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -237,16 +237,19 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA): self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings) def forward(self, x: torch.Tensor) -> torch.Tensor: - added_tokens_mask = x > self.base_layer.org_vocab_size - 1 - embeddings_indices = self.punica_wrapper.embeddings_indices - indices = embeddings_indices[1].view_as(x) + added_tokens_mask = torch.where(x > self.base_layer.org_vocab_size - 1, + 1, 0) + embeddings_indices = torch.narrow( + self.punica_wrapper._embeddings_indices, 1, 0, x.size(0)) + + indices = embeddings_indices[1] full_lora_a_embeddings = F.embedding( x + indices, self.lora_a_stacked_2d, ) - indices = embeddings_indices[0].view_as(x) - full_output = self.base_layer.forward( - x.add_(indices * added_tokens_mask)) + indices = embeddings_indices[0] + full_output = self.base_layer.forward(x + + (indices * added_tokens_mask)) full_output_org = full_output if full_output.ndim == 3: diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index 3a4fcd04dbeb6..19a94eea910c4 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -254,7 +254,9 @@ class PunicaWrapperGPU(PunicaWrapperBase, V1KernelMixin): y_org = y y = y.view(-1, y.shape[-1]) if lora_bias_stacked is not None: - self._apply_bias(self.token_lora_indices, y, output_slices, + token_lora_indices = torch.narrow(self._token_lora_indices, 0, 0, + y.size(0)) + self._apply_bias(token_lora_indices, y, output_slices, lora_bias_stacked) if env.VLLM_USE_V1: @@ -365,7 +367,9 @@ class PunicaWrapperGPU(PunicaWrapperBase, V1KernelMixin): assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) if lora_bias_stacked is not None: assert len(lora_bias_stacked) == len(output_slices) - y = self._apply_bias(self.token_lora_indices, y, output_slices, + token_lora_indices = torch.narrow(self._token_lora_indices, 0, 0, + y.size(0)) + y = self._apply_bias(token_lora_indices, y, output_slices, lora_bias_stacked) if buffer is None: From fb4c7f8ef0163cd12a3368a34691464271c0274d Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Fri, 14 Mar 2025 04:42:27 +0100 Subject: [PATCH 008/169] [Kernel] [V1] Further optimizations to ROCm (Triton) Backend to better handle GQA. (#14431) Signed-off-by: Thomas Parnell Co-authored-by: Jan van Lunteren Co-authored-by: Burkhard Ringlein Co-authored-by: Chih-Chieh Yang --- .../ops/chunked_prefill_paged_decode.py | 103 +++++++++++------- 1 file changed, 63 insertions(+), 40 deletions(-) diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py index 16d67e3abe848..48db3ebfd7412 100644 --- a/vllm/attention/ops/chunked_prefill_paged_decode.py +++ b/vllm/attention/ops/chunked_prefill_paged_decode.py @@ -1,9 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # Authors: -# - Burkhard Ringlein -# - Jan van Lunteren -# - Thomas Parnell +# - Burkhard Ringlein +# - Jan van Lunteren +# - Chih-Chieh Yang +# - Thomas Parnell import torch import triton @@ -31,6 +32,7 @@ def kernel_paged_attention_2d( v_scale, # float32 num_query_heads: tl.constexpr, # int num_queries_per_kv: tl.constexpr, # int + num_queries_per_kv_padded: tl.constexpr, # int block_table_stride: tl.constexpr, # int query_stride_0: tl.constexpr, # int query_stride_1: tl.constexpr, # int, should be equal to head_size @@ -55,8 +57,7 @@ def kernel_paged_attention_2d( query_start_len_ptr, # [num_seqs+1] ): seq_idx = tl.program_id(0) - query_head_idx = tl.program_id(1) - kv_head_idx = query_head_idx // num_queries_per_kv + kv_head_idx = tl.program_id(1) if filter_by_query_len: cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx) @@ -69,31 +70,40 @@ def kernel_paged_attention_2d( else: cur_batch_in_all_start_index = seq_idx + query_head_idx = kv_head_idx * num_queries_per_kv + tl.arange( + 0, num_queries_per_kv_padded) + query_offset = (cur_batch_in_all_start_index * query_stride_0 + - query_head_idx * query_stride_1) + query_head_idx[:, None] * query_stride_1) + + head_mask = query_head_idx < (kv_head_idx + 1) * num_queries_per_kv + head_mask = head_mask & (query_head_idx < num_query_heads) dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1, 0).to(tl.int1) - # Q : (HEAD_SIZE,) + # Q : (num_queries_per_kv, HEAD_SIZE,) Q = tl.load( - query_ptr + query_offset + tl.arange(0, HEAD_SIZE_PADDED), - mask=dim_mask, + query_ptr + query_offset + tl.arange(0, HEAD_SIZE_PADDED)[None, :], + mask=dim_mask[None, :] & head_mask[:, None], other=0.0, ) block_table_offset = seq_idx * block_table_stride - M = tl.full([1], float("-inf"), dtype=tl.float32) - L = tl.full([1], 1.0, dtype=tl.float32) - acc = tl.zeros([HEAD_SIZE_PADDED], dtype=tl.float32) + M = tl.full([num_queries_per_kv_padded], float("-inf"), dtype=tl.float32) + L = tl.full([num_queries_per_kv_padded], 1.0, dtype=tl.float32) + acc = tl.zeros([num_queries_per_kv_padded, HEAD_SIZE_PADDED], + dtype=tl.float32) # sequence len for this particular sequence seq_len = tl.load(seq_lens_ptr + seq_idx) # alibi slope for this head if USE_ALIBI_SLOPES: - alibi_slope = tl.load(alibi_slopes_ptr + query_head_idx) + alibi_slope = tl.load(alibi_slopes_ptr + query_head_idx, + mask=head_mask, + other=0.0) num_blocks = cdiv_fn(seq_len, BLOCK_SIZE) @@ -107,8 +117,8 @@ def kernel_paged_attention_2d( v_offset = (physical_block_idx * stride_v_cache_0 + kv_head_idx * stride_v_cache_1 + - offs_d[:, None] * stride_v_cache_2 + - offs_n[None, :] * stride_v_cache_3) + offs_d[None, :] * stride_v_cache_2 + + offs_n[:, None] * stride_v_cache_3) k_offset = (physical_block_idx * stride_k_cache_0 + kv_head_idx * stride_k_cache_1 + @@ -126,9 +136,9 @@ def kernel_paged_attention_2d( else: K = K_load - # V : (HEAD_SIZE, BLOCK_SIZE) + # V : (BLOCK_SIZE, HEAD_SIZE) V_load = tl.load(value_cache_ptr + v_offset, - mask=dim_mask[:, None], + mask=dim_mask[None, :], other=0.0) if V_load.dtype.is_fp8(): @@ -136,51 +146,59 @@ def kernel_paged_attention_2d( else: V = V_load - tmp = j * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + seq_offset = j * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) boundary = tl.full([BLOCK_SIZE], seq_len, dtype=tl.int32) - mask_new = tmp < boundary - # S : (BLOCK_SIZE,) - S = tl.where(mask_new, 0.0, float("-inf")).to(tl.float32) - S += scale * tl.sum(K * Q[:, None], axis=0) + seq_mask = seq_offset[None, :] < boundary + + # S : (num_queries_per_kv, BLOCK_SIZE,) + S = tl.where(head_mask[:, None] & seq_mask, 0.0, + float("-inf")).to(tl.float32) + S += scale * tl.dot(Q, K) + + context_len = seq_len - 1 if SLIDING_WINDOW > 0: - S = tl.where((seq_len - 1 - tmp) < SLIDING_WINDOW, S, -10000) + S = tl.where((context_len - seq_offset) < SLIDING_WINDOW, S, + -10000) if USE_ALIBI_SLOPES: - S += alibi_slope * (tmp - seq_len + 1) + S += alibi_slope[:, None] * (seq_offset - context_len) # compute running maximum - # m_j : (1,) - m_j = tl.maximum(M, tl.max(S, axis=0)) + # m_j : (num_queries_per_kv,) + m_j = tl.maximum(M, tl.max(S, axis=1)) - # P : (BLOCK_SIZE,) - P = tl.exp(S - m_j) + # P : (num_queries_per_kv, BLOCK_SIZE,) + P = tl.exp(S - m_j[:, None]) - # l_j : (1,) - l_j = tl.sum(P, axis=0) + # l_j : (num_queries_per_kv,) + l_j = tl.sum(P, axis=1) - # alpha : (1, ) + # alpha : (num_queries_per_kv, ) alpha = tl.exp(M - m_j) - # acc : (BLOCK_SIZE,) - acc = acc * alpha + # acc : (num_queries_per_kv, BLOCK_SIZE,) + acc = acc * alpha[:, None] # update constants L = L * alpha + l_j M = m_j - # acc : (BLOCK_SIZE,) - acc += tl.sum(V * P[None, :], axis=1) + # acc : (num_queries_per_kv, BLOCK_SIZE,) + acc += tl.dot(P.to(V.dtype), V) # epilogue - acc = acc / L + acc = acc / L[:, None] output_offset = (cur_batch_in_all_start_index * output_stride_0 + query_head_idx * output_stride_1) - tl.store(output_ptr + output_offset + tl.arange(0, HEAD_SIZE_PADDED), - acc, - mask=dim_mask) + tl.store( + output_ptr + output_offset[:, None] + + tl.arange(0, HEAD_SIZE_PADDED)[None, :], + acc, + mask=dim_mask[None, :] & head_mask[:, None], + ) def chunked_prefill_paged_decode( @@ -234,6 +252,7 @@ def chunked_prefill_paged_decode( block_size = value_cache.shape[3] num_seqs = len(seq_lens) num_query_heads = query.shape[1] + num_kv_heads = key.shape[1] num_queries_per_kv = query.shape[1] // key.shape[1] head_size = query.shape[2] @@ -253,9 +272,12 @@ def chunked_prefill_paged_decode( key_cache = key_cache.view(target_dtype) value_cache = value_cache.view(target_dtype) + num_queries_per_kv_padded = max(triton.next_power_of_2(num_queries_per_kv), + 16) + kernel_paged_attention_2d[( num_seqs, - num_query_heads, + num_kv_heads, )]( output_ptr=output, query_ptr=query, @@ -269,6 +291,7 @@ def chunked_prefill_paged_decode( v_scale=v_scale, num_query_heads=num_query_heads, num_queries_per_kv=num_queries_per_kv, + num_queries_per_kv_padded=num_queries_per_kv_padded, block_table_stride=block_table.stride(0), query_stride_0=query.stride(0), query_stride_1=query.stride(1), From 95d680b8620b4f1a087a2857142a05c15c45aceb Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Fri, 14 Mar 2025 11:43:18 +0800 Subject: [PATCH 009/169] [Bugfix][IPEX] Add `VLLM_CPU_MOE_PREPACK` to allow disabling MoE prepack when CPU does not support it (#14681) Signed-off-by: Thien Tran --- docs/source/getting_started/installation/cpu.md | 1 + vllm/envs.py | 7 +++++++ vllm/model_executor/layers/fused_moe/layer.py | 3 ++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/source/getting_started/installation/cpu.md b/docs/source/getting_started/installation/cpu.md index 9ca25e4709e86..43c9187f072e1 100644 --- a/docs/source/getting_started/installation/cpu.md +++ b/docs/source/getting_started/installation/cpu.md @@ -195,6 +195,7 @@ vLLM CPU backend supports the following vLLM features: - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. +- `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False). ## Performance tips diff --git a/vllm/envs.py b/vllm/envs.py index 24ee4583c75d8..259501056cc3b 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -40,6 +40,7 @@ if TYPE_CHECKING: VLLM_PP_LAYER_PARTITION: Optional[str] = None VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_CPU_OMP_THREADS_BIND: str = "" + VLLM_CPU_MOE_PREPACK: bool = True VLLM_OPENVINO_DEVICE: str = "CPU" VLLM_OPENVINO_KVCACHE_SPACE: int = 0 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None @@ -349,6 +350,12 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_CPU_OMP_THREADS_BIND": lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"), + # (CPU backend only) whether to use prepack for MoE layer. This will be + # passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might + # need to set this to "0" (False). + "VLLM_CPU_MOE_PREPACK": + lambda: bool(int(os.getenv("VLLM_CPU_MOE_PREPACK", "1"))), + # OpenVINO device selection # default is CPU "VLLM_OPENVINO_DEVICE": diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 2c5fa509c595d..917643134645f 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -7,6 +7,7 @@ from typing import Callable, List, Optional, Tuple import torch from torch.nn.parameter import UninitializedParameter +from vllm import envs from vllm.config import get_current_vllm_config from vllm.distributed import (get_dp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -104,7 +105,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE( layer.w13_weight, layer.w2_weight, - use_prepack=True, + use_prepack=envs.VLLM_CPU_MOE_PREPACK, ) else: raise NotImplementedError("CPU MOE only supports x86 arch.") From f1f632d9eca4967f25d2f09a8b444724d2cda624 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Thu, 13 Mar 2025 20:43:45 -0700 Subject: [PATCH 010/169] [ci] Reduce number of tests in fastcheck (#14782) --- .buildkite/test-pipeline.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 2af76cb24dd14..81a971390472d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -41,7 +41,6 @@ steps: - grep \"sig sig-object py\" build/html/api/inference_params.html - label: Async Engine, Inputs, Utils, Worker Test # 24min - fast_check: true source_file_dependencies: - vllm/ - tests/mq_llm_engine @@ -126,7 +125,6 @@ steps: - label: Distributed Tests (4 GPUs) # 10min working_dir: "/vllm-workspace/tests" num_gpus: 4 - fast_check: true source_file_dependencies: - vllm/distributed/ - vllm/core/ @@ -152,7 +150,6 @@ steps: - label: Metrics, Tracing Test # 10min num_gpus: 2 - fast_check: true source_file_dependencies: - vllm/ - tests/metrics @@ -284,7 +281,6 @@ steps: parallelism: 4 - label: PyTorch Fullgraph Smoke Test # 9min - fast_check: true source_file_dependencies: - vllm/ - tests/compile @@ -528,7 +524,6 @@ steps: - label: Plugin Tests (2 GPUs) # 40min working_dir: "/vllm-workspace/tests" num_gpus: 2 - fast_check: true source_file_dependencies: - vllm/plugins/ - tests/plugins/ From 4059adc31b7a58f3b14c34d8bdb2805191d8a067 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 13 Mar 2025 23:44:20 -0400 Subject: [PATCH 011/169] [Misc][Minor] Simplify `SamplingParams.__post_init__()` (#14772) Signed-off-by: Nick Hill --- vllm/sampling_params.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 110efa2298223..b0a5777cc8d56 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -338,29 +338,23 @@ class SamplingParams( if self.seed == -1: self.seed = None - else: - self.seed = self.seed if self.stop is None: self.stop = [] elif isinstance(self.stop, str): self.stop = [self.stop] - else: - self.stop = list(self.stop) if self.stop_token_ids is None: self.stop_token_ids = [] - else: - self.stop_token_ids = list(self.stop_token_ids) if self.bad_words is None: self.bad_words = [] - else: - self.bad_words = list(self.bad_words) - self.logprobs = 1 if self.logprobs is True else self.logprobs - self.prompt_logprobs = (1 if self.prompt_logprobs is True else - self.prompt_logprobs) + if self.logprobs is True: + self.logprobs = 1 + + if self.prompt_logprobs is True: + self.prompt_logprobs = 1 # Number of characters to hold back for stop string evaluation # until sequence is finished. From d3d4956261e864b492150251dec2d3d062293537 Mon Sep 17 00:00:00 2001 From: Liangfu Chen Date: Thu, 13 Mar 2025 20:46:56 -0700 Subject: [PATCH 012/169] [Neuron] flatten test parameterization for neuron attention kernels (#14712) --- .buildkite/run-neuron-test.sh | 2 +- tests/neuron/{ => 1_core}/test_activation.py | 0 tests/neuron/{ => 1_core}/test_block_table.py | 0 tests/neuron/{ => 1_core}/test_cache.py | 0 tests/neuron/{ => 1_core}/test_layernorm.py | 0 .../{ => 1_core}/test_logits_processor.py | 0 .../{ => 1_core}/test_prefix_prefill.py | 46 ++++++++++--------- .../{ => 1_core}/test_rotary_embedding.py | 0 tests/neuron/{ => 2_core}/test_comm_ops.py | 0 9 files changed, 26 insertions(+), 22 deletions(-) rename tests/neuron/{ => 1_core}/test_activation.py (100%) rename tests/neuron/{ => 1_core}/test_block_table.py (100%) rename tests/neuron/{ => 1_core}/test_cache.py (100%) rename tests/neuron/{ => 1_core}/test_layernorm.py (100%) rename tests/neuron/{ => 1_core}/test_logits_processor.py (100%) rename tests/neuron/{ => 1_core}/test_prefix_prefill.py (92%) rename tests/neuron/{ => 1_core}/test_rotary_embedding.py (100%) rename tests/neuron/{ => 2_core}/test_comm_ops.py (100%) diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index 55c374fcc33de..06924fea6195e 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ --name "${container_name}" \ ${image_name} \ - /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys" + /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys" diff --git a/tests/neuron/test_activation.py b/tests/neuron/1_core/test_activation.py similarity index 100% rename from tests/neuron/test_activation.py rename to tests/neuron/1_core/test_activation.py diff --git a/tests/neuron/test_block_table.py b/tests/neuron/1_core/test_block_table.py similarity index 100% rename from tests/neuron/test_block_table.py rename to tests/neuron/1_core/test_block_table.py diff --git a/tests/neuron/test_cache.py b/tests/neuron/1_core/test_cache.py similarity index 100% rename from tests/neuron/test_cache.py rename to tests/neuron/1_core/test_cache.py diff --git a/tests/neuron/test_layernorm.py b/tests/neuron/1_core/test_layernorm.py similarity index 100% rename from tests/neuron/test_layernorm.py rename to tests/neuron/1_core/test_layernorm.py diff --git a/tests/neuron/test_logits_processor.py b/tests/neuron/1_core/test_logits_processor.py similarity index 100% rename from tests/neuron/test_logits_processor.py rename to tests/neuron/1_core/test_logits_processor.py diff --git a/tests/neuron/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py similarity index 92% rename from tests/neuron/test_prefix_prefill.py rename to tests/neuron/1_core/test_prefix_prefill.py index 2c6ac47888d51..326a1f82e9b30 100644 --- a/tests/neuron/test_prefix_prefill.py +++ b/tests/neuron/1_core/test_prefix_prefill.py @@ -292,28 +292,32 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size, @pytest.mark.parametrize( - "prefill_batch_size,decode_batch_size,block_size,large_tile_size", + "prefill_batch_size,decode_batch_size,block_size,large_tile_size,num_heads,num_queries_per_kv,head_size,mixed_precision", [ - (1, 199, 1, 512), # 512 blocks - (4, 12, 256, 2048), # 128 blocks - (4, 12, 16, 2048), # 128 blocks - (4, 12, 4, 1024), # 256 blocks - (4, 12, 32, 2048), # 64 blocks - (4, 12, 32, 4096), # 128 blocks - (4, 12, 32, 8192), # 256 blocks - (4, 12, 64, 8192), # 128 blocks - ], -) -@pytest.mark.parametrize( - "num_heads,num_queries_per_kv,head_size", - [ - (4, 2, 8), - (32, 8, 64), - (4, 4, 128), - (8, 1, 32), - ], -) -@pytest.mark.parametrize("mixed_precision", [True, False]) + # Test minimal configurations (small block size) + (1, 199, 1, 512, 4, 2, 8, False + ), # minimal block size, small dimensions + (1, 199, 1, 512, 4, 2, 8, True), # same with mixed precision + + # Test common/medium configurations + (4, 12, 32, 2048, 32, 8, 64, False), # common case, larger heads + (4, 12, 32, 2048, 16, 4, 32, + True), # medium size, mixed precision, grouped-query attention (GQA) + + # Test large configurations + (4, 12, 256, 8192, 8, 1, 128, False), # large blocks, large head size + (4, 12, 256, 8192, 64, 8, 64, True), # large blocks, many heads + + # Test asymmetric configurations + (2, 24, 64, 4096, 12, 4, 96, False), # varied batch sizes + (8, 8, 128, 2048, 24, 2, 48, True), # balanced batches + + # Test edge cases + (1, 128, 16, 1024, 4, 2, 16, False), # large decode batch + (16, 4, 8, 8192, 48, 1, 128, True), # large prefill batch + (4, 12, 32, 2048, 16, 1, 32, True), # multi-head attention (MHA) + (4, 12, 32, 2048, 16, 16, 32, True), # multi-query attention (MQA) + ]) @torch.inference_mode() def test_contexted_kv_attention( prefill_batch_size: int, diff --git a/tests/neuron/test_rotary_embedding.py b/tests/neuron/1_core/test_rotary_embedding.py similarity index 100% rename from tests/neuron/test_rotary_embedding.py rename to tests/neuron/1_core/test_rotary_embedding.py diff --git a/tests/neuron/test_comm_ops.py b/tests/neuron/2_core/test_comm_ops.py similarity index 100% rename from tests/neuron/test_comm_ops.py rename to tests/neuron/2_core/test_comm_ops.py From a6e0d096dd48d1190fb548dd81fdd31f310391e3 Mon Sep 17 00:00:00 2001 From: Jennifer Zhao Date: Thu, 13 Mar 2025 21:07:54 -0700 Subject: [PATCH 013/169] [Feature] Add visionarena offline support for benchmark_throughput (#14654) Signed-off-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com> Signed-off-by: Jennifer Zhao Co-authored-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com> Co-authored-by: Jennifer Zhao Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> --- benchmarks/README.md | 58 +++++-- benchmarks/benchmark_dataset.py | 65 ++++--- benchmarks/benchmark_throughput.py | 267 ++++++++++++++++++++++------- 3 files changed, 291 insertions(+), 99 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index edc10d8b43eeb..c64c24fd3ad05 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -43,20 +43,26 @@ become available. HuggingFace βœ… - 🚧 + 🟑 Specify your dataset path on HuggingFace VisionArena βœ… - 🚧 + βœ… lmarena-ai/vision-arena-bench-v0.1 (a HuggingFace dataset) -βœ…: supported + +βœ…: supported + 🚧: to be supported +🟑: Partial support. Currently, HuggingFaceDataset only supports dataset formats +similar to `lmms-lab/LLaVA-OneVision-Data`. If you need support for other dataset +formats, please consider contributing. + **Note**: VisionArena’s `dataset-name` should be set to `hf` --- @@ -79,7 +85,7 @@ NUM_PROMPTS=10 BACKEND="openai-chat" DATASET_NAME="sharegpt" DATASET_PATH="/ShareGPT_V3_unfiltered_cleaned_split.json" -python3 benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS} +python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS} ``` If successful, you will see the following output @@ -123,7 +129,7 @@ DATASET_NAME="hf" DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1" DATASET_SPLIT='train' -python3 benchmarks/benchmark_serving.py \ +python3 vllm/benchmarks/benchmark_serving.py \ --backend "${BACKEND}" \ --model "${MODEL_NAME}" \ --endpoint "/v1/chat/completions" \ @@ -140,35 +146,65 @@ python3 benchmarks/benchmark_serving.py \ MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B" NUM_PROMPTS=10 DATASET_NAME="sonnet" -DATASET_PATH="benchmarks/sonnet.txt" +DATASET_PATH="vllm/benchmarks/sonnet.txt" -python3 benchmarks/benchmark_throughput.py \ +python3 vllm/benchmarks/benchmark_throughput.py \ --model "${MODEL_NAME}" \ --dataset-name "${DATASET_NAME}" \ --dataset-path "${DATASET_PATH}" \ --num-prompts "${NUM_PROMPTS}" - ``` +``` If successful, you will see the following output ``` -Throughput: 7.35 requests/s, 4789.20 total tokens/s, 1102.83 output tokens/s +Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s +Total num prompt tokens: 5014 +Total num output tokens: 1500 +``` + +### VisionArena Benchmark for Vision Language Models + +``` bash +MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct" +NUM_PROMPTS=10 +DATASET_NAME="hf" +DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1" +DATASET_SPLIT="train" + +python3 vllm/benchmarks/benchmark_throughput.py \ + --model "${MODEL_NAME}" \ + --backend "vllm-chat" \ + --dataset-name "${DATASET_NAME}" \ + --dataset-path "${DATASET_PATH}" \ + --num-prompts "${NUM_PROMPTS}" \ + --hf-split "${DATASET_SPLIT}" +``` + +The `num prompt tokens` now includes image token counts + +``` +Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s +Total num prompt tokens: 14527 +Total num output tokens: 1280 ``` ### Benchmark with LoRA Adapters ``` bash +# download dataset +# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json MODEL_NAME="meta-llama/Llama-2-7b-hf" BACKEND="vllm" DATASET_NAME="sharegpt" -DATASET_PATH="/home/jovyan/data/vllm_benchmark_datasets/ShareGPT_V3_unfiltered_cleaned_split.json" +DATASET_PATH="/ShareGPT_V3_unfiltered_cleaned_split.json" NUM_PROMPTS=10 MAX_LORAS=2 MAX_LORA_RANK=8 ENABLE_LORA="--enable-lora" LORA_PATH="yard1/llama-2-7b-sql-lora-test" -python3 benchmarks/benchmark_throughput.py \ +python3 vllm/benchmarks/benchmark_throughput.py \ --model "${MODEL_NAME}" \ --backend "${BACKEND}" \ --dataset_path "${DATASET_PATH}" \ diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 30fffdda491d0..55109dab00035 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -46,7 +46,7 @@ class SampleRequest: Represents a single inference request for benchmarking. """ - prompt: str + prompt: Union[str, Any] prompt_len: int expected_output_len: int multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None @@ -84,6 +84,20 @@ class BenchmarkDataset(ABC): if random_seed is not None else self.DEFAULT_SEED) self.data = None + def apply_multimodal_chat_transformation( + self, + prompt: str, + mm_content: Optional[MultiModalDataDict] = None) -> list[dict]: + """ + Transform a prompt and optional multimodal content into a chat format. + This method is used for chat models that expect a specific + conversation format. + """ + content = [{"text": prompt, "type": "text"}] + if mm_content is not None: + content.append(mm_content) + return [{"role": "user", "content": content}] + def load_data(self) -> None: """ Load data from the dataset path into self.data. @@ -338,6 +352,7 @@ class ShareGPTDataset(BenchmarkDataset): lora_path: Optional[str] = None, max_loras: Optional[int] = None, output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, **kwargs) -> list: samples: list = [] for entry in self.data: @@ -358,6 +373,9 @@ class ShareGPTDataset(BenchmarkDataset): skip_min_output_len_check=output_len is not None): continue + if enable_multimodal_chat: + prompt = self.apply_multimodal_chat_transformation( + prompt, None) samples.append( SampleRequest( prompt=prompt, @@ -550,10 +568,13 @@ class HuggingFaceDataset(BenchmarkDataset): split=self.dataset_split, streaming=True, ) - - if "conversations" not in self.data.features: - raise ValueError("HF Dataset must have a 'conversations' column.") - + if self.data.features is None or "conversations" \ + not in self.data.features: + raise ValueError( + "HuggingFaceDataset currently only supports datasets with " + "a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. " + "Please consider contributing if you would like to add " + "support for additional dataset formats.") # Shuffle and filter examples with at least 2 conversations. self.data = self.data.shuffle(seed=self.random_seed).filter( lambda x: len(x["conversations"]) >= 2) @@ -561,9 +582,8 @@ class HuggingFaceDataset(BenchmarkDataset): def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, - lora_path: Optional[str] = None, - max_loras: Optional[int] = None, output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, **kwargs) -> list: sampled_requests = [] dynamic_output = output_len is None @@ -571,13 +591,9 @@ class HuggingFaceDataset(BenchmarkDataset): for item in self.data: if len(sampled_requests) >= num_requests: break - conv = item["conversations"] prompt, completion = conv[0]["value"], conv[1]["value"] - lora_request, tokenizer = self.get_random_lora_request( - tokenizer, lora_path=lora_path, max_loras=max_loras) - prompt_ids = tokenizer(prompt).input_ids completion_ids = tokenizer(completion).input_ids prompt_len = len(prompt_ids) @@ -587,16 +603,20 @@ class HuggingFaceDataset(BenchmarkDataset): if dynamic_output and not is_valid_sequence( prompt_len, completion_len): continue - mm_content = process_image( item["image"]) if "image" in item else None + if enable_multimodal_chat: + # Note: when chat is enabled the request prompt_len is no longer + # accurate and we will be using request output to count the + # actual prompt len and output len + prompt = self.apply_multimodal_chat_transformation( + prompt, mm_content) sampled_requests.append( SampleRequest( prompt=prompt, prompt_len=prompt_len, expected_output_len=output_len, multi_modal_data=mm_content, - lora_request=lora_request, )) return sampled_requests @@ -606,7 +626,7 @@ class HuggingFaceDataset(BenchmarkDataset): # ----------------------------------------------------------------------------- -class VisionArenaDataset(BenchmarkDataset): +class VisionArenaDataset(HuggingFaceDataset): """ Vision Arena Dataset. """ @@ -617,14 +637,9 @@ class VisionArenaDataset(BenchmarkDataset): def __init__( self, - dataset_split: str, - dataset_subset: Optional[str] = None, **kwargs, ) -> None: super().__init__(**kwargs) - self.dataset_split = dataset_split - self.dataset_subset = dataset_subset - if self.dataset_path != self.VISION_ARENA_DATASET_PATH: raise ValueError(f"Only support Vision Arena dataset.\ This data path {self.dataset_path} is not valid.") @@ -645,9 +660,9 @@ class VisionArenaDataset(BenchmarkDataset): def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, - output_len: int = DEFAULT_OUTPUT_LEN, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, **kwargs) -> list: - # TODO (jenniferzhao): Add support for offline benchmark sampling output_len = (output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN) sampled_requests = [] @@ -655,8 +670,14 @@ class VisionArenaDataset(BenchmarkDataset): if len(sampled_requests) >= num_requests: break prompt = item["turns"][0][0]["content"] - prompt_len = len(tokenizer(prompt).input_ids) mm_content = process_image(item["images"][0]) + prompt_len = len(tokenizer(prompt).input_ids) + if enable_multimodal_chat: + # Note: when chat is enabled the request prompt_len is no longer + # accurate and we will be using request output to count the + # actual prompt len + prompt = self.apply_multimodal_chat_transformation( + prompt, mm_content) sampled_requests.append( SampleRequest( prompt=prompt, diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 7e6556733b288..53869db478c51 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -11,8 +11,9 @@ from typing import Any, Optional, Union import torch import uvloop -from benchmark_dataset import (BurstGPTDataset, RandomDataset, SampleRequest, - ShareGPTDataset, SonnetDataset) +from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset, + RandomDataset, SampleRequest, ShareGPTDataset, + SonnetDataset, VisionArenaDataset) from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from tqdm import tqdm from transformers import (AutoModelForCausalLM, AutoTokenizer, @@ -23,6 +24,7 @@ from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args) from vllm.inputs import TextPrompt, TokensPrompt from vllm.lora.request import LoRARequest +from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams from vllm.utils import FlexibleArgumentParser, merge_async_iterators @@ -32,7 +34,7 @@ def run_vllm( n: int, engine_args: EngineArgs, disable_detokenize: bool = False, -) -> float: +) -> tuple[float, Optional[list[RequestOutput]]]: from vllm import LLM, SamplingParams llm = LLM(**dataclasses.asdict(engine_args)) assert all( @@ -66,12 +68,13 @@ def run_vllm( use_beam_search = False + outputs = None if not use_beam_search: start = time.perf_counter() - llm.generate(prompts, - sampling_params, - lora_request=lora_requests, - use_tqdm=True) + outputs = llm.generate(prompts, + sampling_params, + lora_request=lora_requests, + use_tqdm=True) end = time.perf_counter() else: assert lora_requests is None, "BeamSearch API does not support LoRA" @@ -89,7 +92,46 @@ def run_vllm( ignore_eos=True, )) end = time.perf_counter() - return end - start + return end - start, outputs + + +def run_vllm_chat( + requests: list[SampleRequest], + n: int, + engine_args: EngineArgs, + disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]: + """ + Run vLLM chat benchmark. This function is recommended ONLY for benchmarking + multimodal models as it properly handles multimodal inputs and chat + formatting. For non-multimodal models, use run_vllm() instead. + """ + from vllm import LLM, SamplingParams + llm = LLM(**dataclasses.asdict(engine_args)) + + assert all( + llm.llm_engine.model_config.max_model_len >= ( + request.prompt_len + request.expected_output_len) + for request in requests), ( + "Please ensure that max_model_len is greater than the sum of " + "prompt_len and expected_output_len for all requests.") + + prompts = [] + sampling_params: list[SamplingParams] = [] + for request in requests: + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + detokenize=not disable_detokenize, + )) + start = time.perf_counter() + outputs = llm.chat(prompts, sampling_params, use_tqdm=True) + end = time.perf_counter() + return end - start, outputs async def run_vllm_async( @@ -264,6 +306,8 @@ def get_requests(args, tokenizer): dataset_cls = RandomDataset elif args.dataset_name == "sharegpt": dataset_cls = ShareGPTDataset + if args.backend == "vllm-chat": + sample_kwargs["enable_multimodal_chat"] = True elif args.dataset_name == "sonnet": assert tokenizer.chat_template or tokenizer.default_chat_template, ( "Tokenizer/model must have chat template for sonnet dataset.") @@ -272,6 +316,19 @@ def get_requests(args, tokenizer): sample_kwargs["return_prompt_formatted"] = True elif args.dataset_name == "burstgpt": dataset_cls = BurstGPTDataset + elif args.dataset_name == "hf": + if args.backend != "vllm-chat": + raise ValueError( + "hf datasets only are supported by vllm-chat backend") + # Choose between VisionArenaDataset and HuggingFaceDataset based on + # provided parameters. + dataset_cls = (VisionArenaDataset if args.dataset_path + == VisionArenaDataset.VISION_ARENA_DATASET_PATH + and args.hf_subset is None else HuggingFaceDataset) + common_kwargs['dataset_subset'] = args.hf_subset + common_kwargs['dataset_split'] = args.hf_split + sample_kwargs["enable_multimodal_chat"] = True + else: raise ValueError(f"Unknown dataset name: {args.dataset_name}") # Remove None values @@ -290,6 +347,7 @@ def main(args: argparse.Namespace): requests = get_requests(args, tokenizer) is_multi_modal = any(request.multi_modal_data is not None for request in requests) + request_outputs: Optional[list[RequestOutput]] = None if args.backend == "vllm": if args.async_engine: elapsed_time = uvloop.run( @@ -301,9 +359,9 @@ def main(args: argparse.Namespace): args.disable_detokenize, )) else: - elapsed_time = run_vllm(requests, args.n, - EngineArgs.from_cli_args(args), - args.disable_detokenize) + elapsed_time, request_outputs = run_vllm( + requests, args.n, EngineArgs.from_cli_args(args), + args.disable_detokenize) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -312,20 +370,45 @@ def main(args: argparse.Namespace): elif args.backend == "mii": elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size, args.output_len) + elif args.backend == "vllm-chat": + elapsed_time, request_outputs = run_vllm_chat( + requests, args.n, EngineArgs.from_cli_args(args), + args.disable_detokenize) else: raise ValueError(f"Unknown backend: {args.backend}") - total_num_tokens = sum(request.prompt_len + request.expected_output_len - for request in requests) - total_output_tokens = sum(request.expected_output_len - for request in requests) - if is_multi_modal: - print("\033[91mWARNING\033[0m: Multi-modal request detected. The " + + if request_outputs: + # Note: with the vllm and vllm-chat backends, + # we have request_outputs, which we use to count tokens. + total_prompt_tokens = 0 + total_output_tokens = 0 + for ro in request_outputs: + if not isinstance(ro, RequestOutput): + continue + total_prompt_tokens += len( + ro.prompt_token_ids) if ro.prompt_token_ids else 0 + total_output_tokens += sum( + len(o.token_ids) for o in ro.outputs if o) + total_num_tokens = total_prompt_tokens + total_output_tokens + else: + total_num_tokens = sum(r.prompt_len + r.expected_output_len + for r in requests) + total_output_tokens = sum(r.expected_output_len for r in requests) + total_prompt_tokens = total_num_tokens - total_output_tokens + + if is_multi_modal and args.backend != "vllm-chat": + print("\033[91mWARNING\033[0m: Multi-modal request with " + f"{args.backend} backend detected. The " "following metrics are not accurate because image tokens are not" " counted. See vllm-project/vllm/issues/9778 for details.") # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length. + # vllm-chat backend counts the image tokens now + print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " f"{total_output_tokens / elapsed_time:.2f} output tokens/s") + print(f"Total num prompt tokens: {total_prompt_tokens}") + print(f"Total num output tokens: {total_output_tokens}") # Output JSON results if specified if args.output_json: @@ -341,17 +424,100 @@ def main(args: argparse.Namespace): save_to_pytorch_benchmark_format(args, results) +def validate_args(args): + """ + Validate command-line arguments. + """ + + # === Deprecation and Defaulting === + if args.dataset is not None: + warnings.warn( + "The '--dataset' argument will be deprecated in the next release. " + "Please use '--dataset-name' and '--dataset-path' instead.", + stacklevel=2) + args.dataset_path = args.dataset + + if not getattr(args, "tokenizer", None): + args.tokenizer = args.model + + # === Backend Validation === + valid_backends = {"vllm", "hf", "mii", "vllm-chat"} + if args.backend not in valid_backends: + raise ValueError(f"Unsupported backend: {args.backend}") + + # === Dataset Configuration === + if not args.dataset and not args.dataset_path: + print( + "When dataset path is not set, it will default to random dataset") + args.dataset_name = 'random' + if args.input_len is None: + raise ValueError("input_len must be provided for a random dataset") + + # === Dataset Name Specific Checks === + # --hf-subset and --hf-split: only used + # when dataset_name is 'hf' + if args.dataset_name != "hf" and ( + getattr(args, "hf_subset", None) is not None + or getattr(args, "hf_split", None) is not None): + warnings.warn("--hf-subset and --hf-split will be ignored \ + since --dataset-name is not 'hf'.", + stacklevel=2) + elif args.dataset_name == "hf" and args.backend != "vllm-chat": + raise ValueError( + "When --dataset-name is 'hf', backend must be 'vllm-chat'") + + # --random-range-ratio: only used when dataset_name is 'random' + if args.dataset_name != 'random' and args.random_range_ratio is not None: + warnings.warn("--random-range-ratio will be ignored since \ + --dataset-name is not 'random'.", + stacklevel=2) + + # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not + # set. + if args.dataset_name not in {"random", "sonnet", None + } and args.prefix_len is not None: + warnings.warn("--prefix-len will be ignored since --dataset-name\ + is not 'random', 'sonnet', or not set.", + stacklevel=2) + + # === LoRA Settings === + if getattr(args, "enable_lora", False) and args.backend != "vllm": + raise ValueError( + "LoRA benchmarking is only supported for vLLM backend") + if getattr(args, "enable_lora", False) and args.lora_path is None: + raise ValueError("LoRA path must be provided when enable_lora is True") + + # === Backend-specific Validations === + if args.backend == "hf" and args.hf_max_batch_size is None: + raise ValueError("HF max batch size is required for HF backend") + if args.backend != "hf" and args.hf_max_batch_size is not None: + raise ValueError("HF max batch size is only for HF backend.") + + if args.backend in {"hf", "mii"} and getattr(args, "quantization", + None) is not None: + raise ValueError("Quantization is only for vLLM backend.") + + if args.backend == "mii" and args.dtype != "auto": + raise ValueError("dtype must be auto for MII backend.") + if args.backend == "mii" and args.n != 1: + raise ValueError("n must be 1 for MII backend.") + if args.backend == "mii" and args.tokenizer != args.model: + raise ValueError( + "Tokenizer must be the same as the model for MII backend.") + + if __name__ == "__main__": parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument("--backend", type=str, - choices=["vllm", "hf", "mii"], + choices=["vllm", "hf", "mii", "vllm-chat"], default="vllm") - parser.add_argument("--dataset-name", - type=str, - choices=["sharegpt", "random", "sonnet", "burstgpt"], - help="Name of the dataset to benchmark on.", - default="sharegpt") + parser.add_argument( + "--dataset-name", + type=str, + choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"], + help="Name of the dataset to benchmark on.", + default="sharegpt") parser.add_argument( "--dataset", type=str, @@ -419,55 +585,24 @@ if __name__ == "__main__": parser.add_argument( "--random-range-ratio", type=float, - default=1.0, + default=None, help="Range of sampled ratio of input/output length, " "used only for RandomDataSet.", ) + # hf dtaset + parser.add_argument("--hf-subset", + type=str, + default=None, + help="Subset of the HF dataset.") + parser.add_argument("--hf-split", + type=str, + default=None, + help="Split of the HF dataset.") + parser = AsyncEngineArgs.add_cli_args(parser) args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model - if args.dataset is not None: - warnings.warn( - "The '--dataset' argument will be deprecated in the next " - "release. Please use '--dataset-name' and " - "'--dataset-path' in the future runs.", - stacklevel=2) - args.dataset_path = args.dataset - if args.dataset is None and args.dataset_path is None: - # for random dataset, the default sampling setting is in - # benchmark_dataset.RandomDataset - print("When dataset is not set, it will default to random dataset") - else: - assert args.input_len is None - if args.enable_lora: - assert args.lora_path is not None - - if args.backend == "vllm": - if args.hf_max_batch_size is not None: - raise ValueError("HF max batch size is only for HF backend.") - elif args.backend == "hf": - if args.hf_max_batch_size is None: - raise ValueError("HF max batch size is required for HF backend.") - if args.quantization is not None: - raise ValueError("Quantization is only for vLLM backend.") - if args.enable_lora is not None: - raise ValueError("LoRA benchmarking is only supported for vLLM" - " backend") - elif args.backend == "mii": - if args.dtype != "auto": - raise ValueError("dtype must be auto for MII backend.") - if args.n != 1: - raise ValueError("n must be 1 for MII backend.") - if args.quantization is not None: - raise ValueError("Quantization is only for vLLM backend.") - if args.hf_max_batch_size is not None: - raise ValueError("HF max batch size is only for HF backend.") - if args.tokenizer != args.model: - raise ValueError("Tokenizer must be the same as the model for MII " - "backend.") - if args.enable_lora is not None: - raise ValueError("LoRA benchmarking is only supported for vLLM" - " backend") + validate_args(args) main(args) From 0c2af17c766f1e9c3e6827c0a04a07efe9e3ae9b Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Thu, 13 Mar 2025 22:52:15 -0700 Subject: [PATCH 014/169] [CI] Fix missing example model id in processor test (#14787) Signed-off-by: Roger Wang --- tests/models/multimodal/processing/test_common.py | 2 +- tests/models/registry.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 467114eedb01c..aef5db9bc06bb 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -215,7 +215,7 @@ def test_processing_correctness( # yapf: disable -@pytest.mark.parametrize("model_id", ["microsoft/Phi-3-vision-128k-instruct"]) +@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) @pytest.mark.parametrize("simplify_rate", [1.0]) diff --git a/tests/models/registry.py b/tests/models/registry.py index eadbd7e6f4927..372ea33ba9fdc 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -274,7 +274,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True), "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501 extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501 - "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct", + "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True), "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct", trust_remote_code=True), From 9532c49836ad9b5f2120ebba8caf0c56f998126f Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 14 Mar 2025 02:39:02 -0400 Subject: [PATCH 015/169] [Attention] MLA get rid of materialization (#14770) Signed-off-by: Lucas Wilkinson --- vllm/attention/backends/mla/common.py | 263 ++++------------- vllm/envs.py | 19 -- .../layers/quantization/utils/fp8_utils.py | 59 +--- vllm/v1/attention/backends/mla/common.py | 269 ++++-------------- 4 files changed, 114 insertions(+), 496 deletions(-) diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index fc5f3420e394d..ff411f75ae7ff 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -7,22 +7,22 @@ First we define: Sq as Q sequence length Skv as KV sequence length -MLA has two possible ways of computing, a data-movement friendly approach and a -compute friendly approach, we generally want to use the compute friendly -approach for "prefill" (i.e. the ratio Sq / Skv is "small", is near 1) -and the data-movement friendly approach for "decode" (i.e. the ratio -Sq / Skv is "large"). +MLA has two possible ways of computing, a data-movement friendly approach and a +compute friendly approach, we generally want to use the compute friendly +approach for "prefill" (i.e. the ratio Sq / Skv is "small", is near 1) +and the data-movement friendly approach for "decode" (i.e. the ratio +Sq / Skv is "large"). -NOTE what we deem small and large is currently determined by if its labelled -prefill or decode by the scheduler, but this is something we should probably +NOTE what we deem small and large is currently determined by if its labelled +prefill or decode by the scheduler, but this is something we should probably tune. Main reference: DeepseekV2 paper, and FlashInfer Implementation (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551). Deepseek's MLA attention works the following way: -* Use a single latent vector to represent the per-token entry of the KV cache. -* For decode (i.e. the memory friendly approach) the attention "simulates" a +* Use a single latent vector to represent the per-token entry of the KV cache. +* For decode (i.e. the memory friendly approach) the attention "simulates" a multi-head attention, while the compute is similar to multi-query attention. Below is example of both paths assuming batchsize = 1 @@ -54,9 +54,9 @@ W_DQ project h_t to q_c shape [H, Lq] W_UQ project q_c to q_nope shape [Lq, N * P] W_QR project q_c to q_pe shape [Lq, N * R] W_DKV project h_t to kv_c shape [H, Lkv] -W_UK project kv_c to k_nope shape [Lkv, N * P] -W_KR project h_t to k_pe shape [H, N * R] -W_UV project kv_c to v shape [Lkv, N * V] +W_UK project kv_c to k_nope shape [Lkv, N, P] +W_KR project h_t to k_pe shape [H, R] +W_UV project kv_c to v shape [Lkv, N, V] W_O project v to h_t shape [N * V, H] @@ -69,8 +69,8 @@ new_kv_c = h_t @ W_DKV new_k_pe = RoPE(h_t @ W_KR) kv_c = torch.cat([new_kv_c, cache_kv_c], dim=0) k_pe = torch.cat([new_k_pe, cache_k_pe], dim=0) -k_nope = (kv_c @ W_UK).view(Skv, N, P) -v = (kv_c @ W_UV).view(Skv, N, V) +k_nope = (kv_c @ W_UK.view(Lkv, N * P)).view(Skv, N, P) +v = (kv_c @ W_UV.view(Lkv, N * V)).view(Skv, N, V) // MHA with QK headdim = P + R // V headdim = V @@ -90,20 +90,10 @@ NOTE: in the actual code, ## Data-Movement Friendly Approach (i.e. "_forward_decode"): -Ahead of time, compute: - -% this projects from q_c to [Sq, N * Lkv] -W_UQ_UK = einsum("qnp,knp -> qnk" - W_UQ.view(Lq, N, P), W_UK.view(Lkv, N, P) - ).view(Lkv, N * Lkv) -% this projects from attn output [Sq, N * Lkv] to [Sq, H] -W_UV_O = einsum("knv,nvh -> nkh" - W_UV.view(Lkv, N, V), W_O.view(N, V, H) - ).view(N * Lkv, H) - Runtime q_c = h_t @ W_DQ -q_latent = q_c @ W_UQ_UK.view(Sq, N, Lkv) +q_nope = (q_c @ W_UQ).view(-1, N, P) +ql_nope = einsum("snh,lnh->snl", q, W_UK) q_pe = RoPE(q_c @ W_QR).view(Sq, N, R) new_kv_c = h_t @ W_DKV new_k_pe = RoPE(h_t @ W_KR) @@ -116,11 +106,13 @@ k_pe = torch.cat([new_k_pe, cache_k_pe], dim=0) // NOTE: this is less compute-friendly since Lkv > P // but is more data-movement friendly since its MQA vs MHA spda_o = scaled_dot_product_attention( - torch.cat([q_latent, q_pe], dim=-1), + torch.cat([ql_nope, q_pe], dim=-1), torch.cat([kv_c, k_pe], dim=-1), kv_c ) -return spda_o.reshape(-1, N * Lkv) @ W_UV_O + +o = einsum("snl,lnv->snv", spda_o.reshape(-1, N, Lkv), W_UV) +return o.view(-1, N * V) @ self.num_heads @ W_O ## Chunked Prefill @@ -146,8 +138,8 @@ q_nope = (q_c @ W_UQ).view(Sq, N, P) q_pe = RoPE(q_c @ W_QR).view(Sq, N, R) new_kv_c = h_t @ W_DKV new_k_pe = RoPE(h_t @ W_KR) -new_k_nope = (new_kv_c @ W_UK).view(Sq, N, P) -new_v = (new_kv_c @ W_UV).view(Sq, N, V) +new_k_nope = (new_kv_c @ W_UK.view(Lkv, N * P)).view(Sq, N, P) +new_v = (new_kv_c @ W_UV.view(Lkv, N * V)).view(Sq, N, V) // MHA between queries and new KV // with QK headdim = P + R @@ -171,17 +163,17 @@ for chunk_idx in range(cdiv(C, MCC)): cache_k_pe_chunk = cache_k_pe[chunk_start:chunk_end] cache_k_nope_chunk = (cache_kv_c_chunk @ W_UK).view(-1, N, P) cache_v_chunk = (cache_kv_c_chunk @ W_UV).view(-1, N, V) - + chunk_o, chunk_lse = scaled_dot_product_attention( torch.cat([q_nope, q_pe], dim=-1), - torch.cat([cache_k_nope_chunk, - cache_k_pe_chunk.unsqueeze(1).expand(-1, N, -1)], + torch.cat([cache_k_nope_chunk, + cache_k_pe_chunk.unsqueeze(1).expand(-1, N, -1)], dim=-1), cache_v_chunk, casual=False, return_softmax_lse=True ) - + curr_o, curr_lse = merge_attn_states( suffix_output=curr_o, suffix_lse=curr_lse, @@ -202,7 +194,6 @@ from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple, Type, TypeVar) import torch -from compressed_tensors.quantization import QuantizationStrategy from vllm import _custom_ops as ops from vllm import envs @@ -215,20 +206,9 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping, get_flash_attn_version, is_block_tables_empty) from vllm.attention.ops.triton_merge_attn_states import merge_attn_states -from vllm.distributed import (get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce) from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearBase, RowParallelLinear, UnquantizedLinearMethod) -from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 - CompressedTensorsLinearMethod) -from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( - CompressedTensorsW8A8Fp8) -from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod -from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - Fp8LinearGenericOp, is_fp8) -from vllm.model_executor.layers.quantization.utils.quant_utils import ( - scaled_quantize) from vllm.model_executor.layers.rotary_embedding import ( DeepseekScalingRotaryEmbedding, RotaryEmbedding) from vllm.multimodal import MultiModalPlaceholderMap @@ -1057,7 +1037,6 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): self.kv_b_proj = kv_b_proj self.o_proj = o_proj self.triton_fa_func = triton_attention - self.fp8_linear_generic = Fp8LinearGenericOp() # Handle the differences between the flash_attn_varlen from flash_attn # and the one from vllm_flash_attn. The former is used on RoCM and the @@ -1070,80 +1049,29 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): fa_version=self.vllm_flash_attn_version) def _v_up_proj_and_o_proj(self, x): - if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION: - if is_fp8(self.W_UV_O): - output_parallel = self.fp8_linear_generic.apply( - x.flatten(start_dim=1), self.W_UV_O, self.W_UV_O_scales, - self.reqaunt_input_group_shape, - self.reqaunt_weight_group_shape) - else: - output_parallel = torch.matmul(x.flatten(start_dim=1), - self.W_UV_O) - if self.tp_size > 1: - output = tensor_model_parallel_all_reduce(output_parallel) - else: - output = output_parallel - return output - else: - x = torch.einsum("bnl,lnv->bnv", x, self.W_UV) - return self.o_proj(x.reshape(-1, - self.num_heads * self.v_head_dim))[0] + # Convert from (B, N, L) to (N, B, L) + x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1) + # Multiply (N, B, L) x (N, L, V) -> (N, B, V) + x = torch.bmm(x, self.W_UV) + # Convert from (N, B, V) to (B, N * V) + x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim) + return self.o_proj(x)[0] + # Return `ql_nope`, `q_pe` def _q_proj_and_k_up_proj(self, x): - if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION: - if is_fp8(self.W_Q_UK): - return self.fp8_linear_generic.apply( - x, self.W_Q_UK, self.W_Q_UK_scales, - self.reqaunt_input_group_shape, - self.reqaunt_weight_group_shape).view( - -1, self.num_heads, self.kv_lora_rank) - return torch.matmul(x, self.W_Q_UK)\ - .view(-1, self.num_heads, self.kv_lora_rank) - else: - x = torch.matmul(x, self.W_Q)\ - .view(-1, self.num_heads, self.qk_nope_head_dim) - return torch.einsum("bnp,lnp->bnl", x, self.W_UK)\ - .view(-1, self.num_heads, self.kv_lora_rank) + q_nope, q_pe = self.q_proj(x)[0]\ + .view(-1, self.num_heads, self.qk_head_dim)\ + .split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + + # Convert from (B, N, P) to (N, B, P) + q_nope = q_nope.transpose(0, 1) + # Multiply (N, B, P) x (N, P, L) -> (N, B, L) + ql_nope = torch.bmm(q_nope, self.W_UK_T) + # Convert from (N, B, L) to (B, N, L) + return ql_nope.transpose(0, 1), q_pe def process_weights_after_loading(self, act_dtype: torch.dtype): - # TODO(lucas) This is very gross, we need a more wide scale refactor of - # all the FP8 code with a more standard way of - # defining schemes/group-shapes, we should also potentially force - # quant_methods to support a decompress function - # - # returns input_group_shape, weight_group_shape - def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \ - Tuple[Tuple[int, int], Tuple[int, int]]: - if isinstance(layer.quant_method, Fp8LinearMethod): - if layer.quant_method.block_quant: - weight_block_size = \ - layer.quant_method.quant_config.weight_block_size - # per-token-group (1, X), block-quantized (X, Y) - return (1, weight_block_size[-1]), weight_block_size - else: - return (-1, -1), (-1, -1) # per-tensor, per-tensor - elif isinstance(layer.quant_method, CompressedTensorsLinearMethod)\ - and isinstance(layer.scheme, CompressedTensorsW8A8Fp8): - # this is hacky but we always assume the for - # CompressedTensorsW8A8Fp8 the input is dynamic per-token - # we ignore if it is static-per-tensor since we are going to - # requantize after later anyways - strategy = layer.scheme.strategy - if strategy == QuantizationStrategy.TENSOR: - return (1, -1), (-1, -1) # per-token, per-tensor - elif strategy == QuantizationStrategy.CHANNEL: - return (1, -1), (-1, 1) # per-token, per-channel - else: - raise NotImplementedError( - f"QuantizationStrategy.{strategy} is not supported for " - "fp8 MLA, please run with VLLM_MLA_DISABLE=1") - else: - raise NotImplementedError( - "Can't determine scale group shapes for " - f"{layer.quant_method}, please run with VLLM_MLA_DISABLE=1" - ) - def get_layer_weight(layer): WEIGHT_NAMES = ("weight", "qweight", "weight_packed") for attr in WEIGHT_NAMES: @@ -1167,10 +1095,9 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): return dequant_weights.T return layer.weight - weight_dtype = get_layer_weight(self.kv_b_proj).dtype - assert get_layer_weight(self.o_proj).dtype == weight_dtype - assert get_layer_weight(self.q_proj).dtype == weight_dtype - + # we currently do not have quantized bmm's which are needed for + # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform + # the bmm's in 16-bit, the extra memory overhead of this is fairly low kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T assert kv_b_proj_weight.shape == ( self.kv_lora_rank, @@ -1189,89 +1116,10 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): W_UK, W_UV = kv_b_proj_weight.split( [self.qk_nope_head_dim, self.v_head_dim], dim=-1) - q_proj_weight = get_and_maybe_dequant_weights(self.q_proj).T\ - .view(-1, self.num_heads, self.qk_head_dim) - - # can be W_Q or W_UQ depending q_lora_rank, the former if - # q_lora_rank is None, the latter otherwise. From the Attention backend - # perspective though we call these both W_Q and rely on the layer - # to pass in the correct matrix - W_Q = q_proj_weight[..., :self.qk_nope_head_dim] - self.W_QR = q_proj_weight[..., self.qk_nope_head_dim:]\ - .flatten(start_dim=1).contiguous() - - # W_QR is small so for simplicity we dont bother requantizing it - self.W_QR = self.W_QR.to(act_dtype) - - if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION: - requantization_enabled = not envs.VLLM_MLA_DISABLE_REQUANTIZATION - if is_fp8(weight_dtype) and requantization_enabled: - # This assumes it wise to requantize using the same group shapes - # (i.e. strategy, per-tensor, per-channel, block etc.) that the - # weights were originally quantized - requant_input_group_shape, requant_weight_group_shape = \ - get_scale_group_shapes_for_fp8(self.q_proj) - assert (requant_input_group_shape, requant_weight_group_shape)\ - == get_scale_group_shapes_for_fp8(self.kv_b_proj) - assert (requant_input_group_shape, requant_weight_group_shape)\ - == get_scale_group_shapes_for_fp8(self.o_proj) - self.reqaunt_input_group_shape = requant_input_group_shape - self.reqaunt_weight_group_shape = requant_weight_group_shape - - # - # Perform matrix-absorption following - # https://github.com/flashinfer-ai/flashinfer/pull/551 - # for decode, as a result we end up with absorbed weights for decode - # and another copy of raw weights for prefill. - # - self.W_UK, self.W_UV = kv_b_proj_weight.split( - [self.qk_nope_head_dim, self.v_head_dim], dim=-1) - # We absorb `W_UK` into `W_Q` resulting in either W_Q_UK or W_UQ_UK - # depending q_lora_rank, the former if q_lora_rank is None, the - # latter otherwise - # basically if q_lora_rank is none we are absorbing into q_proj - # instead of UQ - W_Q_UK = torch.einsum("qnd,lnd -> qnl", W_Q, W_UK)\ - .flatten(start_dim=1).contiguous() - - if is_fp8(weight_dtype) and requantization_enabled: - W_Q_UK, W_Q_UK_scales = scaled_quantize( - W_Q_UK, - self.reqaunt_weight_group_shape, - quant_dtype=current_platform.fp8_dtype()) - # For FP8 save the transpose so we can use - # `apply_w8a8_block_fp8_linear` directly - self.W_Q_UK = W_Q_UK.T.contiguous() - self.W_Q_UK_scales = W_Q_UK_scales.T.contiguous() - else: - self.W_Q_UK = W_Q_UK.to(act_dtype) - - W_O = get_and_maybe_dequant_weights(self.o_proj)\ - .view(-1, self.num_heads, self.v_head_dim) - W_UV_O = torch.einsum("lnd,hnd -> nlh", W_UV, W_O)\ - .flatten(start_dim=0, end_dim=1).contiguous() - - if is_fp8(weight_dtype) and requantization_enabled: - W_UV_O, W_UV_O_scales = scaled_quantize( - W_UV_O, - self.reqaunt_weight_group_shape, - quant_dtype=current_platform.fp8_dtype()) - # For FP8 save the transpose so we can use - # `apply_w8a8_block_fp8_linear` directly - self.W_UV_O = W_UV_O.T.contiguous() - self.W_UV_O_scales = W_UV_O_scales.T.contiguous() - else: - self.W_UV_O = W_UV_O.to(act_dtype) - - self.tp_size = get_tensor_model_parallel_world_size() - else: - if is_fp8(weight_dtype): - raise NotImplementedError( - "Currently fp8 requires matrix absorption") - - self.W_UV = W_UV - self.W_UK = W_UK - self.W_Q = W_Q.flatten(start_dim=1) + # Convert from (L, N, V) to (N, L, V) + self.W_UV = W_UV.transpose(0, 1) + # Convert from (L, N, P) to (N, P, L) + self.W_UK_T = W_UK.permute(1, 2, 0) def _compute_prefill_context( self, @@ -1471,7 +1319,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): @abstractmethod def _forward_decode( self, - q_nope: torch.Tensor, + ql_nope: torch.Tensor, q_pe: torch.Tensor, kv_c_and_k_pe_cache: torch.Tensor, attn_metadata: T, @@ -1525,9 +1373,8 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): prefill_k_c_normed = k_c_normed[:num_prefill_tokens] if has_decode: - decode_q_nope = self._q_proj_and_k_up_proj(decode_hs_or_q_c) - decode_q_pe = torch.matmul(decode_hs_or_q_c, self.W_QR)\ - .view(-1, self.num_heads, self.qk_rope_head_dim) + decode_ql_nope, decode_q_pe = \ + self._q_proj_and_k_up_proj(decode_hs_or_q_c) decode_q_pe[...], decode_k_pe[...] = self.rotary_emb( decode_input_positions, decode_q_pe, decode_k_pe) @@ -1561,6 +1408,6 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): if has_decode: output[num_prefill_tokens:] = self._forward_decode( - decode_q_nope, decode_q_pe, kv_cache, attn_metadata) + decode_ql_nope, decode_q_pe, kv_cache, attn_metadata) return output diff --git a/vllm/envs.py b/vllm/envs.py index 259501056cc3b..a36d20a4f8b50 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -84,8 +84,6 @@ if TYPE_CHECKING: VLLM_SERVER_DEV_MODE: bool = False VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128 VLLM_MLA_DISABLE: bool = False - VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True - VLLM_MLA_DISABLE_REQUANTIZATION: bool = False VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False VLLM_RAY_PER_WORKER_GPUS: float = 1.0 @@ -563,23 +561,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_MLA_DISABLE": lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))), - # Flag that can control whether or not we perform matrix-absorption for MLA - # decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the - # matrices reduces the runtime FLOPs needed to compute MLA but requires - # storing more weights, W_Q_UK and W_UV_O, so can increase memory usage, - # the is enabled by default - "VLLM_MLA_PERFORM_MATRIX_ABSORPTION": - lambda: bool(int(os.getenv("VLLM_MLA_PERFORM_MATRIX_ABSORPTION", "1"))), - - # When running MLA with matrix-absorption enabled and fp8 quantized weights - # we perform the matrix-absorption in float32 precision, after the matrices - # are absorbed we requantize the weights back to fp8, this flag can be used - # to disable the requantization step, and instead convert the absorbed - # matrices to match the activation type. This can lead to higher memory and - # compute usage but better preserves the accuracy of the original model. - "VLLM_MLA_DISABLE_REQUANTIZATION": - lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))), - # If set, vLLM will use the Triton implementation of moe_align_block_size, # i.e. moe_align_block_size_triton in fused_moe.py. "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON": diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 1e19302cbad81..ecb7996e1e8c5 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -13,10 +13,9 @@ import triton.language as tl from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( - _normalize_quant_group_shape, scaled_dequantize) + scaled_dequantize) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - CUTLASS_BLOCK_FP8_SUPPORTED, Fp8LinearOp, cutlass_block_fp8_supported, - cutlass_fp8_supported) + CUTLASS_BLOCK_FP8_SUPPORTED) from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op @@ -101,60 +100,6 @@ direct_register_custom_op( ) -# Unify the interface between `apply_w8a8_block_fp8_linear` and -# `apply_fp8_linear` -# NOTE(lucas): this is quite messy, we should think through this more formally -# TODO(luka): unify this better -# https://github.com/vllm-project/vllm/issues/14397 -class Fp8LinearGenericOp: - - def __init__( - self, - cutlass_fp8_supported: bool = cutlass_fp8_supported(), - cutlass_block_fp8_supported: bool = cutlass_block_fp8_supported(), - ): - self.cutlass_block_fp8_supported = cutlass_block_fp8_supported - self.fp8_linear = Fp8LinearOp( - cutlass_fp8_supported=cutlass_fp8_supported) - - def apply( - self, - input: torch.Tensor, - weight: torch.Tensor, - weight_scale: torch.Tensor, - input_group_shape: Tuple[int, int], - weight_group_shape: Tuple[int, int], - input_scale: Optional[torch.Tensor] = None, # static scale if one - ) -> torch.Tensor: - # View input as 2D matrix for fp8 methods - input = input.view(-1, input.shape[-1]) - - weight_group_shape = _normalize_quant_group_shape( \ - weight, weight_group_shape) - input_group_shape = _normalize_quant_group_shape( - input, input_group_shape) - - def is_dim_blocked(dim, shape, group_shape): - return group_shape < shape[dim] and group_shape > 1 - - if is_dim_blocked(0, weight.shape, weight_group_shape[0])\ - and is_dim_blocked(1, weight.shape, weight_group_shape[1]) and\ - input_group_shape == (1, weight_group_shape[1]): - return apply_w8a8_block_fp8_linear( - input, - weight, - list(weight_group_shape), - weight_scale, - cutlass_block_fp8_supported=self.cutlass_block_fp8_supported) - else: - # Despite having linear in the name it doesn't conform to - # `torch.nn.functional.linear` which is defined as - # `input @ weight.T` so we explicitly transpose the weight matrix - return self.fp8_linear.apply(input, weight.T, weight_scale.T, - use_per_token_if_dynamic=\ - (input_group_shape == (1, input.shape[1]))) - - def input_to_float8( x: torch.Tensor, dtype: Optional[torch.dtype] = None diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 14a7bd3535222..f801745ab5c7d 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -21,7 +21,7 @@ Main reference: DeepseekV2 paper, and FlashInfer Implementation (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551). Deepseek's MLA attention works the following way: -* Use a single latent vector to represent the per-token entry of the KV cache. +* Use a single latent vector to represent the per-token entry of the KV cache. * For decode (i.e. the memory friendly approach) the attention "simulates" a multi-head attention, while the compute is similar to multi-query attention. @@ -54,9 +54,9 @@ W_DQ project h_t to q_c shape [H, Lq] W_UQ project q_c to q_nope shape [Lq, N * P] W_QR project q_c to q_pe shape [Lq, N * R] W_DKV project h_t to kv_c shape [H, Lkv] -W_UK project kv_c to k_nope shape [Lkv, N * P] -W_KR project h_t to k_pe shape [H, N * R] -W_UV project kv_c to v shape [Lkv, N * V] +W_UK project kv_c to k_nope shape [Lkv, N, P] +W_KR project h_t to k_pe shape [H, R] +W_UV project kv_c to v shape [Lkv, N, V] W_O project v to h_t shape [N * V, H] @@ -69,8 +69,8 @@ new_kv_c = h_t @ W_DKV new_k_pe = RoPE(h_t @ W_KR) kv_c = torch.cat([new_kv_c, cache_kv_c], dim=0) k_pe = torch.cat([new_k_pe, cache_k_pe], dim=0) -k_nope = (kv_c @ W_UK).view(Skv, N, P) -v = (kv_c @ W_UV).view(Skv, N, V) +k_nope = (kv_c @ W_UK.view(Lkv, N * P)).view(Skv, N, P) +v = (kv_c @ W_UV.view(Lkv, N * V)).view(Skv, N, V) // MHA with QK headdim = P + R // V headdim = V @@ -79,7 +79,7 @@ spda_o = scaled_dot_product_attention( torch.cat([q_nope, q_pe], dim=-1), torch.cat([k_nope, k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1), v -) +) return spda_o @ W_O NOTE: in the actual code, @@ -90,20 +90,10 @@ NOTE: in the actual code, ## Data-Movement Friendly Approach (i.e. "_forward_decode"): -Ahead of time, compute: - -% this projects from q_c to [Sq, N * Lkv] -W_UQ_UK = einsum("qnp,knp -> qnk" - W_UQ.view(Lq, N, P), W_UK.view(Lkv, N, P) - ).view(Lkv, N * Lkv) -% this projects from attn output [Sq, N * Lkv] to [Sq, H] -W_UV_O = einsum("knv,nvh -> nkh" - W_UV.view(Lkv, N, V), W_O.view(N, V, H) - ).view(N * Lkv, H) - Runtime q_c = h_t @ W_DQ -q_latent = q_c @ W_UQ_UK.view(Sq, N, Lkv) +q_nope = (q_c @ W_UQ).view(-1, N, P) +ql_nope = einsum("snh,lnh->snl", q, W_UK) q_pe = RoPE(q_c @ W_QR).view(Sq, N, R) new_kv_c = h_t @ W_DKV new_k_pe = RoPE(h_t @ W_KR) @@ -116,29 +106,31 @@ k_pe = torch.cat([new_k_pe, cache_k_pe], dim=0) // NOTE: this is less compute-friendly since Lkv > P // but is more data-movement friendly since its MQA vs MHA spda_o = scaled_dot_product_attention( - torch.cat([q_latent, q_pe], dim=-1), + torch.cat([ql_nope, q_pe], dim=-1), torch.cat([kv_c, k_pe], dim=-1), kv_c ) -return spda_o.reshape(-1, N * Lkv) @ W_UV_O + +o = einsum("snl,lnv->snv", spda_o.reshape(-1, N, Lkv), W_UV) +return o.view(-1, N * V) @ self.num_heads @ W_O ## Chunked Prefill -For chunked prefill we want to use the compute friendly algorithm. We are -assuming sufficiently large Sq / Skv ratio, in the future may want to switch to +For chunked prefill we want to use the compute friendly algorithm. We are +assuming sufficiently large Sq / Skv ratio, in the future may want to switch to the data-movement friendly approach if the chunk (i.e. `Sq`) is small. However, the compute-friendly approach can potentially run out of memory if Skv is large due to: `k_nope = (kv_c @ W_UK).view(Skv, N, P)` -To mitigate this, we chunk the computation of attention with respect to the -current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a +To mitigate this, we chunk the computation of attention with respect to the +current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a fixed workspace size. The chunked prefill approach is as follows: -MCC Max chunk of context to process per iter, computed dynamically, +MCC Max chunk of context to process per iter, computed dynamically, used to bound the memory usage q_c = h_t @ W_DQ @@ -146,8 +138,8 @@ q_nope = (q_c @ W_UQ).view(Sq, N, P) q_pe = RoPE(q_c @ W_QR).view(Sq, N, R) new_kv_c = h_t @ W_DKV new_k_pe = RoPE(h_t @ W_KR) -new_k_nope = (new_kv_c @ W_UK).view(Sq, N, P) -new_v = (new_kv_c @ W_UV).view(Sq, N, V) +new_k_nope = (new_kv_c @ W_UK.view(Lkv, N * P)).view(Sq, N, P) +new_v = (new_kv_c @ W_UV.view(Lkv, N * V)).view(Sq, N, V) // MHA between queries and new KV // with QK headdim = P + R @@ -160,7 +152,7 @@ curr_o, curr_lse = scaled_dot_product_attention( new_v, casual=True, return_softmax_lse=True -) +) // Compute attention with the already existing context for chunk_idx in range(cdiv(C, MCC)): @@ -198,30 +190,17 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar import torch -from compressed_tensors.quantization import QuantizationStrategy from vllm import _custom_ops as ops -from vllm import envs from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer, AttentionMetadata, MLAAttentionImpl) from vllm.attention.backends.utils import get_flash_attn_version from vllm.attention.ops.triton_merge_attn_states import merge_attn_states -from vllm.distributed import (get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce) from vllm.logger import init_logger from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearBase, RowParallelLinear, UnquantizedLinearMethod) -from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 - CompressedTensorsLinearMethod) -from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( - CompressedTensorsW8A8Fp8) -from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod -from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - Fp8LinearGenericOp, is_fp8) -from vllm.model_executor.layers.quantization.utils.quant_utils import ( - scaled_quantize) from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from vllm.platforms import current_platform from vllm.utils import cdiv, round_down @@ -646,7 +625,6 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): self.kv_b_proj = kv_b_proj self.o_proj = o_proj self.vllm_flash_attn_version = get_flash_attn_version() - self.fp8_linear_generic = Fp8LinearGenericOp() # Handle the differences between the flash_attn_varlen from flash_attn # and the one from vllm_flash_attn. The former is used on RoCM and the @@ -658,88 +636,37 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): fa_version=self.vllm_flash_attn_version) def _v_up_proj_and_o_proj(self, x): - if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION: - if is_fp8(self.W_UV_O): - output_parallel = self.fp8_linear_generic.apply( - x.flatten(start_dim=1), self.W_UV_O, self.W_UV_O_scales, - self.reqaunt_input_group_shape, - self.reqaunt_weight_group_shape) - else: - output_parallel = torch.matmul(x.flatten(start_dim=1), - self.W_UV_O) - if self.tp_size > 1: - output = tensor_model_parallel_all_reduce(output_parallel) - else: - output = output_parallel - return output - else: - x = torch.einsum("bnl,lnv->bnv", x, self.W_UV) - return self.o_proj(x.reshape(-1, - self.num_heads * self.v_head_dim))[0] + # Convert from (B, N, L) to (N, B, L) + x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1) + # Multiply (N, B, L) x (N, L, V) -> (N, B, V) + x = torch.bmm(x, self.W_UV) + # Convert from (N, B, V) to (B, N * V) + x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim) + return self.o_proj(x)[0] + # Return `ql_nope`, `q_pe` def _q_proj_and_k_up_proj(self, x): - if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION: - if is_fp8(self.W_Q_UK): - return self.fp8_linear_generic.apply( - x, self.W_Q_UK, self.W_Q_UK_scales, - self.reqaunt_input_group_shape, - self.reqaunt_weight_group_shape).view( - -1, self.num_heads, self.kv_lora_rank) - return torch.matmul(x, self.W_Q_UK)\ - .view(-1, self.num_heads, self.kv_lora_rank) - else: - x = torch.matmul(x, self.W_Q)\ - .view(-1, self.num_heads, self.qk_nope_head_dim) - return torch.einsum("bnp,lnp->bnl", x, self.W_UK)\ - .view(-1, self.num_heads, self.kv_lora_rank) + q_nope, q_pe = self.q_proj(x)[0]\ + .view(-1, self.num_heads, self.qk_head_dim)\ + .split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + + # Convert from (B, N, P) to (N, B, P) + q_nope = q_nope.transpose(0, 1) + # Multiply (N, B, P) x (N, P, L) -> (N, B, L) + ql_nope = torch.bmm(q_nope, self.W_UK_T) + # Convert from (N, B, L) to (B, N, L) + return ql_nope.transpose(0, 1), q_pe def process_weights_after_loading(self, act_dtype: torch.dtype): - # TODO(lucas) This is very gross, we need a more wide scale refactor of - # all the FP8 code with a more standard way of - # defining schemes/group-shapes, we should also potentially force - # quant_methods to support a decompress function - # - # returns input_group_shape, weight_group_shape - def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \ - tuple[tuple[int, int], tuple[int, int]]: - if isinstance(layer.quant_method, Fp8LinearMethod): - if layer.quant_method.block_quant: - weight_block_size = \ - layer.quant_method.quant_config.weight_block_size - # per-token-group (1, X), block-quantized (X, Y) - return (1, weight_block_size[-1]), weight_block_size - else: - return (-1, -1), (-1, -1) # per-tensor, per-tensor - elif isinstance(layer.quant_method, CompressedTensorsLinearMethod)\ - and isinstance(layer.scheme, CompressedTensorsW8A8Fp8): - # this is hacky but we always assume the for - # CompressedTensorsW8A8Fp8 the input is dynamic per-token - # we ignore if it is static-per-tensor since we are going to - # requantize after later anyways - strategy = layer.scheme.strategy - if strategy == QuantizationStrategy.TENSOR: - return (1, -1), (-1, -1) # per-token, per-tensor - elif strategy == QuantizationStrategy.CHANNEL: - return (1, -1), (-1, 1) # per-token, per-channel - else: - raise NotImplementedError( - f"QuantizationStrategy.{strategy} is not supported for " - "fp8 MLA, please run with VLLM_MLA_DISABLE=1") - else: - raise NotImplementedError( - "Can't determine scale group shapes for " - f"{layer.quant_method}, please run with VLLM_MLA_DISABLE=1" - ) - def get_layer_weight(layer): - if hasattr(layer, "weight"): - return layer.weight - elif hasattr(layer, "qweight"): - return layer.qweight - else: - raise AttributeError( - f"Layer '{layer}' has neither weight nor qweight") + WEIGHT_NAMES = ("weight", "qweight", "weight_packed") + for attr in WEIGHT_NAMES: + if hasattr(layer, attr): + return getattr(layer, attr) + raise AttributeError( + f"Layer '{layer}' has no recognized weight attribute:" + f" {WEIGHT_NAMES}.") def get_and_maybe_dequant_weights(layer: LinearBase): if not isinstance(layer.quant_method, UnquantizedLinearMethod): @@ -755,10 +682,9 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): return dequant_weights.T return layer.weight - weight_dtype = get_layer_weight(self.kv_b_proj).dtype - assert get_layer_weight(self.o_proj).dtype == weight_dtype - assert get_layer_weight(self.q_proj).dtype == weight_dtype - + # we currently do not have quantized bmm's which are needed for + # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform + # the bmm's in 16-bit, the extra memory overhead of this is fairly low kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T assert kv_b_proj_weight.shape == ( self.kv_lora_rank, @@ -777,89 +703,10 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): W_UK, W_UV = kv_b_proj_weight.split( [self.qk_nope_head_dim, self.v_head_dim], dim=-1) - q_proj_weight = get_and_maybe_dequant_weights(self.q_proj).T\ - .view(-1, self.num_heads, self.qk_head_dim) - - # can be W_Q or W_UQ depending q_lora_rank, the former if - # q_lora_rank is None, the latter otherwise. From the Attention backend - # perspective though we call these both W_Q and rely on the layer - # to pass in the correct matrix - W_Q = q_proj_weight[..., :self.qk_nope_head_dim] - self.W_QR = q_proj_weight[..., self.qk_nope_head_dim:]\ - .flatten(start_dim=1).contiguous() - - # W_QR is small so for simplicity we dont bother requantizing it - self.W_QR = self.W_QR.to(act_dtype) - - if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION: - requantization_enabled = not envs.VLLM_MLA_DISABLE_REQUANTIZATION - if is_fp8(weight_dtype) and requantization_enabled: - # This assumes it wise to requantize using the same group shapes - # (i.e. strategy, per-tensor, per-channel, block etc.) that the - # weights were originally quantized - requant_input_group_shape, requant_weight_group_shape = \ - get_scale_group_shapes_for_fp8(self.q_proj) - assert (requant_input_group_shape, requant_weight_group_shape)\ - == get_scale_group_shapes_for_fp8(self.kv_b_proj) - assert (requant_input_group_shape, requant_weight_group_shape)\ - == get_scale_group_shapes_for_fp8(self.o_proj) - self.reqaunt_input_group_shape = requant_input_group_shape - self.reqaunt_weight_group_shape = requant_weight_group_shape - - # - # Perform matrix-absorption following - # https://github.com/flashinfer-ai/flashinfer/pull/551 - # for decode, as a result we end up with absorbed weights for decode - # and another copy of raw weights for prefill. - # - self.W_UK, self.W_UV = kv_b_proj_weight.split( - [self.qk_nope_head_dim, self.v_head_dim], dim=-1) - # We absorb `W_UK` into `W_Q` resulting in either W_Q_UK or W_UQ_UK - # depending q_lora_rank, the former if q_lora_rank is None, the - # latter otherwise - # basically if q_lora_rank is none we are absorbing into q_proj - # instead of UQ - W_Q_UK = torch.einsum("qnd,lnd -> qnl", W_Q, W_UK)\ - .flatten(start_dim=1).contiguous() - - if is_fp8(weight_dtype) and requantization_enabled: - W_Q_UK, W_Q_UK_scales = scaled_quantize( - W_Q_UK, - self.reqaunt_weight_group_shape, - quant_dtype=current_platform.fp8_dtype()) - # For FP8 save the transpose so we can use - # `apply_w8a8_block_fp8_linear` directly - self.W_Q_UK = W_Q_UK.T.contiguous() - self.W_Q_UK_scales = W_Q_UK_scales.T.contiguous() - else: - self.W_Q_UK = W_Q_UK.to(act_dtype) - - W_O = get_and_maybe_dequant_weights(self.o_proj)\ - .view(-1, self.num_heads, self.v_head_dim) - W_UV_O = torch.einsum("lnd,hnd -> nlh", W_UV, W_O)\ - .flatten(start_dim=0, end_dim=1).contiguous() - - if is_fp8(weight_dtype) and requantization_enabled: - W_UV_O, W_UV_O_scales = scaled_quantize( - W_UV_O, - self.reqaunt_weight_group_shape, - quant_dtype=current_platform.fp8_dtype()) - # For FP8 save the transpose so we can use - # `apply_w8a8_block_fp8_linear` directly - self.W_UV_O = W_UV_O.T.contiguous() - self.W_UV_O_scales = W_UV_O_scales.T.contiguous() - else: - self.W_UV_O = W_UV_O.to(act_dtype) - - self.tp_size = get_tensor_model_parallel_world_size() - else: - if is_fp8(weight_dtype): - raise NotImplementedError( - "Currently fp8 requires matrix absorption") - - self.W_UV = W_UV - self.W_UK = W_UK - self.W_Q = W_Q.flatten(start_dim=1) + # Convert from (L, N, V) to (N, L, V) + self.W_UV = W_UV.transpose(0, 1) + # Convert from (L, N, P) to (N, P, L) + self.W_UK_T = W_UK.permute(1, 2, 0) def _compute_prefill_context( self, @@ -998,7 +845,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): @abstractmethod def _forward_decode( self, - q_nope: torch.Tensor, + ql_nope: torch.Tensor, q_pe: torch.Tensor, kv_c_and_k_pe_cache: torch.Tensor, attn_metadata: M, @@ -1051,10 +898,8 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): if has_decode: assert attn_metadata.decode is not None - decode_q_nope = self._q_proj_and_k_up_proj(decode_hs_or_q_c) - decode_q_pe = torch.matmul(decode_hs_or_q_c, self.W_QR)\ - .view(-1, self.num_heads, self.qk_rope_head_dim) - + decode_ql_nope, decode_q_pe = \ + self._q_proj_and_k_up_proj(decode_hs_or_q_c) decode_q_pe[...], decode_k_pe[...] = self.rotary_emb( attn_metadata.decode.input_positions, decode_q_pe.contiguous(), decode_k_pe) @@ -1087,6 +932,6 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): if has_decode: output[:num_decode_tokens] = self._forward_decode( - decode_q_nope, decode_q_pe, kv_cache, attn_metadata) + decode_ql_nope, decode_q_pe, kv_cache, attn_metadata) return output_padded From 27b50f1fe6e325f73b405263c3ac1fc668531118 Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Fri, 14 Mar 2025 14:47:49 +0800 Subject: [PATCH 016/169] [Bugfix][Kernel][CPU] Fix num_tokens in CPU rotary embedding kernel (#14667) Signed-off-by: Thien Tran --- csrc/cpu/pos_encoding.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/cpu/pos_encoding.cpp b/csrc/cpu/pos_encoding.cpp index 96bce7dda0132..8a59e884d6c82 100644 --- a/csrc/cpu/pos_encoding.cpp +++ b/csrc/cpu/pos_encoding.cpp @@ -170,7 +170,7 @@ void rotary_embedding_gptj_impl( void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, torch::Tensor& key, int64_t head_size, torch::Tensor& cos_sin_cache, bool is_neox) { - int num_tokens = query.numel() / query.size(-1); + int num_tokens = positions.numel(); int rot_dim = cos_sin_cache.size(1); int num_heads = query.size(-1) / head_size; int num_kv_heads = key.size(-1) / head_size; From 09269b31274952bc5fff73023d108e00eb891068 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Fri, 14 Mar 2025 15:02:05 +0800 Subject: [PATCH 017/169] [BugFix]Fix performance serving benchmark when enable profiling (#14737) Signed-off-by: wangli --- benchmarks/backend_request_func.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index d53428d219e7a..6a7db920b5b63 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -333,7 +333,7 @@ async def async_request_openai_chat_completions( ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith( - "chat/completions" + ("chat/completions", "profile") ), "OpenAI Chat Completions API URL must end with 'chat/completions'." async with aiohttp.ClientSession(trust_env=True, From 601bd3268eddb3bd22aecafdc08d3600d691f7a8 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 14 Mar 2025 15:59:56 +0800 Subject: [PATCH 018/169] [Misc] Clean up type annotation for `SupportsMultiModal` (#14794) Signed-off-by: DarkLight1337 --- docs/source/contributing/model/multimodal.md | 5 ++-- tests/distributed/test_pipeline_parallel.py | 4 ++-- vllm/model_executor/models/aria.py | 10 ++++---- vllm/model_executor/models/blip2.py | 10 ++++---- vllm/model_executor/models/chameleon.py | 10 ++++---- vllm/model_executor/models/deepseek_vl2.py | 7 +++--- vllm/model_executor/models/florence2.py | 10 ++++---- vllm/model_executor/models/fuyu.py | 9 ++++--- vllm/model_executor/models/gemma3_mm.py | 10 ++++---- vllm/model_executor/models/glm4v.py | 10 ++++---- vllm/model_executor/models/idefics3.py | 7 +++--- vllm/model_executor/models/interfaces.py | 24 ++++++++++--------- vllm/model_executor/models/internvl.py | 7 +++--- vllm/model_executor/models/llava.py | 7 +++--- vllm/model_executor/models/llava_next.py | 9 ++++--- .../model_executor/models/llava_next_video.py | 10 ++++---- vllm/model_executor/models/llava_onevision.py | 14 +++++------ vllm/model_executor/models/molmo.py | 9 ++++--- vllm/model_executor/models/paligemma.py | 10 ++++---- vllm/model_executor/models/phi3v.py | 11 ++++----- vllm/model_executor/models/pixtral.py | 9 ++++--- vllm/model_executor/models/qwen2_5_vl.py | 12 +++++----- vllm/model_executor/models/qwen2_audio.py | 10 ++++---- vllm/model_executor/models/qwen2_vl.py | 12 +++++----- vllm/model_executor/models/qwen_vl.py | 11 ++++----- vllm/model_executor/models/ultravox.py | 8 +++---- vllm/model_executor/models/whisper.py | 7 +++--- 27 files changed, 121 insertions(+), 141 deletions(-) diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md index f55a62ef01b4f..9cbfc32991f09 100644 --- a/docs/source/contributing/model/multimodal.md +++ b/docs/source/contributing/model/multimodal.md @@ -34,7 +34,8 @@ Further update the model as follows: image_features = self.vision_encoder(image_input) return self.multi_modal_projector(image_features) - def get_multimodal_embeddings(self, **kwargs: object) -> Optional[NestedTensors]: + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: # Validate the multimodal input keyword arguments image_input = self._parse_and_validate_image_input(**kwargs) @@ -61,7 +62,7 @@ Further update the model as follows: def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: # `get_input_embeddings` should already be implemented for the language diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 5562b36816c44..4b479a0c93a9a 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -214,7 +214,7 @@ MULTIMODAL_MODELS = { "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(), "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(), "allenai/Molmo-7B-D-0924": PPTestSettings.fast(), - "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(), + "microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(), "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"), "Qwen/Qwen-VL-Chat": PPTestSettings.fast(), "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(), @@ -237,7 +237,7 @@ TEST_MODELS = [ "BAAI/bge-multilingual-gemma2", # [MULTIMODAL GENERATION] "OpenGVLab/InternVL2-1B", - "microsoft/Phi-3-vision-128k-instruct", + "microsoft/Phi-3.5-vision-instruct", "fixie-ai/ultravox-v0_5-llama-3_2-1b", # [LANGUAGE GENERATION - HYBRID ARCH] "ai21labs/Jamba-tiny-dev", diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index de3512cf18d99..ecd0a04b1dff7 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -21,8 +21,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, - NestedTensors) +from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, @@ -35,7 +34,7 @@ from .idefics2_vision_model import Idefics2VisionConfig from .idefics2_vision_model import ( Idefics2VisionTransformer as Idefics3VisionTransformer) # yapf: enable -from .interfaces import SupportsMultiModal, SupportsQuant +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsQuant from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, is_pp_missing_parameter, maybe_prefix, @@ -607,8 +606,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): return self.multi_modal_projector(image_outputs, image_attn_mask) def get_multimodal_embeddings( - self, **kwargs - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -618,7 +616,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index d7eaac2563f63..47362e3d89763 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -15,8 +15,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, - NestedTensors) +from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptIndexTargets, @@ -25,7 +24,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from .blip import BlipVisionModel -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -629,8 +628,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): return self.language_projection(query_output) def get_multimodal_embeddings( - self, **kwargs - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -640,7 +638,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 68284a018af8c..66bf85b59d1e2 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -30,8 +30,7 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, - NestedTensors) +from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, @@ -39,7 +38,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .utils import (is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, merge_multimodal_embeddings) @@ -986,8 +985,7 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, ) def get_multimodal_embeddings( - self, **kwargs - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -1000,7 +998,7 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.model.get_input_embeddings(input_ids) diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index fd5d5a564b5ea..6ea8de8450bc7 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -36,7 +36,7 @@ from vllm.transformers_utils.processors.deepseek_vl2 import ( from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from vllm.utils import is_list_of -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -605,8 +605,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): pixel_values=pixel_values, images_spatial_crop=images_spatial_crop) def get_multimodal_embeddings( - self, **kwargs: object - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -616,7 +615,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index e892a1a4fc665..3883cd4460f50 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -20,7 +20,7 @@ from vllm.model_executor.models.bart import (BartDecoder, BartEncoder, BartParallelLMHead, BartScaledWordEmbedding) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs from vllm.multimodal.parse import MultiModalDataDict, MultiModalDataItems from vllm.multimodal.processing import (BaseProcessingInfo, @@ -30,7 +30,8 @@ from vllm.multimodal.processing import (BaseProcessingInfo, from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from .interfaces import SupportsMultiModal, SupportsV0Only +from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, + SupportsV0Only) from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings @@ -1037,8 +1038,7 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal): return self._encode_image(pixel_values) def get_multimodal_embeddings( - self, **kwargs: object - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -1048,7 +1048,7 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal): def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 51c79ba846c94..a6fcb5b81b1dd 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -18,7 +18,7 @@ """ PyTorch Fuyu model.""" import math from collections.abc import Iterable, Mapping, Sequence -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import List, Literal, Optional, Set, Tuple, TypedDict import torch import torch.nn as nn @@ -41,7 +41,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, merge_multimodal_embeddings) @@ -327,8 +327,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): return vision_embeddings_flat.split(patches_per_image, dim=0) def get_multimodal_embeddings( - self, **kwargs - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -338,7 +337,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index ac80059cbe6d8..ce7c89449e08f 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -14,8 +14,7 @@ from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, - NestedTensors) +from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -24,7 +23,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -481,7 +480,8 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, ) return self.multi_modal_projector(vision_outputs) - def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -491,7 +491,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: if multimodal_embeddings is None: inputs_embeds = self.language_model.get_input_embeddings(input_ids) diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 2700ebccb8318..9889b7e4de40a 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -28,7 +28,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors +from vllm.multimodal.inputs import MultiModalKwargs from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, BatchFeature, @@ -39,7 +39,8 @@ from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import ChatGLMConfig from .chatglm import ChatGLMBaseModel, ChatGLMModel -from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP) from .utils import flatten_bn, merge_multimodal_embeddings @@ -596,8 +597,7 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, return self.transformer.vision(pixel_values) def get_multimodal_embeddings( - self, **kwargs - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -608,7 +608,7 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.transformer.get_input_embeddings(input_ids) diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 19d5a4c259973..234e4498f163b 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -49,7 +49,7 @@ from vllm.sequence import IntermediateTensors from .idefics2_vision_model import ( Idefics2VisionTransformer as Idefics3VisionTransformer) # yapf: enable -from .interfaces import SupportsLoRA, SupportsMultiModal +from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal from .llama import LlamaModel from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, merge_multimodal_embeddings) @@ -617,8 +617,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, self.sampler = get_sampler() def get_multimodal_embeddings( - self, **kwargs - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_input = self.model._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -628,7 +627,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 43196bf544e8a..13d7394ac08bc 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -5,7 +5,7 @@ from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional, import torch from torch import Tensor -from typing_extensions import TypeIs, TypeVar +from typing_extensions import TypeIs from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import ( @@ -20,7 +20,14 @@ if TYPE_CHECKING: logger = init_logger(__name__) -T = TypeVar("T", default=Union[list[Tensor], Tensor, tuple[Tensor, ...]]) +MultiModalEmbeddings = Union[list[Tensor], Tensor, tuple[Tensor, ...]] +""" +The output embeddings must be one of the following formats: + +- A list or tuple of 2D tensors, where each tensor corresponds to + each input multimodal data item (e.g, image). +- A single 3D tensor, with the batch dimension grouping the 2D tensors. +""" @runtime_checkable @@ -36,17 +43,12 @@ class SupportsMultiModal(Protocol): MRO of your model class. """ - def get_multimodal_embeddings(self, **kwargs) -> T: + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: """ Returns multimodal embeddings generated from multimodal kwargs to be merged with text embeddings. - The output embeddings must be one of the following formats: - - - A list or tuple of 2D tensors, where each tensor corresponds to - each input multimodal data item (e.g, image). - - A single 3D tensor, with the batch dimension grouping the 2D tensors. - Note: The returned multimodal embeddings must be in the same order as the appearances of their corresponding multimodal data item in the @@ -60,7 +62,7 @@ class SupportsMultiModal(Protocol): def get_input_embeddings( self, input_ids: Tensor, - multimodal_embeddings: Optional[T] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, attn_metadata: Optional["AttentionMetadata"] = None, ) -> Tensor: ... @@ -69,7 +71,7 @@ class SupportsMultiModal(Protocol): def get_input_embeddings( self, input_ids: Tensor, - multimodal_embeddings: Optional[T] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> Tensor: """ Returns the input embeddings merged from the text embeddings from diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index fcaf7fecaafc9..e91d0ba1b382a 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -37,7 +37,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.tokenizer import AnyTokenizer -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -905,8 +905,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): self.visual_token_mask = None def get_multimodal_embeddings( - self, **kwargs - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -916,7 +915,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 0c0d8e109c92e..ecdd6dfb0a72c 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -38,7 +38,7 @@ from vllm.sequence import IntermediateTensors from vllm.utils import JSONTree, flatten_2d_lists, json_map_leaves from .clip import CLIPVisionModel -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .pixtral import (PixtralHFVisionModel, get_pixtral_hf_image_feature_grid_size) from .siglip import SiglipVisionModel @@ -778,7 +778,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): return embeds_in_batch - def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -800,7 +801,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 04b0f29102926..db89bbf1af6ef 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -16,12 +16,12 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors +from vllm.multimodal.inputs import MultiModalFieldConfig from vllm.multimodal.parse import ImageSize from vllm.sequence import IntermediateTensors from .clip import CLIPVisionModel -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingInfo, LlavaDummyInputsBuilder, LlavaLikeConfig, LlavaMultiModalProjector, init_vision_tower_for_llava) @@ -480,8 +480,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, ] def get_multimodal_embeddings( - self, **kwargs - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -491,7 +490,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: if multimodal_embeddings is None: diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index d974c3d224094..5eb56d6711f3b 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -16,8 +16,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, - NestedTensors) +from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, VideoEmbeddingItems, VideoProcessorItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -27,7 +26,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .llava import init_vision_tower_for_llava from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, init_vllm_registered_model, @@ -421,8 +420,7 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, f"Unsupported type of video input {type(video_pixels)}") def get_multimodal_embeddings( - self, **kwargs - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: video_input = self._parse_and_validate_video_input(**kwargs) if video_input is None: return None @@ -432,7 +430,7 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index f41f45e3e4097..c6bc9ffcbf3d6 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -19,8 +19,7 @@ from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, - NestedTensors) +from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, VideoEmbeddingItems, VideoProcessorItems) from vllm.multimodal.processing import PromptReplacement, PromptUpdate @@ -29,7 +28,7 @@ from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of from .clip import CLIPVisionModel -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .llava import LlavaDummyInputsBuilder, init_vision_tower_for_llava from .llava_next import (BaseLlavaNextMultiModalProcessor, LlavaNextLikeConfig, LlavaNextProcessingInfo) @@ -856,7 +855,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, return image_feature def get_multimodal_embeddings( - self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: return None @@ -882,7 +881,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: @@ -894,10 +893,9 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, def get_input_embeddings_v0( self, input_ids: torch.Tensor, - image_input: Optional[NestedTensors] = None, - video_input: Optional[NestedTensors] = None, + image_input: Optional[LlavaOnevisionImagePixelInputs] = None, + video_input: Optional[LlavaOnevisionVideoPixelInputs] = None, ) -> torch.Tensor: - inputs_embeds = self.get_input_embeddings(input_ids) if image_input is not None: image_embeds = self._process_image_input(image_input) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 554080533059c..9696a858ecd54 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -52,8 +52,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import JSONTree, flatten_2d_lists, json_map_leaves -from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP, - SupportsQuant) +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP, SupportsQuant) from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, @@ -1577,8 +1577,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, return embeds_in_batch def get_multimodal_embeddings( - self, **kwargs - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -1598,7 +1597,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index d4758079c42b9..88a6226d21448 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -13,8 +13,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputs, MultiModalKwargs, - NestedTensors) + MultiModalInputs, MultiModalKwargs) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptIndexTargets, @@ -23,7 +22,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -328,8 +327,7 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, return self.multi_modal_projector(image_features) def get_multimodal_embeddings( - self, **kwargs - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -341,7 +339,7 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 06fa5c5e01995..5305f1e03e1a1 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -31,8 +31,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, - NestedTensors) +from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) # yapf conflicts with isort for this block @@ -48,7 +47,8 @@ from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of from .clip import CLIPVisionModel -from .interfaces import SupportsMultiModal, SupportsPP, SupportsQuant +from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP, + SupportsQuant) from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -649,8 +649,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, return image_embeds def get_multimodal_embeddings( - self, **kwargs - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -660,7 +659,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.embed_tokens(input_ids) if multimodal_embeddings is not None: diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index f17f9fb8e0c72..25b4cc4a9fb80 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -30,12 +30,12 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.inputs import NestedTensors, PlaceholderRange +from vllm.multimodal.inputs import PlaceholderRange from vllm.multimodal.utils import consecutive_placeholder_ranges from vllm.sequence import IntermediateTensors, SequenceData from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .utils import (init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs @@ -221,8 +221,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, return get_sampler() def get_multimodal_embeddings( - self, **kwargs - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_input, image_tokens = self._parse_and_validate_image_input( **kwargs) if image_input is None: @@ -255,7 +254,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index ae48c779481f7..8a570d138c6c2 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -59,7 +59,8 @@ from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope -from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP) from .qwen2_vl import Qwen2VLDummyInputsBuilder as Qwen2_5_VLDummyInputsBuilder from .qwen2_vl import (Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo, apply_rotary_pos_emb_vision) @@ -952,7 +953,7 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, return modalities def get_multimodal_embeddings( - self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: @@ -978,7 +979,7 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: @@ -990,10 +991,9 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, def get_input_embeddings_v0( self, input_ids: torch.Tensor, - image_input: Optional[tuple[torch.Tensor, ...]] = None, - video_input: Optional[tuple[torch.Tensor, ...]] = None, + image_input: Optional[Qwen2_5_VLImageInputs] = None, + video_input: Optional[Qwen2_5_VLVideoInputs] = None, ) -> torch.Tensor: - inputs_embeds = self.get_input_embeddings(input_ids) if image_input is not None: image_embeds = self._process_image_input(image_input) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index c44f4fa4d75a3..aae30f1fd6635 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -37,8 +37,7 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, - NestedTensors) +from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -47,7 +46,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -357,8 +356,7 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, audio_output_lengths.flatten().tolist()) def get_multimodal_embeddings( - self, **kwargs - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: audio_input = self._parse_and_validate_audio_input(**kwargs) if audio_input is None: return None @@ -368,7 +366,7 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 0e9fa7183c89a..b8ac40b7e7f9b 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -71,7 +71,8 @@ from vllm.transformers_utils.config import uses_mrope from vllm.transformers_utils.processor import ( cached_image_processor_from_config) -from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -1262,7 +1263,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, return modalities def get_multimodal_embeddings( - self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: @@ -1289,7 +1290,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: @@ -1301,10 +1302,9 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, def get_input_embeddings_v0( self, input_ids: torch.Tensor, - image_input: Optional[tuple[torch.Tensor, ...]] = None, - video_input: Optional[tuple[torch.Tensor, ...]] = None, + image_input: Optional[Qwen2VLImagePixelInputs] = None, + video_input: Optional[Qwen2VLVideoPixelInputs] = None, ) -> torch.Tensor: - inputs_embeds = self.get_input_embeddings(input_ids) if image_input is not None: image_embeds = self._process_image_input(image_input) diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index ff581b093b47a..1a39d2e74b1ee 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -32,8 +32,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, - NestedTensors) +from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, @@ -41,7 +40,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP) from .qwen import QWenBaseModel, QWenModel from .utils import flatten_bn, merge_multimodal_embeddings @@ -741,8 +741,7 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA, return self.transformer.visual(image_input["data"]) def get_multimodal_embeddings( - self, **kwargs - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -753,7 +752,7 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA, def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.transformer.get_input_embeddings(input_ids) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index f639b8d8f9bed..51b1c33cfbdec 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -35,7 +35,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.ultravox import UltravoxConfig -from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings, @@ -555,8 +556,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): return flattened_embeddings def get_multimodal_embeddings( - self, **kwargs - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: audio_input = self._parse_and_validate_audio_input(**kwargs) if audio_input is None: return None @@ -566,7 +566,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 8ed68bd89e5a0..eb6404922c6d0 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -34,8 +34,8 @@ from vllm.multimodal.processing import (BaseProcessingInfo, PromptReplacement, PromptUpdate) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs -from .interfaces import (SupportsMultiModal, SupportsTranscription, - SupportsV0Only) +from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, + SupportsTranscription, SupportsV0Only) from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors, make_layers) @@ -689,8 +689,7 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription, return decoder_outputs def get_multimodal_embeddings( - self, **kwargs - ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]: + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: # TODO: This method does not obey the interface for SupportsMultiModal. # Refactor this once encoder/decoder support is implemented in V1. audio_input = self._parse_and_validate_audio_input(**kwargs) From 54cc46f3ebcde68f072c37f2142ff37d60077ff0 Mon Sep 17 00:00:00 2001 From: WeiCheng Date: Fri, 14 Mar 2025 16:05:17 +0800 Subject: [PATCH 019/169] [Bugfix] Fix small typo in the example of Streaming delimiter (#14793) --- examples/online_serving/api_client.py | 2 +- examples/online_serving/gradio_webserver.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py index 22bb1a87bfdf6..e2944896d1610 100644 --- a/examples/online_serving/api_client.py +++ b/examples/online_serving/api_client.py @@ -42,7 +42,7 @@ def post_http_request(prompt: str, def get_streaming_response(response: requests.Response) -> Iterable[list[str]]: for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, - delimiter=b"\0"): + delimiter=b"\n"): if chunk: data = json.loads(chunk.decode("utf-8")) output = data["text"] diff --git a/examples/online_serving/gradio_webserver.py b/examples/online_serving/gradio_webserver.py index c619146b03aed..85a9119c6aa2f 100644 --- a/examples/online_serving/gradio_webserver.py +++ b/examples/online_serving/gradio_webserver.py @@ -21,7 +21,7 @@ def http_bot(prompt): for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, - delimiter=b"\0"): + delimiter=b"\n"): if chunk: data = json.loads(chunk.decode("utf-8")) output = data["text"][0] From 989ecd200701bc011a186956a7b599bacf890d97 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 14 Mar 2025 16:07:30 +0800 Subject: [PATCH 020/169] [Misc] Gemma3ForConditionalGeneration supports LoRA (#14797) Signed-off-by: Jee Jee Li --- vllm/model_executor/models/gemma3_mm.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index ce7c89449e08f..b945e4732a507 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -12,6 +12,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs @@ -23,7 +24,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP) from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -371,8 +373,8 @@ class Gemma3MultiModalProjector(nn.Module): @MULTIMODAL_REGISTRY.register_processor(Gemma3MultiModalProcessor, info=Gemma3ProcessingInfo, dummy_inputs=Gemma3DummyInputsBuilder) -class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsPP): +class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, + SupportsLoRA): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -614,3 +616,12 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="multi_modal_projector", + tower_model="vision_tower") From c77620d22d43daa7e0440e6267cbdd83f849ac64 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 14 Mar 2025 01:21:28 -0700 Subject: [PATCH 021/169] [V1][Minor] Minor code cleanup for scheduling metrics (#14800) Signed-off-by: Woosuk Kwon --- vllm/v1/core/scheduler.py | 35 ++++++++++------------------------- vllm/v1/request.py | 28 +++++++++++++--------------- 2 files changed, 23 insertions(+), 40 deletions(-) diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index d498891f476e7..056458ef9dd28 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -15,8 +15,8 @@ from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager, from vllm.v1.core.kv_cache_manager import KVCacheManager from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData, SchedulerOutput) -from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType, - EngineCoreOutput, EngineCoreOutputs) +from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput, + EngineCoreOutputs) from vllm.v1.metrics.stats import SchedulerStats from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus @@ -178,7 +178,9 @@ class Scheduler: self.kv_cache_manager.free(preempted_req) preempted_req.status = RequestStatus.PREEMPTED preempted_req.num_computed_tokens = 0 - self.request_preempted(preempted_req, scheduled_timestamp) + if self.log_stats: + preempted_req.record_event( + EngineCoreEventType.PREEMPTED, scheduled_timestamp) self.waiting.appendleft(preempted_req) preempted_reqs.append(preempted_req) @@ -320,7 +322,9 @@ class Scheduler: req_index += 1 self.running.append(request) self.scheduled_req_ids.add(request.request_id) - self.request_scheduled(request, scheduled_timestamp) + if self.log_stats: + request.record_event(EngineCoreEventType.SCHEDULED, + scheduled_timestamp) if request.status == RequestStatus.WAITING: scheduled_new_reqs.append(request) elif request.status == RequestStatus.PREEMPTED: @@ -666,7 +670,8 @@ class Scheduler: def add_request(self, request: Request) -> None: self.waiting.append(request) self.requests[request.request_id] = request - self.request_queued(request) + if self.log_stats: + request.record_event(EngineCoreEventType.QUEUED) def finish_requests( self, @@ -728,26 +733,6 @@ class Scheduler: def reset_prefix_cache(self) -> bool: return self.kv_cache_manager.reset_prefix_cache() - def request_queued(self, request: Request): - if not self.log_stats: - return - request.events.append( - EngineCoreEvent.new_event(EngineCoreEventType.QUEUED)) - - def request_scheduled(self, request: Request, timestamp: float): - if not self.log_stats: - return - request.events.append( - EngineCoreEvent.new_event(EngineCoreEventType.SCHEDULED, - timestamp)) - - def request_preempted(self, request: Request, timestamp: float): - if not self.log_stats: - return - request.events.append( - EngineCoreEvent.new_event(EngineCoreEventType.PREEMPTED, - timestamp)) - def make_stats(self) -> Optional[SchedulerStats]: if not self.log_stats: return None diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 29609d313306d..efb5a54d12077 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -88,21 +88,6 @@ class Request: sampling_params=request.sampling_params), ) - def queued(self, timestamp: Optional[float] = None) -> None: - self.events.append( - EngineCoreEvent.new_event(EngineCoreEventType.QUEUED, timestamp)) - - def scheduled(self, timestamp: Optional[float] = None) -> None: - self.events.append( - EngineCoreEvent.new_event(EngineCoreEventType.SCHEDULED, - timestamp)) - - def take_events(self) -> Optional[list[EngineCoreEvent]]: - if not self.events: - return None - events, self.events = self.events, [] - return events - def append_output_token_ids( self, token_ids: Union[int, list[int]], @@ -146,6 +131,19 @@ class Request: def use_structured_output(self) -> bool: return self.sampling_params.guided_decoding is not None + def record_event( + self, + event_type: EngineCoreEventType, + timestamp: Optional[float] = None, + ) -> None: + self.events.append(EngineCoreEvent.new_event(event_type, timestamp)) + + def take_events(self) -> Optional[list[EngineCoreEvent]]: + if not self.events: + return None + events, self.events = self.events, [] + return events + class RequestStatus(enum.IntEnum): """Status of a request.""" From 40253bab443ad0cdd22ff33bd8f777d2f289cfc4 Mon Sep 17 00:00:00 2001 From: DefTruth <31974251+DefTruth@users.noreply.github.com> Date: Fri, 14 Mar 2025 18:32:42 +0800 Subject: [PATCH 022/169] [Bugfix][W8A8] fixed cutlass block fp8 binding (#14796) --- csrc/torch_bindings.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index d3bcb86adbc80..eb3a2c911d55e 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -370,7 +370,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> " "bool"); ops.impl("cutlass_scaled_mm_supports_block_fp8", - &cutlass_scaled_mm_supports_fp8); + &cutlass_scaled_mm_supports_block_fp8); // Check if cutlass sparse scaled_mm is supported for CUDA devices of the // given capability From ab93f1360fb7289bf78b1880e5dfbcd3aec6be36 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 14 Mar 2025 20:58:19 +0800 Subject: [PATCH 023/169] [VLM] Various cleanup and fixes (#14806) Signed-off-by: DarkLight1337 --- vllm/entrypoints/chat_utils.py | 15 +- vllm/model_executor/models/fuyu.py | 15 +- vllm/model_executor/models/interfaces.py | 4 +- vllm/model_executor/models/llava.py | 136 +++++++++--------- vllm/model_executor/models/llava_next.py | 7 +- vllm/model_executor/models/llava_onevision.py | 29 ++-- vllm/model_executor/models/minicpmo.py | 84 ++++++----- vllm/model_executor/models/minicpmv.py | 120 ++++++++++------ vllm/model_executor/models/molmo.py | 2 +- vllm/model_executor/models/pixtral.py | 124 ++++++---------- vllm/model_executor/models/qwen2_audio.py | 8 +- vllm/multimodal/inputs.py | 4 + vllm/multimodal/parse.py | 4 + vllm/multimodal/processing.py | 4 +- 14 files changed, 283 insertions(+), 273 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 4ce4fa897cc96..61a91fe03d2e0 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -37,6 +37,7 @@ from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.multimodal import MultiModalDataDict from vllm.multimodal.utils import MediaConnector +from vllm.transformers_utils.processor import cached_get_processor from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer logger = init_logger(__name__) @@ -1070,7 +1071,19 @@ def apply_hf_chat_template( tokenize: bool = False, # Different from HF's default **kwargs: Any, ) -> str: - if chat_template is None and tokenizer.chat_template is None: + if chat_template is None: + chat_template = tokenizer.chat_template + + # FIXME: Temporary workaround for + # https://huggingface.co/mistral-community/pixtral-12b/discussions/31 + if chat_template is None: + try: + processor = cached_get_processor(tokenizer.name_or_path) + chat_template = processor.chat_template + except Exception: + pass + + if chat_template is None: raise ValueError( "As of transformers v4.44, default chat template is no longer " "allowed, so you must provide a chat template if the tokenizer " diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index a6fcb5b81b1dd..bd7ef29e1f63f 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -18,7 +18,7 @@ """ PyTorch Fuyu model.""" import math from collections.abc import Iterable, Mapping, Sequence -from typing import List, Literal, Optional, Set, Tuple, TypedDict +from typing import Literal, Optional, Set, Tuple, TypedDict import torch import torch.nn as nn @@ -31,8 +31,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, - NestedTensors) +from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -58,10 +57,12 @@ class FuyuImagePatchInputs(TypedDict): `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)` """ - patches_per_image: List[int] + patches_per_image: list[int] """ - List of number of total patches for each image in the batch. - This is used to restore the first two dimensions of `flat_data`. + The number of total patches for each image in the batch. + + This is used to split the embeddings which has the first two dimensions + flattened just like `flat_data`. """ @@ -317,7 +318,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): return None def _process_image_input( - self, image_input: FuyuImagePatchInputs) -> NestedTensors: + self, image_input: FuyuImagePatchInputs) -> MultiModalEmbeddings: image_patches_flat = image_input["flat_data"] patches_per_image = image_input["patches_per_image"] diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 13d7394ac08bc..c77324bab59c6 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -5,7 +5,7 @@ from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional, import torch from torch import Tensor -from typing_extensions import TypeIs +from typing_extensions import Self, TypeIs from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import ( @@ -451,7 +451,7 @@ class SupportsQuant: packed_modules_mapping: ClassVar[Dict[str, List[str]]] = {} quant_config: Optional[QuantizationConfig] = None - def __new__(cls, *args, **kwargs) -> "SupportsQuant": + def __new__(cls, *args, **kwargs) -> Self: instance = super().__new__(cls) quant_config = cls._find_quant_config(*args, **kwargs) if quant_config is not None: diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index ecdd6dfb0a72c..478dbd83d3002 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -3,8 +3,8 @@ from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence from functools import cached_property -from typing import (Final, List, Literal, Optional, Protocol, Set, Tuple, - TypedDict, TypeVar, Union, cast) +from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict, + TypeVar, Union, cast) import torch import torch.nn as nn @@ -39,8 +39,7 @@ from vllm.utils import JSONTree, flatten_2d_lists, json_map_leaves from .clip import CLIPVisionModel from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP -from .pixtral import (PixtralHFVisionModel, - get_pixtral_hf_image_feature_grid_size) +from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -49,7 +48,7 @@ from .vision import get_vision_encoder_info class LlavaImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] + pixel_values: torch.Tensor """ Shape: `(batch_size * num_images, num_channels, height, width)` @@ -57,7 +56,18 @@ class LlavaImagePixelInputs(TypedDict): in which case the data is passed as a list instead of a batched tensor. """ - feat_is_patch: Union[torch.Tensor, List[torch.Tensor]] + +class PixtralHFImagePixelInputs(TypedDict): + type: Literal["pixel_values_pixtral"] + pixel_values: Union[torch.Tensor, list[torch.Tensor]] + """ + Shape: `(batch_size * num_images, num_channels, height, width)` + + Note that `height` or `width` may be different per batch and image, + in which case the data is passed as a list instead of a batched tensor. + """ + + feat_is_patch: Union[torch.Tensor, list[torch.Tensor]] """ A boolean mask indicating which image features correspond to patch tokens. @@ -65,7 +75,7 @@ class LlavaImagePixelInputs(TypedDict): Shape: `(batch_size, num_crops, num_patch)` """ - embed_is_patch: Union[torch.Tensor, List[torch.Tensor]] + embed_is_patch: Union[torch.Tensor, list[torch.Tensor]] """ A boolean mask indicating which image embeddings correspond to patch tokens. @@ -73,7 +83,7 @@ class LlavaImagePixelInputs(TypedDict): Shape: `(batch_size, num_embeds)` """ - num_crops: torch.Tensor + num_crops: Union[torch.Tensor, list[torch.Tensor]] """Shape: `(batch_size, num_images)`""" @@ -85,27 +95,9 @@ class LlavaImageEmbeddingInputs(TypedDict): `hidden_size` must match the hidden size of language model backbone. """ - feat_is_patch: Union[torch.Tensor, List[torch.Tensor]] - """ - A boolean mask indicating which image features correspond - to patch tokens. - Shape: `(batch_size, num_crops, num_patch)` - """ - - embed_is_patch: Union[torch.Tensor, List[torch.Tensor]] - """ - A boolean mask indicating which image embeddings correspond - to patch tokens. - - Shape: `(batch_size, num_embeds)` - """ - - num_crops: torch.Tensor - """Shape: `(batch_size, num_images)`""" - - -LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageEmbeddingInputs] +LlavaImageInputs = Union[LlavaImagePixelInputs, PixtralHFImagePixelInputs, + LlavaImageEmbeddingInputs] class LlavaMultiModalProjector(nn.Module): @@ -357,13 +349,15 @@ class PixtralHFMultiModalProcessor( ] hf_config = self.info.get_hf_config() + vision_config = hf_config.vision_config + assert isinstance(vision_config, PixtralVisionConfig) + encoder_info = PixtralHFEncoderInfo(vision_config) tile_sizes = [ - get_pixtral_hf_image_feature_grid_size( - hf_config.vision_config, + encoder_info.get_patch_grid_size( image_width=pixel_value.shape[-1], - image_height=pixel_value.shape[-2]) - for pixel_value in processed_outputs["pixel_values"] + image_height=pixel_value.shape[-2], + ) for pixel_value in processed_outputs["pixel_values"] ] num_crops = torch.tensor([(ncols + 1) * nrows for ncols, nrows in tile_sizes]) @@ -411,13 +405,13 @@ class PixtralHFMultiModalProcessor( vision_config = hf_config.vision_config assert isinstance(vision_config, PixtralVisionConfig) + encoder_info = PixtralHFEncoderInfo(vision_config) def get_replacement(item_idx: int): images = mm_items.get_items("image", ImageProcessorItems) image_size = images.get_image_size(item_idx) - ncols, nrows = get_pixtral_hf_image_feature_grid_size( - vision_config, + ncols, nrows = encoder_info.get_patch_grid_size( image_width=image_size.width, image_height=image_size.height, ) @@ -512,7 +506,7 @@ def init_vision_tower_for_llava( *, require_post_norm: Optional[bool] = None, prefix: str = "", -): +) -> Union[CLIPVisionModel, SiglipVisionModel, PixtralHFVisionModel]: vision_config = hf_config.vision_config # Initialize the vision tower only up to the deepest required feature layer @@ -627,32 +621,30 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): if pixel_values is None and image_embeds is None: return None - feat_is_patch = kwargs.pop("feat_is_patch", None) - if feat_is_patch is not None and not isinstance( - feat_is_patch, (torch.Tensor, list)): - raise ValueError("Incorrect type of feat_is_patch. " - f"Got type: {type(feat_is_patch)}") - - embed_is_patch = kwargs.pop("embed_is_patch", None) - if embed_is_patch is not None and not isinstance( - embed_is_patch, (torch.Tensor, list)): - raise ValueError("Incorrect type of embed_is_patch. " - f"Got type: {type(embed_is_patch)}") - - num_crops = kwargs.pop("num_crops", None) - if num_crops is not None and not isinstance(num_crops, torch.Tensor): - raise ValueError("Incorrect type of num_crops. " - f"Got type: {type(num_crops)}") - if pixel_values is not None: if not isinstance(pixel_values, (torch.Tensor, list)): raise ValueError("Incorrect type of pixel values. " f"Got type: {type(pixel_values)}") if self.config.vision_config.model_type == "pixtral": - return LlavaImagePixelInputs( - type="pixel_values", - data=flatten_bn(pixel_values), + feat_is_patch = kwargs.pop("feat_is_patch") + if not isinstance(feat_is_patch, (torch.Tensor, list)): + raise ValueError("Incorrect type of feat_is_patch. " + f"Got type: {type(feat_is_patch)}") + + embed_is_patch = kwargs.pop("embed_is_patch") + if not isinstance(embed_is_patch, (torch.Tensor, list)): + raise ValueError("Incorrect type of embed_is_patch. " + f"Got type: {type(embed_is_patch)}") + + num_crops = kwargs.pop("num_crops") + if not isinstance(num_crops, (torch.Tensor, list)): + raise ValueError("Incorrect type of num_crops. " + f"Got type: {type(num_crops)}") + + return PixtralHFImagePixelInputs( + type="pixel_values_pixtral", + pixel_values=flatten_bn(pixel_values), feat_is_patch=feat_is_patch, embed_is_patch=embed_is_patch, num_crops=num_crops, @@ -660,11 +652,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): return LlavaImagePixelInputs( type="pixel_values", - data=self._validate_pixel_values( + pixel_values=self._validate_pixel_values( flatten_bn(pixel_values, concat=True)), - feat_is_patch=feat_is_patch, - embed_is_patch=embed_is_patch, - num_crops=num_crops, ) if image_embeds is not None: @@ -672,12 +661,12 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): raise ValueError("Incorrect type of image embeddings. " f"Got type: {type(image_embeds)}") + if self.config.vision_config.model_type == "pixtral": + raise ValueError("Pixtral-HF does not support image_embeds.") + return LlavaImageEmbeddingInputs( type="image_embeds", data=flatten_bn(image_embeds, concat=True), - feat_is_patch=feat_is_patch, - embed_is_patch=embed_is_patch, - num_crops=num_crops, ) raise AssertionError("This line should be unreachable.") @@ -696,7 +685,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): self, vision_tower: Union[CLIPVisionModel, SiglipVisionModel, PixtralHFVisionModel], - pixel_values: torch.Tensor, + pixel_values: Union[torch.Tensor, list[torch.Tensor]], ) -> torch.Tensor: # NOTE: we skip the step to select the vision feature layer since @@ -708,17 +697,20 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): strategy=self.config.vision_feature_select_strategy, ) - def _process_image_pixels(self, - inputs: LlavaImagePixelInputs) -> torch.Tensor: + def _process_image_pixels( + self, + inputs: Union[LlavaImagePixelInputs, PixtralHFImagePixelInputs], + ) -> torch.Tensor: assert self.vision_tower is not None - pixel_values = inputs["data"] + pixel_values = inputs["pixel_values"] return self._image_pixels_to_features(self.vision_tower, pixel_values) - def _process_image_input(self, - image_input: LlavaImageInputs) -> torch.Tensor: - + def _process_image_input( + self, + image_input: LlavaImageInputs, + ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: if image_input["type"] == "image_embeds": return image_input["data"] @@ -783,11 +775,11 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None + vision_embeddings = self._process_image_input(image_input) - if kwargs.get("v0_path", False) or \ - image_input.get("feat_is_patch") is None or \ - image_input.get("embed_is_patch") is None: + if (kwargs.get("v0_path", False) + or image_input["type"] != "pixel_values_pixtral"): # The path is used for pixtral (V0 only) and llava (V0/V1) return vision_embeddings diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index db89bbf1af6ef..4de13e5407354 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -32,7 +32,7 @@ from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn, class LlavaNextImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] + pixel_values: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` @@ -315,7 +315,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, return LlavaNextImagePixelInputs( type="pixel_values", - data=self._validate_pixel_values(flatten_bn(pixel_values)), + pixel_values=self._validate_pixel_values( + flatten_bn(pixel_values)), image_sizes=self._validate_image_sizes( flatten_bn(image_sizes, concat=True)), ) @@ -434,7 +435,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: assert self.vision_tower is not None - pixel_values = inputs["data"] + pixel_values = inputs["pixel_values"] if isinstance(pixel_values, torch.Tensor): b, num_patches, c, h, w = pixel_values.shape diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index c6bc9ffcbf3d6..52ec0abcdc5b5 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -42,7 +42,7 @@ _MAX_FRAMES_PER_VIDEO = 16 class LlavaOnevisionVideoPixelInputs(TypedDict): type: Literal["pixel_values_videos"] - data: Union[torch.Tensor, List[torch.Tensor]] + pixel_values_videos: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size, num_videos, num_frames, num_channels, height, width)` @@ -54,7 +54,7 @@ class LlavaOnevisionVideoPixelInputs(TypedDict): class LlavaOnevisionImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] + pixel_values: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` @@ -521,7 +521,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, return LlavaOnevisionImagePixelInputs( type="pixel_values", - data=self._validate_image_pixel_values( + pixel_values=self._validate_image_pixel_values( flatten_bn(pixel_values)), image_sizes=self._validate_image_sizes( flatten_bn(image_sizes, concat=True)), @@ -570,21 +570,20 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, List[b, Tensor(nb_frames, nb_channels, height, width)] } """ - pixel_values = kwargs.pop("pixel_values_videos", None) - - if pixel_values is None: + pixel_values_videos = kwargs.pop("pixel_values_videos", None) + if pixel_values_videos is None: return None - if not (is_list_of(pixel_values, - (torch.Tensor)) # different shape videos - or isinstance(pixel_values, + if not (is_list_of(pixel_values_videos, + torch.Tensor) # different shape videos + or isinstance(pixel_values_videos, torch.Tensor)): # same shape videos - raise ValueError("Incorrect type of pixel values. " - f"Got type: {type(pixel_values)}") + raise ValueError("Incorrect type of pixel_values_videos. " + f"Got type: {type(pixel_values_videos)}") return LlavaOnevisionVideoPixelInputs( type="pixel_values_videos", - data=pixel_values, + pixel_values_videos=pixel_values_videos, ) def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: @@ -723,7 +722,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, ) -> Union[torch.Tensor, List[torch.Tensor]]: assert self.vision_tower is not None - pixel_values = inputs["data"] + pixel_values = inputs["pixel_values"] if isinstance(pixel_values, torch.Tensor): b, num_patches, c, h, w = pixel_values.shape @@ -757,7 +756,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, image_sizes = image_input.get("image_sizes") if image_sizes is None: - batch_size = len(image_input["data"]) + batch_size = len(image_input["pixel_values"]) vision_config = self.config.vision_config default_height = default_width = vision_config.image_size image_sizes = torch.as_tensor([[default_height, default_width] @@ -808,7 +807,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, def _process_video_pixels(self, inputs: LlavaOnevisionVideoPixelInputs): assert self.vision_tower is not None - video_pixels = inputs["data"] + video_pixels = inputs["pixel_values_videos"] if isinstance(video_pixels, torch.Tensor): b, num_videos, frames, c, h, w = video_pixels.shape diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index bf6c38d279633..ac10c211fa81f 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -23,7 +23,6 @@ # limitations under the License. """Inference-only MiniCPM-O model compatible with HuggingFace weights.""" from collections.abc import Iterable, Mapping, Sequence -from functools import partial from typing import (Any, Callable, Dict, List, Literal, Optional, Set, Tuple, TypedDict, Union) @@ -36,11 +35,12 @@ from transformers.models.whisper.modeling_whisper import ( from vllm.config import VllmConfig from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.inputs import MultiModalFieldConfig -from vllm.multimodal.parse import (AudioItem, DictEmbeddingItems, ModalityData, +from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors +from vllm.multimodal.parse import (AudioItem, AudioProcessorItems, + DictEmbeddingItems, ModalityData, ModalityDataItems, MultiModalDataItems, MultiModalDataParser) -from vllm.multimodal.processing import PromptReplacement +from vllm.multimodal.processing import PromptReplacement, PromptUpdate from vllm.multimodal.profiling import ProcessorInputs from vllm.sequence import IntermediateTensors @@ -272,8 +272,13 @@ class MiniCPMOMultiModalProcessor( tokenizer.audio_end_id) return special_tokens - def process_audios(self, mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object]) -> Dict[str, object]: + def process_audios( + self, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> Mapping[str, NestedTensors]: + mm_data = dict(mm_data) + audios = mm_data.pop("audios", []) audio_embeds = mm_data.pop("audio_embeds", []) if isinstance(audios, (list, torch.Tensor)) and len(audios) > 0: @@ -332,11 +337,15 @@ class MiniCPMOMultiModalProcessor( def get_placeholder_split_pattern(self) -> str: return r"\(<(?:image|video|audio)>./\)" - def process_mm_inputs(self, mm_data, mm_kwargs) -> object: + def process_mm_inputs( + self, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> Mapping[str, Mapping[str, NestedTensors]]: return { "image": self.process_images(mm_data, mm_kwargs), "video": self.process_videos(mm_data, mm_kwargs), - "audio": self.process_audios(mm_data, mm_kwargs) + "audio": self.process_audios(mm_data, mm_kwargs), } def get_modality_num_counter(self, modality: str) -> str: @@ -358,39 +367,38 @@ class MiniCPMOMultiModalProcessor( return super().get_prompt_texts_by_modality(inputs, modality, index) def _get_prompt_updates( - self, mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs) -> Sequence[PromptReplacement]: - placeholder = { - "image": self.info.image_pattern, - "video": self.info.video_pattern, - "audio": self.info.audio_pattern - } + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + base_updates = super()._get_prompt_updates( + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + out_mm_kwargs=out_mm_kwargs, + ) - def get_replacement_minicpmv(item_idx: int, modality: str): - if modality == "image": - return self.get_image_prompt_texts( - mm_items["image"].get_image_size(item_idx), item_idx) - elif modality == "video": - return self.get_video_prompt_texts( - mm_items["video"].get_frame_size(item_idx), - mm_items["video"].get_num_frames(item_idx)) - else: # audio - if isinstance(mm_items["audio"], MiniCPMOAudioEmbeddingItems): - single_audio_embeds = mm_items["audio"].get(item_idx) - audio_len = self.info.get_audio_len_by_num_chunks( - sum(chunk_embeds.shape[0] - for chunk_embeds in single_audio_embeds)) - return self.get_audio_prompt_texts(audio_len) - return self.get_audio_prompt_texts( - len(mm_items["audio"].get(item_idx))) + audio_placeholder = self.info.audio_pattern + + def get_audio_replacement(item_idx: int): + audios = mm_items.get_items( + "audio", (MiniCPMOAudioEmbeddingItems, AudioProcessorItems)) + + if isinstance(audios, MiniCPMOAudioEmbeddingItems): + single_audio_embeds = audios.get(item_idx)["audio_embeds"] + audio_len = self.info.get_audio_len_by_num_chunks( + sum(chunk_embeds.shape[0] + for chunk_embeds in single_audio_embeds)) + else: + audio_len = audios.get_audio_length(item_idx) + + return self.get_audio_prompt_texts(audio_len) return [ - PromptReplacement(modality=modality, - target=placeholder[modality], - replacement=partial(get_replacement_minicpmv, - modality=modality)) - for modality in ("image", "video", "audio") + *base_updates, + PromptReplacement(modality="audio", + target=audio_placeholder, + replacement=get_audio_replacement), ] def _get_mm_fields_config( diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 48f0c09cdfb37..48c8572c05f65 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -24,7 +24,6 @@ """Inference-only MiniCPM-V model compatible with HuggingFace weights.""" import math import re -from collections import Counter from collections.abc import Iterable, Mapping, Sequence from functools import cached_property, partial from typing import (Any, Callable, Dict, List, Literal, Optional, Set, Tuple, @@ -51,13 +50,16 @@ from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputs, PlaceholderRange) -from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem, ImageSize, + MultiModalInputs, NestedTensors, + PlaceholderRange) +from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem, + ImageProcessorItems, ImageSize, ModalityData, ModalityDataItems, MultiModalDataItems, MultiModalDataParser, - VideoItem) + VideoItem, VideoProcessorItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement) + BaseProcessingInfo, PromptReplacement, + PromptUpdate) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors @@ -557,8 +559,13 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): outputs = {key: outputs[key][0] for key in valid_keys} return outputs - def process_images(self, mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object]) -> Dict[str, object]: + def process_images( + self, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> Mapping[str, NestedTensors]: + mm_data = dict(mm_data) + images = mm_data.pop("images", []) image_embeds = mm_data.pop("image_embeds", []) if isinstance(images, Image.Image): @@ -568,8 +575,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): prompt=self.info.image_pattern * len(images), mm_data={"images": images}, mm_kwargs=mm_kwargs) - image_outputs = MiniCPMVMultiModalProcessor.\ - repack_processor_outputs(image_outputs) + image_outputs = self.repack_processor_outputs(image_outputs) elif len(image_embeds) > 0: image_sizes = mm_data.pop("image_sizes", None) image_outputs = { @@ -580,8 +586,13 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): image_outputs = {} return image_outputs - def process_videos(self, mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object]) -> Dict[str, object]: + def process_videos( + self, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> Mapping[str, NestedTensors]: + mm_data = dict(mm_data) + videos = mm_data.pop("videos", []) video_embeds = mm_data.pop("video_embeds", []) if len(videos) > 0 and isinstance(videos[0], Image.Image): @@ -635,10 +646,14 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): def get_placeholder_split_pattern(self) -> str: return r"\(<(?:image|video)>./\)" - def process_mm_inputs(self, mm_data, mm_kwargs) -> object: + def process_mm_inputs( + self, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> Mapping[str, Mapping[str, NestedTensors]]: return { "image": self.process_images(mm_data, mm_kwargs), - "video": self.process_videos(mm_data, mm_kwargs) + "video": self.process_videos(mm_data, mm_kwargs), } def get_input_modalities(self, mm_data) -> List[str]: @@ -655,8 +670,10 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): elif modality == "video": return "video_image_sizes" - def get_num_slices_by_modality(self, inputs: Dict[str, object], - modality: str, index: int) -> int: + raise NotImplementedError(modality) + + def get_num_slices_by_modality(self, inputs: dict[str, Any], modality: str, + index: int) -> int: if modality == "image": return self.info.get_image_slice_nums( inputs[modality]["image_sizes"][index], @@ -669,20 +686,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): else: raise ValueError(f"Unexpected modality: {modality}") - def check_mm_inputs(self, inputs: Dict[str, object], - matches: List[str]) -> None: - counts = Counter(matches) - for modality, count in counts.items(): - if modality not in inputs or not inputs[modality]: - raise ValueError(f"None input data of {modality}." - " But prompt requires.") - counter_key = self.get_modality_num_counter(modality) - if len(inputs[modality][counter_key]) != count: - raise ValueError(f"The prompt requires {count} " - f"{modality} inputs while you pass " - f"{len(inputs[modality][counter_key])}") - - def get_prompt_texts_by_modality(self, inputs: Dict[str, object], + def get_prompt_texts_by_modality(self, inputs: dict[str, Any], modality: str, index: int) -> str: if modality == "image": return self.get_image_prompt_texts( @@ -715,13 +719,23 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): tokenizer = self.info.get_tokenizer() inputs = self.process_mm_inputs(mm_data, mm_kwargs) mm_input_modalities = self.get_input_modalities(inputs) - num_mm_slices = {modality: [] for modality in mm_input_modalities} + + num_mm_slices_lst = { + modality: list[int]() + for modality in mm_input_modalities + } for modality in mm_input_modalities: num_counter_key = self.get_modality_num_counter(modality) for index in range(len(inputs[modality][num_counter_key])): - num_mm_slices[modality].append( + num_mm_slices_lst[modality].append( self.get_num_slices_by_modality(inputs, modality, index)) - return { + + num_mm_slices = { + modality: torch.tensor(v) + for modality, v in num_mm_slices_lst.items() + } + + return BatchFeature({ "input_ids": np.array([tokenizer.encode(prompt)]), **{ key: value @@ -732,7 +746,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): f"{modality}_num_slices": num_mm_slices[modality] for modality in mm_input_modalities } - } + }) def _hf_processor_applies_updates( self, @@ -743,28 +757,42 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): return False def _get_prompt_updates( - self, mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs) -> Sequence[PromptReplacement]: + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: placeholder = { "image": self.info.image_pattern, "video": self.info.video_pattern, } - def get_replacement_minicpmv(item_idx: int, modality: str): - if modality == "image": - return self.get_image_prompt_texts( - mm_items["image"].get_image_size(item_idx), item_idx) - else: # video - return self.get_video_prompt_texts( - mm_items["video"].get_frame_size(item_idx), - mm_items["video"].get_num_frames(item_idx)) + def get_image_replacement(item_idx: int): + images = mm_items.get_items( + "image", (MiniCPMVImageEmbeddingItems, ImageProcessorItems)) + + image_size = images.get_image_size(item_idx) + + return self.get_image_prompt_texts(image_size, item_idx) + + def get_video_replacement(item_idx: int): + videos = mm_items.get_items( + "video", (MiniCPMVVideoEmbeddingItems, VideoProcessorItems)) + + frame_size = videos.get_frame_size(item_idx) + num_frames = videos.get_num_frames(item_idx) + + return self.get_video_prompt_texts(frame_size, num_frames) + + get_replacement = { + "image": get_image_replacement, + "video": get_video_replacement, + } return [ PromptReplacement(modality=modality, target=placeholder[modality], - replacement=partial(get_replacement_minicpmv, - modality=modality)) + replacement=get_replacement[modality]) for modality in ("image", "video") ] diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 9696a858ecd54..444b619437a09 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1478,7 +1478,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, f"Got type: {type(embed_is_patch)}") num_crops = kwargs.pop("num_crops", None) - if not isinstance(num_crops, torch.Tensor): + if not isinstance(num_crops, (torch.Tensor, list)): raise ValueError("Incorrect type of num_crops. " f"Got type: {type(num_crops)}") diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 25b4cc4a9fb80..2e71390623fdf 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -1,9 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 import math +from collections.abc import Iterable, Mapping from dataclasses import dataclass, fields from functools import cached_property -from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union +from typing import List, Optional, Set, Tuple, Union import torch import torch.nn as nn @@ -683,79 +684,6 @@ class VisionLanguageAdapter(nn.Module): # and [`MistralForCausalLM`] for its language decoder. -def get_pixtral_hf_patch_grid_length(*, image_size: int, - patch_size: int) -> int: - # Since interpolation is applied, the image size need not be divisible - # assert image_size % patch_size == 0 - return image_size // patch_size - - -def get_pixtral_hf_image_feature_size( - *, - image_size: int, - patch_size: int, -) -> int: - grid_length = get_pixtral_hf_patch_grid_length( - image_size=image_size, - patch_size=patch_size, - ) - - # Consider the image_break_token - return (grid_length + 1) * grid_length - - -def get_max_pixtral_hf_image_tokens(hf_config: PixtralVisionConfig) -> int: - grid_length = get_pixtral_hf_patch_grid_length( - image_size=hf_config.image_size, - patch_size=hf_config.patch_size, - ) - - # Consider the image_break_token - return (grid_length + 1) * grid_length - - -def dummy_image_for_pixtral_hf( - hf_config: PixtralVisionConfig, - num_images: int, - *, - image_width_override: Optional[int] = None, - image_height_override: Optional[int] = None, -): - width = height = hf_config.image_size - if image_width_override is not None: - width = image_width_override - if image_height_override is not None: - height = image_height_override - - image = Image.new("RGB", (width, height), color=0) - return {"image": image if num_images == 1 else [image] * num_images} - - -# Adapted from transformers.models.pixtral.image_processing_pixtral.get_resize_output_image_size # noqa: E501 -# https://github.com/huggingface/transformers/blob/2bd4d5897dc73e8b172832070a6f9e567a0df017/src/transformers/models/pixtral/image_processing_pixtral.py#L180 -def get_pixtral_hf_image_feature_grid_size( - hf_config: PixtralVisionConfig, - *, - image_width: int, - image_height: int, -) -> tuple[int, int]: - max_width = max_height = hf_config.image_size - patch_width = patch_height = hf_config.patch_size - - ratio = max(image_width / max_width, image_height / max_height) - - if ratio > 1: - image_width = int(math.ceil(image_width / ratio)) - image_height = int(math.ceil(image_height / ratio)) - - nrows, ncols = _get_pixtral_hf_num_image_tokens( - (image_height, image_width), - (patch_height, patch_width), - ) # type: ignore - - return ncols, nrows - - class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]): def get_num_image_tokens( @@ -764,13 +692,21 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]): image_width: int, image_height: int, ) -> int: - return get_pixtral_hf_image_feature_size( - image_size=self.vision_config.image_size, - patch_size=self.vision_config.patch_size, + ncols, nrows = self.get_patch_grid_size( + image_width=image_width, + image_height=image_height, ) + # Consider the image_break_token + return (ncols + 1) * nrows + def get_max_image_tokens(self) -> int: - return get_max_pixtral_hf_image_tokens(self.vision_config) + image_size = self.get_image_size() + + return self.get_num_image_tokens( + image_width=image_size, + image_height=image_size, + ) def get_image_size(self) -> int: return self.vision_config.image_size @@ -779,10 +715,34 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]): return self.vision_config.patch_size def get_patch_grid_length(self) -> int: - return get_pixtral_hf_patch_grid_length( - image_size=self.vision_config.image_size, - patch_size=self.vision_config.patch_size, - ) + image_size, patch_size = self.get_image_size(), self.get_patch_size() + + # Since interpolation is applied, the image size need not be divisible + # assert image_size % patch_size == 0 + return image_size // patch_size + + # Adapted from: https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/pixtral/image_processing_pixtral.py#L99 + def get_patch_grid_size( + self, + *, + image_width: int, + image_height: int, + ) -> tuple[int, int]: + max_width = max_height = self.get_image_size() + patch_width = patch_height = self.get_patch_size() + + ratio = max(image_width / max_width, image_height / max_height) + + if ratio > 1: + image_width = int(math.ceil(image_width / ratio)) + image_height = int(math.ceil(image_height / ratio)) + + nrows, ncols = _get_pixtral_hf_num_image_tokens( + (image_height, image_width), + (patch_height, patch_width), + ) # type: ignore + + return ncols, nrows class PixtralHFMLP(nn.Module): diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index aae30f1fd6635..f63bd0a11459a 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -222,10 +222,10 @@ class Qwen2AudioMultiModalProcessor( num_features = audio_output_lengths[item_idx] if num_features == 0: audios = mm_items.get_items("audio", AudioProcessorItems) - audio = audios.get(item_idx) - raise ValueError( - f"The audio {audio} (len={len(audio)}) is too short " - "to be represented inside the model") + audio_len = audios.get_audio_length(item_idx) + + raise ValueError(f"The audio (len={audio_len}) is too short " + "to be represented inside the model") audio_tokens = [audio_token_id] * num_features diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index e93fa24a6e4dc..7b186d89dad4a 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -433,6 +433,10 @@ class MultiModalFieldConfig: :func:`MultiModalFieldConfig.flat` """ + if size_per_item.ndim != 1: + raise ValueError("size_per_item should be a 1-D tensor, " + f"but found shape: {size_per_item.shape}") + slice_idxs = [0, *accumulate(size_per_item)] slices = [ slice(slice_idxs[i], slice_idxs[i + 1]) diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 4e3e5b2088640..772b1609a9fbb 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -176,6 +176,10 @@ class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]): def __init__(self, data: Sequence[HfAudioItem]) -> None: super().__init__(data, "audio") + def get_audio_length(self, item_idx: int) -> int: + audio = self.get(item_idx) + return len(audio) + class AudioEmbeddingItems(EmbeddingItems): diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index ba8a458e84c8b..080a2362aac52 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1311,8 +1311,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): def _bind_and_group_updates( self, - prompt_updates: list[PromptUpdate], - ) -> dict[str, list[BoundPromptUpdate]]: + prompt_updates: Sequence[PromptUpdate], + ) -> dict[str, Sequence[BoundPromptUpdate]]: tokenizer = self.info.get_tokenizer() it = (update.bind(tokenizer) for update in prompt_updates) From fd8e055ffba508e094cd1793e49bbdc5e53b7266 Mon Sep 17 00:00:00 2001 From: Guillaume Calmettes Date: Fri, 14 Mar 2025 08:58:34 -0400 Subject: [PATCH 024/169] [BugFix]: properly catch templating error when preprocess input (#13976) Signed-off-by: Guillaume Calmettes --- vllm/entrypoints/openai/serving_chat.py | 10 ++++++++++ vllm/entrypoints/openai/serving_completion.py | 10 ++++++++++ vllm/entrypoints/openai/serving_embedding.py | 3 +++ vllm/entrypoints/openai/serving_pooling.py | 7 +++++++ vllm/entrypoints/openai/serving_tokenization.py | 7 +++++++ 5 files changed, 37 insertions(+) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 1ba33f78cde77..130dfe1841fda 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -7,6 +7,7 @@ from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import Sequence as GenericSequence from typing import Callable, Final, Optional, Union +import jinja2 from fastapi import Request from vllm.config import ModelConfig @@ -199,6 +200,15 @@ class OpenAIServingChat(OpenAIServing): except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) + except TypeError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + except RuntimeError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + except jinja2.TemplateError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) request_id = "chatcmpl-" \ f"{self._base_request_id(raw_request, request.request_id)}" diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 1db91a91e37a9..1067f35ce2402 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -6,6 +6,7 @@ from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import Sequence as GenericSequence from typing import Optional, Union, cast +import jinja2 from fastapi import Request from vllm.config import ModelConfig @@ -114,6 +115,15 @@ class OpenAIServingCompletion(OpenAIServing): except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) + except TypeError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + except RuntimeError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + except jinja2.TemplateError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) # Schedule the request and get the result generator. generators: list[AsyncGenerator[RequestOutput, None]] = [] diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 5f6e06e6f79f0..1c2c78aaf8926 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -142,6 +142,9 @@ class OpenAIServingEmbedding(OpenAIServing): except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) + except TypeError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) # Schedule the request and get the result generator. generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 0a3ca2aa7c5bf..894128ee974cd 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -6,6 +6,7 @@ import time from collections.abc import AsyncGenerator from typing import Final, Literal, Optional, Union, cast +import jinja2 import numpy as np from fastapi import Request from typing_extensions import assert_never @@ -138,6 +139,12 @@ class OpenAIServingPooling(OpenAIServing): except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) + except TypeError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + except jinja2.TemplateError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) # Schedule the request and get the result generator. generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 4e95ef59e80eb..90c0da2a24d51 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -2,6 +2,7 @@ from typing import Final, Optional, Union +import jinja2 from fastapi import Request from vllm.config import ModelConfig @@ -91,6 +92,12 @@ class OpenAIServingTokenization(OpenAIServing): except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) + except TypeError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + except jinja2.TemplateError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) input_ids: list[int] = [] for i, engine_prompt in enumerate(engine_prompts): From 613c5bb9458f9ced270fd98c9d877e00c0ad165b Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 15 Mar 2025 00:11:23 +0800 Subject: [PATCH 025/169] [Bugfix] Fix Aria test loading (#14823) Signed-off-by: DarkLight1337 --- tests/models/decoder_only/vision_language/test_models.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 84a5260ad9a08..a0f1229f0af5a 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -9,7 +9,8 @@ from pathlib import PosixPath import pytest from packaging.version import Version -from transformers import AutoModelForPreTraining, AutoModelForVision2Seq +from transformers import (AutoModelForImageTextToText, AutoModelForPreTraining, + AutoModelForVision2Seq) from transformers import __version__ as TRANSFORMERS_VERSION from vllm.platforms import current_platform @@ -163,6 +164,7 @@ VLM_TEST_SETTINGS = { img_idx_to_prompt=lambda idx: "<|img|>\n", max_model_len=4096, max_num_seqs=2, + auto_cls=AutoModelForImageTextToText, single_image_prompts=IMAGE_ASSETS.prompts({ "stop_sign": "Please describe the image shortly.", "cherry_blossom": "Please infer the season with reason.", From 1140991a7b3a435b06fb819b683cc12780a19e52 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 14 Mar 2025 12:18:38 -0400 Subject: [PATCH 026/169] [V1] Fix vocab size calculation for structured output (#14826) Signed-off-by: Russell Bryant --- vllm/v1/structured_output/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index a341d74c5812b..32ea1852d0ac2 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -40,7 +40,7 @@ class StructuredOutputManager: tokenizer_group.ping() tokenizer = tokenizer_group.get_lora_tokenizer(None) - self.vocab_size = tokenizer.max_token_id + self.vocab_size = tokenizer.max_token_id + 1 if isinstance(tokenizer, MistralTokenizer): # NOTE: ideally, xgrammar should handle this accordingly. # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98 From 0b0d6421b242b2fa72bab4dd698da14985c31bbe Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 14 Mar 2025 12:21:09 -0400 Subject: [PATCH 027/169] [Frontend] Fix log message to use http vs https (#14774) Signed-off-by: Russell Bryant --- vllm/entrypoints/openai/api_server.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index ec2099d4cebf0..7583078e9462b 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -955,8 +955,10 @@ async def run_server(args, **uvicorn_kwargs) -> None: return '[' + a + ']' return a or "0.0.0.0" - logger.info("Starting vLLM API server on http://%s:%d", - _listen_addr(sock_addr[0]), sock_addr[1]) + is_ssl = args.ssl_keyfile and args.ssl_certfile + logger.info("Starting vLLM API server on http%s://%s:%d", + "s" if is_ssl else "", _listen_addr(sock_addr[0]), + sock_addr[1]) shutdown_task = await serve_http( app, From 9d2b4a70f43e91f49c3d2fd449d586ff8f25e31f Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 14 Mar 2025 16:45:25 +0000 Subject: [PATCH 028/169] [V1][Metrics] Updated list of deprecated metrics in v0.8 (#14695) Signed-off-by: Mark McLoughlin --- docs/source/serving/metrics.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md index 1d55f201503ce..647ece3f85f06 100644 --- a/docs/source/serving/metrics.md +++ b/docs/source/serving/metrics.md @@ -39,7 +39,16 @@ The following metrics are exposed: The following metrics are deprecated and due to be removed in a future version: -- *(No metrics are currently deprecated)* +- `vllm:num_requests_swapped`, `vllm:cpu_cache_usage_perc`, and + `vllm:cpu_prefix_cache_hit_rate` because KV cache offloading is not + used in V1. +- `vllm:gpu_prefix_cache_hit_rate` is replaced by queries+hits + counters in V1. +- `vllm:time_in_queue_requests` because it duplicates + `vllm:request_queue_time_seconds`. +- `vllm:model_forward_time_milliseconds` and + `vllm:model_execute_time_milliseconds` because + prefill/decode/inference time metrics should be used instead. Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1` but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch, From 73deea2fdba03690b9067e71819d5d385d2afa54 Mon Sep 17 00:00:00 2001 From: daniel-salib Date: Fri, 14 Mar 2025 09:53:17 -0700 Subject: [PATCH 029/169] [Frontend] track server_load (#13950) --- tests/entrypoints/openai/test_basic.py | 48 ++++++++++++++++++++++++++ vllm/entrypoints/openai/api_server.py | 32 +++++++++++++++-- vllm/entrypoints/openai/cli_args.py | 7 ++++ vllm/entrypoints/utils.py | 48 ++++++++++++++++++++++++-- 4 files changed, 131 insertions(+), 4 deletions(-) diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index e7bf974f13ed8..a4ac800707734 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -171,3 +171,51 @@ async def test_request_wrong_content_type(server: RemoteOpenAIServer): extra_headers={ "Content-Type": "application/x-www-form-urlencoded" }) + + +@pytest.mark.parametrize( + "server_args", + [ + pytest.param(["--enable-server-load-tracking"], + id="enable-server-load-tracking") + ], + indirect=True, +) +@pytest.mark.asyncio +async def test_server_load(server: RemoteOpenAIServer): + # Check initial server load + response = requests.get(server.url_for("load")) + assert response.status_code == HTTPStatus.OK + assert response.json().get("server_load") == 0 + + def make_long_completion_request(): + return requests.post( + server.url_for("v1/completions"), + headers={"Content-Type": "application/json"}, + json={ + "prompt": "Give me a long story", + "max_tokens": 1000, + "temperature": 0, + }, + ) + + # Start the completion request in a background thread. + completion_future = asyncio.create_task( + asyncio.to_thread(make_long_completion_request)) + + # Give a short delay to ensure the request has started. + await asyncio.sleep(0.1) + + # Check server load while the completion request is running. + response = requests.get(server.url_for("load")) + assert response.status_code == HTTPStatus.OK + assert response.json().get("server_load") == 1 + + # Wait for the completion request to finish. + await completion_future + await asyncio.sleep(0.1) + + # Check server load after the completion request has finished. + response = requests.get(server.url_for("load")) + assert response.status_code == HTTPStatus.OK + assert response.json().get("server_load") == 0 diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 7583078e9462b..52e65fc214bc7 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -80,7 +80,7 @@ from vllm.entrypoints.openai.serving_tokenization import ( from vllm.entrypoints.openai.serving_transcription import ( OpenAIServingTranscription) from vllm.entrypoints.openai.tool_parsers import ToolParserManager -from vllm.entrypoints.utils import with_cancellation +from vllm.entrypoints.utils import load_aware_call, with_cancellation from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path, @@ -347,6 +347,24 @@ async def health(raw_request: Request) -> Response: return Response(status_code=200) +@router.get("/load") +async def get_server_load_metrics(request: Request): + # This endpoint returns the current server load metrics. + # It tracks requests utilizing the GPU from the following routes: + # - /v1/chat/completions + # - /v1/completions + # - /v1/audio/transcriptions + # - /v1/embeddings + # - /pooling + # - /score + # - /v1/score + # - /rerank + # - /v1/rerank + # - /v2/rerank + return JSONResponse( + content={'server_load': request.app.state.server_load_metrics}) + + @router.api_route("/ping", methods=["GET", "POST"]) async def ping(raw_request: Request) -> Response: """Ping check. Endpoint required for SageMaker""" @@ -400,6 +418,7 @@ async def show_version(): @router.post("/v1/chat/completions", dependencies=[Depends(validate_json_request)]) @with_cancellation +@load_aware_call async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request): handler = chat(raw_request) @@ -421,6 +440,7 @@ async def create_chat_completion(request: ChatCompletionRequest, @router.post("/v1/completions", dependencies=[Depends(validate_json_request)]) @with_cancellation +@load_aware_call async def create_completion(request: CompletionRequest, raw_request: Request): handler = completion(raw_request) if handler is None: @@ -439,6 +459,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request): @router.post("/v1/embeddings", dependencies=[Depends(validate_json_request)]) @with_cancellation +@load_aware_call async def create_embedding(request: EmbeddingRequest, raw_request: Request): handler = embedding(raw_request) if handler is None: @@ -485,6 +506,7 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): @router.post("/pooling", dependencies=[Depends(validate_json_request)]) @with_cancellation +@load_aware_call async def create_pooling(request: PoolingRequest, raw_request: Request): handler = pooling(raw_request) if handler is None: @@ -503,6 +525,7 @@ async def create_pooling(request: PoolingRequest, raw_request: Request): @router.post("/score", dependencies=[Depends(validate_json_request)]) @with_cancellation +@load_aware_call async def create_score(request: ScoreRequest, raw_request: Request): handler = score(raw_request) if handler is None: @@ -521,6 +544,7 @@ async def create_score(request: ScoreRequest, raw_request: Request): @router.post("/v1/score", dependencies=[Depends(validate_json_request)]) @with_cancellation +@load_aware_call async def create_score_v1(request: ScoreRequest, raw_request: Request): logger.warning( "To indicate that Score API is not part of standard OpenAI API, we " @@ -531,10 +555,10 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): @router.post("/v1/audio/transcriptions") @with_cancellation +@load_aware_call async def create_transcriptions(request: Annotated[TranscriptionRequest, Form()], raw_request: Request): - handler = transcription(raw_request) if handler is None: return base(raw_request).create_error_response( @@ -556,6 +580,7 @@ async def create_transcriptions(request: Annotated[TranscriptionRequest, @router.post("/rerank", dependencies=[Depends(validate_json_request)]) @with_cancellation +@load_aware_call async def do_rerank(request: RerankRequest, raw_request: Request): handler = rerank(raw_request) if handler is None: @@ -894,6 +919,9 @@ async def init_app_state( ) if model_config.runner_type == "transcription" else None state.task = model_config.task + state.enable_server_load_tracking = args.enable_server_load_tracking + state.server_load_metrics = 0 + def create_server_socket(addr: tuple[str, int]) -> socket.socket: family = socket.AF_INET diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index b8cc57430f85c..bd66416d90cc8 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -257,6 +257,13 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: action='store_true', default=False, help="If set to True, enable prompt_tokens_details in usage.") + parser.add_argument( + "--enable-server-load-tracking", + action='store_true', + default=False, + help= + "If set to True, enable tracking server_load_metrics in the app state." + ) return parser diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index 9af37871d57c8..60cbb58af3d9a 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -4,6 +4,8 @@ import asyncio import functools from fastapi import Request +from fastapi.responses import JSONResponse, StreamingResponse +from starlette.background import BackgroundTask, BackgroundTasks async def listen_for_disconnect(request: Request) -> None: @@ -17,9 +19,9 @@ async def listen_for_disconnect(request: Request) -> None: def with_cancellation(handler_func): """Decorator that allows a route handler to be cancelled by client disconnections. - + This does _not_ use request.is_disconnected, which does not work with - middleware. Instead this follows the pattern from + middleware. Instead this follows the pattern from starlette.StreamingResponse, which simultaneously awaits on two tasks- one to wait for an http disconnect message, and the other to do the work that we want done. When the first task finishes, the other is cancelled. @@ -57,3 +59,45 @@ def with_cancellation(handler_func): return None return wrapper + + +def decrement_server_load(request: Request): + request.app.state.server_load_metrics -= 1 + + +def load_aware_call(func): + + @functools.wraps(func) + async def wrapper(*args, raw_request: Request, **kwargs): + if not raw_request.app.state.enable_server_load_tracking: + return await func(*args, raw_request=raw_request, **kwargs) + + raw_request.app.state.server_load_metrics += 1 + try: + response = await func(*args, raw_request=raw_request, **kwargs) + except Exception: + raw_request.app.state.server_load_metrics -= 1 + raise + + if isinstance(response, (JSONResponse, StreamingResponse)): + if response.background is None: + response.background = BackgroundTask(decrement_server_load, + raw_request) + elif isinstance(response.background, BackgroundTasks): + response.background.add_task(decrement_server_load, + raw_request) + elif isinstance(response.background, BackgroundTask): + # Convert the single BackgroundTask to BackgroundTasks + # and chain the decrement_server_load task to it + tasks = BackgroundTasks() + tasks.add_task(response.background.func, + *response.background.args, + **response.background.kwargs) + tasks.add_task(decrement_server_load, raw_request) + response.background = tasks + else: + raw_request.app.state.server_load_metrics -= 1 + + return response + + return wrapper From 977a16772c9d9717c4224fe7bd5b7d8699595449 Mon Sep 17 00:00:00 2001 From: Yajie Wang Date: Sat, 15 Mar 2025 00:55:14 +0800 Subject: [PATCH 030/169] [Bugfix][Kernel]: Fix AllSpark kernel compilation errors and enable for CUDA < 12.0 (#14430) Signed-off-by: wyj371990 --- CMakeLists.txt | 4 ++-- .../gptq_allspark/allspark_qgemm_w8a16.cu | 13 ++++++++----- csrc/quantization/gptq_allspark/allspark_utils.cuh | 8 +++++--- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5baa39b6f9e59..b7bfdc6c857b1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -319,7 +319,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Only build AllSpark kernels if we are building for at least some compatible archs. cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}") - if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND ALLSPARK_ARCHS) + if (ALLSPARK_ARCHS) set(ALLSPARK_SRCS "csrc/quantization/gptq_allspark/allspark_repack.cu" "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu") @@ -330,7 +330,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}") else() message(STATUS "Not building AllSpark kernels as no compatible archs found" - " in CUDA target architectures, or CUDA not >= 12.0") + " in CUDA target architectures") endif() diff --git a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu index c4ed98ca64f8b..b520f8c32b95b 100644 --- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu +++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu @@ -437,9 +437,10 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) { #pragma unroll for (int k_idx = 0; k_idx < 2; ++k_idx) { - FType low16 = static_cast(C_frag[m_idx][n_idx][k_idx * 2]); + FType low16 = + ScalarType::float2num(C_frag[m_idx][n_idx][k_idx * 2]); FType high16 = - static_cast(C_frag[m_idx][n_idx][k_idx * 2 + 1]); + ScalarType::float2num(C_frag[m_idx][n_idx][k_idx * 2 + 1]); uint32_t tmp = (reinterpret_cast(low16) & 0xffff) | (reinterpret_cast(high16) << 16); int sts_offset = @@ -793,7 +794,7 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel( FT scale_reg[4]; *(reinterpret_cast(scale_reg)) = *(reinterpret_cast(scales + params_nidx)); - FT zero_reg[4] = {0}; + FT zero_reg[4]; if (zeros != nullptr) { *(reinterpret_cast(zero_reg)) = *(reinterpret_cast(zeros + params_nidx)); @@ -809,8 +810,10 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel( reinterpret_cast::T2*>(&(fval_reg[ni * 4]))); #pragma unroll for (int ki = 0; ki < 4; ++ki) { - fval_reg[ni * 4 + ki] = - (fval_reg[ni * 4 + ki] - zero_reg[ni]) * scale_reg[ni]; + if (zeros != nullptr) { + fval_reg[ni * 4 + ki] = __hsub(fval_reg[ni * 4 + ki], zero_reg[ni]); + } + fval_reg[ni * 4 + ki] = __hmul(fval_reg[ni * 4 + ki], scale_reg[ni]); int sts_offset = sts_base_offset + ((ki / 2) * 8 + (ki % 2)) * 32 + ((ni + lane_id % 4) % 4) * 8; smem[sts_offset] = fval_reg[ni * 4 + ki]; diff --git a/csrc/quantization/gptq_allspark/allspark_utils.cuh b/csrc/quantization/gptq_allspark/allspark_utils.cuh index 7aded9a17280d..80456c25590d0 100644 --- a/csrc/quantization/gptq_allspark/allspark_utils.cuh +++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh @@ -7,6 +7,8 @@ #include #include #include +#include "../gptq_marlin/marlin_dtypes.cuh" +using marlin::ScalarType; namespace allspark { @@ -66,14 +68,14 @@ __global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C, return; } - FType sum(0); + float sum = 0.f; int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix; for (int i = 0; i < n_mat; ++i) { - sum += C_split[idx + i * matrix_size]; + sum += ScalarType::num2float(C_split[idx + i * matrix_size]); } - C[idx] = sum; + C[idx] = ScalarType::float2num(sum); } template From 7097b4cc1c131a418221721653684538ea8506f8 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Fri, 14 Mar 2025 11:59:52 -0700 Subject: [PATCH 031/169] [release] Remove log cleanup commands from TPU job (#14838) --- .buildkite/release-pipeline.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 096a1c870c6ba..37cdab9e01ecb 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -57,8 +57,6 @@ steps: agents: queue: tpu_queue_postmerge commands: - - "rm -f /var/log/syslog" - - "rm -f /var/log/kern.log" - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ." - "docker push vllm/vllm-tpu:nightly" - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT" From 270a5da495d24e947a71e2fa0c56635f4fad2dc3 Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Fri, 14 Mar 2025 14:18:13 -0500 Subject: [PATCH 032/169] Re-enable the AMD Entrypoints Test (#14711) Signed-off-by: Alexei V. Ivanov --- .buildkite/run-amd-test.sh | 24 +++++++++++++++++++----- Dockerfile.rocm | 1 + requirements/rocm-test.txt | 23 +++++++++++++++++++++++ 3 files changed, 43 insertions(+), 5 deletions(-) create mode 100644 requirements/rocm-test.txt diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 955baa1ff8b3c..0680bae13ddbf 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -101,16 +101,30 @@ if [[ $commands == *" kernels "* ]]; then --ignore=kernels/test_permute_cols.py" fi -#ignore certain Entrypoints tests +#ignore certain Entrypoints/openai tests if [[ $commands == *" entrypoints/openai "* ]]; then commands=${commands//" entrypoints/openai "/" entrypoints/openai \ - --ignore=entrypoints/openai/test_accuracy.py \ --ignore=entrypoints/openai/test_audio.py \ - --ignore=entrypoints/openai/test_encoder_decoder.py \ - --ignore=entrypoints/openai/test_embedding.py \ - --ignore=entrypoints/openai/test_oot_registration.py "} + --ignore=entrypoints/openai/test_chat.py \ + --ignore=entrypoints/openai/test_shutdown.py \ + --ignore=entrypoints/openai/test_completion.py \ + --ignore=entrypoints/openai/test_sleep.py \ + --ignore=entrypoints/openai/test_models.py \ + --ignore=entrypoints/openai/test_prompt_validation.py "} fi +#ignore certain Entrypoints/llm tests +if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then + commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "} +fi + +# --ignore=entrypoints/openai/test_encoder_decoder.py \ +# --ignore=entrypoints/openai/test_embedding.py \ +# --ignore=entrypoints/openai/test_oot_registration.py +# --ignore=entrypoints/openai/test_accuracy.py \ +# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13 + + PARALLEL_JOB_COUNT=8 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. if [[ $commands == *"--shard-id="* ]]; then diff --git a/Dockerfile.rocm b/Dockerfile.rocm index e2d9ab37533e4..f852f3d69759f 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -61,6 +61,7 @@ RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ cd /install \ && pip install -U -r requirements/rocm.txt \ + && pip install -U -r requirements/rocm-test.txt \ && pip uninstall -y vllm \ && pip install *.whl diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt new file mode 100644 index 0000000000000..52fbf787f1dff --- /dev/null +++ b/requirements/rocm-test.txt @@ -0,0 +1,23 @@ + +# entrypoints test +# librosa==0.10.2.post1 # required by audio tests in entrypoints/openai +audioread==3.0.1 +cffi==1.17.1 +decorator==5.2.1 +lazy-loader==0.4 +platformdirs==4.3.6 +pooch==1.8.2 +#pycparse==2.22 +soundfile==0.13.1 +soxr==0.5.0.post1 +librosa==0.10.2.post1 + +# entrypoints test +#vllm[video] # required by entrypoints/openai/test_video.py +decord==0.6.0 + +# entrypoints test +#sentence-transformers # required by entrypoints/openai/test_score.py +sentence-transformers==3.4.1 + + From fe66b34728e5d383e3d19aefc544eeee808c99fb Mon Sep 17 00:00:00 2001 From: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Date: Fri, 14 Mar 2025 16:36:18 -0400 Subject: [PATCH 033/169] [Model] Mamba2 Prefill Performance Tweaks: Fixing Flurry of Unnecessary Memory Copies (#14778) Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> --- .../layers/mamba/mamba_mixer2.py | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index b53a540ed6624..5b19e3f3554ac 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -466,10 +466,17 @@ class MambaMixer2(CustomOp): if has_prefill: initial_states = None - if has_initial_states is not None and any(has_initial_states): - for idx in mamba_cache_params.state_indices_tensor[ - ~has_initial_states]: - mamba_cache_params.ssm_state[idx].zero_() + + if has_initial_states is not None and torch.any( + has_initial_states): + + # vectorized ssm_state zero init + batched_zero_init_func = torch.vmap( + lambda idx: mamba_cache_params.ssm_state[idx].zero_()) + batched_zero_init_func( + mamba_cache_params. + state_indices_tensor[~has_initial_states].unsqueeze( + dim=-1), ) initial_states = mamba_cache_params.ssm_state[ mamba_cache_params.state_indices_tensor] @@ -493,10 +500,17 @@ class MambaMixer2(CustomOp): dt_limit=(0.0, float("inf")), ) - # update ssm states - # - varlen state is a (batch, nheads, headdim, dstate) tensor - for i, idx in enumerate(mamba_cache_params.state_indices_tensor): - mamba_cache_params.ssm_state[idx].copy_(varlen_state[i]) + # vectorized ssm state update using vmap + # the 1d state_indices_tensor needs to be unsqueezed to avoid vmap + # limitation which doesn't allow use of `item()` + # Note: the lambda capture can happen where ssm_state is initialized + # instead of here + batched_copy = torch.vmap( + lambda idx, source_state: mamba_cache_params.ssm_state[ + idx].copy_(source_state)) + batched_copy( + mamba_cache_params.state_indices_tensor.unsqueeze(dim=-1), + varlen_state) # - reshape hidden_states = scan_output.view(seq_len, -1) From 46f98893dd0c30365116563ab660c360b29c276b Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 14 Mar 2025 16:55:18 -0400 Subject: [PATCH 034/169] [V1] Fix model parameterization for structured output tests (#14833) Signed-off-by: Russell Bryant --- .../llm/test_struct_output_generate.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index b99fb6a778295..b4eb475c23baa 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -14,18 +14,15 @@ from vllm.outputs import RequestOutput from vllm.sampling_params import GuidedDecodingParams, SamplingParams GUIDED_DECODING_BACKENDS_V1 = ["xgrammar"] - - -@pytest.fixture -def model_name(): - return [ - "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410" - ] +MODELS_TO_TEST = [ + "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410" +] @pytest.mark.skip_global_cleanup @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS_V1) +@pytest.mark.parametrize("model_name", MODELS_TO_TEST) def test_guided_json_completion( monkeypatch: pytest.MonkeyPatch, sample_json_schema: dict[str, Any], @@ -63,6 +60,7 @@ def test_guided_json_completion( @pytest.mark.skip_global_cleanup @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS_V1) +@pytest.mark.parametrize("model_name", MODELS_TO_TEST) def test_guided_json_object( monkeypatch: pytest.MonkeyPatch, guided_decoding_backend: str, @@ -101,6 +99,7 @@ def test_guided_json_object( @pytest.mark.skip_global_cleanup @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS_V1) +@pytest.mark.parametrize("model_name", MODELS_TO_TEST) def test_guided_json_unsupported_schema( monkeypatch: pytest.MonkeyPatch, unsupported_json_schema: dict[str, Any], @@ -128,6 +127,7 @@ def test_guided_json_unsupported_schema( @pytest.mark.skip_global_cleanup @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS_V1) +@pytest.mark.parametrize("model_name", MODELS_TO_TEST) def test_guided_grammar_ebnf( monkeypatch: pytest.MonkeyPatch, sample_sql_ebnf: str, @@ -170,6 +170,7 @@ def test_guided_grammar_ebnf( @pytest.mark.skip_global_cleanup @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS_V1) +@pytest.mark.parametrize("model_name", MODELS_TO_TEST) def test_guided_grammar_lark( monkeypatch: pytest.MonkeyPatch, sample_sql_lark: str, @@ -217,6 +218,7 @@ def test_guided_grammar_lark( @pytest.mark.skip_global_cleanup @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS_V1) +@pytest.mark.parametrize("model_name", MODELS_TO_TEST) def test_guided_grammar_ebnf_invalid( monkeypatch: pytest.MonkeyPatch, guided_decoding_backend: str, @@ -244,6 +246,7 @@ def test_guided_grammar_ebnf_invalid( @pytest.mark.skip_global_cleanup @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS_V1) +@pytest.mark.parametrize("model_name", MODELS_TO_TEST) def test_guided_regex( monkeypatch: pytest.MonkeyPatch, sample_regex: str, @@ -280,6 +283,7 @@ def test_guided_regex( @pytest.mark.skip_global_cleanup @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS_V1) +@pytest.mark.parametrize("model_name", MODELS_TO_TEST) def test_guided_choice_completion( monkeypatch: pytest.MonkeyPatch, sample_guided_choice: str, From 14f301b541ffe95187ff90938c7f94f642bc5bfa Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 14 Mar 2025 16:58:30 -0400 Subject: [PATCH 035/169] Update to torch==2.6.0 (#12721) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: mgoin Signed-off-by: mgoin Signed-off-by: luka Signed-off-by: Tyler Michael Smith Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Luka Govedič Co-authored-by: DarkLight1337 Co-authored-by: Tyler Michael Smith Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- CMakeLists.txt | 4 ++-- Dockerfile | 2 +- pyproject.toml | 2 +- requirements/build.txt | 2 +- requirements/cuda.txt | 10 +++++----- requirements/test.in | 7 ++++--- requirements/test.txt | 18 ++++++++++-------- tests/compile/backend.py | 6 ++++-- vllm/config.py | 15 +++++++++++++++ 9 files changed, 43 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b7bfdc6c857b1..65d1ddbeee0b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,8 +46,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101") # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1") -set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1") +set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0") +set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0") # # Try to find python package with an executable that exactly matches diff --git a/Dockerfile b/Dockerfile index ff4a0839f6e0f..79bca1cf9f8c1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -222,7 +222,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist RUN --mount=type=cache,target=/root/.cache/uv \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ - uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \ + uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \ fi COPY examples examples diff --git a/pyproject.toml b/pyproject.toml index 836389bc9a646..ee4e2ed0b7ce2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "packaging", "setuptools>=61", "setuptools-scm>=8.0", - "torch == 2.5.1", + "torch == 2.6.0", "wheel", "jinja2", ] diff --git a/requirements/build.txt b/requirements/build.txt index fec01caaf25ef..364a16d80b71b 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -4,6 +4,6 @@ ninja packaging setuptools>=61 setuptools-scm>=8 -torch==2.5.1 +torch==2.6.0 wheel jinja2 diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 46bb17361b2f2..702d4b0bb320c 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -4,9 +4,9 @@ numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding # Dependencies for NVIDIA GPUs -ray[cgraph] >= 2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1. -torch == 2.5.1 -torchaudio==2.5.1 +ray[cgraph]>=2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1. +torch==2.6.0 +torchaudio==2.6.0 # These must be updated alongside torch -torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version -xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1 +torchvision==0.21.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +xformers==0.0.29.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.6.0 diff --git a/requirements/test.in b/requirements/test.in index de33f92b37b9c..cc89d518c7eec 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -21,8 +21,9 @@ sentence-transformers # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests timm # required for internvl test -torch==2.5.1 -torchaudio==2.5.1 +torch==2.6.0 +torchaudio==2.6.0 +torchvision==0.21.0 transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test mistral_common[opencv] >= 1.5.0 # required for pixtral test @@ -30,7 +31,7 @@ datamodel_code_generator # required for minicpm3 test lm-eval[api]==0.4.4 # required for model evaluation test transformers==4.48.2 # quantization -bitsandbytes>=0.45.0 +bitsandbytes>=0.45.3 buildkite-test-collector==0.1.9 genai_perf==0.0.8 diff --git a/requirements/test.txt b/requirements/test.txt index f112320725c60..a235c8b24eecf 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -33,7 +33,7 @@ audioread==3.0.1 # via librosa awscli==1.35.23 # via -r requirements/test.in -bitsandbytes==0.45.0 +bitsandbytes==0.45.3 # via -r requirements/test.in black==24.10.0 # via datamodel-code-generator @@ -127,7 +127,6 @@ filelock==3.16.1 # ray # torch # transformers - # triton fonttools==4.54.1 # via matplotlib frozendict==2.4.6 @@ -320,6 +319,8 @@ nvidia-cusparse-cu12==12.3.1.170 # via # nvidia-cusolver-cu12 # torch +nvidia-cusparselt-cu12==0.6.2 + # via torch nvidia-nccl-cu12==2.21.5 # via torch nvidia-nvjitlink-cu12==12.4.127 @@ -591,7 +592,7 @@ timm==1.0.11 # via -r requirements/test.in tokenizers==0.21.0 # via transformers -torch==2.5.1 +torch==2.6.0 # via # -r requirements/test.in # accelerate @@ -607,13 +608,15 @@ torch==2.5.1 # torchvision # vector-quantize-pytorch # vocos -torchaudio==2.5.1 +torchaudio==2.6.0 # via # -r requirements/test.in # encodec # vocos -torchvision==0.20.1 - # via timm +torchvision==0.21.0 + # via + # -r requirements/test.in + # timm tqdm==4.66.6 # via # datasets @@ -638,7 +641,7 @@ transformers==4.48.2 # transformers-stream-generator transformers-stream-generator==0.0.5 # via -r requirements/test.in -triton==3.1.0 +triton==3.2.0 # via torch tritonclient==2.51.0 # via @@ -651,7 +654,6 @@ typepy==1.3.2 # tabledata typing-extensions==4.12.2 # via - # bitsandbytes # huggingface-hub # librosa # mistral-common diff --git a/tests/compile/backend.py b/tests/compile/backend.py index 64416eb136cf4..a21e8eca3a6e1 100644 --- a/tests/compile/backend.py +++ b/tests/compile/backend.py @@ -6,6 +6,7 @@ from typing import Callable, Union from torch import fx from vllm.compilation.inductor_pass import InductorPass +from vllm.config import get_current_vllm_config class TestBackend: @@ -17,13 +18,14 @@ class TestBackend: Inductor config can be modified directly by editing the inductor_config property. This can be helpful for adding passes like the 'pre_grad_custom_pass' and the 'post_grad_custom_pre_pass'. + Inductor config is default-initialized from VllmConfig.CompilationConfig. """ def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph], None]]): self.custom_passes = list(passes) - from torch._inductor import config - self.inductor_config = config.shallow_copy_dict() + compile_config = get_current_vllm_config().compilation_config + self.inductor_config = compile_config.inductor_compile_config self.inductor_config['force_disable_caches'] = True self.inductor_config['post_grad_custom_post_pass'] = self.post_pass diff --git a/vllm/config.py b/vllm/config.py index 429ec0dd51c13..40ea50cb083fb 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -52,6 +52,8 @@ if TYPE_CHECKING: else: QuantizationConfig = None +from packaging.version import Version + logger = init_logger(__name__) # This value is chosen to have a balance between ITL and TTFT. Note it is @@ -3126,6 +3128,19 @@ class CompilationConfig(BaseModel): count_all = self.custom_ops.count("all") assert count_none + count_all <= 1, "Can only specify 'none' or 'all'" + # TODO(zou3519/luka): There are 2 issues with auto-functionalization V2: + # 1. A bug in PyTorch, fixed in 2.7: + # https://github.com/pytorch/pytorch/issues/147924 + # 2. Custom passes (fusion) rely on auto-functionalization V1 and don't + # work with V2. Addressing this will take extra engineering effort + # and it is not yet a priority. RFC here: + # https://github.com/vllm-project/vllm/issues/14703 + + if Version(torch.__version__) >= Version("2.6"): + KEY = 'enable_auto_functionalized_v2' + if KEY not in self.inductor_compile_config: + self.inductor_compile_config[KEY] = False + if self.splitting_ops is None: if envs.VLLM_USE_V1: # v1 must split the graph on attention ops From 40677783aa1f59019424ff1828c54b696e4cfc3a Mon Sep 17 00:00:00 2001 From: Richard Liu <39319471+richardsliu@users.noreply.github.com> Date: Fri, 14 Mar 2025 14:13:30 -0700 Subject: [PATCH 036/169] [CI] Add TPU v1 test (#14834) Signed-off-by: Richard Liu --- .buildkite/run-tpu-v1-test.sh | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100755 .buildkite/run-tpu-v1-test.sh diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh new file mode 100755 index 0000000000000..a6a14d0829d65 --- /dev/null +++ b/.buildkite/run-tpu-v1-test.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +set -e + +# Build the docker image. +docker build -f Dockerfile.tpu -t vllm-tpu . + +# Set up cleanup. +remove_docker_container() { docker rm -f tpu-test || true; } +trap remove_docker_container EXIT +# Remove the container that might not be cleaned up in the previous run. +remove_docker_container + +# For HF_TOKEN. +source /etc/environment +# Run a simple end-to-end example. +docker run --privileged --net host --shm-size=16G -it \ + -e "HF_TOKEN=$HF_TOKEN" -e "VLLM_USE_V1=1" --name tpu-test \ + vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \ + && python3 -m pip install pytest \ + && python3 -m pip install lm_eval[api]==0.4.4 \ + && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ + && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \ + && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \ + && python3 /workspace/vllm/tests/tpu/test_compilation.py \ + && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \ + && python3 /workspace/vllm/examples/offline_inference/tpu.py" From 233ffce1ebd3a0389c8adfd413de715d4f351d6f Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 14 Mar 2025 17:25:28 -0400 Subject: [PATCH 037/169] [Build/CI] Move ninja to common deps (#14835) Signed-off-by: Russell Bryant --- requirements/common.txt | 1 + requirements/rocm-build.txt | 1 - requirements/tpu.txt | 1 - requirements/xpu.txt | 3 +-- 4 files changed, 2 insertions(+), 4 deletions(-) diff --git a/requirements/common.txt b/requirements/common.txt index 13a06011e4091..3cd933f347f59 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -38,3 +38,4 @@ cloudpickle # allows pickling lambda functions in model_executor/models/registry watchfiles # required for http server to monitor the updates of TLS files python-json-logger # Used by logging as per examples/other/logging_configuration.md scipy # Required for phi-4-multimodal-instruct +ninja # Required for xgrammar, rocm, tpu, xpu diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt index 4d4945b007ebc..f378663ade752 100644 --- a/requirements/rocm-build.txt +++ b/requirements/rocm-build.txt @@ -7,7 +7,6 @@ torchvision==0.20.1 torchaudio==2.5.1 cmake>=3.26 -ninja packaging setuptools>=61 setuptools-scm>=8 diff --git a/requirements/tpu.txt b/requirements/tpu.txt index e071c604b5c0b..06bcecfc00458 100644 --- a/requirements/tpu.txt +++ b/requirements/tpu.txt @@ -3,7 +3,6 @@ # Dependencies for TPU cmake>=3.26 -ninja packaging setuptools-scm>=8 wheel diff --git a/requirements/xpu.txt b/requirements/xpu.txt index 0e3252f02d35b..3fd0655904e4d 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -3,7 +3,6 @@ ray>=2.9 cmake>=3.26 -ninja packaging setuptools-scm>=8 setuptools>=75.8.0 @@ -21,4 +20,4 @@ pytorch-triton-xpu # FIXME: This will be fix in ipex 2.7. just leave this here for awareness. # intel-extension-for-pytorch==2.6.10+xpu oneccl_bind_pt==2.6.0+xpu ---extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \ No newline at end of file +--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ From bbd94a19fcfab78599844ffcd5c227fa77a46b2e Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 14 Mar 2025 19:11:28 -0400 Subject: [PATCH 038/169] [Build/CI] Upgrade aiohttp to incldue CVE fix (#14840) Signed-off-by: Russell Bryant --- requirements/test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/test.txt b/requirements/test.txt index a235c8b24eecf..0a2b491669ac8 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -8,7 +8,7 @@ accelerate==1.0.1 # peft aiohappyeyeballs==2.4.3 # via aiohttp -aiohttp==3.10.10 +aiohttp==3.10.11 # via # datasets # fsspec From 54a8804455a14234ba246f7cbaf29fb5e8587d64 Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Fri, 14 Mar 2025 19:12:36 -0400 Subject: [PATCH 039/169] [Doc] More neutral K8s deployment guide (#14084) Signed-off-by: Yuan Tang --- docs/source/deployment/k8s.md | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md index dd3769c47fc50..b31344b199663 100644 --- a/docs/source/deployment/k8s.md +++ b/docs/source/deployment/k8s.md @@ -4,17 +4,19 @@ Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes. --------- - -Alternatively, you can also deploy Kubernetes using [helm chart](https://docs.vllm.ai/en/latest/deployment/frameworks/helm.html). There are also open-source projects available to make your deployment even smoother. - -* [vLLM production-stack](https://github.com/vllm-project/production-stack): Born out of a Berkeley-UChicago collaboration, vLLM production stack is a project that contains latest research and community effort, while still delivering production-level stability and performance. Checkout the [documentation page](https://docs.vllm.ai/en/latest/deployment/integrations/production-stack.html) for more details and examples. - --------- +Alternatively, you can deploy vLLM to Kubernetes using any of the following: +* [Helm](frameworks/helm.md) +* [InftyAI/llmaz](integrations/llmaz.md) +* [KServe](integrations/kserve.md) +* [kubernetes-sigs/lws](frameworks/lws.md) +* [meta-llama/llama-stack](integrations/llamastack.md) +* [substratusai/kubeai](integrations/kubeai.md) +* [vllm-project/aibrix](https://github.com/vllm-project/aibrix) +* [vllm-project/production-stack](integrations/production-stack.md) ## Pre-requisite -Ensure that you have a running Kubernetes environment with GPU (you can follow [this tutorial](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) to install a Kubernetes environment on a bare-metal GPU machine). +Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/). ## Deployment using native K8s From dd344e03425087ebcfc3f98f91821c7e5d316832 Mon Sep 17 00:00:00 2001 From: yarongmu-google <150371854+yarongmu-google@users.noreply.github.com> Date: Fri, 14 Mar 2025 17:41:15 -0700 Subject: [PATCH 040/169] =?UTF-8?q?[Bugfix]=20Fix=20torch=5Fxla=20in=20V0?= =?UTF-8?q?=20which=20can't=20handle=20None=20seed=20introduced=20?= =?UTF-8?q?=E2=80=A6=20(#14844)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Yarong Mu --- vllm/worker/tpu_worker.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 1a5eaba09b940..66911790662eb 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -51,6 +51,9 @@ class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase): self.model_runner: TPUModelRunner = TPUModelRunner( vllm_config=vllm_config, is_driver_worker=is_driver_worker) + if self.model_config.seed is None: + self.model_config.seed = 0 + def init_device(self) -> None: os.environ["PJRT_DEVICE"] = "TPU" torch.set_grad_enabled(False) From 9f37422779a17d8b2ebde300808166068f4ad4cc Mon Sep 17 00:00:00 2001 From: Liangfu Chen Date: Fri, 14 Mar 2025 18:51:35 -0700 Subject: [PATCH 041/169] [Neuron][CI] update docker run command (#14829) Signed-off-by: Liangfu Chen --- .buildkite/run-neuron-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index 06924fea6195e..ad5ae6f415748 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -44,7 +44,7 @@ remove_docker_container() { trap remove_docker_container EXIT # Run the image -docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \ +docker run --rm -it --device=/dev/neuron0 --network bridge \ -v "${HF_CACHE}:${HF_MOUNT}" \ -e "HF_HOME=${HF_MOUNT}" \ -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \ From acaea3bb07883c80b71643ebee1cd08d555797bc Mon Sep 17 00:00:00 2001 From: DefTruth <31974251+DefTruth@users.noreply.github.com> Date: Sat, 15 Mar 2025 11:42:38 +0800 Subject: [PATCH 042/169] [Bugfix][V1] Fix flashinfer sampling (#14815) --- vllm/v1/sample/ops/topk_topp_sampler.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index 7d70e839b6f4e..d461a80989337 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -24,7 +24,24 @@ class TopKTopPSampler(nn.Module): super().__init__() if current_platform.is_cuda(): if is_flashinfer_available: - if envs.VLLM_USE_FLASHINFER_SAMPLER is not False: + flashinfer_version = flashinfer.__version__ + if flashinfer_version >= "0.2.3": + # FIXME(DefTruth): Currently, we have errors when using + # FlashInfer>=v0.2.3 for top-p & top-k sampling. As a + # workaround, we disable FlashInfer for top-p & top-k + # sampling by default while FlashInfer>=v0.2.3. + # The sampling API removes the success return value + # of all sampling API, which is not compatible with + # earlier design. + # https://github.com/flashinfer-ai/flashinfer/releases/ + # tag/v0.2.3 + logger.info( + "Currently, FlashInfer top-p & top-k sampling sampler " + "is disabled because FlashInfer>=v0.2.3 is not " + "backward compatible. Falling back to the PyTorch-" + "native implementation of top-p & top-k sampling.") + self.forward = self.forward_native + elif envs.VLLM_USE_FLASHINFER_SAMPLER is not False: # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by # default it is unused). For backward compatibility, we set From ccf02fcbaebb1a5b59dfc6c7cb64aa7cc489f04c Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Fri, 14 Mar 2025 23:45:42 -0400 Subject: [PATCH 043/169] =?UTF-8?q?Revert=20"[Model]=20Mamba2=20Prefill=20?= =?UTF-8?q?Performance=20Tweaks:=20Fixing=20Flurry=20of=20U=E2=80=A6=20(#1?= =?UTF-8?q?4848)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../layers/mamba/mamba_mixer2.py | 30 +++++-------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 5b19e3f3554ac..b53a540ed6624 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -466,17 +466,10 @@ class MambaMixer2(CustomOp): if has_prefill: initial_states = None - - if has_initial_states is not None and torch.any( - has_initial_states): - - # vectorized ssm_state zero init - batched_zero_init_func = torch.vmap( - lambda idx: mamba_cache_params.ssm_state[idx].zero_()) - batched_zero_init_func( - mamba_cache_params. - state_indices_tensor[~has_initial_states].unsqueeze( - dim=-1), ) + if has_initial_states is not None and any(has_initial_states): + for idx in mamba_cache_params.state_indices_tensor[ + ~has_initial_states]: + mamba_cache_params.ssm_state[idx].zero_() initial_states = mamba_cache_params.ssm_state[ mamba_cache_params.state_indices_tensor] @@ -500,17 +493,10 @@ class MambaMixer2(CustomOp): dt_limit=(0.0, float("inf")), ) - # vectorized ssm state update using vmap - # the 1d state_indices_tensor needs to be unsqueezed to avoid vmap - # limitation which doesn't allow use of `item()` - # Note: the lambda capture can happen where ssm_state is initialized - # instead of here - batched_copy = torch.vmap( - lambda idx, source_state: mamba_cache_params.ssm_state[ - idx].copy_(source_state)) - batched_copy( - mamba_cache_params.state_indices_tensor.unsqueeze(dim=-1), - varlen_state) + # update ssm states + # - varlen state is a (batch, nheads, headdim, dstate) tensor + for i, idx in enumerate(mamba_cache_params.state_indices_tensor): + mamba_cache_params.ssm_state[idx].copy_(varlen_state[i]) # - reshape hidden_states = scan_output.view(seq_len, -1) From 776dcec8fe51860d7580001de86216406629df0f Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 14 Mar 2025 23:57:55 -0400 Subject: [PATCH 044/169] Disable outlines cache by default (#14837) --- vllm/envs.py | 7 +++++++ .../guided_decoding/outlines_logits_processors.py | 10 +++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index a36d20a4f8b50..0b1bcd9eb358b 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -95,6 +95,7 @@ if TYPE_CHECKING: VLLM_DP_MASTER_IP: str = "" VLLM_DP_MASTER_PORT: int = 0 VLLM_MARLIN_USE_ATOMIC_ADD: bool = False + VLLM_V0_USE_OUTLINES_CACHE: bool = False def get_default_cache_root(): @@ -623,6 +624,12 @@ environment_variables: dict[str, Callable[[], Any]] = { # Whether to use atomicAdd reduce in gptq/awq marlin kernel. "VLLM_MARLIN_USE_ATOMIC_ADD": lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1", + + # Whether to turn on the outlines cache for V0 + # This cache is unbounded and on disk, so it's not safe to use in + # an environment with potentially malicious users. + "VLLM_V0_USE_OUTLINES_CACHE": + lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1", } # end-env-vars-definition diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index de24eaa1fb6a3..8b2a0f4cfe64b 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -24,7 +24,7 @@ from typing import Callable, DefaultDict, Dict, List, Optional, Union import numpy as np import torch from outlines import grammars -from outlines.caching import cache +from outlines.caching import cache, disable_cache from outlines.fsm.guide import (CFGGuide, CFGState, Generate, Guide, RegexGuide, Write) from outlines.fsm.parsing import PartialLark @@ -32,12 +32,20 @@ from outlines_core.fsm.json_schema import build_regex_from_schema from pydantic import BaseModel from transformers import PreTrainedTokenizerBase +import vllm.envs as envs from vllm.logger import init_logger from vllm.model_executor.guided_decoding.reasoner import Reasoner from vllm.platforms import current_platform logger = init_logger(__name__) +if envs.VLLM_V0_USE_OUTLINES_CACHE: + logger.warning("Enabling outlines cache. This is an unbounded on-disk " + "cache. It may consume a lot of disk space and should " + "not be used with untrusted clients.") +else: + disable_cache() + class BaseLogitsProcessor: From 97ac781c6213f1774f6559585ed166852e2ecc78 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 15 Mar 2025 12:35:12 +0800 Subject: [PATCH 045/169] [Misc] Remove misleading message in gemma2 and gemma3 (#14850) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/gemma.py | 6 +----- vllm/model_executor/models/gemma2.py | 5 ----- vllm/model_executor/models/gemma3.py | 5 ----- 3 files changed, 1 insertion(+), 15 deletions(-) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index da17646c540fd..d741880c00d2d 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -424,9 +424,5 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): default_weight_loader) weight_loader(param, loaded_weight) loaded_params.add(name) - unloaded_params = params_dict.keys() - loaded_params - if unloaded_params: - logger.warning( - "Some weights are not initialized from checkpoints: %s", - unloaded_params) + return loaded_params diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index cf744fc2b9d12..d125c666f3cd1 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -358,11 +358,6 @@ class Gemma2Model(nn.Module): weight_loader(param, loaded_weight) loaded_params.add(name) - unloaded_params = params_dict.keys() - loaded_params - if unloaded_params: - logger.warning( - "Some weights are not initialized from checkpoints: %s", - unloaded_params) return loaded_params diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index f1ecf7fa821d9..55c96f649fbeb 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -452,11 +452,6 @@ class Gemma3Model(nn.Module): weight_loader(param, loaded_weight) loaded_params.add(name) - unloaded_params = params_dict.keys() - loaded_params - if unloaded_params: - logger.warning( - "Some weights are not initialized from checkpoints: %s", - unloaded_params) return loaded_params From 8c0d15d5c5658b74a70694124af2ac250fdc4e23 Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Fri, 14 Mar 2025 21:40:09 -0700 Subject: [PATCH 046/169] [Misc][Easy] Annotate unused vars in the csrc files (#14798) Signed-off-by: Lu Fang --- csrc/prepare_inputs/advance_step.cu | 2 +- csrc/quantization/fp8/amd/quant_utils.cuh | 2 +- csrc/quantization/gptq/q_gemm.cu | 16 ++++++++-------- csrc/rocm/attention.cu | 7 ++++--- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu index c3902f4c2a163..fea4bc2ca0d8f 100644 --- a/csrc/prepare_inputs/advance_step.cu +++ b/csrc/prepare_inputs/advance_step.cu @@ -274,7 +274,7 @@ void advance_step_flashinfer( cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev); - int block_tables_stride = block_tables.stride(0); + [[maybe_unused]] int block_tables_stride = block_tables.stride(0); TORCH_CHECK((blocks * threads > num_queries), "multi-step: not enough threads to map to num_queries = ", num_queries, " block_tables.stride(0) = ", block_tables.stride(0), diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh index feda497d0210e..c4ed1b4757928 100644 --- a/csrc/quantization/fp8/amd/quant_utils.cuh +++ b/csrc/quantization/fp8/amd/quant_utils.cuh @@ -446,7 +446,7 @@ scaled_vec_conversion(const uint8_t& a, float scale) { template <> __inline__ __device__ uint32_t scaled_vec_conversion(const uint16_t& a, float scale) { - __half2_raw h2r = + [[maybe_unused]] __half2_raw h2r = __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret); union { __half2_raw h2r; diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu index 785f1a09c1900..538cb5848e21f 100644 --- a/csrc/quantization/gptq/q_gemm.cu +++ b/csrc/quantization/gptq/q_gemm.cu @@ -206,8 +206,8 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel( int offset_m = blockIdx.y * m_count; int offset_k = blockIdx.z * BLOCK_KN_SIZE; - int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - int end_m = min(offset_m + m_count, size_m); + [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); + [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int n = offset_n + t * 4; @@ -344,8 +344,8 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel( int offset_m = blockIdx.y * m_count; int offset_k = blockIdx.z * BLOCK_KN_SIZE; - int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - int end_m = min(offset_m + m_count, size_m); + [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); + [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int n = offset_n + t * 4; @@ -465,8 +465,8 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel( int offset_m = blockIdx.y * m_count; int offset_k = blockIdx.z * BLOCK_KN_SIZE; - int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - int end_m = min(offset_m + m_count, size_m); + [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); + [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int n = offset_n + t * 4; @@ -593,8 +593,8 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel( int offset_m = blockIdx.y * m_count; int offset_k = blockIdx.z * BLOCK_KN_SIZE; - int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - int end_m = min(offset_m + m_count, size_m); + [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); + [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int n = offset_n + t * 4; diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index 86029da141b36..90f0b54d2f006 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -308,8 +308,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP(GQA_RATIO, 4); - __shared__ float shared_qk_max[NWARPS][16 + 1]; - __shared__ float shared_exp_sum[NWARPS][16 + 1]; + [[maybe_unused]] __shared__ float shared_qk_max[NWARPS][16 + 1]; + [[maybe_unused]] __shared__ float shared_exp_sum[NWARPS][16 + 1]; // shared_logits is used for multiple purposes __shared__ _B16x4 shared_logits[NWARPS][4][16][4]; @@ -426,7 +426,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride; const int klocal_token_idx = TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; - const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx; + [[maybe_unused]] const int kglobal_token_idx = + partition_start_token_idx + klocal_token_idx; const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE; const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX; From d4d93db2c54ad989ea800004f491c8d116017a4c Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Sat, 15 Mar 2025 01:02:20 -0400 Subject: [PATCH 047/169] [V1] V1 Enablement Oracle (#13726) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: rshaw@neuralmagic.com Co-authored-by: rshaw@neuralmagic.com Co-authored-by: NicolΓ² Lucchesi Co-authored-by: Tyler Michael Smith Co-authored-by: Michael Goin --- .../configs/Minitron-4B-Base-FP8.yaml | 4 +- .../test_lm_eval_correctness.py | 5 + .buildkite/test-pipeline.yaml | 34 +- tests/async_engine/conftest.py | 11 + tests/async_engine/test_api_server.py | 6 +- tests/async_engine/test_async_llm_engine.py | 9 + .../basic_correctness/test_chunked_prefill.py | 9 + tests/basic_correctness/test_cpu_offload.py | 7 + tests/basic_correctness/test_preemption.py | 9 + tests/compile/conftest.py | 14 + tests/conftest.py | 20 + tests/core/conftest.py | 11 + .../__init__.py | 0 tests/detokenizer/conftest.py | 10 + .../test_disable_detokenization.py} | 1 + .../test_stop_checker.py | 0 .../test_stop_reason.py | 0 tests/detokenizer/test_stop_strings.py | 141 ++++++ tests/distributed/test_pipeline_parallel.py | 12 + tests/encoder_decoder/test_e2e_correctness.py | 9 + tests/engine/conftest.py | 11 + ...py => test_multi_step_output_processor.py} | 2 +- tests/engine/test_stop_strings.py | 165 ------- tests/entrypoints/llm/test_lazy_outlines.py | 9 + tests/entrypoints/openai/test_chat_echo.py | 3 - tests/entrypoints/openai/test_root_path.py | 3 - tests/kernels/test_attention_selector.py | 28 +- tests/kernels/test_encoder_decoder_attn.py | 10 + tests/kernels/test_rocm_attention_selector.py | 3 +- tests/lora/test_llama_tp.py | 4 + tests/lora/test_lora_functions.py | 4 +- tests/lora/test_lora_manager.py | 3 + tests/metrics/test_metrics.py | 9 + .../models/decoder_only/language/test_gguf.py | 20 +- .../decoder_only/language/test_hybrid.py | 14 +- .../decoder_only/language/test_mamba.py | 7 - .../decoder_only/language/test_mistral.py | 21 +- .../decoder_only/language/test_models.py | 16 +- .../decoder_only/vision_language/test_awq.py | 7 +- .../vision_language/test_models.py | 89 ++-- .../vision_language/test_qwen2_vl.py | 12 +- .../embedding/language/test_cls_models.py | 7 - .../embedding/language/test_embedding.py | 7 - tests/models/registry.py | 8 +- tests/models/test_initialization.py | 14 +- tests/models/test_oot_registration.py | 8 +- tests/mq_llm_engine/conftest.py | 11 + tests/plugins_tests/conftest.py | 11 + .../test_disable_sliding_window.py | 5 +- tests/prefix_caching/test_prefix_caching.py | 9 + tests/quantization/test_compressed_tensors.py | 8 + tests/quantization/test_cpu_offload.py | 7 + tests/quantization/test_fp8.py | 7 +- tests/quantization/test_gptq_dynamic.py | 6 +- tests/quantization/test_lm_head.py | 3 + tests/quantization/test_quark.py | 4 +- .../test_register_quantization_config.py | 4 +- tests/samplers/test_beam_search.py | 8 + tests/samplers/test_ignore_eos.py | 7 + tests/samplers/test_logits_processor.py | 8 + tests/samplers/test_logprobs.py | 9 + tests/samplers/test_no_bad_words.py | 7 + tests/samplers/test_ranks.py | 6 + tests/samplers/test_rejection_sampler.py | 9 + tests/samplers/test_sampler.py | 8 + tests/samplers/test_seeded_generate.py | 4 +- .../test_typical_acceptance_sampler.py | 8 + tests/spec_decode/conftest.py | 11 + tests/tensorizer_loader/conftest.py | 8 + tests/test_regression.py | 2 + tests/test_utils.py | 5 +- tests/tokenization/test_detokenize.py | 5 + tests/tool_use/utils.py | 30 +- tests/tracing/test_tracing.py | 10 + tests/v1/engine/test_engine_args.py | 26 +- tests/v1/sample/test_logprobs.py | 4 +- tests/v1/test_oracle.py | 169 +++++++ tests/weight_loading/test_weight_loading.py | 9 +- tests/worker/conftest.py | 10 + vllm/config.py | 27 +- vllm/engine/arg_utils.py | 426 ++++++++++++++---- vllm/engine/async_llm_engine.py | 53 ++- vllm/engine/llm_engine.py | 47 +- vllm/engine/multiprocessing/client.py | 5 +- vllm/engine/multiprocessing/engine.py | 55 ++- vllm/entrypoints/llm.py | 19 +- vllm/entrypoints/openai/api_server.py | 52 ++- vllm/envs.py | 20 +- vllm/model_executor/model_loader/utils.py | 3 +- vllm/model_executor/models/bloom.py | 4 +- vllm/model_executor/models/glm.py | 3 +- vllm/model_executor/models/ultravox.py | 5 +- vllm/v1/attention/backends/flash_attn.py | 3 +- vllm/v1/engine/async_llm.py | 47 +- vllm/v1/engine/llm_engine.py | 27 ++ vllm/v1/engine/processor.py | 19 +- 96 files changed, 1537 insertions(+), 512 deletions(-) create mode 100644 tests/async_engine/conftest.py create mode 100644 tests/compile/conftest.py create mode 100644 tests/core/conftest.py rename tests/{engine/output_processor => detokenizer}/__init__.py (100%) create mode 100644 tests/detokenizer/conftest.py rename tests/{engine/test_detokenization.py => detokenizer/test_disable_detokenization.py} (98%) rename tests/{engine/output_processor => detokenizer}/test_stop_checker.py (100%) rename tests/{engine => detokenizer}/test_stop_reason.py (100%) create mode 100644 tests/detokenizer/test_stop_strings.py create mode 100644 tests/engine/conftest.py rename tests/engine/{output_processor/test_multi_step.py => test_multi_step_output_processor.py} (99%) delete mode 100644 tests/engine/test_stop_strings.py create mode 100644 tests/mq_llm_engine/conftest.py create mode 100644 tests/plugins_tests/conftest.py create mode 100644 tests/spec_decode/conftest.py create mode 100644 tests/v1/test_oracle.py create mode 100644 tests/worker/conftest.py diff --git a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml index 3ea0b7bb5cd66..4ef8b5c3709b3 100644 --- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml +++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml @@ -4,8 +4,8 @@ tasks: - name: "gsm8k" metrics: - name: "exact_match,strict-match" - value: 0.233 + value: 0.231 - name: "exact_match,flexible-extract" - value: 0.236 + value: 0.22 limit: 1000 num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index 96e57dfd06475..4ae23eff62f37 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -13,6 +13,7 @@ from pathlib import Path import lm_eval import numpy +import pytest import yaml RTOL = 0.05 @@ -46,6 +47,10 @@ def test_lm_eval_correctness(): eval_config = yaml.safe_load( Path(TEST_DATA_FILE).read_text(encoding="utf-8")) + if eval_config[ + "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501 + pytest.skip("FBGEMM is currently failing on main.") + # Launch eval requests. results = launch_lm_eval(eval_config) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 81a971390472d..93ac8a29c676c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -117,10 +117,10 @@ steps: - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process + - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/ - pytest -v -s entrypoints/test_chat_utils.py - - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - label: Distributed Tests (4 GPUs) # 10min working_dir: "/vllm-workspace/tests" @@ -136,7 +136,7 @@ steps: - examples/offline_inference/rlhf_colocate.py - tests/examples/offline_inference/data_parallel.py commands: - - VLLM_USE_V1=1 python3 ../examples/offline_inference/data_parallel.py + - python3 ../examples/offline_inference/data_parallel.py - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py @@ -197,16 +197,17 @@ steps: - tests/v1 commands: # split the test to avoid interference - - VLLM_USE_V1=1 pytest -v -s v1/core - - VLLM_USE_V1=1 pytest -v -s v1/engine - - VLLM_USE_V1=1 pytest -v -s v1/sample - - VLLM_USE_V1=1 pytest -v -s v1/worker - - VLLM_USE_V1=1 pytest -v -s v1/structured_output - - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py - - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py + - pytest -v -s v1/core + - pytest -v -s v1/engine + - pytest -v -s v1/sample + - pytest -v -s v1/worker + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_stats.py + - pytest -v -s v1/test_utils.py + - pytest -v -s v1/test_oracle.py # TODO: accuracy does not match, whether setting # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - - VLLM_USE_V1=1 pytest -v -s v1/e2e + - pytest -v -s v1/e2e # Integration test for streaming correctness (requires special branch). - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine @@ -226,12 +227,12 @@ steps: - python3 offline_inference/llm_engine_example.py - python3 offline_inference/vision_language.py - python3 offline_inference/vision_language_multi_image.py - - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder.py - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py - - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 + - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - label: Prefix Caching Test # 9min mirror_hardwares: [amd] @@ -375,7 +376,8 @@ steps: commands: - pytest -v -s models/test_transformers.py - pytest -v -s models/test_registry.py - - pytest -v -s models/test_initialization.py + # V1 Test: https://github.com/vllm-project/vllm/issues/14531 + - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py - label: Language Models Test (Standard) # 32min #mirror_hardwares: [amd] @@ -518,8 +520,8 @@ steps: # this test fails consistently. # TODO: investigate and fix # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py + - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py + - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py - label: Plugin Tests (2 GPUs) # 40min working_dir: "/vllm-workspace/tests" diff --git a/tests/async_engine/conftest.py b/tests/async_engine/conftest.py new file mode 100644 index 0000000000000..1a20e2c135c2e --- /dev/null +++ b/tests/async_engine/conftest.py @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest + + +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + Since this module is V0 only, set VLLM_USE_V1=0 for + all tests in the module. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 77f3fb0025a0f..410cece795e94 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 +import os import subprocess import sys import time @@ -44,7 +45,10 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str): distributed_executor_backend, ] - uvicorn_process = subprocess.Popen(commands) + # API Server Test Requires V0. + my_env = os.environ.copy() + my_env["VLLM_USE_V1"] = "0" + uvicorn_process = subprocess.Popen(commands, env=my_env) yield uvicorn_process.terminate() diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index 6307bd7d64627..48e2e31e5db88 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -151,6 +151,10 @@ def uid() -> str: @pytest_asyncio.fixture(scope="module") async def async_engine(): + # We cannot use monkeypatch since this is a module + # scoped fixture and monkeypatch is function scoped. + previous_value = os.getenv("VLLM_USE_V1", None) + os.environ["VLLM_USE_V1"] = "0" engine = await asyncio.get_event_loop().run_in_executor(executor=None, func=start_engine) try: @@ -161,6 +165,11 @@ async def async_engine(): await asyncio.sleep(0.1) cleanup_dist_env_and_memory() + if previous_value: + os.environ["VLLM_USE_V1"] = previous_value + else: + del os.environ["VLLM_USE_V1"] + @pytest.fixture() def should_do_global_cleanup_after_test(request) -> bool: diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index fd4a804183bf5..5bf48b5cced4a 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -23,6 +23,15 @@ MODELS = [ ] +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + Since this module is V0 only, set VLLM_USE_V1=0 for + all tests in the file. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') + + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py index be3ad12396b4b..436e43638a3dd 100644 --- a/tests/basic_correctness/test_cpu_offload.py +++ b/tests/basic_correctness/test_cpu_offload.py @@ -1,8 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 +import pytest + from ..utils import compare_two_settings +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + monkeypatch.setenv('VLLM_USE_V1', '0') + + def test_cpu_offload(): compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [], ["--cpu-offload-gb", "1"]) diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index a32b7cac080be..63dc0f8c8e3b2 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -21,6 +21,15 @@ MODELS = [ ] +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT, + so use VLLM_USE_V1=0 for all tests in the file. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') + + @pytest.fixture(scope="module", autouse=True) def check_settings(): assert ENABLE_ARTIFICIAL_PREEMPT is True, ( diff --git a/tests/compile/conftest.py b/tests/compile/conftest.py new file mode 100644 index 0000000000000..7118810a58614 --- /dev/null +++ b/tests/compile/conftest.py @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest + + +# TEST V1: this should be removed. Right now V1 overrides +# all the torch compile logic. We should re-enable this +# as we add torch compile support back to V1. +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + Since this module is V0 only, set VLLM_USE_V1=0 for + all tests in the module. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') diff --git a/tests/conftest.py b/tests/conftest.py index 4fbb4132d385c..4716ca2e315b7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -111,6 +111,26 @@ VIDEO_ASSETS = _VideoAssets() """Singleton instance of :class:`_VideoAssets`.""" +@pytest.fixture(scope="function", autouse=True) +def cleanup_VLLM_USE_V1(monkeypatch): + """ + The V1 oracle sets "VLLM_USE_V1" during loading. This means + that each invocation of a test change the env variable. + + If we touch "VLLM_USE_V1" with monkeypatch, then any changes + made during the test run by vLLM will be cleaned up. + + This fixture is used by every test. + """ + + # If VLLM_USE_V1 is not set, set then delete. This will + # cause monkeypatch to clean up VLLM_USE_V1 upon exit + # if VLLM modifies the value of envs.VLLM_USE_V1. + if "VLLM_USE_V1" not in os.environ: + monkeypatch.setenv("VLLM_USE_V1", "") + monkeypatch.delenv("VLLM_USE_V1") + + @pytest.fixture(params=[True, False]) def run_with_both_engines(request, monkeypatch): # Automatically runs tests twice, once with V1 and once without diff --git a/tests/core/conftest.py b/tests/core/conftest.py new file mode 100644 index 0000000000000..1a20e2c135c2e --- /dev/null +++ b/tests/core/conftest.py @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest + + +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + Since this module is V0 only, set VLLM_USE_V1=0 for + all tests in the module. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') diff --git a/tests/engine/output_processor/__init__.py b/tests/detokenizer/__init__.py similarity index 100% rename from tests/engine/output_processor/__init__.py rename to tests/detokenizer/__init__.py diff --git a/tests/detokenizer/conftest.py b/tests/detokenizer/conftest.py new file mode 100644 index 0000000000000..59394b0351bda --- /dev/null +++ b/tests/detokenizer/conftest.py @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest + + +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass diff --git a/tests/engine/test_detokenization.py b/tests/detokenizer/test_disable_detokenization.py similarity index 98% rename from tests/engine/test_detokenization.py rename to tests/detokenizer/test_disable_detokenization.py index 2b7ebf705bbdb..14f9babb8d8a6 100644 --- a/tests/engine/test_detokenization.py +++ b/tests/detokenizer/test_disable_detokenization.py @@ -6,6 +6,7 @@ from vllm.entrypoints.llm import LLM from vllm.sampling_params import SamplingParams +@pytest.mark.skip_v1 @pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) def test_computed_prefix_blocks(model: str): # This test checks if the engine generates completions both with and diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/detokenizer/test_stop_checker.py similarity index 100% rename from tests/engine/output_processor/test_stop_checker.py rename to tests/detokenizer/test_stop_checker.py diff --git a/tests/engine/test_stop_reason.py b/tests/detokenizer/test_stop_reason.py similarity index 100% rename from tests/engine/test_stop_reason.py rename to tests/detokenizer/test_stop_reason.py diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py new file mode 100644 index 0000000000000..0607dd01a3395 --- /dev/null +++ b/tests/detokenizer/test_stop_strings.py @@ -0,0 +1,141 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Optional + +import pytest + +from vllm import LLM, SamplingParams, envs + +MODEL = "meta-llama/llama-2-7b-hf" +MAX_TOKENS = 200 + + +def _test_stopping(llm: LLM, + expected_output: str, + expected_reason: Any, + stop: Optional[list[str]] = None, + stop_token_ids: Optional[list[int]] = None, + include_in_output: bool = False) -> None: + output = llm.generate( + "A story about vLLM:\n", + SamplingParams( + temperature=0.0, + max_tokens=MAX_TOKENS, + stop=stop, + stop_token_ids=stop_token_ids, + include_stop_str_in_output=include_in_output, + ))[0].outputs[0] + + assert output is not None + assert output.text == expected_output + assert output.stop_reason == expected_reason + + +def _set_async_mode(llm, is_async): + llm.llm_engine.scheduler[0].use_async_output_proc = is_async + + +def _stop_basic(llm): + _test_stopping(llm, + stop=["."], + include_in_output=False, + expected_output="VLLM is a 100% volunteer organization", + expected_reason=".") + + _test_stopping(llm, + stop=["."], + include_in_output=True, + expected_output="VLLM is a 100% volunteer organization.", + expected_reason=".") + + +def _stop_multi_tokens(llm): + _test_stopping( + llm, + stop=["group of peo", "short"], + include_in_output=False, + expected_output="VLLM is a 100% volunteer organization. We are a ", + expected_reason="group of peo") + + _test_stopping( + llm, + stop=["group of peo", "short"], + include_in_output=True, + expected_output= + "VLLM is a 100% volunteer organization. We are a group of peo", + expected_reason="group of peo") + + +def _stop_partial_token(llm): + _test_stopping(llm, + stop=["gani"], + include_in_output=False, + expected_output="VLLM is a 100% volunteer or", + expected_reason="gani") + + _test_stopping(llm, + stop=["gani"], + include_in_output=True, + expected_output="VLLM is a 100% volunteer organi", + expected_reason="gani") + + +def _stop_token_id(llm): + # token id 13013 => " organization" + + _test_stopping(llm, + stop_token_ids=[13013], + include_in_output=False, + expected_output="VLLM is a 100% volunteer", + expected_reason=13013) + + _test_stopping(llm, + stop_token_ids=[13013], + include_in_output=True, + expected_output="VLLM is a 100% volunteer organization", + expected_reason=13013) + + +@pytest.mark.skip_global_cleanup +def test_stop_strings(): + # If V0, must set enforce_eager=False since we use + # async output processing below. + vllm_model = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1) + + if envs.VLLM_USE_V1: + _stop_basic(vllm_model) + else: + _set_async_mode(vllm_model, True) + _stop_basic(vllm_model) + + _set_async_mode(vllm_model, False) + _stop_basic(vllm_model) + + if envs.VLLM_USE_V1: + _stop_multi_tokens(vllm_model) + else: + _set_async_mode(vllm_model, True) + _stop_multi_tokens(vllm_model) + + _set_async_mode(vllm_model, False) + _stop_multi_tokens(vllm_model) + + if envs.VLLM_USE_V1: + _stop_partial_token(vllm_model) + else: + _set_async_mode(vllm_model, True) + _stop_partial_token(vllm_model) + + _set_async_mode(vllm_model, False) + _stop_partial_token(vllm_model) + + if envs.VLLM_USE_V1: + # FIXME: this does not respect include_in_output=False + # _stop_token_id(vllm_model) + pass + else: + _set_async_mode(vllm_model, True) + _stop_token_id(vllm_model) + + _set_async_mode(vllm_model, False) + _stop_token_id(vllm_model) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 4b479a0c93a9a..05b6ba40506a2 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -24,6 +24,18 @@ logger = init_logger("test_pipeline_parallel") VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1" +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + For PP, we fall back to V0 by default. This means + that the TP baseline runs with V1 while the PP engine + runs with V0. This gives divergent results with dummy + weights. Once we enable V1 by default for PP, we can + remove this. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') + + class ParallelSetup(NamedTuple): tp_size: int pp_size: int diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py index cb772fc760812..0f46fba3ac49f 100644 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -21,6 +21,15 @@ LIST_ENC_DEC_SUPPORTED_BACKENDS = [ ] +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + Since this module is V0 only, set VLLM_USE_V1=0 for + all tests in the module. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') + + def vllm_to_hf_output( vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], decoder_prompt_type: DecoderPromptType, diff --git a/tests/engine/conftest.py b/tests/engine/conftest.py new file mode 100644 index 0000000000000..1a20e2c135c2e --- /dev/null +++ b/tests/engine/conftest.py @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest + + +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + Since this module is V0 only, set VLLM_USE_V1=0 for + all tests in the module. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') diff --git a/tests/engine/output_processor/test_multi_step.py b/tests/engine/test_multi_step_output_processor.py similarity index 99% rename from tests/engine/output_processor/test_multi_step.py rename to tests/engine/test_multi_step_output_processor.py index 3ba3c4ec53a5e..b67dd86bfdf0b 100644 --- a/tests/engine/output_processor/test_multi_step.py +++ b/tests/engine/test_multi_step_output_processor.py @@ -15,7 +15,7 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, from vllm.transformers_utils.detokenizer import Detokenizer from vllm.utils import Counter -from ...core.utils import create_seq_group +from ..core.utils import create_seq_group @pytest.mark.parametrize("seq_output_len", [128]) diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py deleted file mode 100644 index 62d167aa14b45..0000000000000 --- a/tests/engine/test_stop_strings.py +++ /dev/null @@ -1,165 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from typing import Any, Optional - -import pytest - -from vllm import CompletionOutput, LLMEngine, SamplingParams - -MODEL = "meta-llama/llama-2-7b-hf" -MAX_TOKENS = 200 - -IS_ASYNC = False - - -@pytest.fixture(scope="session") -def vllm_model(vllm_runner): - with vllm_runner(MODEL) as vllm_model: - yield vllm_model - - -def _test_stopping(llm_engine: LLMEngine, - expected_output: str, - expected_reason: Any, - stop: Optional[list[str]] = None, - stop_token_ids: Optional[list[int]] = None, - include_in_output: bool = False, - use_async_output_proc: bool = False) -> None: - llm_engine.add_request( - "id", "A story about vLLM:\n", - SamplingParams( - temperature=0.0, - max_tokens=MAX_TOKENS, - stop=stop, - stop_token_ids=stop_token_ids, - include_stop_str_in_output=include_in_output, - ), None) - - output: Optional[CompletionOutput] = None - output_text = "" - stop_reason = None - - if use_async_output_proc: - llm_engine.step() - - while llm_engine.has_unfinished_requests(): - (request_output, ) = llm_engine.step() - (output, ) = request_output.outputs - - # Ensure we don't backtrack - assert output.text.startswith(output_text) - output_text = output.text - stop_reason = output.stop_reason - - assert output is not None - assert output_text == expected_output - assert stop_reason == expected_reason - - -def _set_async_mode(llm_engine, is_async): - llm_engine.scheduler[0].use_async_output_proc = is_async - - -def _stop_basic(llm_engine, is_async): - _test_stopping(llm_engine, - stop=["."], - include_in_output=False, - expected_output="VLLM is a 100% volunteer organization", - expected_reason=".", - use_async_output_proc=is_async) - - _test_stopping(llm_engine, - stop=["."], - include_in_output=True, - expected_output="VLLM is a 100% volunteer organization.", - expected_reason=".", - use_async_output_proc=is_async) - - -def _stop_multi_tokens(llm_engine, is_async): - _test_stopping( - llm_engine, - stop=["group of peo", "short"], - include_in_output=False, - expected_output="VLLM is a 100% volunteer organization. We are a ", - expected_reason="group of peo", - use_async_output_proc=is_async) - - _test_stopping( - llm_engine, - stop=["group of peo", "short"], - include_in_output=True, - expected_output= - "VLLM is a 100% volunteer organization. We are a group of peo", - expected_reason="group of peo", - use_async_output_proc=is_async) - - -def _stop_partial_token(llm_engine, is_async): - _test_stopping(llm_engine, - stop=["gani"], - include_in_output=False, - expected_output="VLLM is a 100% volunteer or", - expected_reason="gani", - use_async_output_proc=is_async) - - _test_stopping(llm_engine, - stop=["gani"], - include_in_output=True, - expected_output="VLLM is a 100% volunteer organi", - expected_reason="gani", - use_async_output_proc=is_async) - - -def _stop_token_id(llm_engine, is_async): - # token id 13013 => " organization" - - _test_stopping(llm_engine, - stop_token_ids=[13013], - include_in_output=False, - expected_output="VLLM is a 100% volunteer", - expected_reason=13013, - use_async_output_proc=is_async) - - _test_stopping(llm_engine, - stop_token_ids=[13013], - include_in_output=True, - expected_output="VLLM is a 100% volunteer organization", - expected_reason=13013, - use_async_output_proc=is_async) - - -@pytest.mark.skip_global_cleanup -def test_stop_basic(vllm_model): - _set_async_mode(vllm_model.model.llm_engine, True) - _stop_basic(vllm_model.model.llm_engine, is_async=True) - - _set_async_mode(vllm_model.model.llm_engine, False) - _stop_basic(vllm_model.model.llm_engine, is_async=False) - - -@pytest.mark.skip_global_cleanup -def test_stop_multi_tokens(vllm_model): - _set_async_mode(vllm_model.model.llm_engine, True) - _stop_multi_tokens(vllm_model.model.llm_engine, is_async=True) - - _set_async_mode(vllm_model.model.llm_engine, False) - _stop_multi_tokens(vllm_model.model.llm_engine, is_async=False) - - -@pytest.mark.skip_global_cleanup -def test_stop_partial_token(vllm_model): - _set_async_mode(vllm_model.model.llm_engine, True) - _stop_partial_token(vllm_model.model.llm_engine, is_async=True) - - _set_async_mode(vllm_model.model.llm_engine, False) - _stop_partial_token(vllm_model.model.llm_engine, is_async=False) - - -@pytest.mark.skip_global_cleanup -def test_stop_token_id(vllm_model): - _set_async_mode(vllm_model.model.llm_engine, True) - _stop_token_id(vllm_model.model.llm_engine, is_async=True) - - _set_async_mode(vllm_model.model.llm_engine, False) - _stop_token_id(vllm_model.model.llm_engine, is_async=False) diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py index 0598e3990d868..f065f6564cd2f 100644 --- a/tests/entrypoints/llm/test_lazy_outlines.py +++ b/tests/entrypoints/llm/test_lazy_outlines.py @@ -3,12 +3,21 @@ import sys from contextlib import nullcontext +import pytest from vllm_test_utils import BlameResult, blame from vllm import LLM, SamplingParams from vllm.distributed import cleanup_dist_env_and_memory +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + V1 only supports xgrammar so this is irrelevant. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') + + def run_normal_opt125m(): prompts = [ "Hello, my name is", diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py index 3e76158a8c142..86ee17c6f4491 100644 --- a/tests/entrypoints/openai/test_chat_echo.py +++ b/tests/entrypoints/openai/test_chat_echo.py @@ -10,7 +10,6 @@ from ...utils import RemoteOpenAIServer # # any model with a chat template should work here MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct" -DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 @pytest.fixture(scope="module") @@ -22,8 +21,6 @@ def server(): "--enforce-eager", "--max-model-len", "4080", - "--chat-template", - DUMMY_CHAT_TEMPLATE, ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py index c9fa192fb6aec..106d6b2c14f83 100644 --- a/tests/entrypoints/openai/test_root_path.py +++ b/tests/entrypoints/openai/test_root_path.py @@ -11,7 +11,6 @@ from ...utils import RemoteOpenAIServer # # any model with a chat template should work here MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct" -DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 API_KEY = "abc-123" ERROR_API_KEY = "abc" ROOT_PATH = "llm" @@ -28,8 +27,6 @@ def server(): "4080", "--root-path", # use --root-path=/llm for testing "/" + ROOT_PATH, - "--chat-template", - DUMMY_CHAT_TEMPLATE, ] envs = os.environ.copy() diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 0e87437312eaa..570e643e0364d 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -23,12 +23,14 @@ def clear_cache(): @pytest.mark.parametrize( "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"]) +@pytest.mark.parametrize("use_v1", [True, False]) @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"]) -def test_env(name: str, device: str, monkeypatch): +def test_env(name: str, use_v1: bool, device: str, monkeypatch): """Test that the attention selector can be set via environment variable. Note that we do not test FlashAttn because it is the default backend. """ + monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0") override_backend_env_variable(monkeypatch, name) if device == "cpu": @@ -40,7 +42,8 @@ def test_env(name: str, device: str, monkeypatch): with patch("vllm.attention.selector.current_platform", RocmPlatform()): backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) - assert backend.get_name() == "ROCM_FLASH" + EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH" + assert backend.get_name() == EXPECTED elif device == "openvino": with patch("vllm.attention.selector.current_platform", OpenVinoPlatform()), patch.dict('sys.modules', @@ -54,7 +57,8 @@ def test_env(name: str, device: str, monkeypatch): CudaPlatform()): backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) - assert backend.get_name() == name + EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name + assert backend.get_name() == EXPECTED def test_flash_attn(monkeypatch): @@ -95,13 +99,23 @@ def test_flash_attn(monkeypatch): assert backend.get_name() != STR_FLASH_ATTN_VAL -def test_invalid_env(monkeypatch): +@pytest.mark.parametrize("use_v1", [True, False]) +def test_invalid_env(use_v1: bool, monkeypatch): """Ignore the invalid env variable if it is set.""" + monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0") override_backend_env_variable(monkeypatch, STR_INVALID_VAL) + with patch("vllm.attention.selector.current_platform", CudaPlatform()): backend = get_attn_backend(32, torch.float16, None, 16, False) - assert backend.get_name() == "FLASH_ATTN" + EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN" + assert backend.get_name() == EXPECTED # when block size == 16, backend will fall back to XFORMERS - backend = get_attn_backend(16, torch.float16, None, 16, False) - assert backend.get_name() == "XFORMERS" + # this behavior is not yet supported on V1. + if use_v1: + # TODO: support fallback on V1! + # https://github.com/vllm-project/vllm/issues/14524 + pass + else: + backend = get_attn_backend(16, torch.float16, None, 16, False) + assert backend.get_name() == "XFORMERS" diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index 547a63499b260..c8ee46bc65d4d 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -22,6 +22,16 @@ from vllm.config import VllmConfig, set_current_vllm_config from vllm.forward_context import set_forward_context from vllm.platforms import current_platform + +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + Encoder-decoder is only supported on V0, so set + VLLM_USE_V1=0 for all tests in the module. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') + + # List of support backends for encoder/decoder models LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN] HEAD_SIZES = [64, 256] diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/test_rocm_attention_selector.py index 5848dc014ca69..7cd6082486605 100644 --- a/tests/kernels/test_rocm_attention_selector.py +++ b/tests/kernels/test_rocm_attention_selector.py @@ -24,7 +24,8 @@ def test_selector(monkeypatch): with patch("vllm.attention.selector.current_platform", RocmPlatform()): backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) - assert backend.get_name() == "ROCM_FLASH" + assert (backend.get_name() == "ROCM_FLASH" + or backend.get_name() == "ROCM_ATTN_VLLM_V1") # mla test for deepseek related backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, False, True) diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index e84ff30ba9929..d497ae6b2bc1e 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -80,6 +80,8 @@ def v1(run_with_both_engines_lora): pass +# V1 Test: Failing due to numerics on V1. +@pytest.mark.skip_v1 @fork_new_process_for_each_test def test_llama_lora(sql_lora_files): @@ -123,6 +125,8 @@ def test_llama_lora_warmup(sql_lora_files): "less when using lora than when not using lora") +# V1 Test: Failing due to numerics on V1. +@pytest.mark.skip_v1 @multi_gpu_test(num_gpus=4) @fork_new_process_for_each_test def test_llama_lora_tp4(sql_lora_files): diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py index b279566c00f26..204624a0540af 100644 --- a/tests/lora/test_lora_functions.py +++ b/tests/lora/test_lora_functions.py @@ -8,7 +8,7 @@ import os import pytest from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs -from vllm.entrypoints.llm import LLM +from vllm.engine.llm_engine import LLMEngine from vllm.lora.request import LoRARequest MODEL_PATH = "meta-llama/Llama-2-7b-hf" @@ -43,7 +43,7 @@ def test_lora_functions_sync(): gpu_memory_utilization=0.8, enforce_eager=True) - llm = LLM.get_engine_class().from_engine_args(engine_args) + llm = LLMEngine.from_engine_args(engine_args) def run_check(fn, args, expected: list): fn(args) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 8d25833125950..db6a6ec78fa2f 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -7,6 +7,7 @@ import torch from safetensors.torch import load_file from torch import nn +from vllm import envs from vllm.config import LoRAConfig from vllm.lora.layers import (ColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA, @@ -410,6 +411,7 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device): assert manager.device == device +@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.") @pytest.mark.parametrize("device", DEVICES) def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, sql_lora_files, device): @@ -489,6 +491,7 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, device) +@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.") @pytest.mark.parametrize("device", DEVICES) def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, sql_lora_files, device): diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index e23ff43ebd7f8..8ddcefd9191ac 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -15,6 +15,15 @@ from vllm.engine.metrics import RayPrometheusStatLogger from vllm.sampling_params import SamplingParams from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET + +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + This module tests V0 internals, so set VLLM_USE_V1=0. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') + + MODELS = [ "distilbert/distilgpt2", ] diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py index 804df4c4903e6..dd34a2577a084 100644 --- a/tests/models/decoder_only/language/test_gguf.py +++ b/tests/models/decoder_only/language/test_gguf.py @@ -110,16 +110,6 @@ def test_models( example_prompts = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True) - # Run unquantized model. - with vllm_runner( - model_name=model.original_model, - enforce_eager=True, # faster tests - dtype=dtype, - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=tp_size) as original_model: - original_outputs = original_model.generate_greedy_logprobs( - example_prompts[:-1], max_tokens, num_logprobs) - # Run gguf model. with vllm_runner(model_name=model.gguf_model, enforce_eager=True, @@ -130,6 +120,16 @@ def test_models( gguf_outputs = gguf_model.generate_greedy_logprobs( example_prompts[:-1], max_tokens, num_logprobs) + # Run unquantized model. + with vllm_runner( + model_name=model.original_model, + enforce_eager=True, # faster tests + dtype=dtype, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=tp_size) as original_model: + original_outputs = original_model.generate_greedy_logprobs( + example_prompts[:-1], max_tokens, num_logprobs) + check_logprobs_close( outputs_0_lst=original_outputs, outputs_1_lst=gguf_outputs, diff --git a/tests/models/decoder_only/language/test_hybrid.py b/tests/models/decoder_only/language/test_hybrid.py index a39b11923582c..1a78b30930e36 100644 --- a/tests/models/decoder_only/language/test_hybrid.py +++ b/tests/models/decoder_only/language/test_hybrid.py @@ -9,7 +9,9 @@ from vllm.sampling_params import SamplingParams from ...utils import check_outputs_equal # This test is for the hybrid models -MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"] +MODELS = ["ai21labs/Jamba-tiny-dev"] +# Bamba at Fp32 is too big for the CI (L4 GPU). +# MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"] @pytest.mark.parametrize("model", MODELS) @@ -41,13 +43,6 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - def print_model(model): - print(model) - - vllm_model.apply_model(print_model) - for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] vllm_output_ids, vllm_output_str = vllm_outputs[i] @@ -192,6 +187,7 @@ def test_parallel_sampling( ) +@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [20]) @@ -293,6 +289,7 @@ def test_state_cleanup( "could be related to finished_requests_ids") +@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) def test_multistep( @@ -308,6 +305,7 @@ def test_multistep( vllm_model.generate_greedy([example_prompts[0]] * 10, 1) +@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [64]) diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py index 80d13b667bb56..47b9c0f69c36e 100644 --- a/tests/models/decoder_only/language/test_mamba.py +++ b/tests/models/decoder_only/language/test_mamba.py @@ -68,13 +68,6 @@ def test_models( with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - def print_model(model): - print(model) - - vllm_model.apply_model(print_model) - for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] vllm_output_ids, vllm_output_str = vllm_outputs[i] diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py index 17923673023f4..7e1337b7d4876 100644 --- a/tests/models/decoder_only/language/test_mistral.py +++ b/tests/models/decoder_only/language/test_mistral.py @@ -213,16 +213,6 @@ def test_mistral_format( max_tokens: int, num_logprobs: int, ) -> None: - with vllm_runner( - model, - dtype=dtype, - tokenizer_mode="auto", - load_format="safetensors", - config_format="hf", - ) as hf_format_model: - hf_format_outputs = hf_format_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - with vllm_runner( model, dtype=dtype, @@ -233,6 +223,16 @@ def test_mistral_format( mistral_format_outputs = mistral_format_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) + with vllm_runner( + model, + dtype=dtype, + tokenizer_mode="auto", + load_format="safetensors", + config_format="hf", + ) as hf_format_model: + hf_format_outputs = hf_format_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + check_logprobs_close( outputs_0_lst=hf_format_outputs, outputs_1_lst=mistral_format_outputs, @@ -261,6 +261,7 @@ def test_mistral_symbolic_languages( assert "οΏ½" not in outputs[0].outputs[0].text.strip() +@pytest.mark.skip("RE-ENABLE: test is currently failing on main.") @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS) # v1 can't do func calling diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index 71e4a9f11ab82..a49926ea220e8 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -7,6 +7,12 @@ import pytest from ...utils import check_logprobs_close +# These have unsupported head_dim for FA. We do not +# not have a clean way to fall back, so we fail with +# a clear msg when it happens. +# https://github.com/vllm-project/vllm/issues/14524 +REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"] + @pytest.mark.parametrize( "model", @@ -71,7 +77,10 @@ def test_models( dtype: str, max_tokens: int, num_logprobs: int, + monkeypatch, ) -> None: + if model in REQUIRES_V0: + monkeypatch.setenv("VLLM_USE_V1", "0") with hf_runner(model, dtype=dtype) as hf_model: if model.startswith("THUDM/chatglm3"): @@ -85,13 +94,6 @@ def test_models( vllm_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - def print_model(model): - print(model) - - vllm_model.apply_model(print_model) - check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/decoder_only/vision_language/test_awq.py index f4a6dd0f101fd..6cc81d2b9ed4a 100644 --- a/tests/models/decoder_only/vision_language/test_awq.py +++ b/tests/models/decoder_only/vision_language/test_awq.py @@ -108,7 +108,12 @@ def run_awq_test( @pytest.mark.parametrize("num_logprobs", [5]) @torch.inference_mode() def test_awq_models(vllm_runner, image_assets, source_model, quant_model, - size_factors, dtype, max_tokens, num_logprobs) -> None: + size_factors, dtype, max_tokens, num_logprobs, + monkeypatch) -> None: + + # Test V1: this test hangs during setup on single-scale input. + # TODO: fixure out why and re-enable this on V1. + monkeypatch.setenv("VLLM_USE_V1", "0") run_awq_test( vllm_runner, image_assets, diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index a0f1229f0af5a..7cdd037d49acd 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -9,8 +9,7 @@ from pathlib import PosixPath import pytest from packaging.version import Version -from transformers import (AutoModelForImageTextToText, AutoModelForPreTraining, - AutoModelForVision2Seq) +from transformers import AutoModelForPreTraining, AutoModelForVision2Seq from transformers import __version__ as TRANSFORMERS_VERSION from vllm.platforms import current_platform @@ -33,6 +32,16 @@ from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs, if current_platform.is_rocm(): os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0" +REQUIRES_V0_MODELS = [ + # V1 Test: no way to fall back for head_dim = 80 + # https://github.com/vllm-project/vllm/issues/14524 + "qwen_vl", + "h2ovl", + "blip2", + # V1 Test: not enough KV cache space in C1. + "fuyu", +] + # yapf: disable COMMON_BROADCAST_SETTINGS = { "test_type": VLMTestType.IMAGE, @@ -157,25 +166,25 @@ VLM_TEST_SETTINGS = { marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), #### Extended model tests - "aria": VLMTestInfo( - models=["rhymes-ai/Aria"], - test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501 - img_idx_to_prompt=lambda idx: "<|img|>\n", - max_model_len=4096, - max_num_seqs=2, - auto_cls=AutoModelForImageTextToText, - single_image_prompts=IMAGE_ASSETS.prompts({ - "stop_sign": "Please describe the image shortly.", - "cherry_blossom": "Please infer the season with reason.", - }), - multi_image_prompt="Describe the two images shortly.", # noqa: E501 - postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"), - stop_str=["<|im_end|>"], - image_size_factors=[(0.10, 0.15)], - max_tokens=64, - marks=[large_gpu_mark(min_gb=64)], - ), + # "aria": VLMTestInfo( + # models=["rhymes-ai/Aria"], + # test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + # prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501 + # img_idx_to_prompt=lambda idx: "<|img|>\n", + # max_model_len=4096, + # max_num_seqs=2, + # auto_cls=AutoModelForImageTextToText, + # single_image_prompts=IMAGE_ASSETS.prompts({ + # "stop_sign": "Please describe the image shortly.", + # "cherry_blossom": "Please infer the season with reason.", # noqa: E501 + # }), + # multi_image_prompt="Describe the two images shortly.", # noqa: E501 + # postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"), # noqa: E501 + # stop_str=["<|im_end|>"], + # image_size_factors=[(0.10, 0.15)], + # max_tokens=64, + # marks=[large_gpu_mark(min_gb=64)], + # ), "blip2": VLMTestInfo( models=["Salesforce/blip2-opt-2.7b"], test_type=VLMTestType.IMAGE, @@ -589,7 +598,9 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets): + image_assets: _ImageAssets, monkeypatch): + if model_type in REQUIRES_V0_MODELS: + monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_single_image_test( tmp_path=tmp_path, @@ -612,7 +623,9 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets): + image_assets: _ImageAssets, monkeypatch): + if model_type in REQUIRES_V0_MODELS: + monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_multi_image_test( tmp_path=tmp_path, @@ -635,7 +648,9 @@ def test_image_embedding_models(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets): + image_assets: _ImageAssets, monkeypatch): + if model_type in REQUIRES_V0_MODELS: + monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_embedding_test( model_test_info=model_test_info, @@ -655,7 +670,9 @@ def test_image_embedding_models(model_type: str, )) def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - video_assets: _VideoAssets): + video_assets: _VideoAssets, monkeypatch): + if model_type in REQUIRES_V0_MODELS: + monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_video_test( model_test_info=model_test_info, @@ -678,7 +695,10 @@ def test_custom_inputs_models( test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], + monkeypatch, ): + if model_type in REQUIRES_V0_MODELS: + monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_custom_inputs_test( model_test_info=model_test_info, @@ -701,7 +721,9 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets): + image_assets: _ImageAssets, monkeypatch): + if model_type in REQUIRES_V0_MODELS: + monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_single_image_test( tmp_path=tmp_path, @@ -725,7 +747,9 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets): + image_assets: _ImageAssets, monkeypatch): + if model_type in REQUIRES_V0_MODELS: + monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_multi_image_test( tmp_path=tmp_path, @@ -749,7 +773,9 @@ def test_image_embedding_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets): + image_assets: _ImageAssets, monkeypatch): + if model_type in REQUIRES_V0_MODELS: + monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_embedding_test( model_test_info=model_test_info, @@ -770,7 +796,9 @@ def test_image_embedding_models_heavy(model_type: str, def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - video_assets: _VideoAssets): + video_assets: _VideoAssets, monkeypatch): + if model_type in REQUIRES_V0_MODELS: + monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_video_test( model_test_info=model_test_info, @@ -794,7 +822,10 @@ def test_custom_inputs_models_heavy( test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], + monkeypatch, ): + if model_type in REQUIRES_V0_MODELS: + monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_custom_inputs_test( model_test_info=model_test_info, diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py index af494eb2e62bf..0b27a4caf6eb7 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -14,6 +14,15 @@ from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput, PromptVideoInput, VllmRunner) from ...utils import check_logprobs_close + +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + V1 Test: batch_make_xxxxx_embeddings calls a V0 internal + """ + monkeypatch.setenv('VLLM_USE_V1', '0') + + models = ["Qwen/Qwen2-VL-2B-Instruct"] target_dtype = "half" @@ -118,6 +127,7 @@ def batch_make_image_embeddings( return visual(pixel_values_on_device, grid_thw=image_grid_thw_on_device) + # V1 Test: this calls a V0 internal. image_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches @@ -201,6 +211,7 @@ def batch_make_video_embeddings( return visual(pixel_values_on_device, grid_thw=video_grid_thw_on_device) + # V1 Test: this calls a V0 internal. video_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches @@ -253,7 +264,6 @@ def run_embedding_input_test( processor = AutoProcessor.from_pretrained(model) - # NOTE: # max_model_len should be greater than image_feature_size with vllm_runner(model, task="generate", diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py index c6155da50b585..6a3cd8a5c594e 100644 --- a/tests/models/embedding/language/test_cls_models.py +++ b/tests/models/embedding/language/test_cls_models.py @@ -35,13 +35,6 @@ def test_classification_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.classify(example_prompts) - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - def print_model(model): - print(model) - - vllm_model.apply_model(print_model) - with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSequenceClassification) as hf_model: diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 6c28ee91a50ad..5deb35fa32108 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -73,13 +73,6 @@ def test_models( **vllm_extra_kwargs) as vllm_model: vllm_outputs = vllm_model.encode(example_prompts) - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - def print_model(model): - print(model) - - vllm_model.apply_model(print_model) - check_embeddings_close( embeddings_0_lst=hf_outputs, embeddings_1_lst=vllm_outputs, diff --git a/tests/models/registry.py b/tests/models/registry.py index 372ea33ba9fdc..6b0ac46b0c365 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -256,7 +256,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501 {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501 "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf", - extras={"mistral": "mistral-community/pixtral-12b"}), # noqa: E501 + extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501 + "mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}), # noqa: E501 "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"), # noqa: E501 "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"), # noqa: E501 "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 @@ -274,8 +275,9 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True), "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501 extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501 - "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-vision-instruct", - trust_remote_code=True), + "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct", + trust_remote_code=True, + extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}), # noqa: E501), "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct", trust_remote_code=True), "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501 diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index c58c637231681..adb2d6d0a9907 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -6,6 +6,8 @@ import pytest from transformers import PretrainedConfig from vllm import LLM +from vllm.engine.llm_engine import LLMEngine as V0LLMEngine +from vllm.v1.engine.core import EngineCore as V1EngineCore from .registry import HF_EXAMPLE_MODELS @@ -36,12 +38,18 @@ def test_can_initialize(model_arch): return hf_config # Avoid calling model.forward() - def _initialize_kv_caches(self) -> None: + def _initialize_kv_caches_v0(self) -> None: self.cache_config.num_gpu_blocks = 0 self.cache_config.num_cpu_blocks = 0 - with patch.object(LLM.get_engine_class(), "_initialize_kv_caches", - _initialize_kv_caches): + def _initalize_kv_caches_v1(self, vllm_config): + # gpu_blocks (> 0), cpu_blocks + return 1, 0 + + with (patch.object(V0LLMEngine, "_initialize_kv_caches", + _initialize_kv_caches_v0), + patch.object(V1EngineCore, "_initialize_kv_caches", + _initalize_kv_caches_v1)): LLM( model_info.default, tokenizer=model_info.tokenizer, diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index f2a505596ce69..d3d07d0d9acfc 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -11,12 +11,14 @@ from ..utils import fork_new_process_for_each_test @fork_new_process_for_each_test -def test_plugin(dummy_opt_path): +def test_plugin(dummy_opt_path, monkeypatch): + # V1 shuts down rather than raising an error here. + monkeypatch.setenv("VLLM_USE_V1", "0") os.environ["VLLM_PLUGINS"] = "" with pytest.raises(Exception) as excinfo: LLM(model=dummy_opt_path, load_format="dummy") error_msg = "has no vLLM implementation and " \ - "the Transformers implementation is not compatible with vLLM." + "the Transformers implementation is not compatible with vLLM" assert (error_msg in str(excinfo.value)) @@ -51,7 +53,7 @@ image = ImageAsset("cherry_blossom").pil_image.convert("RGB") @fork_new_process_for_each_test -def test_oot_registration_multimodal(dummy_llava_path): +def test_oot_registration_multimodal(dummy_llava_path, monkeypatch): os.environ["VLLM_PLUGINS"] = "register_dummy_model" prompts = [{ "prompt": "What's in the image?", diff --git a/tests/mq_llm_engine/conftest.py b/tests/mq_llm_engine/conftest.py new file mode 100644 index 0000000000000..1a20e2c135c2e --- /dev/null +++ b/tests/mq_llm_engine/conftest.py @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest + + +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + Since this module is V0 only, set VLLM_USE_V1=0 for + all tests in the module. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') diff --git a/tests/plugins_tests/conftest.py b/tests/plugins_tests/conftest.py new file mode 100644 index 0000000000000..8561f2ddfa266 --- /dev/null +++ b/tests/plugins_tests/conftest.py @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest + + +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + Since this module is V0 only, set VLLM_USE_V1=0 for + all tests in the module. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') \ No newline at end of file diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py index 19f393e07984c..4cc399175df41 100644 --- a/tests/prefix_caching/test_disable_sliding_window.py +++ b/tests/prefix_caching/test_disable_sliding_window.py @@ -34,7 +34,10 @@ def test_disable_sliding_window(model_len_len, ): del vllm_disabled_model cleanup_dist_env_and_memory() - vllm_enabled_model = LLM(model, disable_sliding_window=False) + vllm_enabled_model = LLM(model, + enforce_eager=True, + disable_sliding_window=False, + enable_prefix_caching=False) vllm_enabled_model.generate("Hi my name is") model_config = vllm_enabled_model.llm_engine.model_config assert model_config.max_model_len == full_len, ( diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index d7d84bdcf382a..7a4bc7aecc0f4 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -16,6 +16,15 @@ from vllm.platforms import current_platform from ..models.utils import check_outputs_equal + +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + This module relies on V0 internals, so set VLLM_USE_V1=0. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') + + MODELS = [ "distilbert/distilgpt2", ] diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index b9b2b634e0bbb..133475a3e06aa 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -21,6 +21,14 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( from vllm.platforms import current_platform +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + This module relies on V0 internals, so set VLLM_USE_V1=0. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') + + @pytest.mark.parametrize( "model_args", [ diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py index de03d37a74bfb..79afcc916f2bb 100644 --- a/tests/quantization/test_cpu_offload.py +++ b/tests/quantization/test_cpu_offload.py @@ -10,6 +10,13 @@ from tests.quantization.utils import is_quant_method_supported from ..utils import compare_two_settings +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + # Fall back to V0 if cpu offloading is enabled. + # Fixture is required to that baseline uses V0. + monkeypatch.setenv('VLLM_USE_V1', '0') + + @pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="fp8 is not supported on this GPU type.") def test_cpu_offload_fp8(): diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index b9a1d759b9a49..19cf29d3e6591 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -47,7 +47,9 @@ KV_CACHE_MODELS = [ @pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="FP8 is not supported on this GPU type.") @pytest.mark.parametrize("model_id", KV_CACHE_MODELS) -def test_kv_cache_model_load_and_run(vllm_runner, model_id: str): +def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch): + # vllm_runner.apply_model() relies on V0 internals. + monkeypatch.setenv("VLLM_USE_V1", "0") with vllm_runner(model_id, kv_cache_dtype="fp8") as llm: def check_model(model): @@ -86,6 +88,9 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str): @pytest.mark.parametrize("force_marlin", [False, True]) def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool, monkeypatch) -> None: + # vllm_runner.apply_model() relies on V0 internals. + monkeypatch.setenv("VLLM_USE_V1", "0") + if force_marlin: monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py index c6f34fef2743b..22055c49ae296 100644 --- a/tests/quantization/test_gptq_dynamic.py +++ b/tests/quantization/test_gptq_dynamic.py @@ -28,8 +28,10 @@ MODEL_QUANT = [ @pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT) -def test_gptq_with_dynamic(vllm_runner, model_id: str, - use_marlin_kernel: bool): +def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool, + monkeypatch): + # vllm_runner.apply_model() relies on V0 internals. + monkeypatch.setenv("VLLM_USE_V1", "0") vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index 20435a287e37a..1c6bd18521c31 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -29,7 +29,10 @@ def test_lm_head( vllm_runner, model_id: str, lm_head_quantized: bool, + monkeypatch, ) -> None: + # vllm_runner.apply_model() relies on V0 internals. + monkeypatch.setenv("VLLM_USE_V1", "0") with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as vllm_model: diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index 491370c7cc24d..85dc695be6865 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -10,7 +10,9 @@ from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501 QuarkLinearMethod, QuarkW8A8Fp8) -def test_quark_fp8(vllm_runner): +def test_quark_fp8(vllm_runner, monkeypatch): + # vllm_runner.apply_model() relies on V0 internals. + monkeypatch.setenv("VLLM_USE_V1", "0") model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test" with vllm_runner(model_path) as llm: diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py index f64dca6e4bbf6..abc1c05de3c0c 100644 --- a/tests/quantization/test_register_quantization_config.py +++ b/tests/quantization/test_register_quantization_config.py @@ -101,8 +101,10 @@ def test_register_quantization_config(): argvalues=[ "meta-llama/Llama-3.2-1B-Instruct", ]) -def test_custom_quant(vllm_runner, model): +def test_custom_quant(vllm_runner, model, monkeypatch): """Test infer with the custom quantization method.""" + # vllm_runner.apply_model() relies on V0 internals. + monkeypatch.setenv("VLLM_USE_V1", "0") with vllm_runner(model_name=model, quantization="custom_quant", enforce_eager=True) as llm: diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index 39feb1895b094..a1a81b3891f65 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -6,6 +6,13 @@ Run `pytest tests/samplers/test_beam_search.py`. import pytest + +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + """We can run both engines for this test.""" + pass + + # FIXME(zhuohan): The test can not pass if we: # 1. Increase max_tokens to 256. # 2. Increase beam_width to 8. @@ -15,6 +22,7 @@ BEAM_WIDTHS = [4] MODELS = ["TinyLlama/TinyLlama-1.1B-Chat-v1.0"] +@pytest.mark.skip_v1 # FIXME: This fails on V1 right now. @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", MAX_TOKENS) diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py index 673d1b9a7ef6f..2a124aa0c5960 100644 --- a/tests/samplers/test_ignore_eos.py +++ b/tests/samplers/test_ignore_eos.py @@ -8,6 +8,13 @@ import pytest from vllm import SamplingParams + +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + """We can run both engines for this test.""" + pass + + # We also test with llama because it has generation_config to specify EOS # (past regression). MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"] diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py index f237b616077bf..74f1eb4a95477 100644 --- a/tests/samplers/test_logits_processor.py +++ b/tests/samplers/test_logits_processor.py @@ -8,6 +8,14 @@ from vllm import SamplingParams MODELS = ["distilbert/distilgpt2"] +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + This file tests V0 internals, so set VLLM_USE_V1=0. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') + + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_logits_processor_force_generate( diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 58c7c256473e0..5cc646e76ec84 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -10,6 +10,15 @@ from ..conftest import VllmRunner MODELS = ["distilbert/distilgpt2"] +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + This module is V0 only since it uses dtype=float, so + set VLLM_USE_V1=0 for all tests in the module. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') + + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) # needed for comparing logprobs with HF diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py index 29e73eb1bead0..355e3adcf5f30 100644 --- a/tests/samplers/test_no_bad_words.py +++ b/tests/samplers/test_no_bad_words.py @@ -6,11 +6,18 @@ Run `pytest tests/samplers/test_no_bad_words.py`. """ from typing import Optional +import pytest from transformers import AutoTokenizer from vllm import LLM, SamplingParams +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + """We can run both engines for this test.""" + pass + + def _generate( model: LLM, prompt: str, diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py index 66779d97a92c1..ebe9b302148c0 100644 --- a/tests/samplers/test_ranks.py +++ b/tests/samplers/test_ranks.py @@ -7,6 +7,12 @@ from vllm import SamplingParams MODELS = ["distilbert/distilgpt2"] +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + """We can run both engines for this test.""" + pass + + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_ranks( diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index 2b86dcac7f03c..8884f8ae70b8e 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -8,6 +8,15 @@ import torch.nn.functional as F from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.utils import set_random_seed + +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + This file tests V0 internals, so set VLLM_USE_V1=0. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') + + CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 68944ac7e1efa..6924aba115764 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -18,6 +18,14 @@ from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata from vllm.utils import Counter, is_pin_memory_available +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + This file tests V0 internals, so set VLLM_USE_V1=0. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') + + class MockLogitsSampler(Sampler): def __init__(self, fake_logits: torch.Tensor): diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index 4e828256130e9..efa2642dba971 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -17,7 +17,9 @@ RANDOM_SEEDS = list(range(5)) @pytest.fixture -def vllm_model(vllm_runner): +def vllm_model(vllm_runner, monkeypatch): + # This file relies on V0 internals. + monkeypatch.setenv("VLLM_USE_V1", "0") with vllm_runner(MODEL, dtype="half") as vllm_model: yield vllm_model diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py index ecf98179ca21a..279e5ed100d97 100644 --- a/tests/samplers/test_typical_acceptance_sampler.py +++ b/tests/samplers/test_typical_acceptance_sampler.py @@ -11,6 +11,14 @@ from vllm.model_executor.utils import set_random_seed CUDA_DEVICES = [f"cuda:{i}" for i in range(1)] +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + This file tests V0 internals, so set VLLM_USE_V1=0. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') + + def get_zero_temperature_prob_dist(batch_size, k, vocab_size): """ Generates a fake temperature zero probability distribution. diff --git a/tests/spec_decode/conftest.py b/tests/spec_decode/conftest.py new file mode 100644 index 0000000000000..1a20e2c135c2e --- /dev/null +++ b/tests/spec_decode/conftest.py @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest + + +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + Since this module is V0 only, set VLLM_USE_V1=0 for + all tests in the module. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py index 694bb5fbc3f71..a88ae8cda73d3 100644 --- a/tests/tensorizer_loader/conftest.py +++ b/tests/tensorizer_loader/conftest.py @@ -12,6 +12,14 @@ from vllm.distributed import cleanup_dist_env_and_memory from vllm.model_executor.model_loader.tensorizer import TensorizerConfig +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + Tensorizer only tested on V0 so far. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') + + @pytest.fixture(autouse=True) def cleanup(): cleanup_dist_env_and_memory(shutdown_ray=True) diff --git a/tests/test_regression.py b/tests/test_regression.py index ce9498e8d7e84..b54dc6af3e9a6 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -7,11 +7,13 @@ will never happen again. """ import gc +import pytest import torch from vllm import LLM, SamplingParams +@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len") def test_duplicated_ignored_sequence_group(): """https://github.com/vllm-project/vllm/issues/1655""" diff --git a/tests/test_utils.py b/tests/test_utils.py index 49fb02fd04039..dcca7d5965e9e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -366,7 +366,10 @@ def test_bind_kv_cache_non_attention(): assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1] -def test_bind_kv_cache_encoder_decoder(): +def test_bind_kv_cache_encoder_decoder(monkeypatch): + # V1 TESTS: ENCODER_DECODER is not supported on V1 yet. + monkeypatch.setenv("VLLM_USE_V1", "0") + from vllm.attention import Attention, AttentionType # example from bart diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 9aa2eea3154cc..b1860e0bb7083 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -279,7 +279,12 @@ def test_decode_prompt_logprobs_chunked_prefill( model, chunked_prefill_token_size: int, example_prompts, + monkeypatch, ): + # VLLM V1 does not use incremental detokenization for + # prompt logprobs, so this test strategy is irrelevant. + monkeypatch.setenv("VLLM_USE_V1", "0") + max_num_seqs = 256 enable_chunked_prefill = False max_num_batched_tokens = None diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index fd947bd7fed06..aad37eb9b8f3a 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -91,20 +91,22 @@ CONFIGS: dict[str, ServerConfig] = { "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT " "to the user's question - just respond to it normally." }, - "granite20b": { - "model": - "mbayser/granite-20b-functioncalling-FP8-KV", - "arguments": [ - "--tool-call-parser", "granite-20b-fc", "--chat-template", - str(VLLM_PATH / - "examples/tool_chat_template_granite_20b_fc.jinja"), - "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20" - ], - "supports_parallel": - False, - "supports_rocm": - False, - }, + # V1 Test: Passing locally but failing in CI. This runs the + # V0 Engine because of CPU offloading. Need to debug why. + # "granite20b": { + # "model": + # "mbayser/granite-20b-functioncalling-FP8-KV", + # "arguments": [ + # "--tool-call-parser", "granite-20b-fc", "--chat-template", + # str(VLLM_PATH / + # "examples/tool_chat_template_granite_20b_fc.jinja"), + # "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20" + # ], + # "supports_parallel": + # False, + # "supports_rocm": + # False, + # }, "granite-3.0-8b": { "model": "ibm-granite/granite-3.0-8b-instruct", diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index 5fc5d08b327bb..97149884497af 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -19,6 +19,16 @@ from opentelemetry.sdk.environment_variables import ( from vllm import LLM, SamplingParams from vllm.tracing import SpanAttributes + +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + Since this module is V0 only, set VLLM_USE_V1=0 for + all tests in the module. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') + + FAKE_TRACE_SERVER_ADDRESS = "localhost:4317" FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value', diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py index a3540582a397c..02470ca92f47f 100644 --- a/tests/v1/engine/test_engine_args.py +++ b/tests/v1/engine/test_engine_args.py @@ -18,19 +18,19 @@ if not envs.VLLM_USE_V1: def test_prefix_caching_from_cli(): parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) args = parser.parse_args([]) - engine_args = EngineArgs.from_cli_args(args=args) - assert (engine_args.enable_prefix_caching + vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config() + assert (vllm_config.cache_config.enable_prefix_caching ), "V1 turns on prefix caching by default." # Turn it off possible with flag. args = parser.parse_args(["--no-enable-prefix-caching"]) - engine_args = EngineArgs.from_cli_args(args=args) - assert not engine_args.enable_prefix_caching + vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config() + assert not vllm_config.cache_config.enable_prefix_caching # Turn it on with flag. args = parser.parse_args(["--enable-prefix-caching"]) - engine_args = EngineArgs.from_cli_args(args=args) - assert engine_args.enable_prefix_caching + vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config() + assert vllm_config.cache_config.enable_prefix_caching def test_defaults_with_usage_context(): @@ -38,11 +38,21 @@ def test_defaults_with_usage_context(): vllm_config: VllmConfig = engine_args.create_engine_config( UsageContext.LLM_CLASS) + from vllm.platforms import current_platform + device_name = current_platform.get_device_name().lower() + if "h100" in device_name or "h200" in device_name: + # For H100 and H200, we use larger default values. + default_llm_tokens = 16384 + default_server_tokens = 8192 + else: + default_llm_tokens = 8192 + default_server_tokens = 2048 + assert vllm_config.scheduler_config.max_num_seqs == 1024 - assert vllm_config.scheduler_config.max_num_batched_tokens == 8192 + assert vllm_config.scheduler_config.max_num_batched_tokens == default_llm_tokens # noqa: E501 engine_args = EngineArgs(model="facebook/opt-125m") vllm_config = engine_args.create_engine_config( UsageContext.OPENAI_API_SERVER) assert vllm_config.scheduler_config.max_num_seqs == 1024 - assert vllm_config.scheduler_config.max_num_batched_tokens == 2048 + assert vllm_config.scheduler_config.max_num_batched_tokens == default_server_tokens # noqa: E501 diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 9715573e3f142..e763aa2c86998 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -6,7 +6,6 @@ from collections.abc import Generator import pytest import torch -from tests.kernels.utils import override_backend_env_variable from tests.v1.sample.utils import ( BatchLogprobsComposition, BatchLogprobsSpecType, assert_incr_detok_str_matches_non_incr_detok_str, @@ -334,7 +333,7 @@ def test_get_logprobs_and_prompt_logprobs( do_apc=do_apc) -def test_max_logprobs(monkeypatch): +def test_max_logprobs(): """vLLM v1 engine should fail a request with `logprobs > max_logprobs` Should also fail for `prompt_logprobs > max_logprobs` @@ -344,7 +343,6 @@ def test_max_logprobs(monkeypatch): Args: monkeypatch """ - override_backend_env_variable(monkeypatch, "FLASH_ATTN") runner = VllmRunner("facebook/opt-125m", max_logprobs=1, diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py new file mode 100644 index 0000000000000..d74a96fbfa02f --- /dev/null +++ b/tests/v1/test_oracle.py @@ -0,0 +1,169 @@ +# SPDX-License-Identifier: Apache-2.0 +import os + +import pytest + +import vllm.envs as envs +from vllm import LLM +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine + +UNSUPPORTED_MODELS_V1 = [ + "openai/whisper-large-v3", # transcription + "facebook/bart-large-cnn", # encoder decoder + "mistralai/Mamba-Codestral-7B-v0.1", # mamba + "ibm-ai-platform/Bamba-9B", # hybrid + "BAAI/bge-m3", # embedding +] + +MODEL = "meta-llama/Llama-3.2-1B-Instruct" + + +@pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1) +def test_reject_unsupported_models(monkeypatch, model): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + args = AsyncEngineArgs(model=model) + + with pytest.raises(NotImplementedError): + _ = args.create_engine_config() + m.delenv("VLLM_USE_V1") + + +def test_reject_bad_config(monkeypatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + + +def test_unsupported_configs(monkeypatch): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + with pytest.raises(NotImplementedError): + AsyncEngineArgs( + model=MODEL, + kv_cache_dtype="fp8", + ).create_engine_config() + + with pytest.raises(NotImplementedError): + AsyncEngineArgs( + model=MODEL, + speculative_model=MODEL, + ).create_engine_config() + + with pytest.raises(NotImplementedError): + AsyncEngineArgs( + model=MODEL, + guided_decoding_backend="lm-format-enforcer:no-fallback", + ).create_engine_config() + + with pytest.raises(NotImplementedError): + AsyncEngineArgs( + model=MODEL, + preemption_mode="swap", + ).create_engine_config() + + with pytest.raises(NotImplementedError): + AsyncEngineArgs( + model=MODEL, + disable_async_output_proc=True, + ).create_engine_config() + + with pytest.raises(NotImplementedError): + AsyncEngineArgs( + model=MODEL, + scheduling_policy="priority", + ).create_engine_config() + + with pytest.raises(NotImplementedError): + AsyncEngineArgs( + model=MODEL, + num_scheduler_steps=5, + ).create_engine_config() + + with pytest.raises(NotImplementedError): + AsyncEngineArgs( + model=MODEL, + scheduler_delay_factor=1.2, + ).create_engine_config() + + +def test_enable_by_default_fallback(monkeypatch): + with monkeypatch.context() as m: + if os.getenv("VLLM_USE_V1", None): + m.delenv("VLLM_USE_V1") + + # Should default to V1 for supported config. + _ = AsyncEngineArgs( + model=MODEL, + enforce_eager=True, + ).create_engine_config() + assert envs.VLLM_USE_V1 + m.delenv("VLLM_USE_V1") + + # Should fall back to V0 for experimental config. + _ = AsyncEngineArgs( + model=MODEL, + enable_lora=True, + ).create_engine_config() + assert not envs.VLLM_USE_V1 + m.delenv("VLLM_USE_V1") + + # Should fall back to V0 for supported model. + _ = AsyncEngineArgs( + model=UNSUPPORTED_MODELS_V1[0]).create_engine_config() + assert not envs.VLLM_USE_V1 + m.delenv("VLLM_USE_V1") + + +def test_v1_llm_by_default(monkeypatch): + with monkeypatch.context() as m: + if os.getenv("VLLM_USE_V1", None): + m.delenv("VLLM_USE_V1") + + # Should default to V1 for supported config. + model = LLM(MODEL, enforce_eager=True) + print(model.generate("Hello my name is")) + assert hasattr(model.llm_engine, "engine_core") + m.delenv("VLLM_USE_V1") + + +def test_v1_attn_backend(monkeypatch): + with monkeypatch.context() as m: + if os.getenv("VLLM_USE_V1", None): + m.delenv("VLLM_USE_V1") + m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") + + # Fall back to V0. + _ = AsyncEngineArgs(model=MODEL).create_engine_config() + assert not envs.VLLM_USE_V1 + m.delenv("VLLM_USE_V1") + + # Reject if V1. + m.setenv("VLLM_USE_V1", "1") + with pytest.raises(NotImplementedError): + AsyncEngineArgs(model=MODEL).create_engine_config() + m.delenv("VLLM_USE_V1") + + m.setenv("VLLM_ATTENTION_BACKEND", "FLASHMLA") + _ = AsyncEngineArgs(model=MODEL).create_engine_config() + assert envs.VLLM_USE_V1 + m.delenv("VLLM_USE_V1") + + +def test_reject_using_constructor_directly(monkeypatch): + with monkeypatch.context() as m: + if os.getenv("VLLM_USE_V1", None): + m.delenv("VLLM_USE_V1") + + # Sets VLLM_USE_V1=1. + vllm_config = AsyncEngineArgs(model=MODEL).create_engine_config() + + # This uses the V0 constructor directly. + with pytest.raises(ValueError): + AsyncLLMEngine(vllm_config, + AsyncLLMEngine._get_executor_cls(vllm_config), + log_stats=True) + + m.delenv("VLLM_USE_V1") diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py index 9d6b25da7e6d1..9f99b3725fe41 100644 --- a/tests/weight_loading/test_weight_loading.py +++ b/tests/weight_loading/test_weight_loading.py @@ -15,6 +15,9 @@ QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin") MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "80") +@pytest.mark.skipif( + MODEL_NAME == "casperhansen/deepseek-coder-v2-instruct-awq", + reason="OOM in the CI") @pytest.mark.skipif( not current_platform.has_device_capability(int(MIN_CAPABILITY)), reason="Current system does not have minimum capability.") @@ -22,10 +25,14 @@ def test_weight_loading(vllm_runner): """ Test parameter weight loading with tp>1. """ + + # MoE models need fp16. + NEEDS_FP16 = (QUANTIZATION == "gptq" or MODEL_NAME + == "nm-testing/test-w4a16-mixtral-actorder-group") with vllm_runner( model_name=MODEL_NAME, revision=REVISION, - dtype=torch.half if QUANTIZATION == "gptq" else "auto", + dtype=torch.half if NEEDS_FP16 else "auto", quantization=None if QUANTIZATION == "None" else QUANTIZATION, max_model_len=MAX_MODEL_LEN, tensor_parallel_size=2) as model: diff --git a/tests/worker/conftest.py b/tests/worker/conftest.py new file mode 100644 index 0000000000000..372d71a78d0a7 --- /dev/null +++ b/tests/worker/conftest.py @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest + + +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + This module tests V0 internals, so set VLLM_USE_V1=0. + """ + monkeypatch.setenv('VLLM_USE_V1', '0') \ No newline at end of file diff --git a/vllm/config.py b/vllm/config.py index 40ea50cb083fb..70cc0affe9982 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1140,6 +1140,10 @@ class CacheConfig: if self.cache_dtype == "auto": pass elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"): + if envs.VLLM_USE_V1: + raise NotImplementedError( + "V1 does not yet support fp8 KV cache. " + "Set VLLM_USE_V1=0 to enable fp8 kv cache.") logger.info( "Using fp8 data type to store kv cache. It reduces the GPU " "memory footprint and boosts the performance. " @@ -3142,16 +3146,7 @@ class CompilationConfig(BaseModel): self.inductor_compile_config[KEY] = False if self.splitting_ops is None: - if envs.VLLM_USE_V1: - # v1 must split the graph on attention ops - # for piecewise cudagraph - self.splitting_ops = [ - "vllm.unified_attention", - "vllm.unified_attention_with_output", - ] - else: - # v0 uses full graph compilation - self.splitting_ops = [] + self.splitting_ops = [] for k, v in self.inductor_passes.items(): if not isinstance(v, str): @@ -3246,6 +3241,15 @@ class CompilationConfig(BaseModel): self.bs_to_padded_graph_size[ self.max_capture_size] = self.max_capture_size + def set_splitting_ops_for_v1(self): + # If default, override splitting ops for piecewise cudagraph on V1. + # NOTE: this function needs to be called + if not self.splitting_ops: + self.splitting_ops = [ + "vllm.unified_attention", + "vllm.unified_attention_with_output", + ] + @dataclass class VllmConfig: @@ -3297,6 +3301,7 @@ class VllmConfig: vllm_factors: list[Any] = [] from vllm import __version__ vllm_factors.append(__version__) + vllm_factors.append(envs.VLLM_USE_V1) if self.model_config: vllm_factors.append(self.model_config.compute_hash()) else: @@ -3460,6 +3465,7 @@ class VllmConfig: # CUDA graphs do not work properly with the custom CUDA kernels. # FIXME(woosuk): Disable inductor to reduce the compilation time # and avoid any potential issues with the inductor. + # FIXME(rob): Add function to set all of these. self.compilation_config.custom_ops = ["none"] self.compilation_config.use_cudagraph = True self.compilation_config.use_inductor = True @@ -3467,6 +3473,7 @@ class VllmConfig: self.compilation_config.pass_config.enable_fusion = False self.compilation_config.pass_config.enable_noop = False self.compilation_config.level = CompilationLevel.PIECEWISE + self.compilation_config.set_splitting_ops_for_v1() self._set_cudagraph_sizes() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cef3a3f78b0b9..31d567de0efa5 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -223,15 +223,6 @@ class EngineArgs: if not self.tokenizer: self.tokenizer = self.model - # Override the default value of enable_prefix_caching if it's not set - # by user. - if self.enable_prefix_caching is None: - self.enable_prefix_caching = bool(envs.VLLM_USE_V1) - - # Override max_num_seqs if it's not set by user. - if self.max_num_seqs is None: - self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024 - # support `EngineArgs(compilation_config={...})` # without having to manually construct a # CompilationConfig object @@ -246,7 +237,6 @@ class EngineArgs: @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: """Shared CLI arguments for vLLM engine.""" - # Model arguments parser.add_argument( '--model', @@ -1191,24 +1181,51 @@ class EngineArgs: use_tqdm_on_load=self.use_tqdm_on_load, ) - def create_engine_config(self, - usage_context: Optional[UsageContext] = None - ) -> VllmConfig: + def create_engine_config( + self, + usage_context: Optional[UsageContext] = None, + ) -> VllmConfig: + """ + Create the VllmConfig. + + NOTE: for autoselection of V0 vs V1 engine, we need to + create the ModelConfig first, since ModelConfig's attrs + (e.g. the model arch) are needed to make the decision. + + This function set VLLM_USE_V1=X if VLLM_USE_V1 is + unspecified by the user. + + If VLLM_USE_V1 is specified by the user but the VllmConfig + is incompatible, we raise an error. + """ from vllm.platforms import current_platform current_platform.pre_register_and_update() - if envs.VLLM_USE_V1: - self._override_v1_engine_args(usage_context) - device_config = DeviceConfig(device=self.device) model_config = self.create_model_config() - if (model_config.is_multimodal_model and not envs.VLLM_USE_V1 - and self.enable_prefix_caching): - logger.warning("--enable-prefix-caching is currently not " - "supported for multimodal models in v0 and " - "has been disabled.") - self.enable_prefix_caching = False + # * If VLLM_USE_V1 is unset, we enable V1 for "supported features" + # and fall back to V0 for experimental or unsupported features. + # * If VLLM_USE_V1=1, we enable V1 for supported + experimental + # features and raise error for unsupported features. + # * If VLLM_USE_V1=0, we disable V1. + use_v1 = False + try_v1 = envs.VLLM_USE_V1 or not envs.is_set("VLLM_USE_V1") + if try_v1 and self._is_v1_supported_oracle(model_config): + use_v1 = True + + # If user explicitly set VLLM_USE_V1, sanity check we respect it. + if envs.is_set("VLLM_USE_V1"): + assert use_v1 == envs.VLLM_USE_V1 + # Otherwise, set the VLLM_USE_V1 variable globally. + else: + envs.set_vllm_use_v1(use_v1) + + # Set default arguments for V0 or V1 Engine. + if use_v1: + self._set_default_args_v1(usage_context) + else: + self._set_default_args_v0(model_config) cache_config = CacheConfig( block_size=self.block_size, @@ -1239,50 +1256,6 @@ class EngineArgs: worker_extension_cls=self.worker_extension_cls, ) - max_model_len = model_config.max_model_len - use_long_context = max_model_len > 32768 - if self.enable_chunked_prefill is None: - # If not explicitly set, enable chunked prefill by default for - # long context (> 32K) models. This is to avoid OOM errors in the - # initial memory profiling phase. - - # For multimodal models and models with MLA, chunked prefill is - # disabled by default in V0, but enabled by design in V1 - if model_config.is_multimodal_model or model_config.use_mla: - self.enable_chunked_prefill = bool(envs.VLLM_USE_V1) - - elif use_long_context: - is_gpu = device_config.device_type == "cuda" - use_sliding_window = (model_config.get_sliding_window() - is not None) - use_spec_decode = self.speculative_model is not None - from vllm.platforms import current_platform - if (is_gpu and not use_sliding_window and not use_spec_decode - and not self.enable_lora - and not self.enable_prompt_adapter - and model_config.runner_type != "pooling" - and not current_platform.is_rocm()): - self.enable_chunked_prefill = True - logger.warning( - "Chunked prefill is enabled by default for models with " - "max_model_len > 32K. Currently, chunked prefill might " - "not work with some features or models. If you " - "encounter any issues, please disable chunked prefill " - "by setting --enable-chunked-prefill=False.") - if self.enable_chunked_prefill is None: - self.enable_chunked_prefill = False - - if not self.enable_chunked_prefill and use_long_context: - logger.warning( - "The model has a long context length (%s). This may cause OOM " - "errors during the initial memory profiling phase, or result " - "in low performance due to small KV cache space. Consider " - "setting --max-model-len to a smaller value.", max_model_len) - elif (self.enable_chunked_prefill - and model_config.runner_type == "pooling"): - msg = "Chunked prefill is not supported for pooling models" - raise ValueError(msg) - speculative_config = SpeculativeConfig.maybe_create_spec_config( target_model_config=model_config, target_parallel_config=parallel_config, @@ -1425,18 +1398,282 @@ class EngineArgs: additional_config=self.additional_config, ) - if envs.VLLM_USE_V1: - self._override_v1_engine_config(config) return config - def _override_v1_engine_args(self, usage_context: UsageContext) -> None: - """ - Override the EngineArgs's args based on the usage context for V1. - """ - assert envs.VLLM_USE_V1, "V1 is not enabled" + def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: + """Oracle for whether to use V0 or V1 Engine by default.""" + + ############################################################# + # Unsupported Feature Flags on V1. + + if (self.load_format == LoadFormat.TENSORIZER.value + or self.load_format == LoadFormat.SHARDED_STATE.value): + _raise_or_fallback( + feature_name=f"--load_format {self.load_format}", + recommend_to_remove=False) + return False + + if (self.logits_processor_pattern + != EngineArgs.logits_processor_pattern): + _raise_or_fallback(feature_name="--logits-processor-pattern", + recommend_to_remove=False) + return False + + if self.preemption_mode != EngineArgs.preemption_mode: + _raise_or_fallback(feature_name="--preemption-mode", + recommend_to_remove=True) + return False + + if (self.disable_async_output_proc + != EngineArgs.disable_async_output_proc): + _raise_or_fallback(feature_name="--disable-async-output-proc", + recommend_to_remove=True) + return False + + if self.scheduling_policy != EngineArgs.scheduling_policy: + _raise_or_fallback(feature_name="--scheduling-policy", + recommend_to_remove=False) + return False + + if self.worker_cls != EngineArgs.worker_cls: + _raise_or_fallback(feature_name="--worker-cls", + recommend_to_remove=False) + return False + + if self.worker_extension_cls != EngineArgs.worker_extension_cls: + _raise_or_fallback(feature_name="--worker-extension-cls", + recommend_to_remove=False) + return False + + if self.num_scheduler_steps != EngineArgs.num_scheduler_steps: + _raise_or_fallback(feature_name="--num-scheduler-steps", + recommend_to_remove=True) + return False + + if self.scheduler_delay_factor != EngineArgs.scheduler_delay_factor: + _raise_or_fallback(feature_name="--scheduler-delay-factor", + recommend_to_remove=True) + return False + + if self.additional_config != EngineArgs.additional_config: + _raise_or_fallback(feature_name="--additional-config", + recommend_to_remove=False) + return False + + # Only support Xgrammar for guided decoding so far. + SUPPORTED_GUIDED_DECODING = ["xgrammar", "xgrammar:nofallback"] + if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING: + _raise_or_fallback(feature_name="--guided-decoding-backend", + recommend_to_remove=False) + return False + + # Need at least Ampere for now (FA support required). + from vllm.platforms import current_platform + if (current_platform.is_cuda() + and current_platform.get_device_capability().major < 8): + _raise_or_fallback(feature_name="Compute Capability < 8.0", + recommend_to_remove=False) + return False + + # No Fp8 KV cache so far. + if self.kv_cache_dtype != "auto": + _raise_or_fallback(feature_name="--kv-cache-dtype", + recommend_to_remove=False) + return False + + # No Prompt Adapter so far. + if self.enable_prompt_adapter: + _raise_or_fallback(feature_name="--enable-prompt-adapter", + recommend_to_remove=False) + return False + + # No MistralTokenizer support so far (not compatible + # with xgrammar) + if model_config.tokenizer_mode == "mistral": + _raise_or_fallback(feature_name="--tokenizer-mode mistral", + recommend_to_remove=False) + return False + + # No CPU offloading yet. + if self.cpu_offload_gb != EngineArgs.cpu_offload_gb: + _raise_or_fallback(feature_name="--cpu-offload-gb", + recommend_to_remove=False) + return False + + # Only Fp16 and Bf16 dtypes since we only support FA. + V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16] + if model_config.dtype not in V1_SUPPORTED_DTYPES: + _raise_or_fallback(feature_name=f"--dtype {model_config.dtype}", + recommend_to_remove=False) + return False + + # Some quantization is not compatible with torch.compile. + V1_UNSUPPORTED_QUANT = ["bitsandbytes", "gguf"] + if model_config.quantization in V1_UNSUPPORTED_QUANT: + _raise_or_fallback( + feature_name=f"--quantization {model_config.quantization}", + recommend_to_remove=False) + return False + + # No Embedding Models so far. + if model_config.task not in ["generate"]: + _raise_or_fallback(feature_name=f"--task {model_config.task}", + recommend_to_remove=False) + return False + + # No Mamba or Encoder-Decoder so far. + if not model_config.is_v1_compatible: + _raise_or_fallback(feature_name=model_config.architectures, + recommend_to_remove=False) + return False + + # No TransformersModel support so far. + if (model_config.model_impl == ModelImpl.TRANSFORMERS + or model_config.model_impl == "transformers"): + _raise_or_fallback( + feature_name=f"model_impl={model_config.model_impl}", + recommend_to_remove=False) + return False + + # No Concurrent Partial Prefills so far. + if (self.max_num_partial_prefills + != EngineArgs.max_num_partial_prefills + or self.max_long_partial_prefills + != EngineArgs.max_long_partial_prefills + or self.long_prefill_token_threshold + != EngineArgs.long_prefill_token_threshold): + _raise_or_fallback(feature_name="Concurrent Partial Prefill", + recommend_to_remove=False) + return False + + # No OTLP observability so far. + if (self.otlp_traces_endpoint or self.collect_detailed_traces): + _raise_or_fallback(feature_name="--otlp-traces-endpoint", + recommend_to_remove=False) + return False + + # Only Ngram speculative decoding so far. + if (self.speculative_model is not None + or self.num_speculative_tokens is not None): + # This is supported but experimental (handled below). + if self.speculative_model == "[ngram]": + pass + else: + _raise_or_fallback(feature_name="Speculative Decoding", + recommend_to_remove=False) + return False + + # No Disaggregated Prefill so far. + if self.kv_transfer_config != EngineArgs.kv_transfer_config: + _raise_or_fallback(feature_name="--kv-transfer-config", + recommend_to_remove=False) + return False + + # No FlashInfer or XFormers so far. + V1_BACKENDS = [ + "FLASH_ATTN_VLLM_V1", "FLASH_ATTN", "PALLAS", "PALLAS_VLLM_V1", + "TRITON_MLA", "FLASHMLA" + ] + if (envs.is_set("VLLM_ATTENTION_BACKEND") + and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS): + name = f"VLLM_ATTENTION_BACKEND={envs.VLLM_ATTENTION_BACKEND}" + _raise_or_fallback(feature_name=name, recommend_to_remove=True) + return False + + ############################################################# + # Experimental Features - allow users to opt in. + + # MLA is is supported on V1, but off by default for now. + if model_config.use_mla and _warn_or_fallback("MLA"): + return False + + # LoRA is supported on V1, but off by default for now. + if self.enable_lora and _warn_or_fallback("LORA"): + return False + + # PP is supported on V1, but off by default for now. + if self.pipeline_parallel_size > 1 and _warn_or_fallback("PP"): + return False + + # ngram is supported on V1, but off by default for now. + if self.speculative_model == "[ngram]" and _warn_or_fallback("ngram"): + return False + + # Non-CUDA is supported on V1, but off by default for now. + not_cuda = not current_platform.is_cuda() + if not_cuda and _warn_or_fallback( # noqa: SIM103 + current_platform.device_type): + return False + ############################################################# + + return True + + def _set_default_args_v0(self, model_config: ModelConfig) -> None: + """Set Default Arguments for V0 Engine.""" + + max_model_len = model_config.max_model_len + use_long_context = max_model_len > 32768 + if self.enable_chunked_prefill is None: + # Chunked prefill not supported for Multimodal or MLA in V0. + if model_config.is_multimodal_model or model_config.use_mla: + self.enable_chunked_prefill = False + + # Enable chunked prefill by default for long context (> 32K) + # models to avoid OOM errors in initial memory profiling phase. + elif use_long_context: + from vllm.platforms import current_platform + is_gpu = current_platform.is_cuda() + use_sliding_window = (model_config.get_sliding_window() + is not None) + use_spec_decode = self.speculative_model is not None + + if (is_gpu and not use_sliding_window and not use_spec_decode + and not self.enable_lora + and not self.enable_prompt_adapter + and model_config.runner_type != "pooling"): + self.enable_chunked_prefill = True + logger.warning( + "Chunked prefill is enabled by default for models " + "with max_model_len > 32K. Chunked prefill might " + "not work with some features or models. If you " + "encounter any issues, please disable by launching " + "with --enable-chunked-prefill=False.") + + if self.enable_chunked_prefill is None: + self.enable_chunked_prefill = False + + if not self.enable_chunked_prefill and use_long_context: + logger.warning( + "The model has a long context length (%s). This may cause" + "OOM during the initial memory profiling phase, or result " + "in low performance due to small KV cache size. Consider " + "setting --max-model-len to a smaller value.", max_model_len) + elif (self.enable_chunked_prefill + and model_config.runner_type == "pooling"): + msg = "Chunked prefill is not supported for pooling models" + raise ValueError(msg) + + # Disable prefix caching for multimodal models for VLLM_V0. + if (model_config.is_multimodal_model and self.enable_prefix_caching): + logger.warning( + "--enable-prefix-caching is not supported for multimodal " + "models in V0 and has been disabled.") + self.enable_prefix_caching = False + + # Set max_num_seqs to 256 for VLLM_V0. + if self.max_num_seqs is None: + self.max_num_seqs = 256 + + def _set_default_args_v1(self, usage_context: UsageContext) -> None: + """Set Default Arguments for V1 Engine.""" # V1 always uses chunked prefills. self.enable_chunked_prefill = True + + # V1 enables prefix caching by default. + if self.enable_prefix_caching is None: + self.enable_prefix_caching = True + # V1 should use the new scheduler by default. # Swap it only if this arg is set to the original V0 default if self.scheduler_cls == EngineArgs.scheduler_cls: @@ -1471,19 +1708,21 @@ class EngineArgs: UsageContext.OPENAI_API_SERVER: 2048, } + use_context_value = usage_context.value if usage_context else None if (self.max_num_batched_tokens is None and usage_context in default_max_num_batched_tokens): self.max_num_batched_tokens = default_max_num_batched_tokens[ usage_context] - logger.warning( + logger.debug( "Setting max_num_batched_tokens to %d for %s usage context.", - self.max_num_batched_tokens, usage_context.value) + self.max_num_batched_tokens, use_context_value) - def _override_v1_engine_config(self, engine_config: VllmConfig) -> None: - """ - Override the EngineConfig's configs based on the usage context for V1. - """ - assert envs.VLLM_USE_V1, "V1 is not enabled" + default_max_num_seqs = 1024 + if self.max_num_seqs is None: + self.max_num_seqs = default_max_num_seqs + + logger.debug("Setting max_num_seqs to %d for %s usage context.", + self.max_num_seqs, use_context_value) @dataclass @@ -1508,6 +1747,33 @@ class AsyncEngineArgs(EngineArgs): return parser +def _raise_or_fallback(feature_name: str, recommend_to_remove: bool): + if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1: + raise NotImplementedError( + f"VLLM_USE_V1=1 is not supported with {feature_name}.") + msg = f"{feature_name} is not supported by the V1 Engine. " + msg += "Falling back to V0. " + if recommend_to_remove: + msg += f"We recommend to remove {feature_name} from your config " + msg += "in favor of the V1 Engine." + logger.warning(msg) + + +def _warn_or_fallback(feature_name: str) -> bool: + if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1: + logger.warning( + "Detected VLLM_USE_V1=1 with %s. Usage should " + "be considered experimental. Please report any " + "issues on Github.", feature_name) + should_exit = False + else: + logger.info( + "%s is experimental on VLLM_USE_V1=1. " + "Falling back to V0 Engine.", feature_name) + should_exit = True + return should_exit + + # These functions are used by sphinx to build the documentation def _engine_args_parser(): return EngineArgs.add_cli_args(FlexibleArgumentParser()) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index ebba34c5c8677..84f5528a06d02 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -595,6 +595,13 @@ class AsyncLLMEngine(EngineClient): log_requests: bool = True, start_engine_loop: bool = True, **kwargs) -> None: + if envs.VLLM_USE_V1: + raise ValueError( + "Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. " + "This should not happen. As a workaround, try using " + "AsyncLLMEngine.from_vllm_config(...) or explicitly set " + "VLLM_USE_V1=0 or 1 and report this issue on Github.") + self.log_requests = log_requests self.engine = self._engine_class(*args, **kwargs) @@ -629,33 +636,53 @@ class AsyncLLMEngine(EngineClient): engine_config: VllmConfig) -> Type[ExecutorBase]: return LLMEngine._get_executor_cls(engine_config) + @classmethod + def from_vllm_config( + cls, + vllm_config: VllmConfig, + start_engine_loop: bool = True, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[dict[str, StatLoggerBase]] = None, + disable_log_requests: bool = False, + disable_log_stats: bool = False, + ) -> "AsyncLLMEngine": + """Create an AsyncLLMEngine from the EngineArgs.""" + + return cls( + vllm_config=vllm_config, + executor_class=cls._get_executor_cls(vllm_config), + start_engine_loop=start_engine_loop, + log_requests=not disable_log_requests, + log_stats=not disable_log_stats, + usage_context=usage_context, + stat_loggers=stat_loggers, + ) + @classmethod def from_engine_args( cls, engine_args: AsyncEngineArgs, - engine_config: Optional[VllmConfig] = None, start_engine_loop: bool = True, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, ) -> "AsyncLLMEngine": """Creates an async LLM engine from the engine arguments.""" - # Create the engine configs. - if engine_config is None: - engine_config = engine_args.create_engine_config(usage_context) - executor_class = cls._get_executor_cls(engine_config) + vllm_config = engine_args.create_engine_config(usage_context) - # Create the async LLM engine. - engine = cls( - vllm_config=engine_config, - executor_class=executor_class, - log_requests=not engine_args.disable_log_requests, - log_stats=not engine_args.disable_log_stats, + async_engine_cls = cls + if envs.VLLM_USE_V1: + from vllm.v1.engine.async_llm import AsyncLLM as V1AsyncLLMEngine + async_engine_cls = V1AsyncLLMEngine + + return async_engine_cls.from_vllm_config( + vllm_config=vllm_config, start_engine_loop=start_engine_loop, usage_context=usage_context, stat_loggers=stat_loggers, + disable_log_stats=engine_args.disable_log_stats, + disable_log_requests=engine_args.disable_log_requests, ) - return engine @property def is_running(self) -> bool: @@ -1203,7 +1230,7 @@ class AsyncLLMEngine(EngineClient): # TODO(v1): Remove this class proxy when V1 goes default. -if envs.VLLM_USE_V1: +if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1: from vllm.v1.engine.async_llm import AsyncLLM AsyncLLMEngine = AsyncLLM # type: ignore diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 783275ab41d26..94687a13c5280 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -216,6 +216,12 @@ class LLMEngine: mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, use_cached_outputs: bool = False, ) -> None: + if envs.VLLM_USE_V1: + raise ValueError( + "Using V0 LLMEngine, but envs.VLLM_USE_V1=True. " + "This should not happen. As a workaround, try using " + "LLMEngine.from_vllm_config(...) or explicitly set " + "VLLM_USE_V1=0 or 1 and report this issue on Github.") self.vllm_config = vllm_config self.model_config = vllm_config.model_config @@ -479,6 +485,22 @@ class LLMEngine: f"{distributed_executor_backend}") return executor_class + @classmethod + def from_vllm_config( + cls, + vllm_config: VllmConfig, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + disable_log_stats: bool = False, + ) -> "LLMEngine": + return cls( + vllm_config=vllm_config, + executor_class=cls._get_executor_cls(vllm_config), + log_stats=(not disable_log_stats), + usage_context=usage_context, + stat_loggers=stat_loggers, + ) + @classmethod def from_engine_args( cls, @@ -488,19 +510,20 @@ class LLMEngine: ) -> "LLMEngine": """Creates an LLM engine from the engine arguments.""" # Create the engine configs. - engine_config = engine_args.create_engine_config(usage_context) - executor_class = cls._get_executor_cls(engine_config) - # Create the LLM engine. - engine = cls( - vllm_config=engine_config, - executor_class=executor_class, - log_stats=not engine_args.disable_log_stats, + vllm_config = engine_args.create_engine_config(usage_context) + + engine_cls = cls + if envs.VLLM_USE_V1: + from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine + engine_cls = V1LLMEngine + + return engine_cls.from_vllm_config( + vllm_config=vllm_config, usage_context=usage_context, stat_loggers=stat_loggers, + disable_log_stats=engine_args.disable_log_stats, ) - return engine - def __reduce__(self): # This is to ensure that the LLMEngine is not referenced in # the closure used to initialize Ray worker actors @@ -2097,6 +2120,6 @@ class LLMEngine: return sampling_params -# TODO(v1): Remove this class proxy when V1 goes default. -if envs.VLLM_USE_V1: - from vllm.v1.engine.llm_engine import LLMEngine # type: ignore +if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1: + from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine + LLMEngine = V1LLMEngine # type: ignore diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 005ba81cd2264..b1bb0fd53d67a 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -18,7 +18,6 @@ from zmq.asyncio import Socket from vllm import PoolingParams from vllm.config import DecodingConfig, ModelConfig, VllmConfig from vllm.core.scheduler import SchedulerOutputs -from vllm.engine.arg_utils import AsyncEngineArgs # yapf conflicts with isort for this block # yapf: disable from vllm.engine.async_llm_engine import ( @@ -133,9 +132,9 @@ class MQLLMEngineClient(EngineClient): self._engine_process = psutil.Process(engine_pid) @staticmethod - def is_unsupported_config(engine_args: AsyncEngineArgs): + def is_unsupported_config(vllm_config: VllmConfig): # Pipeline parallel not yet supported - return engine_args.pipeline_parallel_size > 1 + return vllm_config.parallel_config.pipeline_parallel_size > 1 @contextmanager def get_data_socket(self) -> Iterator[Socket]: diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 5d14b4112a862..312e0e98d56b4 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -9,6 +9,7 @@ import cloudpickle import zmq from vllm import AsyncEngineArgs, SamplingParams +from vllm.config import VllmConfig from vllm.engine.llm_engine import LLMEngine # yapf conflicts with isort for this block # yapf: disable @@ -110,25 +111,39 @@ class MQLLMEngine: return ENGINE_DEAD_ERROR() @classmethod - def from_engine_args(cls, engine_args: AsyncEngineArgs, - usage_context: UsageContext, ipc_path: str): - """Creates an MQLLMEngine from the engine arguments.""" + def from_vllm_config(cls, vllm_config: VllmConfig, + usage_context: UsageContext, + disable_log_requests: bool, disable_log_stats: bool, + ipc_path: str) -> "MQLLMEngine": # Setup plugins for each process from vllm.plugins import load_general_plugins load_general_plugins() - engine_config = engine_args.create_engine_config(usage_context) - executor_class = LLMEngine._get_executor_cls(engine_config) + use_async_sockets = vllm_config.model_config.use_async_output_proc - use_async_sockets = engine_config.model_config.use_async_output_proc + return cls( + vllm_config=vllm_config, + executor_class=LLMEngine._get_executor_cls(vllm_config), + ipc_path=ipc_path, + usage_context=usage_context, + use_async_sockets=use_async_sockets, + log_requests=(not disable_log_requests), + log_stats=(not disable_log_stats), + ) - return cls(ipc_path=ipc_path, - use_async_sockets=use_async_sockets, - vllm_config=engine_config, - executor_class=executor_class, - log_requests=not engine_args.disable_log_requests, - log_stats=not engine_args.disable_log_stats, - usage_context=usage_context) + @staticmethod + def from_engine_args(engine_args: AsyncEngineArgs, + usage_context: UsageContext, ipc_path: str): + """Creates an MQLLMEngine from the engine arguments.""" + + vllm_config = engine_args.create_engine_config(usage_context) + return MQLLMEngine.from_vllm_config( + ipc_path=ipc_path, + vllm_config=vllm_config, + usage_context=usage_context, + disable_log_requests=engine_args.disable_log_requests, + disable_log_stats=engine_args.disable_log_stats, + ) def start(self): try: @@ -396,12 +411,16 @@ def signal_handler(*_) -> None: raise KeyboardInterrupt("MQLLMEngine terminated") -def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext, - ipc_path: str, engine_alive): +def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext, + ipc_path: str, disable_log_stats: bool, + disable_log_requests: bool, engine_alive): try: - engine = MQLLMEngine.from_engine_args(engine_args=engine_args, - usage_context=usage_context, - ipc_path=ipc_path) + engine = MQLLMEngine.from_vllm_config( + vllm_config=vllm_config, + usage_context=usage_context, + disable_log_stats=disable_log_stats, + disable_log_requests=disable_log_requests, + ipc_path=ipc_path) signal.signal(signal.SIGTERM, signal_handler) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index e8f3c1f4e50b6..a0e2fa2918bd1 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -11,7 +11,6 @@ import torch.nn as nn from tqdm import tqdm from typing_extensions import TypeVar, deprecated -from vllm import envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, get_beam_search_score) from vllm.config import CompilationConfig @@ -238,23 +237,15 @@ class LLM: compilation_config=compilation_config_instance, **kwargs, ) - # Logic to switch between engines is done at runtime instead of import - # to avoid import order issues - self.engine_class = self.get_engine_class() - self.llm_engine = self.engine_class.from_engine_args( - engine_args, usage_context=UsageContext.LLM_CLASS) + + # Create the Engine (autoselects V0 vs V1) + self.llm_engine = LLMEngine.from_engine_args( + engine_args=engine_args, usage_context=UsageContext.LLM_CLASS) + self.engine_class = type(self.llm_engine) self.request_counter = Counter() self.default_sampling_params: Union[dict[str, Any], None] = None - @staticmethod - def get_engine_class() -> type[LLMEngine]: - if envs.VLLM_USE_V1: - # Lazy import: the v1 package isn't distributed - from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine - return V1LLMEngine # type: ignore - return LLMEngine - def get_tokenizer(self) -> AnyTokenizer: return self.llm_engine.get_tokenizer_group(TokenizerGroup).tokenizer diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 52e65fc214bc7..694d4f9cf1121 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -154,21 +154,47 @@ async def build_async_engine_client_from_engine_args( Returns the Client or None if the creation failed. """ - # AsyncLLMEngine. - if (MQLLMEngineClient.is_unsupported_config(engine_args) - or envs.VLLM_USE_V1 or disable_frontend_multiprocessing): + # Create the EngineConfig (determines if we can use V1). + usage_context = UsageContext.OPENAI_API_SERVER + vllm_config = engine_args.create_engine_config(usage_context=usage_context) + + # V1 AsyncLLM. + if envs.VLLM_USE_V1: + if disable_frontend_multiprocessing: + logger.warning( + "V1 is enabled, but got --disable-frontend-multiprocessing. " + "To disable frontend multiprocessing, set VLLM_USE_V1=0.") + + from vllm.v1.engine.async_llm import AsyncLLM + async_llm: Optional[AsyncLLM] = None + try: + async_llm = AsyncLLM.from_vllm_config( + vllm_config=vllm_config, + usage_context=usage_context, + disable_log_requests=engine_args.disable_log_requests, + disable_log_stats=engine_args.disable_log_stats) + yield async_llm + finally: + if async_llm: + async_llm.shutdown() + + # V0 AsyncLLM. + elif (MQLLMEngineClient.is_unsupported_config(vllm_config) + or disable_frontend_multiprocessing): engine_client: Optional[EngineClient] = None try: - engine_client = AsyncLLMEngine.from_engine_args( - engine_args=engine_args, - usage_context=UsageContext.OPENAI_API_SERVER) + engine_client = AsyncLLMEngine.from_vllm_config( + vllm_config=vllm_config, + usage_context=usage_context, + disable_log_requests=engine_args.disable_log_requests, + disable_log_stats=engine_args.disable_log_stats) yield engine_client finally: if engine_client and hasattr(engine_client, "shutdown"): engine_client.shutdown() - # MQLLMEngine. + # V0MQLLMEngine. else: if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: # Make TemporaryDirectory for prometheus multiprocessing @@ -199,10 +225,11 @@ async def build_async_engine_client_from_engine_args( # not actually result in an exitcode being reported. As a result # we use a shared variable to communicate the information. engine_alive = multiprocessing.Value('b', True, lock=False) - engine_process = context.Process(target=run_mp_engine, - args=(engine_args, - UsageContext.OPENAI_API_SERVER, - ipc_path, engine_alive)) + engine_process = context.Process( + target=run_mp_engine, + args=(vllm_config, UsageContext.OPENAI_API_SERVER, ipc_path, + engine_args.disable_log_stats, + engine_args.disable_log_requests, engine_alive)) engine_process.start() engine_pid = engine_process.pid assert engine_pid is not None, "Engine process failed to start." @@ -217,8 +244,7 @@ async def build_async_engine_client_from_engine_args( atexit.register(_cleanup_ipc_path) # Build RPCClient, which conforms to EngineClient Protocol. - engine_config = engine_args.create_engine_config() - build_client = partial(MQLLMEngineClient, ipc_path, engine_config, + build_client = partial(MQLLMEngineClient, ipc_path, vllm_config, engine_pid) mq_engine_client = await asyncio.get_running_loop().run_in_executor( None, build_client) diff --git a/vllm/envs.py b/vllm/envs.py index 0b1bcd9eb358b..7e079006b273c 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -74,7 +74,7 @@ if TYPE_CHECKING: VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_SKIP_P2P_CHECK: bool = False VLLM_DISABLED_KERNELS: list[str] = [] - VLLM_USE_V1: bool = False + VLLM_USE_V1: bool = True VLLM_ROCM_FP8_PADDING: bool = True VLLM_ENABLE_V1_MULTIPROCESSING: bool = True VLLM_LOG_BATCHSIZE_INTERVAL: float = -1 @@ -522,7 +522,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # If set, use the V1 code path. "VLLM_USE_V1": - lambda: bool(int(os.getenv("VLLM_USE_V1", "0"))), + lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))), # Pad the fp8 weights to 256 bytes for ROCm "VLLM_ROCM_FP8_PADDING": @@ -644,3 +644,19 @@ def __getattr__(name: str): def __dir__(): return list(environment_variables.keys()) + + +def is_set(name: str): + """Check if an environment variable is explicitly set.""" + if name in environment_variables: + return name in os.environ + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def set_vllm_use_v1(use_v1: bool): + if is_set("VLLM_USE_V1"): + raise ValueError( + "Should not call set_vllm_use_v1() if VLLM_USE_V1 is set " + "explicitly by the user. Please raise this as a Github " + "Issue and explicitly set VLLM_USE_V1=0 or 1.") + os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0" diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 9686231fb4bd1..0b4872827319b 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -74,7 +74,8 @@ def resolve_transformers_fallback(model_config: ModelConfig, if not is_transformers_impl_compatible(arch, custom_model_module): raise ValueError( f"{arch} has no vLLM implementation and the Transformers " - "implementation is not compatible with vLLM.") + "implementation is not compatible with vLLM. Try setting " + "VLLM_USE_V1=0.") logger.warning( "%s has no vLLM implementation, falling back to Transformers " "implementation. Some features may not be supported and " diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 84b79613abc47..50f48f91798ac 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -42,7 +42,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsPP +from .interfaces import SupportsPP, SupportsV0Only from .utils import (is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -279,7 +279,7 @@ class BloomModel(nn.Module): return hidden_states -class BloomForCausalLM(nn.Module, SupportsPP): +class BloomForCausalLM(nn.Module, SupportsPP, SupportsV0Only): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py index 5f1903345f0d7..8d52da8b7482c 100644 --- a/vllm/model_executor/models/glm.py +++ b/vllm/model_executor/models/glm.py @@ -3,10 +3,11 @@ from vllm.config import VllmConfig from vllm.model_executor.models.llama import LlamaForCausalLM +from .interfaces import SupportsV0Only from .utils import PPMissingLayer -class GlmForCausalLM(LlamaForCausalLM): +class GlmForCausalLM(LlamaForCausalLM, SupportsV0Only): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 51b1c33cfbdec..d368c145d55f9 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -36,7 +36,7 @@ from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.ultravox import UltravoxConfig from .interfaces import (MultiModalEmbeddings, SupportsLoRA, - SupportsMultiModal, SupportsPP) + SupportsMultiModal, SupportsPP, SupportsV0Only) from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings, @@ -405,7 +405,8 @@ class ModifiedWhisperEncoder(WhisperEncoder): UltravoxMultiModalProcessor, info=UltravoxProcessingInfo, dummy_inputs=UltravoxDummyInputsBuilder) -class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): +class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, + SupportsV0Only): packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index db80e52bf0738..ad44f256a7b91 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -196,7 +196,8 @@ class FlashAttentionImpl(AttentionImpl): if head_size not in support_head_sizes: raise ValueError( f"Head size {head_size} is not supported by FlashAttention. " - f"Supported head sizes are: {support_head_sizes}.") + f"Supported head sizes are: {support_head_sizes}. " + "Set VLLM_USE_V1=0 to use another attention backend.") if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 05633352be6c0..7188f10b18856 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -8,6 +8,7 @@ from typing import Optional, Union import numpy as np +import vllm.envs as envs from vllm.config import ModelConfig, VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient @@ -49,6 +50,12 @@ class AsyncLLM(EngineClient): log_requests: bool = True, start_engine_loop: bool = True, ) -> None: + if not envs.VLLM_USE_V1: + raise ValueError( + "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. " + "This should not happen. As a workaround, try using " + "AsyncLLMEngine.from_vllm_config(...) or explicitly set " + "VLLM_USE_V1=0 or 1 and report this issue on Github.") assert start_engine_loop @@ -92,22 +99,50 @@ class AsyncLLM(EngineClient): self.output_handler: Optional[asyncio.Task] = None + @classmethod + def from_vllm_config( + cls, + vllm_config: VllmConfig, + start_engine_loop: bool = True, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[dict[str, StatLoggerBase]] = None, + disable_log_requests: bool = False, + disable_log_stats: bool = False, + ) -> "AsyncLLM": + if not envs.VLLM_USE_V1: + raise ValueError( + "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. " + "This should not happen. As a workaround, try using " + "AsyncLLMEngine.from_vllm_config(...) or explicitly set " + "VLLM_USE_V1=0 or 1 and report this issue on Github.") + + # FIXME(rob): refactor VllmConfig to include the StatLoggers + # include StatLogger in the Oracle decision. + if stat_loggers is not None: + raise ValueError("Custom StatLoggers are not yet supported on V1. " + "Explicitly set VLLM_USE_V1=0 to disable V1.") + + # Create the LLMEngine. + return cls( + vllm_config=vllm_config, + executor_class=Executor.get_class(vllm_config), + start_engine_loop=start_engine_loop, + log_requests=not disable_log_requests, + log_stats=not disable_log_stats, + usage_context=usage_context, + ) + @classmethod def from_engine_args( cls, engine_args: AsyncEngineArgs, - engine_config: Optional[VllmConfig] = None, start_engine_loop: bool = True, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, ) -> "AsyncLLM": """Create an AsyncLLM from the EngineArgs.""" # Create the engine configs. - if engine_config is None: - vllm_config = engine_args.create_engine_config(usage_context) - else: - vllm_config = engine_config - + vllm_config = engine_args.create_engine_config(usage_context) executor_class = Executor.get_class(vllm_config) # Create the AsyncLLM. diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index d56aee1accc2d..cbd19d4d637be 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -46,6 +46,13 @@ class LLMEngine: use_cached_outputs: bool = False, multiprocess_mode: bool = False, ) -> None: + if not envs.VLLM_USE_V1: + raise ValueError( + "Using V1 LLMEngine, but envs.VLLM_USE_V1=False. " + "This should not happen. As a workaround, try using " + "LLMEngine.from_vllm_config(...) or explicitly set " + "VLLM_USE_V1=0 or 1 and report this issue on Github.") + self.vllm_config = vllm_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config @@ -88,6 +95,26 @@ class LLMEngine: # for v0 compatibility self.model_executor = self.engine_core.engine_core.model_executor # type: ignore + @classmethod + def from_vllm_config( + cls, + vllm_config: VllmConfig, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[dict[str, StatLoggerBase]] = None, + disable_log_stats: bool = False, + ) -> "LLMEngine": + if stat_loggers is not None: + raise NotImplementedError( + "Passing StatLoggers to V1 is not yet supported. " + "Set VLLM_USE_V1=0 and file and issue on Github.") + + return cls(vllm_config=vllm_config, + executor_class=Executor.get_class(vllm_config), + log_stats=(not disable_log_stats), + usage_context=usage_context, + stat_loggers=stat_loggers, + multiprocess_mode=envs.VLLM_ENABLE_V1_MULTIPROCESSING) + @classmethod def from_engine_args( cls, diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 56846030ac49f..663e1e36f7561 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -184,7 +184,7 @@ class Processor: # Only applicable to multimodal models with legacy input processor. processed_inputs = self.input_processor(preprocessed_inputs) - self._validate_model_inputs(processed_inputs) + self._validate_model_inputs(processed_inputs, lora_request) if is_encoder_decoder_inputs(processed_inputs): decoder_inputs = SingletonInputsAdapter( @@ -200,8 +200,12 @@ class Processor: raise NotImplementedError assert isinstance(params, SamplingParams) - # TODO: can we avoid cloning here in multiproc case + # TODO: can we avoid cloning here in multiproc case? sampling_params = params.clone() + # If unset max tokens, then generate up to the max_model_len. + if sampling_params.max_tokens is None: + sampling_params.max_tokens = (self.model_config.max_model_len - + len(decoder_inputs.prompt_token_ids)) sampling_params.update_from_generation_config( self.generation_config_fields, eos_token_id) sampling_params.update_from_tokenizer( @@ -296,7 +300,9 @@ class Processor: lora_request=lora_request, ) - def _validate_model_inputs(self, inputs: ProcessorInputs): + def _validate_model_inputs(self, + inputs: ProcessorInputs, + lora_request: Optional[LoRARequest] = None): if is_encoder_decoder_inputs(inputs): # For encoder-decoder multimodal models, the max_prompt_len # restricts the decoder prompt length @@ -310,6 +316,13 @@ class Processor: if prompt_ids is None or len(prompt_ids) == 0: raise ValueError("Prompt cannot be empty") + max_input_id = max(prompt_ids) + max_allowed = self.tokenizer.get_lora_tokenizer( + lora_request).max_token_id + if max_input_id > max_allowed: + raise ValueError( + "Token id {} is out of vocabulary".format(max_input_id)) + if len(prompt_ids) >= self.model_config.max_model_len: raise ValueError( f"Prompt length of {len(prompt_ids)} is longer than the " From 877e3522624e0b4ba4172f28d159edeb9c8a5c52 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Fri, 14 Mar 2025 22:06:38 -0700 Subject: [PATCH 048/169] [Docs] Add new East Coast vLLM Meetup slides to README and meetups.md (#14852) --- README.md | 1 + docs/source/community/meetups.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 405e3a257f768..bfab7faf598b6 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ Join us to connect with the **vLLM team** and explore how vLLM is leveraged in * *Latest News* πŸ”₯ +- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0). - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted. - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html). - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing). diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md index c57f27b49b88a..efb4f692972b5 100644 --- a/docs/source/community/meetups.md +++ b/docs/source/community/meetups.md @@ -4,6 +4,7 @@ We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +- [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0) - [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) - [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing) - [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing) From a2ae4965890105957984f55c3156608132327a48 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Sat, 15 Mar 2025 13:07:36 +0800 Subject: [PATCH 049/169] [CPU] Support FP8 KV cache (#14741) Signed-off-by: jiang1.li --- csrc/cpu/cache.cpp | 38 ++++++------ csrc/cpu/cpu_types_x86.hpp | 9 +++ .../getting_started/installation/cpu.md | 2 +- .../basic_correctness/test_chunked_prefill.py | 4 +- .../models/decoder_only/language/test_fp8.py | 61 +++++++++++++++++++ vllm/attention/backends/torch_sdpa.py | 9 +-- vllm/platforms/cpu.py | 30 +++++---- vllm/worker/cpu_worker.py | 5 +- 8 files changed, 122 insertions(+), 36 deletions(-) diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp index e3809acad7453..d726ee9307fe0 100644 --- a/csrc/cpu/cache.cpp +++ b/csrc/cpu/cache.cpp @@ -3,6 +3,12 @@ #include "cpu_types.hpp" +#if defined(__x86_64__) + #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2 +#else + #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES +#endif + namespace { template void copy_blocks_cpu_impl(std::vector const& key_caches, @@ -95,13 +101,12 @@ void copy_blocks(std::vector const& key_caches, } const int element_num_per_block = key_caches[0][0].numel(); - VLLM_DISPATCH_FLOATING_TYPES( - key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] { - CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl) - copy_blocks_cpu_impl(key_caches, value_caches, block_mapping, - element_num_per_block, num_layers); - CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl) - }); + DISPATCH_MACRO(key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] { + CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl) + copy_blocks_cpu_impl(key_caches, value_caches, block_mapping, + element_num_per_block, num_layers); + CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl) + }); } void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, @@ -118,16 +123,15 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, int key_stride = key.stride(0); int value_stride = value.stride(0); - VLLM_DISPATCH_FLOATING_TYPES( - key.scalar_type(), "reshape_and_cache_cpu_impl", [&] { - CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl) - reshape_and_cache_cpu_impl( - key.data_ptr(), value.data_ptr(), - key_cache.data_ptr(), value_cache.data_ptr(), - slot_mapping.data_ptr(), num_tokens, key_stride, - value_stride, num_heads, head_size, block_size, x); - CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl) - }); + DISPATCH_MACRO(key.scalar_type(), "reshape_and_cache_cpu_impl", [&] { + CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl) + reshape_and_cache_cpu_impl( + key.data_ptr(), value.data_ptr(), + key_cache.data_ptr(), value_cache.data_ptr(), + slot_mapping.data_ptr(), num_tokens, key_stride, value_stride, + num_heads, head_size, block_size, x); + CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl) + }); } void swap_blocks(torch::Tensor& src, torch::Tensor& dst, diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index a4ef2be2a58ca..a9369e1fd1016 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -16,9 +16,18 @@ namespace vec_op { AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) +#define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) + #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) +#define VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, \ + VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(__VA_ARGS__)) + #ifndef CPU_OP_GUARD #define CPU_KERNEL_GUARD_IN(NAME) #define CPU_KERNEL_GUARD_OUT(NAME) diff --git a/docs/source/getting_started/installation/cpu.md b/docs/source/getting_started/installation/cpu.md index 43c9187f072e1..65af7b50bdc15 100644 --- a/docs/source/getting_started/installation/cpu.md +++ b/docs/source/getting_started/installation/cpu.md @@ -189,7 +189,7 @@ vLLM CPU backend supports the following vLLM features: - Model Quantization (`INT8 W8A8, AWQ, GPTQ`) - Chunked-prefill - Prefix-caching -- FP8-E5M2 KV-Caching (TODO) +- FP8-E5M2 KV cache ## Related runtime environment variables diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index 5bf48b5cced4a..be007de321c8a 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -266,7 +266,7 @@ def test_with_prefix_caching( @pytest.mark.parametrize("model", ["facebook/opt-125m"]) -@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("dtype", ["bfloat16", "half"]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16]) @pytest.mark.parametrize("enforce_eager", [False]) @@ -303,7 +303,7 @@ def test_models_cpu( @pytest.mark.parametrize("max_tokens", [16]) @pytest.mark.parametrize("enforce_eager", [False]) @pytest.mark.parametrize("chunk_size", [30, 32]) -@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("dtype", ["bfloat16", "half"]) @pytest.mark.cpu_model @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") def test_with_prefix_caching_cpu( diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py index 27c125160aa19..faca7a566e79c 100644 --- a/tests/models/decoder_only/language/test_fp8.py +++ b/tests/models/decoder_only/language/test_fp8.py @@ -11,6 +11,7 @@ import pytest from tests.kernels.utils import override_backend_env_variable from tests.quantization.utils import is_quant_method_supported +from vllm.platforms import current_platform from ...utils import check_logprobs_close @@ -93,3 +94,63 @@ def test_models( name_0="fp16_kv_cache", name_1="fp8_kv_cache", ) + + +@pytest.mark.cpu_model +@pytest.mark.skipif(not current_platform.is_cpu(), + reason="test for the CPU backend.") +@pytest.mark.parametrize( + "kv_cache_dtype,base_model,test_model", + [ + # Test BF16 checkpoint w. fp8_e5m2 kv-cache. + ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct", + "meta-llama/Llama-3.2-1B-Instruct"), + ]) +# Due to low-precision numerical divergence, we only test logprob of 4 tokens +@pytest.mark.parametrize("max_tokens", [4]) +# Due to low-precision numerical divergence, this test is too sensitive for +# the async postprocessor +@pytest.mark.parametrize("disable_async_output_proc", [True]) +def test_cpu_models( + vllm_runner, + example_prompts, + kv_cache_dtype: str, + base_model: str, + test_model: str, + max_tokens: int, + disable_async_output_proc: bool, +) -> None: + """ + Only checks log probs match to cover the discrepancy in + numerical sensitive kernels. + """ + + MAX_MODEL_LEN = 1024 + NUM_LOG_PROBS = 8 + + with vllm_runner( + base_model, + max_model_len=MAX_MODEL_LEN, + dtype="bfloat16", + kv_cache_dtype="auto", + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + baseline_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) + + with vllm_runner( + test_model, + max_model_len=MAX_MODEL_LEN, + dtype="bfloat16", + kv_cache_dtype=kv_cache_dtype, + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + test_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) + + check_logprobs_close( + outputs_0_lst=baseline_outputs, + outputs_1_lst=test_outputs, + name_0="bf16_kv_cache", + name_1="fp8_kv_cache", + ) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 37dd75da27596..afe2acff4ab3d 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -17,7 +17,7 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, is_quantized_kv_cache) # yapf: enable from vllm.attention.backends.utils import CommonAttentionState -from vllm.attention.ops.ipex_attn import PagedAttention +from vllm.attention.ops.ipex_attn import PagedAttention, _use_ipex from vllm.attention.ops.paged_attn import PagedAttentionMetadata from vllm.logger import init_logger from vllm.utils import make_tensor_with_pad @@ -431,10 +431,11 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): raise ValueError( f"Head size {head_size} is not supported by PagedAttention. " f"Supported head sizes are: {supported_head_sizes}.") - if is_quantized_kv_cache(kv_cache_dtype): + + if is_quantized_kv_cache(kv_cache_dtype) and not _use_ipex: raise NotImplementedError( - "Torch SDPA backend does not support FP8 KV cache. " - "Please use xFormers backend instead.") + "Torch SDPA backend FP8 KV cache requires " + "intel_extension_for_pytorch support.") self.attn_type = attn_type def forward( diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 140335dfb64a6..40eacfd080e13 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -60,9 +60,6 @@ class CpuPlatform(Platform): # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid if not model_config.enforce_eager: - logger.warning( - "CUDA graph is not supported on CPU, fallback to the eager " - "mode.") model_config.enforce_eager = True cache_config = vllm_config.cache_config @@ -70,6 +67,25 @@ class CpuPlatform(Platform): if cache_config and cache_config.block_size is None: cache_config.block_size = 16 + scheduler_config = vllm_config.scheduler_config + if ((scheduler_config.chunked_prefill_enabled + or cache_config.enable_prefix_caching) + and cache_config.cache_dtype != "auto"): + raise RuntimeError("Chunked-prefill and prefix-cache on the CPU " + "backend is not compatible with FP8 KV cache.") + + if cache_config.cache_dtype == "fp8_e4m3": + cache_config.cache_dtype = "fp8_e5m2" + logger.warning( + "CPU backend doesn't support fp8_e4m3 KV cache type, " + "cast to fp8_e5m2.") + + if (cache_config.cache_dtype != "auto" + and model_config.dtype == torch.half): + logger.warning("FP8 KV cache on the CPU backend only does not" + " support fp16 for now, cast to bf16.") + model_config.dtype = torch.bfloat16 + kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE if kv_cache_space >= 0: @@ -85,14 +101,6 @@ class CpuPlatform(Platform): "Invalid environment variable VLLM_CPU_KVCACHE_SPACE" f" {kv_cache_space}, expect a positive integer value.") - scheduler_config = vllm_config.scheduler_config - if ((scheduler_config.chunked_prefill_enabled - or cache_config.enable_prefix_caching) - and model_config.dtype == torch.half): - logger.warning("Chunked-prefill on the CPU backend only does not" - " support fp16 for now, cast to bf16.") - model_config.dtype = torch.bfloat16 - parallel_config = vllm_config.parallel_config if (parallel_config.distributed_executor_backend is not None and parallel_config.distributed_executor_backend != "mp"): diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 27b1a2dd1be8c..70d2924a045be 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -53,8 +53,11 @@ class CPUCacheEngine: if cache_config.cache_dtype == "auto": self.dtype = model_config.dtype + elif cache_config.cache_dtype in ["fp8", "fp8_e5m2"]: + self.dtype = torch.float8_e5m2 else: - self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] + raise NotImplementedError(f"Unsupported KV cache type " + f"{cache_config.cache_dtype}.") # Get attention backend. self.attn_backend = get_attn_backend( From 5952d8ab61a39eefd3617b7d46b7a6bd87f51259 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Sat, 15 Mar 2025 01:08:25 -0400 Subject: [PATCH 050/169] [Attention] Get rid of mla cache alignment (#14842) Signed-off-by: Lucas Wilkinson --- tests/kernels/test_cache.py | 39 ++++++++++------------------------ vllm/envs.py | 10 --------- vllm/utils.py | 6 ------ vllm/worker/cache_engine.py | 42 +++---------------------------------- 4 files changed, 14 insertions(+), 83 deletions(-) diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index b55ebd967fd7c..f7936989c9639 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -8,7 +8,6 @@ import torch from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck from vllm import _custom_ops as ops from vllm.platforms import current_platform -from vllm.utils import align_to_256bytes COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] DTYPES = [torch.half, torch.bfloat16, torch.float] @@ -450,22 +449,13 @@ def _create_mla_cache( dtype: torch.dtype, kv_cache_dtype: str, device: str, - align_cache: bool, ) -> torch.Tensor: cache_dtype = torch.uint8 if kv_cache_dtype == "fp8" else dtype - - if align_cache: - alloc_entry_size = align_to_256bytes(entry_size, cache_dtype) - alloc_shape = (num_blocks, block_size, alloc_entry_size) - cache_full = torch.zeros(alloc_shape, dtype=cache_dtype, device=device) - cache = cache_full[..., :entry_size] - else: - cache = torch.zeros(num_blocks, - block_size, - entry_size, - dtype=cache_dtype, - device=device) - return cache + return torch.zeros(num_blocks, + block_size, + entry_size, + dtype=cache_dtype, + device=device) def _fill_mla_cache(cache: torch.Tensor, kv_cache_dtype: str): @@ -488,7 +478,6 @@ def _fill_mla_cache(cache: torch.Tensor, kv_cache_dtype: str): @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) -@pytest.mark.parametrize("align_cache", [False]) @torch.inference_mode() def test_concat_and_cache_mla( kv_lora_rank: int, @@ -500,7 +489,6 @@ def test_concat_and_cache_mla( seed: int, device: str, kv_cache_dtype: str, - align_cache: bool, ) -> None: current_platform.seed_everything(seed) torch.set_default_device(device) @@ -520,7 +508,7 @@ def test_concat_and_cache_mla( scale = torch.tensor(0.1, dtype=torch.float32, device=device) kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype, - kv_cache_dtype, device, align_cache) + kv_cache_dtype, device) ref_temp = torch.zeros(*kv_cache.shape, dtype=dtype, device=device) for i in range(num_tokens): @@ -576,7 +564,6 @@ def test_concat_and_cache_mla( @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) -@pytest.mark.parametrize("align_cache", [False, True]) @torch.inference_mode() def test_copy_blocks_mla( kv_lora_rank: int, @@ -588,7 +575,6 @@ def test_copy_blocks_mla( seed: int, device: str, kv_cache_dtype: str, - align_cache: bool, ) -> None: current_platform.seed_everything(seed) torch.set_default_device(device) @@ -598,7 +584,7 @@ def test_copy_blocks_mla( kv_caches = [] for _ in range(num_layers): kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype, - kv_cache_dtype, device, align_cache) + kv_cache_dtype, device) _fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype) kv_caches.append(kv_cache) @@ -642,7 +628,6 @@ def test_copy_blocks_mla( @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) -@pytest.mark.parametrize("align_cache", [False, True]) @torch.inference_mode() def test_swap_blocks_mla( kv_lora_rank: int, @@ -653,7 +638,6 @@ def test_swap_blocks_mla( seed: int, device: str, kv_cache_dtype: str, - align_cache: bool, ) -> None: current_platform.seed_everything(seed) torch.set_default_device(device) @@ -661,9 +645,9 @@ def test_swap_blocks_mla( entry_size = kv_lora_rank + qk_rope_head_dim src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype, - kv_cache_dtype, device, align_cache) + kv_cache_dtype, device) dst_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype, - kv_cache_dtype, device, align_cache) + kv_cache_dtype, device) _fill_mla_cache(src_cache, kv_cache_dtype) _fill_mla_cache(dst_cache, kv_cache_dtype) @@ -704,15 +688,14 @@ def test_swap_blocks_mla( @pytest.mark.parametrize("dtype", [torch.float32]) @pytest.mark.parametrize("kv_cache_dtype", ["auto"]) # You can also test "fp8" if needed. -@pytest.mark.parametrize("align_cache", [True, False]) @pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size, num_blocks, max_seq_len, batch_size, dtype, - kv_cache_dtype, align_cache, device): + kv_cache_dtype, device): entry_size = kv_lora_rank + qk_rope_head_dim src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype, - kv_cache_dtype, device, align_cache) + kv_cache_dtype, device) _fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype) seq_len_tensor = torch.randint(0, diff --git a/vllm/envs.py b/vllm/envs.py index 7e079006b273c..463059dc06704 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -84,7 +84,6 @@ if TYPE_CHECKING: VLLM_SERVER_DEV_MODE: bool = False VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128 VLLM_MLA_DISABLE: bool = False - VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False VLLM_RAY_PER_WORKER_GPUS: float = 1.0 VLLM_RAY_BUNDLE_INDICES: str = "" @@ -580,15 +579,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_RAY_BUNDLE_INDICES": lambda: os.getenv("VLLM_RAY_BUNDLE_INDICES", ""), - # When on a Nvidia GPU aligns single entries (within a page) so they are 256 - # byte aligned for better performance, this increases the memory usage of - # the cache. Currently this only affects MLA that results in non-256 - # byte aligned entries. This matches the alignment the CUDA runtime uses - # for all allocations. Currently this primarily affects MLA, for most other - # models the alignment is already naturally aligned to 256 bytes. - "VLLM_CUDA_MEM_ALIGN_KV_CACHE": - lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))), - # In some system, find_loaded_library() may not work. So we allow users to # specify the path through environment variable VLLM_CUDART_SO_PATH. "VLLM_CUDART_SO_PATH": diff --git a/vllm/utils.py b/vllm/utils.py index a8eba27dbcdbd..9334741225008 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -827,12 +827,6 @@ def get_dtype_size(dtype: torch.dtype) -> int: return torch.tensor([], dtype=dtype).element_size() -def align_to_256bytes(extent: int, dtype: torch.dtype) -> int: - dtype_size = get_dtype_size(dtype) - eles_per_256bytes = 256 // dtype_size - return round_up(extent, eles_per_256bytes) - - # `collections` helpers def is_list_of( value: object, diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 004b4e4b757fd..85ebe8121e524 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -1,18 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 """CacheEngine class for managing the KV cache.""" -from math import prod from typing import List import torch -from vllm import envs from vllm.attention import get_attn_backend from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, - align_to_256bytes, get_dtype_size, - is_pin_memory_available) + get_dtype_size, is_pin_memory_available) logger = init_logger(__name__) @@ -42,7 +38,6 @@ class CacheEngine: self.num_attention_layers = model_config.get_num_layers_by_block_type( parallel_config, LayerBlockType.attention) self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) - self.align_cache = self._align_cache(model_config) self.block_size = cache_config.block_size self.num_gpu_blocks = cache_config.num_gpu_blocks @@ -81,38 +76,18 @@ class CacheEngine: pin_memory = is_pin_memory_available() if device == "cpu" else False kv_cache: List[torch.Tensor] = [] - # Align entries so they are 256 byte aligned for better performance - # Primarily targets MLA as this typically only ends up having entries - # be 128 byte aligned. - if self.align_cache: - # We assume the cache shape is: - # (TOTAL_PAGES, PAGE_SIZE, entry_shape...) - # NOTE this assumption currently only holds for MLA so we only apply - # this optimization when `use_mla` is true - entry_shape = kv_cache_shape[2:] - entry_size = prod(entry_shape) - alloc_entry_size = align_to_256bytes(entry_size, self.dtype) - alloc_shape = (*kv_cache_shape[:2], alloc_entry_size) - else: - alloc_shape = kv_cache_shape - for _ in range(self.num_attention_layers): # null block in CpuGpuBlockAllocator requires at least that # block to be zeroed-out. # We zero-out everything for simplicity. - layer_kv_cache = torch.zeros(alloc_shape, + layer_kv_cache = torch.zeros(kv_cache_shape, dtype=self.dtype, pin_memory=pin_memory, device=device) - # If we allocated with padding for alignment reasons truncate the - # shape while preserving the aligned stride - if self.align_cache: - layer_kv_cache = layer_kv_cache[..., :entry_size] - # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases # when entry_shape is higher than 1D - kv_cache.append(layer_kv_cache.view(kv_cache_shape)) + kv_cache.append(layer_kv_cache) return kv_cache def swap_in(self, src_to_dst: torch.Tensor) -> None: @@ -128,14 +103,6 @@ class CacheEngine: def copy(self, src_to_dsts: torch.Tensor) -> None: self.attn_backend.copy_blocks(self.gpu_cache, src_to_dsts) - @staticmethod - def _align_cache(model_config: ModelConfig): - # Currently align_cache only applies to MLA models since the other - # cache kernels haven't been updated yet to support non-continguous - # tensors - return model_config.use_mla and current_platform.is_cuda() \ - and envs.VLLM_CUDA_MEM_ALIGN_KV_CACHE - @staticmethod def get_cache_block_size( cache_config: CacheConfig, @@ -153,9 +120,6 @@ class CacheEngine: dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] key_cache_entry = num_heads * head_size - if CacheEngine._align_cache(model_config): - key_cache_entry = align_to_256bytes(key_cache_entry, - model_config.dtype) # For MLA there is no value cache, since the latent vector # is joint keys and values. From e0fdfa1608b988f3b6767117a9adf2f9a3831cb9 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 15 Mar 2025 13:09:25 +0800 Subject: [PATCH 051/169] [CI/Build] Delete LoRA bias test (#14849) Signed-off-by: Jee Jee Li --- tests/lora/conftest.py | 5 --- tests/lora/test_lora_bias_e2e.py | 63 -------------------------------- 2 files changed, 68 deletions(-) delete mode 100644 tests/lora/test_lora_bias_e2e.py diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 25665517fee28..ee01a1a524f82 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -173,11 +173,6 @@ def sql_lora_files(sql_lora_huggingface_id): return snapshot_download(repo_id=sql_lora_huggingface_id) -@pytest.fixture(scope="session") -def lora_bias_files(): - return snapshot_download(repo_id="followumesh/granite-3b-lora8-bias") - - @pytest.fixture(scope="session") def mixtral_lora_files(): # Note: this module has incorrect adapter_config.json to test diff --git a/tests/lora/test_lora_bias_e2e.py b/tests/lora/test_lora_bias_e2e.py deleted file mode 100644 index d4245a89dff08..0000000000000 --- a/tests/lora/test_lora_bias_e2e.py +++ /dev/null @@ -1,63 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import pytest - -import vllm -from vllm.lora.request import LoRARequest - -MODEL_PATH = "ibm-granite/granite-3b-code-base" - - -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: - prompts = [ - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 - ] - sampling_params = vllm.SamplingParams(temperature=0, - max_tokens=256, - stop=["[/assistant]"]) - outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) - if lora_id else None) - generated_texts: list[str] = [] - for output in outputs: - generated_text = output.outputs[0].text - generated_texts.append(generated_text) - return generated_texts - - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - -# Skipping for V1 for now as we are hitting, -# "Head size 80 is not supported by FlashAttention." error. -@pytest.mark.skip_v1 -@pytest.mark.parametrize("lora_bias", [True]) -@pytest.mark.parametrize("fully_sharded", [True, False]) -def test_lora_bias(lora_bias_files: str, lora_bias: bool, fully_sharded: bool): - llm = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_lora_rank=8, - max_loras=1, - enable_lora_bias=lora_bias, - tensor_parallel_size=1, - fully_sharded_loras=fully_sharded) - - print("lora adapter created") - output1 = do_sample(llm, lora_bias_files, lora_id=0) - - print("lora") - output2 = do_sample(llm, lora_bias_files, lora_id=1) - - if lora_bias: - assert output1 != output2 - else: - assert output1 == output2 From 4c7629cae94d1a4a8ba91d16946bbc283ecd3413 Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Sat, 15 Mar 2025 01:09:51 -0400 Subject: [PATCH 052/169] [V1][Structured Output] calculate vocab_size eagerly (#14851) Signed-off-by: Aaron Pham --- vllm/v1/structured_output/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 32ea1852d0ac2..77bafdee85ce2 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -40,7 +40,7 @@ class StructuredOutputManager: tokenizer_group.ping() tokenizer = tokenizer_group.get_lora_tokenizer(None) - self.vocab_size = tokenizer.max_token_id + 1 + self.vocab_size = len(tokenizer.get_vocab()) if isinstance(tokenizer, MistralTokenizer): # NOTE: ideally, xgrammar should handle this accordingly. # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98 From aaacf173243d7700a7a245489198a5f22d96f745 Mon Sep 17 00:00:00 2001 From: Jennifer Zhao Date: Fri, 14 Mar 2025 22:17:59 -0700 Subject: [PATCH 053/169] [Doc] V1 user guide (#13991) Signed-off-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com> Signed-off-by: Roger Wang Signed-off-by: Jennifer Zhao Co-authored-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com> Co-authored-by: Jennifer Zhao Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Co-authored-by: Roger Wang Co-authored-by: Cyrus Leung --- docs/source/getting_started/v1_user_guide.md | 159 +++++++++++++++++++ docs/source/index.md | 2 + 2 files changed, 161 insertions(+) create mode 100644 docs/source/getting_started/v1_user_guide.md diff --git a/docs/source/getting_started/v1_user_guide.md b/docs/source/getting_started/v1_user_guide.md new file mode 100644 index 0000000000000..533324f9174f8 --- /dev/null +++ b/docs/source/getting_started/v1_user_guide.md @@ -0,0 +1,159 @@ +# vLLM V1 User Guide + +V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack). + +## Why vLLM V1? + +vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design. + +Building on V0’s success, vLLM V1 retains the stable and proven components from V0 +(such as the models, GPU kernels, and utilities). At the same time, it significantly +re-architects the core systems, covering the scheduler, KV cache manager, worker, +sampler, and API server, to provide a cohesive, maintainable framework that better +accommodates continued growth and innovation. + +Specifically, V1 aims to: + +- Provide a **simple, modular, and easy-to-hack codebase**. +- Ensure **high performance** with near-zero CPU overhead. +- **Combine key optimizations** into a unified architecture. +- Require **zero configs** by enabling features/optimizations by default. + +We see significant performance improvements from upgrading to V1 core engine, in +particular for long context scenarios. Please see performance benchmark (To be +added). + +For more details, check out the vLLM V1 blog post [vLLM V1: A Major +Upgrade to vLLM’s Core Architecture](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html) (published Jan 27, 2025). + +This living user guide outlines a few known **important changes and limitations** introduced by vLLM V1. The team has been working actively to bring V1 as the default engine, therefore this guide will be updated constantly as more features get supported on vLLM V1. + +### Supports Overview +#### Hardware + +| Hardware | Status | +|----------|------------------------------------------| +| **NVIDIA** | πŸš€ Natively Supported | +| **AMD** | 🚧 WIP | +| **TPU** | 🚧 WIP | +#### Feature / Model + +| Feature / Model | Status | +|-----------------|-----------------------------------------------------------------------------------| +| **Prefix Caching** | πŸš€ Optimized | +| **Chunked Prefill** | πŸš€ Optimized | +| **Logprobs Calculation** | 🟒 Functional | +| **LoRA** | 🟒 Functional ([PR #13096](https://github.com/vllm-project/vllm/pull/13096))| +| **Multimodal Models** | 🟒 Functional | +| **Spec Decode** | 🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))| +| **Prompt Logprobs with Prefix Caching** | 🟑 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))| +| **FP8 KV Cache** | 🟑 Planned | +| **Structured Output Alternative Backends** | 🟑 Planned | +| **Embedding Models** | 🟑 Planned ([RFC #12249](https://github.com/vllm-project/vllm/issues/12249)) | +| **Mamba Models** | 🟑 Planned | +| **Encoder-Decoder Models** | 🟑 Planned | +| **Request-level Structured Output Backend** | πŸ”΄ Deprecated | +| **best_of** | πŸ”΄ Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))| +| **Per-Request Logits Processors** | πŸ”΄ Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360)) | +| **GPU <> CPU KV Cache Swapping** | πŸ”΄ Deprecated | + +- **πŸš€ Optimized**: Nearly fully optimized, with no further work currently planned. +- **🟒 Functional**: Fully operational, with ongoing optimizations. +- **🚧 WIP**: Under active development. +- **🟑 Planned**: Scheduled for future implementation (some may have open PRs/RFCs). +- **πŸ”΄ Deprecated**: Not planned for v1 unless there is strong demand. + +**Note**: vLLM V1’s unified scheduler treats both prompt and output tokens the same +way by using a simple dictionary (e.g., {request_id: num_tokens}) to dynamically +allocate a fixed token budget per request, enabling features like chunked prefills, +prefix caching, and speculative decoding without a strict separation between prefill +and decode phases. + +### Semantic Changes and Deprecated Features + +#### Logprobs + +vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic +differences compared to V0: + +**Logprobs Calculation** + +Logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e. +before applying any logits post-processing such as temperature scaling or penalty +adjustments). As a result, the returned logprobs do not reflect the final adjusted +probabilities used during sampling. + +Support for logprobs with post-sampling adjustments is in progress and will be added in future updates. + +**Prompt Logprobs with Prefix Caching** + +Currently prompt logprobs are only supported when prefix caching is turned off via `--no-enable-prefix-caching`. In a future release, prompt logprobs will be compatible with prefix caching, but a recomputation will be triggered to recover the full prompt logprobs even upon a prefix cache hit. See details in [RFC #13414](https://github.com/vllm-project/vllm/issues/13414). + +#### Deprecated Features + +As part of the major architectural rework in vLLM V1, several legacy features have been deprecated. + +**Sampling features** + +- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361). +- **Per-Request Logits Processors**: In V0, users could pass custom + processing functions to adjust logits on a per-request basis. In vLLM V1, this + feature has been deprecated. Instead, the design is moving toward supporting **global logits + processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](https://github.com/vllm-project/vllm/pull/13360). + +**KV Cache features** + +- **GPU <> CPU KV Cache Swapping**: with the new simplified core architecture, vLLM V1 no longer requires KV cache swapping +to handle request preemptions. + +**Structured Output features** + +- **Request-level Structured Output Backend**: Deprecated, alternative backends + (outlines, guidance) with fallbacks is WIP. +### Feature & Model Support in Progress + +Although we have re-implemented and partially optimized many features and models from V0 in vLLM V1, optimization work is still ongoing for some, and others remain unsupported. + +#### Features to Be Optimized + +These features are already supported in vLLM V1, but their optimization is still +in progress. + +- **LoRA**: LoRA is functionally working on vLLM V1 but its performance is + inferior to that of V0. The team is actively working on improving its + performance +(e.g., see [PR #13096](https://github.com/vllm-project/vllm/pull/13096)). + +- **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There + will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode. + +#### Features to Be Supported + +- **FP8 KV Cache**: While vLLM V1 introduces new FP8 kernels for model weight quantization, support for an FP8 key–value cache is not yet available. Users must continue using FP16 (or other supported precisions) for the KV cache. + +- **Structured Output Alternative Backends**: Structured output alternative backends (outlines, guidance) support is planned. V1 currently + supports only the `xgrammar:no_fallback` mode, meaning that it will error out if the output schema is unsupported by xgrammar. + Details about the structured outputs can be found + [here](https://docs.vllm.ai/en/latest/features/structured_outputs.html). + +#### Models to Be Supported + +vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol, +and the majority fall into the following categories. V1 support for these models will be added eventually. + +**Embedding Models** +Instead of having a separate model runner, hidden states processor [RFC #12249](https://github.com/vllm-project/vllm/issues/12249), which is based on global logits processor [RFC #13360](https://github.com/vllm-project/vllm/pull/13360), has been proposed to enable simultaneous generation and embedding using the same engine instance in V1. It is still in the planning stage. + +**Mamba Models** +Models using selective state-space mechanisms (instead of standard transformer attention) +are not yet supported (e.g., `MambaForCausalLM`, `JambaForCausalLM`). + +**Encoder-Decoder Models** +vLLM V1 is currently optimized for decoder-only transformers. Models requiring + cross-attention between separate encoder and decoder are not yet supported (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`). + +For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html). + +## FAQ + +TODO diff --git a/docs/source/index.md b/docs/source/index.md index 52c4622d3e5a3..1624d5cf5aae7 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -67,6 +67,8 @@ getting_started/quickstart getting_started/examples/examples_index getting_started/troubleshooting getting_started/faq +getting_started/v1_user_guide + ::: % What does vLLM support? From ee3778d5fc0a075dc04dada5a3bbf2af5275a243 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Sat, 15 Mar 2025 01:38:19 -0400 Subject: [PATCH 054/169] [Build/CI] Upgrade jinja2 to get 3 moderate CVE fixes (#14839) Signed-off-by: Russell Bryant --- requirements/build.txt | 2 +- requirements/rocm-build.txt | 2 +- requirements/test.txt | 2 +- requirements/tpu.txt | 2 +- requirements/xpu.txt | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements/build.txt b/requirements/build.txt index 364a16d80b71b..13d643bcaff10 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -6,4 +6,4 @@ setuptools>=61 setuptools-scm>=8 torch==2.6.0 wheel -jinja2 +jinja2>=3.1.6 diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt index f378663ade752..a0731c51d46bd 100644 --- a/requirements/rocm-build.txt +++ b/requirements/rocm-build.txt @@ -11,5 +11,5 @@ packaging setuptools>=61 setuptools-scm>=8 wheel -jinja2 +jinja2>=3.1.6 amdsmi==6.2.4 diff --git a/requirements/test.txt b/requirements/test.txt index 0a2b491669ac8..c2cdd2c8664d8 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -181,7 +181,7 @@ iniconfig==2.0.0 # via pytest isort==5.13.2 # via datamodel-code-generator -jinja2==3.1.4 +jinja2==3.1.6 # via # datamodel-code-generator # torch diff --git a/requirements/tpu.txt b/requirements/tpu.txt index 06bcecfc00458..97a39bcd4a6d6 100644 --- a/requirements/tpu.txt +++ b/requirements/tpu.txt @@ -6,7 +6,7 @@ cmake>=3.26 packaging setuptools-scm>=8 wheel -jinja2 +jinja2>=3.1.6 ray[default] ray[data] diff --git a/requirements/xpu.txt b/requirements/xpu.txt index 3fd0655904e4d..fa09004d0a9cb 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -7,7 +7,7 @@ packaging setuptools-scm>=8 setuptools>=75.8.0 wheel -jinja2 +jinja2>=3.1.6 datasets # for benchmark scripts torch==2.6.0+xpu From 9ed6ee92d6f7a335995f1fb634b15254840d9ad4 Mon Sep 17 00:00:00 2001 From: Bryan Lu <55512809+luyuzhe111@users.noreply.github.com> Date: Fri, 14 Mar 2025 23:50:33 -0700 Subject: [PATCH 055/169] [Bugfix] EAGLE output norm bug (#14464) Signed-off-by: Bryan Lu --- docs/source/features/spec_decode.md | 2 +- examples/offline_inference/eagle.py | 93 ++++++++++++++++++++++ vllm/engine/llm_engine.py | 7 +- vllm/engine/output_processor/multi_step.py | 5 ++ vllm/model_executor/models/eagle.py | 2 +- vllm/sequence.py | 44 ++++++---- vllm/spec_decode/spec_decode_worker.py | 2 +- vllm/spec_decode/util.py | 32 ++++---- 8 files changed, 152 insertions(+), 35 deletions(-) create mode 100644 examples/offline_inference/eagle.py diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md index cc8d6fceb7d66..852248e418ca0 100644 --- a/docs/source/features/spec_decode.md +++ b/docs/source/features/spec_decode.md @@ -162,7 +162,7 @@ A variety of speculative models of this type are available on HF hub: ## Speculating using EAGLE based draft models The following code configures vLLM to use speculative decoding where proposals are generated by -an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. +an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](). ```python from vllm import LLM, SamplingParams diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py new file mode 100644 index 0000000000000..baa91b2d0364d --- /dev/null +++ b/examples/offline_inference/eagle.py @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: Apache-2.0 +import argparse +import json +import os + +from transformers import AutoTokenizer + +from vllm import LLM, SamplingParams + +parser = argparse.ArgumentParser() + +parser.add_argument( + "--dataset", + type=str, + default="./examples/data/gsm8k.jsonl", + help="downloaded from the eagle repo " \ + "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/" +) +parser.add_argument("--max_num_seqs", type=int, default=8) +parser.add_argument("--num_prompts", type=int, default=80) +parser.add_argument("--num_spec_tokens", type=int, default=2) +parser.add_argument("--tp", type=int, default=1) +parser.add_argument("--draft_tp", type=int, default=1) +parser.add_argument("--enforce_eager", action='store_true') +parser.add_argument("--enable_chunked_prefill", action='store_true') +parser.add_argument("--max_num_batched_tokens", type=int, default=2048) +parser.add_argument("--temp", type=float, default=0) + +args = parser.parse_args() + +print(args) + +model_dir = "meta-llama/Meta-Llama-3-8B-Instruct" +eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm" + +max_model_len = 2048 + +tokenizer = AutoTokenizer.from_pretrained(model_dir) + +if os.path.exists(args.dataset): + prompts = [] + num_prompts = args.num_prompts + with open(args.dataset) as f: + for line in f: + data = json.loads(line) + prompts.append(data["turns"][0]) +else: + prompts = ["The future of AI is", "The president of the United States is"] + +prompts = prompts[:args.num_prompts] +num_prompts = len(prompts) + +prompt_ids = [ + tokenizer.apply_chat_template([{ + "role": "user", + "content": prompt + }], + add_generation_prompt=True) + for prompt in prompts +] + +llm = LLM( + model=model_dir, + trust_remote_code=True, + tensor_parallel_size=args.tp, + enable_chunked_prefill=args.enable_chunked_prefill, + max_num_batched_tokens=args.max_num_batched_tokens, + enforce_eager=args.enforce_eager, + max_model_len=max_model_len, + max_num_seqs=args.max_num_seqs, + gpu_memory_utilization=0.8, + speculative_model=eagle_dir, + num_speculative_tokens=args.num_spec_tokens, + speculative_draft_tensor_parallel_size=args.draft_tp, + speculative_max_model_len=max_model_len, + disable_log_stats=False, +) + +sampling_params = SamplingParams(temperature=args.temp, max_tokens=256) + +outputs = llm.generate(prompt_token_ids=prompt_ids, + sampling_params=sampling_params) + +# calculate the average number of accepted tokens per forward pass, +1 is +# to account for the token from the target model that's always going to be +# accepted +acceptance_counts = [0] * (args.num_spec_tokens + 1) +for output in outputs: + for step, count in enumerate(output.metrics.spec_token_acceptance_counts): + acceptance_counts[step] += count + +print(f"mean acceptance length: \ + {sum(acceptance_counts) / acceptance_counts[0]:.2f}") diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 94687a13c5280..6dc0055bdfb49 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -853,6 +853,10 @@ class LLMEngine: self.generation_config_fields, seq.eos_token_id) # Create the sequence group. + draft_size = 1 + if self.vllm_config.speculative_config is not None: + draft_size = \ + self.vllm_config.speculative_config.num_speculative_tokens + 1 seq_group = SequenceGroup( request_id=request_id, seqs=[seq], @@ -862,7 +866,8 @@ class LLMEngine: trace_headers=trace_headers, prompt_adapter_request=prompt_adapter_request, encoder_seq=encoder_seq, - priority=priority) + priority=priority, + draft_size=draft_size) return seq_group diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 8ceef855e020f..4c5d78a43df6c 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -100,6 +100,11 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor): seqs = sequence_group.get_seqs( status=SequenceStatus.FINISHED_ABORTED) + for output in outputs: + if output.samples[0].output_token != VLLM_INVALID_TOKEN_ID: + sequence_group.metrics.spec_token_acceptance_counts[ + output.step_index] += 1 + assert seqs, "Expected RUNNING or FINISHED_ABORTED sequences" assert len(seqs) == 1, ( "Beam search not supported in multi-step decoding.") diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index f2a2935e6c694..010e51a3b9f28 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -38,7 +38,7 @@ class DummyOutputNorm(nn.Module): if residual is None: return x else: - return x, residual + return x + residual, None class EAGLE(nn.Module): diff --git a/vllm/sequence.py b/vllm/sequence.py index 6a7b1e62a6045..61867b0253159 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -111,6 +111,13 @@ class RequestMetrics: model_execute_time: The time spent in the model execute function. This will include model forward, block/sync across workers, cpu-gpu sync time and sampling time. + spec_token_acceptance_counts: number of accepted speculative tokens at + each position; the first token is from + the target model and is always accepted; + e.g., when it's [10, 8, 4, 2] for a req, + it means there were 10 forward passes in + total, and there were 8, 4, 2 accepted + tokens at 1st, 2nd, 3rd speculation step. """ arrival_time: float last_token_time: float @@ -121,6 +128,7 @@ class RequestMetrics: scheduler_time: Optional[float] = None model_forward_time: Optional[float] = None model_execute_time: Optional[float] = None + spec_token_acceptance_counts: Optional[list[int]] = None class SequenceDataDelta( @@ -639,22 +647,25 @@ class SequenceGroup: trace_headers: OpenTelemetry trace headers. prompt_adapter_request: Prompt Adapter request. priority: User-defined priority of the request. + draft_size: The number of speculative tokens plus one from the target + model; equal to max number of tokens a step can generate + for single-draft speculative decoding but larger than + that for multi-draft SD (currently not supported). """ - def __init__( - self, - request_id: str, - seqs: list[Sequence], - arrival_time: float, - sampling_params: Optional[SamplingParams] = None, - lora_request: Optional[LoRARequest] = None, - pooling_params: Optional[PoolingParams] = None, - pooled_data: Optional[torch.Tensor] = None, - encoder_seq: Optional[Sequence] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, - ) -> None: + def __init__(self, + request_id: str, + seqs: list[Sequence], + arrival_time: float, + sampling_params: Optional[SamplingParams] = None, + lora_request: Optional[LoRARequest] = None, + pooling_params: Optional[PoolingParams] = None, + pooled_data: Optional[torch.Tensor] = None, + encoder_seq: Optional[Sequence] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, + draft_size: int = 1) -> None: self.request_id = request_id self.seqs = seqs self.first_seq = seqs[0] @@ -667,7 +678,9 @@ class SequenceGroup: last_token_time=arrival_time, first_scheduled_time=None, first_token_time=None, - time_in_queue=None) + time_in_queue=None, + spec_token_acceptance_counts=[0] * + draft_size) self.last_token_latency = 0.0 self.lora_request = lora_request self.prompt_logprobs: Optional[PromptLogprobs] = None @@ -1079,6 +1092,7 @@ class CompletionSequenceGroupOutput( samples: list[SequenceOutput] # Prompt logprob for each prompt query token. prompt_logprobs: Optional[PromptLogprobs] + step_index: Optional[int] = 0 def __repr__(self) -> str: return (f"CompletionSequenceGroupOutput(samples={self.samples}, " diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 8909a41bc99fc..5bf4f67d35bdb 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -1080,7 +1080,7 @@ class SpecDecodeWorker(LoRANotSupportedWorkerBase): [sequence_index][:num_logprobs], topk_logprobs=topk_logprobs_by_step[step_index] [sequence_index][:num_logprobs], - )) + step_index=step_index)) sampler_output_list.append( SamplerOutput(outputs=step_output_token_ids)) diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 9c04680a6a7ab..466269b2107f5 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -93,14 +93,14 @@ def create_logprobs_output( def create_sequence_group_output( - token_id: int, - token_id_logprob_rank: int, - token_id_logprob: float, - seq_id: SeqId, - topk_token_ids: List[Optional[int]], - topk_logprobs: List[Optional[float]], - prompt_logprobs: Optional[PromptLogprobs] = None, -) -> CompletionSequenceGroupOutput: + token_id: int, + token_id_logprob_rank: int, + token_id_logprob: float, + seq_id: SeqId, + topk_token_ids: List[Optional[int]], + topk_logprobs: List[Optional[float]], + prompt_logprobs: Optional[PromptLogprobs] = None, + step_index: Optional[int] = 0) -> CompletionSequenceGroupOutput: """Create a SequenceGroupOutput given the sampling results. Args: @@ -110,6 +110,7 @@ def create_sequence_group_output( seq_id (int): The sequence id. topk_token_ids (List[Optional[int]]): The list of top-k token ids. topk_logprobs (List[Optional[float]]): The list of top-k logprobs. + step_index: (Optional[int]): The index of the speculative token. """ logprobs = create_logprobs_output( @@ -120,14 +121,13 @@ def create_sequence_group_output( topk_logprobs, ) - return CompletionSequenceGroupOutput( - samples=[ - SequenceOutput(parent_seq_id=seq_id, - output_token=token_id, - logprobs=logprobs) - ], - prompt_logprobs=prompt_logprobs, - ) + return CompletionSequenceGroupOutput(samples=[ + SequenceOutput(parent_seq_id=seq_id, + output_token=token_id, + logprobs=logprobs) + ], + prompt_logprobs=prompt_logprobs, + step_index=step_index) def split_batch_by_proposal_len( From 3556a414341033aad1bbb84674ec16b235324b25 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 15 Mar 2025 17:52:05 +0800 Subject: [PATCH 056/169] [VLM] Limit multimodal input cache by memory (#14805) Signed-off-by: DarkLight1337 --- .pre-commit-config.yaml | 2 +- requirements/common.txt | 1 + requirements/docs.txt | 1 + .../multimodal/processing/test_common.py | 2 +- vllm/envs.py | 11 ++- vllm/jsontree.py | 79 +++++++++++++++++++ vllm/model_executor/models/llava.py | 3 +- vllm/model_executor/models/molmo.py | 3 +- vllm/multimodal/inputs.py | 3 +- vllm/multimodal/processing.py | 51 ++++++++++-- vllm/multimodal/registry.py | 4 +- vllm/utils.py | 16 ---- vllm/v1/engine/mm_input_cache.py | 38 ++++----- 13 files changed, 159 insertions(+), 55 deletions(-) create mode 100644 vllm/jsontree.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 074ac9d122bfe..484cd171f5f52 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -53,7 +53,7 @@ repos: entry: tools/mypy.sh 0 "local" language: python types: [python] - additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests] + additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests] stages: [pre-commit] # Don't run in CI - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.9 diff --git a/requirements/common.txt b/requirements/common.txt index 3cd933f347f59..bb021d9e45499 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -1,3 +1,4 @@ +cachetools psutil sentencepiece # Required for LLaMA tokenizer. numpy < 2.0.0 diff --git a/requirements/docs.txt b/requirements/docs.txt index 1d669699f4b2a..7a9b921a11715 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -9,6 +9,7 @@ msgspec cloudpickle # packages to install to build the documentation +cachetools pydantic >= 2.8 -f https://download.pytorch.org/whl/cpu torch diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index aef5db9bc06bb..0e0d3711357e4 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -48,7 +48,7 @@ def _test_processing_correctness( tokenizer=cached_tokenizer_from_config(model_config), ) # Ensure that it can fit all of the data - cache = ProcessingCache(capacity=1 << 30) + cache = ProcessingCache(capacity_gb=2048) processing_info = factories.info(ctx) supported_mm_limits = processing_info.get_supported_mm_limits() diff --git a/vllm/envs.py b/vllm/envs.py index 463059dc06704..bf214f314c458 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -56,7 +56,7 @@ if TYPE_CHECKING: VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_VIDEO_FETCH_TIMEOUT: int = 30 VLLM_AUDIO_FETCH_TIMEOUT: int = 10 - VLLM_MM_INPUT_CACHE_SIZE: int = 256 + VLLM_MM_INPUT_CACHE_GIB: int = 8 VLLM_TARGET_DEVICE: str = "cuda" MAX_JOBS: Optional[str] = None NVCC_THREADS: Optional[str] = None @@ -432,11 +432,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_AUDIO_FETCH_TIMEOUT": lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")), - # Cache size for multimodal feature/input cache for multimodal models - # in unit of number of multimodal data items (e.g. image, video, audio). - # Default is 256 multimodal data items. - "VLLM_MM_INPUT_CACHE_SIZE": - lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_SIZE", "256")), + # Cache size (in GiB) for multimodal input cache + # Default is 8GiB + "VLLM_MM_INPUT_CACHE_GIB": + lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "8")), # Path to the XLA persistent cache directory. # Only used for XLA devices such as TPUs. diff --git a/vllm/jsontree.py b/vllm/jsontree.py new file mode 100644 index 0000000000000..91cd7cb216d77 --- /dev/null +++ b/vllm/jsontree.py @@ -0,0 +1,79 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Helper functions to work with nested JSON structures.""" +from collections.abc import Iterable +from functools import reduce +from typing import Callable, TypeVar, Union, overload + +_T = TypeVar("_T") +_U = TypeVar("_U") + +JSONTree = Union[dict[str, "JSONTree[_T]"], list["JSONTree[_T]"], + tuple["JSONTree[_T]", ...], _T] +"""A nested JSON structure where the leaves need not be JSON-serializable.""" + + +def json_iter_leaves(value: JSONTree[_T]) -> Iterable[_T]: + """Iterate through each leaf in a nested JSON structure.""" + if isinstance(value, dict): + for v in value.values(): + yield from json_iter_leaves(v) + elif isinstance(value, (list, tuple)): + for v in value: + yield from json_iter_leaves(v) + else: + yield value + + +def json_map_leaves( + func: Callable[[_T], _U], + value: JSONTree[_T], +) -> JSONTree[_U]: + """Apply a function to each leaf in a nested JSON structure.""" + if isinstance(value, dict): + return {k: json_map_leaves(func, v) for k, v in value.items()} + elif isinstance(value, list): + return [json_map_leaves(func, v) for v in value] + elif isinstance(value, tuple): + return tuple(json_map_leaves(func, v) for v in value) + else: + return func(value) + + +@overload +def json_reduce_leaves( + func: Callable[[_T, _T], _T], + value: JSONTree[_T], + /, +) -> _T: + ... + + +@overload +def json_reduce_leaves( + func: Callable[[_U, _T], _U], + value: JSONTree[_T], + initial: _U, + /, +) -> _U: + ... + + +def json_reduce_leaves( + func: Callable[..., Union[_T, _U]], + value: JSONTree[_T], + initial: _U = ..., # type: ignore[assignment] + /, +) -> Union[_T, _U]: + """ + Apply a function of two arguments cumulatively to each leaf in a + nested JSON structure, from left to right, so as to reduce the + sequence to a single value. + """ + if initial is ...: + return reduce(func, json_iter_leaves(value)) # type: ignore[arg-type] + + return reduce( + func, # type: ignore[arg-type] + json_iter_leaves(value), + initial, + ) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 478dbd83d3002..42bf6a5b2979a 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -18,6 +18,7 @@ from transformers.models.pixtral import PixtralProcessor from vllm.config import VllmConfig from vllm.inputs import InputProcessingContext +from vllm.jsontree import JSONTree, json_map_leaves from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) @@ -35,7 +36,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, PromptReplacement, PromptUpdate) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from vllm.utils import JSONTree, flatten_2d_lists, json_map_leaves +from vllm.utils import flatten_2d_lists from .clip import CLIPVisionModel from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 444b619437a09..e709b08815eaf 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -24,6 +24,7 @@ from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, tensor_model_parallel_all_gather) +from vllm.jsontree import JSONTree, json_map_leaves from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import (MulAndSilu, QuickGELU, SiluAndMul) @@ -50,7 +51,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, PromptInsertion, PromptUpdate) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from vllm.utils import JSONTree, flatten_2d_lists, json_map_leaves +from vllm.utils import flatten_2d_lists from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP, SupportsQuant) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 7b186d89dad4a..3c609fd967650 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -16,7 +16,8 @@ from PIL.Image import Image from transformers import BatchFeature from typing_extensions import NotRequired, TypeAlias -from vllm.utils import JSONTree, full_groupby, is_list_of, json_map_leaves +from vllm.jsontree import JSONTree, json_map_leaves +from vllm.utils import full_groupby, is_list_of if TYPE_CHECKING: from .hasher import MultiModalHashDict diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 080a2362aac52..cdbbed27a5218 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 - import re +import sys from abc import ABC, abstractmethod from collections import defaultdict from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping, @@ -11,14 +11,17 @@ from functools import lru_cache from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol, TypeVar, Union, cast) +import torch +from cachetools import LRUCache from transformers import BatchFeature, PretrainedConfig, ProcessorMixin from typing_extensions import assert_never from vllm.inputs import InputProcessingContext +from vllm.jsontree import json_map_leaves, json_reduce_leaves from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens, encode_tokens) -from vllm.utils import LRUCache, flatten_2d_lists, full_groupby +from vllm.utils import GiB_bytes, flatten_2d_lists, full_groupby from .hasher import MultiModalHasher from .inputs import (MultiModalDataDict, MultiModalEncDecInputs, @@ -812,25 +815,50 @@ def find_mm_placeholders( return dict(full_groupby_modality(it)) +_V = TypeVar("_V", bound="Union[MultiModalKwargs, MultiModalKwargsItem]") + + class ProcessingCache: - def __init__(self, capacity: int) -> None: + @staticmethod + def get_lru_cache( + capacity_gb: int, + value_type: type[_V], + ) -> LRUCache[str, _V]: + + def get_size(leaf: object) -> int: + if isinstance(leaf, torch.Tensor): + return leaf.nbytes # sys.getsizeof doesn't work for tensors + + return sys.getsizeof(leaf) + + return LRUCache[str, _V]( + GiB_bytes * capacity_gb, + getsizeof=lambda x: json_reduce_leaves( + lambda a, b: a + b, + json_map_leaves(get_size, x), + ), + ) + + def __init__(self, capacity_gb: int) -> None: super().__init__() # DEBUG: Set to None to disable self.debug_cache_hit_ratio_steps: Optional[int] = None + self.debug_cache_hits = 0 + self.debug_cache_total = 0 - self._cache = LRUCache[str, MultiModalKwargsItem](capacity) + self._cache = self.get_lru_cache(capacity_gb, MultiModalKwargsItem) def _maybe_log_cache_stats(self) -> None: steps = self.debug_cache_hit_ratio_steps if not steps: return - cache_stats = self._cache.stat() - if cache_stats.total % steps == 0: + total = self.debug_cache_total + if total > 0 and total % steps == 0: logger.debug("ProcessingCache: hit_ratio = %.2f", - cache_stats.hit_ratio) + self.debug_cache_hits / total) def get( self, @@ -853,6 +881,13 @@ class ProcessingCache: cache_key = MultiModalHasher.hash_kwargs(model_id=model_id, **{modality: input_item}, **input_kwargs) + + if self.debug_cache_hit_ratio_steps: + if cache_key in self._cache: + self.debug_cache_hits += 1 + + self.debug_cache_total += 1 + return self._cache.get(cache_key) def put( @@ -870,7 +905,7 @@ class ProcessingCache: cache_key = MultiModalHasher.hash_kwargs(model_id=model_id, **{modality: input_item}, **input_kwargs) - self._cache.put(cache_key, output_kwargs) + self._cache[cache_key] = output_kwargs class BaseProcessingInfo: diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index febf3ad9eea42..24b8358982797 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Any, Generic, Optional, Protocol, TypeVar import torch.nn as nn -from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE +from vllm.envs import VLLM_MM_INPUT_CACHE_GIB from vllm.inputs import InputProcessingContext from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import (AnyTokenizer, @@ -119,7 +119,7 @@ class MultiModalRegistry: self._limits_by_model = _MultiModalLimits() - self._processing_cache = ProcessingCache(VLLM_MM_INPUT_CACHE_SIZE) + self._processing_cache = ProcessingCache(VLLM_MM_INPUT_CACHE_GIB) def register_plugin(self, plugin: MultiModalPlugin) -> None: """ diff --git a/vllm/utils.py b/vllm/utils.py index 9334741225008..632b3666e959c 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -845,22 +845,6 @@ def is_list_of( assert_never(check) -JSONTree = Union[dict[str, "JSONTree[T]"], list["JSONTree[T]"], - tuple["JSONTree[T]", ...], T] -"""A nested JSON structure where the leaves need not be JSON-serializable.""" - - -def json_map_leaves(func: Callable[[T], U], value: JSONTree[T]) -> JSONTree[U]: - if isinstance(value, dict): - return {k: json_map_leaves(func, v) for k, v in value.items()} - elif isinstance(value, list): - return [json_map_leaves(func, v) for v in value] - elif isinstance(value, tuple): - return tuple(json_map_leaves(func, v) for v in value) - else: - return func(value) - - def flatten_2d_lists(lists: list[list[T]]) -> list[T]: """Flatten a list of lists to a single list.""" return [item for sublist in lists for item in sublist] diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index 0f66f68109b17..e2dda73ba4299 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -3,11 +3,11 @@ from typing import Any, Optional from vllm.config import ModelConfig -from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE +from vllm.envs import VLLM_MM_INPUT_CACHE_GIB from vllm.logger import init_logger from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalKwargs, MultiModalRegistry) -from vllm.utils import LRUCache +from vllm.multimodal.processing import ProcessingCache logger = init_logger(__name__) @@ -30,7 +30,7 @@ logger = init_logger(__name__) # Both Client and Server must use the same cache size # (to perform mirrored caching). This cache size is set by the environment -# variable VLLM_MM_INPUT_CACHE_SIZE. +# variable VLLM_MM_INPUT_CACHE_GIB. # TODO(ywang96): Deprecate this class once all multimodal models migrate to use @@ -50,18 +50,20 @@ class MMInputCacheClient: # Init cache self.use_cache = not model_config.disable_mm_preprocessor_cache - self.mm_cache = LRUCache[str, - MultiModalKwargs](VLLM_MM_INPUT_CACHE_SIZE) + self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB, + MultiModalKwargs) # DEBUG: Set to None to disable self.mm_debug_cache_hit_ratio_steps = None - self.mm_cache_hits = 0 - self.mm_cache_total = 0 + self.mm_debug_cache_hits = 0 + self.mm_debug_cache_total = 0 def cache_hit_ratio(self, steps): - if self.mm_cache_total > 0 and self.mm_cache_total % steps == 0: + total = self.mm_debug_cache_total + + if total > 0 and total % steps == 0: logger.debug("MMInputMapper: cache_hit_ratio = %.2f ", - self.mm_cache_hits / self.mm_cache_total) + self.mm_debug_cache_hits / total) # NOTE: process_inputs only supports image inputs since all multimodal # models with other modalities have migrated to use merged preprocessor. @@ -71,7 +73,7 @@ class MMInputCacheClient: mm_hashes: Optional[list[str]], mm_processor_kwargs: Optional[dict[str, Any]], precomputed_mm_inputs: Optional[list[MultiModalKwargs]], - ) -> list[MultiModalKwargs]: + ) -> list[Optional[MultiModalKwargs]]: if precomputed_mm_inputs is None: image_inputs = mm_data["image"] if not isinstance(image_inputs, list): @@ -88,7 +90,7 @@ class MMInputCacheClient: # Process each image input separately, so that later we can schedule # them in a fine-grained manner. # Apply caching (if enabled) and reuse precomputed inputs (if provided) - ret_inputs: list[MultiModalKwargs] = [] + ret_inputs: list[Optional[MultiModalKwargs]] = [] for input_id in range(num_inputs): if self.mm_debug_cache_hit_ratio_steps is not None: self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps) @@ -99,7 +101,7 @@ class MMInputCacheClient: mm_hash = mm_hashes[input_id] mm_input = self.mm_cache.get(mm_hash) - self.mm_cache_total += 1 + self.mm_debug_cache_total += 1 if mm_input is None: if precomputed_mm_inputs is not None: # Reuse precomputed input (for merged preprocessor) @@ -114,9 +116,9 @@ class MMInputCacheClient: if self.use_cache: # Add to cache assert mm_hash is not None - self.mm_cache.put(mm_hash, mm_input) + self.mm_cache[mm_hash] = mm_input else: - self.mm_cache_hits += 1 + self.mm_debug_cache_hits += 1 mm_input = None # Avoids sending mm_input to Server ret_inputs.append(mm_input) @@ -128,14 +130,14 @@ class MMInputCacheServer: def __init__(self, model_config): self.use_cache = not model_config.disable_mm_preprocessor_cache - self.mm_cache = LRUCache[str, - MultiModalKwargs](VLLM_MM_INPUT_CACHE_SIZE) + self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB, + MultiModalKwargs) def get_and_update( self, mm_inputs: list[Optional[MultiModalKwargs]], mm_hashes: list[str], - ) -> list[MultiModalKwargs]: + ) -> list[Optional[MultiModalKwargs]]: assert len(mm_inputs) == len(mm_hashes) if not self.use_cache: @@ -148,7 +150,7 @@ class MMInputCacheServer: mm_input = self.mm_cache.get(mm_hash) assert mm_input is not None else: - self.mm_cache.put(mm_hash, mm_input) + self.mm_cache[mm_hash] = mm_input full_mm_inputs.append(mm_input) From f58aea002c585d1957d46f3b9ab23642cab88d82 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 15 Mar 2025 04:58:53 -0700 Subject: [PATCH 057/169] [CI][Intel GPU] refine intel GPU ci docker build (#14860) Signed-off-by: Kunshang Ji --- .buildkite/run-xpu-test.sh | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh index d48639e5720c5..a9c71201a745f 100644 --- a/.buildkite/run-xpu-test.sh +++ b/.buildkite/run-xpu-test.sh @@ -4,16 +4,27 @@ # It serves a sanity check for compilation and basic model usage. set -ex +image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}" +container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" + # Try building the docker image -docker build -t xpu-test -f Dockerfile.xpu . +docker build -t ${image_name} -f Dockerfile.xpu . # Setup cleanup -remove_docker_container() { docker rm -f xpu-test || true; } +remove_docker_container() { + docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true; +} trap remove_docker_container EXIT remove_docker_container # Run the image and test offline inference/tensor parallel -docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c ' +docker run \ + --device /dev/dri \ + -v /dev/dri/by-path:/dev/dri/by-path \ + --entrypoint="" \ + --name "${container_name}" \ + "${image_name}" \ + sh -c ' python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2 ' From 74bc397b0a8c092089bdd21e3ec9130336797471 Mon Sep 17 00:00:00 2001 From: Jun Duan Date: Sat, 15 Mar 2025 09:28:14 -0400 Subject: [PATCH 058/169] [Core] Expose API endpoint `/is_sleeping` (#14312) Signed-off-by: Jun Duan --- tests/entrypoints/openai/test_sleep.py | 7 +++++++ vllm/engine/async_llm_engine.py | 3 +++ vllm/engine/llm_engine.py | 3 +++ vllm/engine/multiprocessing/__init__.py | 16 +++++++++++++-- vllm/engine/multiprocessing/client.py | 27 +++++++++++++++++++++++-- vllm/engine/multiprocessing/engine.py | 13 ++++++++++++ vllm/engine/protocol.py | 5 +++++ vllm/entrypoints/openai/api_server.py | 6 ++++++ vllm/v1/engine/async_llm.py | 3 +++ vllm/v1/engine/core.py | 3 +++ vllm/v1/engine/core_client.py | 15 ++++++++++++++ vllm/v1/engine/llm_engine.py | 3 +++ 12 files changed, 100 insertions(+), 4 deletions(-) diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py index 1caa743c40185..8bdf00bcee126 100644 --- a/tests/entrypoints/openai/test_sleep.py +++ b/tests/entrypoints/openai/test_sleep.py @@ -28,5 +28,12 @@ def test_sleep_mode(): response = requests.post(remote_server.url_for("/sleep"), data={"level": "1"}) assert response.status_code == 200 + response = requests.get(remote_server.url_for("/is_sleeping")) + assert response.status_code == 200 + assert response.json().get("is_sleeping") is True + response = requests.post(remote_server.url_for("/wake_up")) assert response.status_code == 200 + response = requests.get(remote_server.url_for("/is_sleeping")) + assert response.status_code == 200 + assert response.json().get("is_sleeping") is False diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 84f5528a06d02..63787590bf47a 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1225,6 +1225,9 @@ class AsyncLLMEngine(EngineClient): async def wake_up(self) -> None: self.engine.wake_up() + async def is_sleeping(self) -> bool: + return self.engine.is_sleeping() + async def add_lora(self, lora_request: LoRARequest) -> None: self.engine.add_lora(lora_request) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 6dc0055bdfb49..ca50f08a38048 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1948,6 +1948,9 @@ class LLMEngine: "Sleep mode is not enabled in the model config") self.model_executor.wake_up() + def is_sleeping(self) -> bool: + return self.model_executor.is_sleeping + def check_health(self) -> None: if self.tokenizer: self.tokenizer.check_health() diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py index 26dfb63c3dbf3..144dd822a177c 100644 --- a/vllm/engine/multiprocessing/__init__.py +++ b/vllm/engine/multiprocessing/__init__.py @@ -136,6 +136,18 @@ class RPCWakeUpRequest(Enum): WAKE_UP = 1 +@dataclass +class RPCIsSleepingRequest: + # Set the default value of request_id to a new UUID + request_id: str = field(default_factory=lambda: str(uuid.uuid4())) + + +@dataclass +class RPCIsSleepingResponse: + request_id: str + is_sleeping: bool + + @dataclass class RPCLoadAdapterRequest: lora_request: LoRARequest @@ -151,10 +163,10 @@ class RPCAdapterLoadedResponse: RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest, RPCUProfileRequest, RPCLoadAdapterRequest, RPCResetPrefixCacheRequest, RPCSleepRequest, - RPCWakeUpRequest] + RPCWakeUpRequest, RPCIsSleepingRequest] REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse, - RPCError] + RPCIsSleepingResponse, RPCError] def ENGINE_DEAD_ERROR( diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index b1bb0fd53d67a..e2ae9486e4351 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -27,6 +27,8 @@ from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT, IPC_OUTPUT_EXT, RPC_REQUEST_T, VLLM_RPC_SUCCESS_STR, RPCAbortRequest, RPCAdapterLoadedResponse, RPCError, + RPCIsSleepingRequest, + RPCIsSleepingResponse, RPCLoadAdapterRequest, RPCProcessRequest, RPCResetPrefixCacheRequest, @@ -246,7 +248,9 @@ class MQLLMEngineClient(EngineClient): if queue is not None: queue.put_nowait(exception) # Put each output into the appropriate queue. - elif isinstance(request_outputs, RPCAdapterLoadedResponse): + elif isinstance( + request_outputs, + (RPCAdapterLoadedResponse, RPCIsSleepingResponse)): self._add_output(request_outputs) else: for request_output in request_outputs: @@ -256,7 +260,8 @@ class MQLLMEngineClient(EngineClient): logger.debug("Shutting down MQLLMEngineClient output handler.") def _add_output(self, request_output: Union[RequestOutput, - RPCAdapterLoadedResponse]): + RPCAdapterLoadedResponse, + RPCIsSleepingResponse]): queue = self.output_queues.get(request_output.request_id) if queue is not None: queue.put_nowait(request_output) @@ -696,6 +701,24 @@ class MQLLMEngineClient(EngineClient): return await self._send_one_way_rpc_request( request=RPCWakeUpRequest.WAKE_UP, socket=self.input_socket) + async def is_sleeping(self) -> bool: + """Check whether the engine is sleeping""" + request = RPCIsSleepingRequest() + + queue: asyncio.Queue[Union[BaseException, + RPCIsSleepingResponse]] = asyncio.Queue() + self.output_queues[request.request_id] = queue + + request_bytes = pickle.dumps(request) + await self.input_socket.send_multipart((request_bytes, ), copy=False) + + request_output = await queue.get() + self.output_queues.pop(request.request_id) + + if isinstance(request_output, BaseException): + raise request_output + return request_output.is_sleeping + async def add_lora(self, lora_request: LoRARequest) -> None: """Load a new LoRA adapter into the engine for future requests.""" # Uses the same I/O as generate requests diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 312e0e98d56b4..33b96af3018a3 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -18,6 +18,8 @@ from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT, IPC_OUTPUT_EXT, REQUEST_OUTPUTS_T, VLLM_RPC_SUCCESS_STR, RPCAbortRequest, RPCAdapterLoadedResponse, RPCError, + RPCIsSleepingRequest, + RPCIsSleepingResponse, RPCLoadAdapterRequest, RPCProcessRequest, RPCResetPrefixCacheRequest, @@ -271,6 +273,8 @@ class MQLLMEngine: self.sleep(request.value) elif isinstance(request, RPCWakeUpRequest): self.wake_up() + elif isinstance(request, RPCIsSleepingRequest): + self._handle_is_sleeping_request(request) else: raise ValueError("Unknown RPCRequest Type: " f"{type(request)}") @@ -337,6 +341,12 @@ class MQLLMEngine: self._send_outputs( RPCAdapterLoadedResponse(request_id=request.request_id)) + def _handle_is_sleeping_request(self, request: RPCIsSleepingRequest): + is_sleeping = self.is_sleeping() + self._send_outputs( + RPCIsSleepingResponse(request_id=request.request_id, + is_sleeping=is_sleeping)) + def _health_check(self): # Send unhealthy if engine has already errored if self._errored_with is not None: @@ -406,6 +416,9 @@ class MQLLMEngine: def wake_up(self) -> None: self.engine.wake_up() + def is_sleeping(self) -> bool: + return self.engine.is_sleeping() + def signal_handler(*_) -> None: raise KeyboardInterrupt("MQLLMEngine terminated") diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index ee9accd32f218..f314075b166e2 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -288,6 +288,11 @@ class EngineClient(ABC): """Wake up the engine""" ... + @abstractmethod + async def is_sleeping(self) -> bool: + """Check whether the engine is sleeping""" + ... + @abstractmethod async def add_lora(self, lora_request: LoRARequest) -> None: """Load a new LoRA adapter into the engine for future requests.""" diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 694d4f9cf1121..bc74ebd205d15 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -694,6 +694,12 @@ if envs.VLLM_SERVER_DEV_MODE: # is sent but does not finish yet when we return a response. return Response(status_code=200) + @router.get("/is_sleeping") + async def is_sleeping(raw_request: Request): + logger.info("check whether the engine is sleeping") + is_sleeping = await engine_client(raw_request).is_sleeping() + return JSONResponse(content={"is_sleeping": is_sleeping}) + @router.post("/invocations", dependencies=[Depends(validate_json_request)]) async def invocations(raw_request: Request): diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 7188f10b18856..d4ac9c066d50d 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -407,6 +407,9 @@ class AsyncLLM(EngineClient): async def wake_up(self) -> None: await self.engine_core.wake_up_async() + async def is_sleeping(self) -> bool: + return await self.engine_core.is_sleeping_async() + async def add_lora(self, lora_request: LoRARequest) -> bool: """Load a new LoRA adapter into the engine for future requests.""" return await self.engine_core.add_lora_async(lora_request) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 174d96ec43776..8f93d3c71cdf3 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -253,6 +253,9 @@ class EngineCore: def wake_up(self): self.model_executor.wake_up() + def is_sleeping(self) -> bool: + return self.model_executor.is_sleeping + def execute_dummy_batch(self): self.model_executor.collective_rpc("execute_dummy_batch") diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 0f92adcc86375..5ed4645797846 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -89,6 +89,9 @@ class EngineCoreClient(ABC): def wake_up(self) -> None: raise NotImplementedError + def is_sleeping(self) -> bool: + raise NotImplementedError + def execute_dummy_batch(self) -> None: raise NotImplementedError @@ -128,6 +131,9 @@ class EngineCoreClient(ABC): async def wake_up_async(self) -> None: raise NotImplementedError + async def is_sleeping_async(self) -> bool: + raise NotImplementedError + async def abort_requests_async(self, request_ids: list[str]) -> None: raise NotImplementedError @@ -182,6 +188,9 @@ class InprocClient(EngineCoreClient): def wake_up(self) -> None: self.engine_core.wake_up() + def is_sleeping(self) -> bool: + return self.engine_core.is_sleeping() + def execute_dummy_batch(self) -> None: self.engine_core.execute_dummy_batch() @@ -433,6 +442,9 @@ class SyncMPClient(MPClient): def wake_up(self) -> None: self._call_utility("wake_up") + def is_sleeping(self) -> bool: + return self._call_utility("is_sleeping") + def execute_dummy_batch(self) -> None: self._call_utility("execute_dummy_batch") @@ -523,6 +535,9 @@ class AsyncMPClient(MPClient): async def wake_up_async(self) -> None: await self._call_utility_async("wake_up") + async def is_sleeping_async(self) -> bool: + return await self._call_utility_async("is_sleeping") + async def execute_dummy_batch_async(self) -> None: await self._call_utility_async("execute_dummy_batch") diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index cbd19d4d637be..63b0a8fca32bd 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -235,6 +235,9 @@ class LLMEngine: def wake_up(self): self.engine_core.wake_up() + def is_sleeping(self) -> bool: + return self.engine_core.is_sleeping() + def get_tokenizer_group( self, group_type: type[_G] = BaseTokenizerGroup, From 61c6a5a79664882a8ab1c9af3ff78677911516dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Delacourt?= <54138269+Flechman@users.noreply.github.com> Date: Sat, 15 Mar 2025 14:28:27 +0100 Subject: [PATCH 059/169] [VLM] Merged multi-modal processor for Pixtral (#12211) Signed-off-by: remi Signed-off-by: DarkLight1337 Co-authored-by: DarkLight1337 --- examples/offline_inference/pixtral.py | 24 +- .../multimodal/processing/test_common.py | 190 ++++-- vllm/model_executor/models/llava.py | 126 ++-- vllm/model_executor/models/molmo.py | 6 +- vllm/model_executor/models/paligemma.py | 9 +- vllm/model_executor/models/pixtral.py | 588 +++++++++++------- vllm/multimodal/processing.py | 14 +- vllm/transformers_utils/tokenizer.py | 19 +- vllm/utils.py | 2 +- 9 files changed, 620 insertions(+), 358 deletions(-) diff --git a/examples/offline_inference/pixtral.py b/examples/offline_inference/pixtral.py index 760de114508cd..03e6eea891088 100644 --- a/examples/offline_inference/pixtral.py +++ b/examples/offline_inference/pixtral.py @@ -43,12 +43,18 @@ from vllm.sampling_params import SamplingParams # python demo.py advanced -def run_simple_demo(): +def run_simple_demo(args: argparse.Namespace): model_name = "mistralai/Pixtral-12B-2409" sampling_params = SamplingParams(max_tokens=8192) - # Lower max_num_seqs or max_model_len on low-VRAM GPUs. - llm = LLM(model=model_name, tokenizer_mode="mistral") + # Lower max_model_len and/or max_num_seqs on low-VRAM GPUs. + llm = LLM( + model=model_name, + tokenizer_mode="mistral", + max_model_len=4096, + max_num_seqs=2, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) prompt = "Describe this image in one sentence." image_url = "https://picsum.photos/id/237/200/300" @@ -76,7 +82,7 @@ def run_simple_demo(): print(outputs[0].outputs[0].text) -def run_advanced_demo(): +def run_advanced_demo(args: argparse.Namespace): model_name = "mistralai/Pixtral-12B-2409" max_img_per_msg = 5 max_tokens_per_img = 4096 @@ -87,6 +93,7 @@ def run_advanced_demo(): tokenizer_mode="mistral", limit_mm_per_prompt={"image": max_img_per_msg}, max_model_len=max_img_per_msg * max_tokens_per_img, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) prompt = "Describe the following image." @@ -153,14 +160,19 @@ def main(): help="Specify the demo mode: 'simple' or 'advanced'", ) + parser.add_argument( + '--disable-mm-preprocessor-cache', + action='store_true', + help='If True, disables caching of multi-modal preprocessor/mapper.') + args = parser.parse_args() if args.mode == "simple": print("Running simple demo...") - run_simple_demo() + run_simple_demo(args) elif args.mode == "advanced": print("Running advanced demo...") - run_advanced_demo() + run_advanced_demo(args) if __name__ == "__main__": diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 0e0d3711357e4..f761190a8d097 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -2,17 +2,23 @@ import copy from functools import partial -from typing import Optional +from typing import Optional, Union import numpy as np import pytest +from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk, + UserMessage) +from mistral_common.protocol.instruct.request import ChatCompletionRequest from PIL import Image +from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast from vllm.config import ModelConfig from vllm.inputs import InputProcessingContext -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.processing import ProcessingCache -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict +from vllm.multimodal.inputs import MultiModalInputs +from vllm.multimodal.processing import BaseMultiModalProcessor, ProcessingCache +from vllm.transformers_utils.tokenizer import (MistralTokenizer, + cached_tokenizer_from_config) from ....multimodal.utils import random_audio, random_image, random_video from ...registry import HF_EXAMPLE_MODELS @@ -85,14 +91,6 @@ def _test_processing_correctness( partial(random_audio, rng, min_len=512, max_len=1024, sr=16000), } - tokenizer_encode_kwargs = {} - if model_config.hf_config.model_type in ("mllama", "whisper", "ultravox"): - # For some multimodal models, tokenizer will always add bos_token - # at the beginning of prompt by default, causing hf_processor outputs - # incorrect token ids. So we need use `add_special_tokens=False` here - # to leave bos_token to be added by the processor. - tokenizer_encode_kwargs = {"add_special_tokens": False} - for batch_idx in range(num_batches): mm_data = { k: @@ -115,43 +113,131 @@ def _test_processing_correctness( elif len(mm_data[k]) == 1: mm_data[k] = mm_data[k][0] - baseline_result = baseline_processor.apply( - prompt, - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) - cached_result = cached_processor.apply( - prompt, - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) + if isinstance(tokenizer, MistralTokenizer): + _test_processing_correctness_mistral( + model_config, + tokenizer, + prompt, + mm_data, + baseline_processor, + cached_processor, + batch_idx, + ignore_mm_keys=ignore_mm_keys, + ) + else: + _test_processing_correctness_hf( + model_config, + tokenizer, + prompt, + mm_data, + baseline_processor, + cached_processor, + batch_idx, + ignore_mm_keys=ignore_mm_keys, + ) - assert _drop_mm_kwargs_keys( - baseline_result, ignore_mm_keys) == _drop_mm_kwargs_keys( - cached_result, ignore_mm_keys), ( - f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") - baseline_tokenized_result = baseline_processor.apply( - tokenizer.encode(prompt, **tokenizer_encode_kwargs), - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) +def _test_processing_correctness_hf( + model_config: ModelConfig, + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + prompt: str, + mm_data: MultiModalDataDict, + baseline_processor: BaseMultiModalProcessor, + cached_processor: BaseMultiModalProcessor, + batch_idx: int, + ignore_mm_keys: Optional[list[str]] = None, +): + if model_config.hf_config.model_type in ("mllama", "whisper", "ultravox"): + # For some multimodal models, tokenizer will always add bos_token + # at the beginning of prompt by default, causing hf_processor outputs + # incorrect token ids. So we need use `add_special_tokens=False` here + # to leave bos_token to be added by the processor. + token_prompt = tokenizer.encode(prompt, add_special_tokens=False) + else: + token_prompt = tokenizer.encode(prompt) - assert _drop_mm_kwargs_keys( - baseline_result, ignore_mm_keys) == _drop_mm_kwargs_keys( - baseline_tokenized_result, ignore_mm_keys), ( - f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") + baseline_result = baseline_processor.apply( + prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + cached_result = cached_processor.apply( + prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) - cached_tokenized_result = cached_processor.apply( - tokenizer.encode(prompt, **tokenizer_encode_kwargs), - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) + assert _inputs_equal( + baseline_result, + cached_result, + ignore_mm_keys, + ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})" - assert _drop_mm_kwargs_keys( - cached_result, ignore_mm_keys) == _drop_mm_kwargs_keys( - cached_tokenized_result, ignore_mm_keys), ( - f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") + baseline_tokenized_result = baseline_processor.apply( + token_prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + assert _inputs_equal( + baseline_result, + baseline_tokenized_result, + ignore_mm_keys, + ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})" + + cached_tokenized_result = cached_processor.apply( + token_prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + assert _inputs_equal( + cached_result, + cached_tokenized_result, + ignore_mm_keys, + ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})" + + +def _test_processing_correctness_mistral( + model_config: ModelConfig, + tokenizer: MistralTokenizer, + prompt: str, + mm_data: MultiModalDataDict, + baseline_processor: BaseMultiModalProcessor, + cached_processor: BaseMultiModalProcessor, + batch_idx: int, + ignore_mm_keys: Optional[list[str]] = None, +): + images = mm_data.get("image", []) + if not isinstance(images, list): + images = [images] + + request = ChatCompletionRequest(messages=[ + UserMessage(content=[ + TextChunk(text=prompt), + *(ImageChunk(image=image) for image in images), + ]), + ]) + res = tokenizer.mistral.encode_chat_completion(request) + token_prompt = res.tokens + + # Mistral chat outputs tokens directly, rather than text prompts + baseline_tokenized_result = baseline_processor.apply( + token_prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + cached_tokenized_result = cached_processor.apply( + token_prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + assert _inputs_equal( + baseline_tokenized_result, + cached_tokenized_result, + ignore_mm_keys, + ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})" # yapf: disable @@ -173,6 +259,7 @@ def _test_processing_correctness( "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "meta-llama/Llama-3.2-11B-Vision-Instruct", "TIGER-Lab/Mantis-8B-siglip-llama3", + "mistralai/Pixtral-12B-2409", "mistral-community/pixtral-12b", "openbmb/MiniCPM-o-2_6", "openbmb/MiniCPM-V-2_6", @@ -241,8 +328,19 @@ def test_processing_correctness_phi3v( ) -def _drop_mm_kwargs_keys(result: dict, - ignore_mm_keys: Optional[list[str]] = None) -> dict: +def _inputs_equal( + a: MultiModalInputs, + b: MultiModalInputs, + ignore_mm_keys: Optional[list[str]] = None, +): + return _drop_mm_kwargs_keys(a, ignore_mm_keys) == _drop_mm_kwargs_keys( + b, ignore_mm_keys) + + +def _drop_mm_kwargs_keys( + result: MultiModalInputs, + ignore_mm_keys: Optional[list[str]] = None, +) -> MultiModalInputs: """Drop specified keys from result['mm_kwargs']. This is mainly to avoid doing exact match of audio_features in ultravox. diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 42bf6a5b2979a..3a8d184528d8b 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -68,23 +68,15 @@ class PixtralHFImagePixelInputs(TypedDict): in which case the data is passed as a list instead of a batched tensor. """ - feat_is_patch: Union[torch.Tensor, list[torch.Tensor]] - """ - A boolean mask indicating which image features correspond - to patch tokens. - - Shape: `(batch_size, num_crops, num_patch)` - """ - embed_is_patch: Union[torch.Tensor, list[torch.Tensor]] """ A boolean mask indicating which image embeddings correspond to patch tokens. - Shape: `(batch_size, num_embeds)` + Shape: `(batch_size, num_images, num_embeds)` """ - num_crops: Union[torch.Tensor, list[torch.Tensor]] + num_patches: Union[torch.Tensor, list[torch.Tensor]] """Shape: `(batch_size, num_images)`""" @@ -360,16 +352,16 @@ class PixtralHFMultiModalProcessor( image_height=pixel_value.shape[-2], ) for pixel_value in processed_outputs["pixel_values"] ] - num_crops = torch.tensor([(ncols + 1) * nrows - for ncols, nrows in tile_sizes]) + num_patches = torch.tensor([(ncols + 1) * nrows + for ncols, nrows in tile_sizes]) # Each image may result to masks of different sizes, so we need to - # flatten the list and later use `num_crops` to get per-image masks. - embed_is_patch = torch.tensor( - flatten_2d_lists([([True] * ncols + [False]) * nrows - for ncols, nrows in tile_sizes])) - processed_outputs["num_crops"] = num_crops + # later use `num_patches` to get per-image masks. + embed_is_patch = [ + torch.tensor(([True] * ncols + [False]) * nrows) + for ncols, nrows in tile_sizes + ] + processed_outputs["num_patches"] = num_patches processed_outputs["embed_is_patch"] = embed_is_patch - processed_outputs["feat_is_patch"] = embed_is_patch return processed_outputs @@ -378,14 +370,10 @@ class PixtralHFMultiModalProcessor( hf_inputs: BatchFeature, hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: - num_crops = hf_inputs.get("num_crops", torch.empty(0)).view(-1) return dict( - feat_is_patch=MultiModalFieldConfig.flat_from_sizes( - "image", num_crops), - embed_is_patch=MultiModalFieldConfig.flat_from_sizes( - "image", num_crops), - num_crops=MultiModalFieldConfig.batched("image"), pixel_values=MultiModalFieldConfig.batched("image"), + num_patches=MultiModalFieldConfig.batched("image"), + embed_is_patch=MultiModalFieldConfig.batched("image"), image_embeds=MultiModalFieldConfig.batched("image"), ) @@ -628,27 +616,21 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): f"Got type: {type(pixel_values)}") if self.config.vision_config.model_type == "pixtral": - feat_is_patch = kwargs.pop("feat_is_patch") - if not isinstance(feat_is_patch, (torch.Tensor, list)): - raise ValueError("Incorrect type of feat_is_patch. " - f"Got type: {type(feat_is_patch)}") - embed_is_patch = kwargs.pop("embed_is_patch") if not isinstance(embed_is_patch, (torch.Tensor, list)): raise ValueError("Incorrect type of embed_is_patch. " f"Got type: {type(embed_is_patch)}") - num_crops = kwargs.pop("num_crops") - if not isinstance(num_crops, (torch.Tensor, list)): - raise ValueError("Incorrect type of num_crops. " - f"Got type: {type(num_crops)}") + num_patches = kwargs.pop("num_patches") + if not isinstance(num_patches, (torch.Tensor, list)): + raise ValueError("Incorrect type of num_patches. " + f"Got type: {type(num_patches)}") return PixtralHFImagePixelInputs( type="pixel_values_pixtral", pixel_values=flatten_bn(pixel_values), - feat_is_patch=feat_is_patch, embed_is_patch=embed_is_patch, - num_crops=num_crops, + num_patches=num_patches, ) return LlavaImagePixelInputs( @@ -687,21 +669,26 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): vision_tower: Union[CLIPVisionModel, SiglipVisionModel, PixtralHFVisionModel], pixel_values: Union[torch.Tensor, list[torch.Tensor]], - ) -> torch.Tensor: - + ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: # NOTE: we skip the step to select the vision feature layer since # this is already done inside the vision tower image_features = vision_tower(pixel_values) - return self._select_image_features( - image_features, - strategy=self.config.vision_feature_select_strategy, + def select_features(leaf: torch.Tensor): + return self._select_image_features( + leaf, + strategy=self.config.vision_feature_select_strategy, + ) + + return cast( + Union[torch.Tensor, tuple[torch.Tensor, ...]], + json_map_leaves(select_features, image_features), ) def _process_image_pixels( self, inputs: Union[LlavaImagePixelInputs, PixtralHFImagePixelInputs], - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: assert self.vision_tower is not None pixel_values = inputs["pixel_values"] @@ -731,45 +718,30 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): def _get_mm_embeds( self, - features: torch.Tensor, # Shape: (num_crop, num_patch, d) - feat_is_patch: torch.Tensor, # Shape: (num_crop, num_patch) - num_crops: torch.Tensor, # Shape: (num_images,) - embed_is_patch: torch.Tensor, # Shape: (num_embeds,) - ) -> list[torch.Tensor]: + features: torch.Tensor, # Shape: (num_patch, d) + num_patches: torch.Tensor, # Shape: (num_images,) + embed_is_patch: torch.Tensor, # Shape: (num_images, num_embeds) + ) -> tuple[torch.Tensor, ...]: """Scatter the patch features into a contiguous tensor that corresponds to the embedding tokens defined by the multimodal processor. Mostly copied from `Molmo._get_mm_embeds`. See following fixme comment. """ - - # Insert columns of nan values according to `feat_is_patch`. This work + # Insert columns of nan values according to `embed_is_patch`. This work # ideally should be done in `_process_image_input`, but # `_process_image_input` is used in both V0 and V1 path. It's safer to # put the logic here. # FIXME: Move this logic to `_process_image_input` when v0 is # deprecated. Merge this function with `Molmo._get_mm_embeds`. - feat_is_patch = feat_is_patch.view(-1) - embed_is_patch = embed_is_patch.view(-1) - expanded_embedding = torch.full( - (sum(num_crops), *features.shape[1:]), - torch.nan, - dtype=features.dtype).to(features.device) - expanded_embedding[feat_is_patch] = features + num_patches_per_image: list[int] = num_patches.tolist() - num_crops_per_image = num_crops.tolist() - feats_per_image = expanded_embedding.split(num_crops_per_image) - f_is_patch_per_image = feat_is_patch.split(num_crops_per_image) + embeds_flat = features.new_full( + (sum(num_patches_per_image), *features.shape[1:]), + fill_value=torch.nan, + ) + embeds_flat[embed_is_patch.view(-1)] = features - embed_dim = expanded_embedding.shape[-1] - num_embeds = embed_is_patch.shape[0] - - embeds_in_batch = list[torch.Tensor]() - for feats, f_is_patch in zip(feats_per_image, f_is_patch_per_image): - embeds = feats.new_full((num_embeds, embed_dim), torch.nan) - embeds[embed_is_patch] = feats[f_is_patch] - embeds_in_batch.append(embeds) - - return embeds_in_batch + return embeds_flat.split(num_patches_per_image) def get_multimodal_embeddings( self, **kwargs: object) -> Optional[MultiModalEmbeddings]: @@ -784,12 +756,12 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): # The path is used for pixtral (V0 only) and llava (V0/V1) return vision_embeddings - nested_emb = [ + return flatten_2d_lists( self._get_mm_embeds(*args) for args in zip( - vision_embeddings, image_input["feat_is_patch"], - image_input["num_crops"], image_input["embed_is_patch"]) - ] - return flatten_2d_lists(nested_emb) + vision_embeddings, + image_input["num_patches"], + image_input["embed_is_patch"], + )) def get_input_embeddings( self, @@ -805,9 +777,11 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): ) inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, cast(NestedTensors, - patch_embeddings), - self.config.image_token_index) + input_ids, + inputs_embeds, + cast(NestedTensors, patch_embeddings), + self.config.image_token_index, + ) return inputs_embeds def forward( diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index e709b08815eaf..c7f6cf461d523 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1585,15 +1585,13 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, image_features = self._process_image_input(image_input) - nested_embeds = [ + return flatten_2d_lists( self._get_mm_embeds(*args) for args in zip( image_features, image_input["feat_is_patch"], image_input["num_crops"], image_input["embed_is_patch"], - ) - ] - return flatten_2d_lists(nested_embeds) + )) def get_input_embeddings( self, diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 88a6226d21448..8a773607ce4ed 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 - -from typing import (Iterable, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from collections.abc import Iterable, Mapping, Sequence +from typing import Literal, Optional, Set, Tuple, TypedDict, Union import torch from torch import nn @@ -17,7 +16,7 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptIndexTargets, - PromptInsertion, PromptReplacement, + PromptInsertion, PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors @@ -144,7 +143,7 @@ class PaliGemmaMultiModalProcessor( mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, - ) -> list[PromptReplacement]: + ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 2e71390623fdf..fff630056e405 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -1,26 +1,28 @@ # SPDX-License-Identifier: Apache-2.0 import math -from collections.abc import Iterable, Mapping +from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass, fields from functools import cached_property -from typing import List, Optional, Set, Tuple, Union +from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union, cast import torch import torch.nn as nn import torch.nn.functional as F from mistral_common.protocol.instruct.messages import ImageChunk +from mistral_common.tokens.tokenizers.multimodal import ImageEncoder from PIL import Image -from transformers import PixtralVisionConfig +from transformers import PixtralVisionConfig, TensorType +from transformers.image_utils import ImageInput from transformers.models.pixtral.image_processing_pixtral import ( _num_image_tokens as _get_pixtral_hf_num_image_tokens) from transformers.models.pixtral.modeling_pixtral import ( PixtralRotaryEmbedding, apply_rotary_pos_emb, position_ids_in_meshgrid) +from transformers.tokenization_utils_base import TextInput from vllm.config import VllmConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) +from vllm.jsontree import JSONTree, json_map_leaves from vllm.model_executor.layers.activation import get_act_and_mul_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -31,13 +33,20 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.inputs import PlaceholderRange -from vllm.multimodal.utils import consecutive_placeholder_ranges -from vllm.sequence import IntermediateTensors, SequenceData -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors +from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, + MultiModalDataItems) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement, + PromptUpdate) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.tokenizer import (MistralTokenizer, + cached_tokenizer_from_config) +from vllm.utils import flatten_2d_lists from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP -from .utils import (init_vllm_registered_model, maybe_prefix, +from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs @@ -48,132 +57,275 @@ except ImportError: USE_XFORMERS_OPS = False -def get_max_pixtral_image_tokens(ctx: InputContext): - tokenizer = cached_tokenizer_from_config(ctx.model_config) - mm_encoder = tokenizer.instruct.mm_encoder +class PixtralImagePixelInputs(TypedDict): + type: Literal["pixel_values"] - image_config = mm_encoder.mm_config if hasattr( - mm_encoder, "mm_config") else mm_encoder.image_config - - max_image_size = image_config.max_image_size - image_patch_size = image_config.image_patch_size - - return ((max_image_size // image_patch_size)**2) - - -def dummy_data_for_pixtral(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - tokenizer = cached_tokenizer_from_config(ctx.model_config) - - mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder - image_token_id = mm_encoder.special_ids.img - - mm_config = ctx.get_mm_config() - num_images = mm_config.get_limit_per_prompt("image") - - # dummy size - size = 256 - image = Image.new("RGB", (size, size), color=0) - - encoding = tokenizer.instruct.mm_encoder(ImageChunk(image=image)) - image_feature_size = len(encoding.tokens) - num_image_tokens = image_feature_size * num_images - seq_data = SequenceData.from_prompt_token_counts( - (image_token_id, num_image_tokens), - (0, seq_len - num_image_tokens), - ) - - mm_data = {"image": num_images * [image]} - mm_placeholders = { - "image": - consecutive_placeholder_ranges(num_items=num_images, - item_size=image_feature_size) - } - return DummyData(seq_data, mm_data, mm_placeholders) - - -def input_mapper_for_pixtral(ctx: InputContext, - data: object) -> MultiModalKwargs: - """Maps the input data to its MultiModalKwargs (if any). - - Args: - ctx: Context of the loaded model. - data: data potentially containing PIL images to be processed - and mapped to `images`. - - Returns: - MultiModalKwargs containing the stacked normalized images tensor or - image embeddings. + images: Union[torch.Tensor, list[torch.Tensor]] """ - tokenizer = cached_tokenizer_from_config(ctx.model_config) + Shape: `(batch_size * num_images, num_channels, image_width, image_height)` - data_list = data if isinstance(data, list) else [data] + The result of stacking :attr:`ImageEncoding.tokens` from each prompt. + """ - images = [] - image_tokens_list = [] - for image_data in data_list: - image = ImageChunk(image=image_data) - encoding = tokenizer.instruct.mm_encoder(image) - image = torch.from_numpy(encoding.image).to(dtype=torch.float16) - images.append(image) - image_tokens_list.append(encoding.tokens) + embed_is_patch: Union[torch.Tensor, list[torch.Tensor]] + """ + A boolean mask indicating which image embeddings correspond + to patch tokens. + + Shape: `(batch_size, num_images, num_embeds)` + """ - image_tokens = torch.tensor([ - token_id for image_tokens in image_tokens_list - for token_id in image_tokens - ]) - return MultiModalKwargs({"images": images, "image_tokens": image_tokens}) + num_patches: Union[torch.Tensor, list[torch.Tensor]] + """Shape: `(batch_size, num_images)`""" -def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs +class PixtralProcessorAdapter: + """ + Provide a HF-compatible interface for + :class:`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`. + """ - prompt_token_ids = inputs.get("prompt_token_ids") - prompt = inputs.get("prompt") - tokenizer = cached_tokenizer_from_config(ctx.model_config) + def __init__(self, tokenizer: MistralTokenizer) -> None: + super().__init__() - mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder - image_token_id = mm_encoder.special_ids.img - image_break_id = mm_encoder.special_ids.img_break - image_end_id = mm_encoder.special_ids.img_end + self.tokenizer = tokenizer - if image_token_id not in inputs['prompt_token_ids']: - raise ValueError( - f"You've passed {inputs=} without {image_token_id=}" - " Make sure to process your input via mistral_common's" - " tokenizer or pass a chat completion request. For more" - " For more info, see: " - "https://github.com/vllm-project/vllm/issues/8411.") + @property + def image_processor(self) -> ImageEncoder: + image_encoder = self.tokenizer.instruct.mm_encoder + assert isinstance(image_encoder, ImageEncoder) + return image_encoder - # Get precise tracking of placeholder positions - placeholder_ranges = [] - curr_offset = -1 - curr_length = 0 - for i in range(len(prompt_token_ids)): - if prompt_token_ids[i] in (image_token_id, image_break_id): - if curr_offset < 0: - curr_offset = i - curr_length += 1 - elif prompt_token_ids[i] == image_end_id: - curr_length += 1 - placeholder_ranges.append( - PlaceholderRange(offset=curr_offset, length=curr_length)) - curr_offset = -1 - curr_length = 0 - else: - pass - return token_inputs(prompt=prompt, - prompt_token_ids=prompt_token_ids, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"image": placeholder_ranges}) + @cached_property + def image_break_id(self) -> int: + return self.image_processor.special_ids.img_break + + @cached_property + def image_token_id(self) -> int: + return self.image_processor.special_ids.img + + @cached_property + def image_end_id(self) -> int: + return self.image_processor.special_ids.img_end + + @cached_property + def image_size(self) -> int: + return self.image_processor.mm_config.max_image_size + + @cached_property + def patch_size(self) -> int: + return self.image_processor.mm_config.image_patch_size + + def __call__( + self, + text: Optional[Union[TextInput, list[TextInput]]] = None, + images: Optional[Union[ImageInput, list[ImageInput]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> Mapping[str, NestedTensors]: + if text is None: + text = [] + if not isinstance(text, list): + text = [text] + if images is None: + images = [] + if not isinstance(images, list): + images = [images] + + if not images: + input_ids = self.tokenizer(text).input_ids + + return {"input_ids": torch.tensor(input_ids)} + + # Allow dummy text, which is used for profiling as well as token inputs + if any(len(t) > 0 for t in text): + raise ValueError( + "You've passed text inputs instead of token inputs. " + "Make sure to process your input via `mistral_common`'s " + "tokenizer or pass a chat completion request. " + "For more info, see: " + "https://github.com/vllm-project/vllm/issues/8411.") + + image_token_id = self.image_token_id + + images_processed = list[torch.Tensor]() + images_tokens = list[torch.Tensor]() + images_embed_is_patch = list[torch.Tensor]() + images_num_patches = list[int]() + + for image in images: + image_inputs = self.image_processor(ImageChunk(image=image)) + + image_processed = torch.tensor(image_inputs.image) + image_tokens = torch.tensor(image_inputs.tokens) + + images_processed.append(image_processed) + images_tokens.append(image_tokens) + images_embed_is_patch.append(image_tokens == image_token_id) + images_num_patches.append(len(image_tokens)) + + return { + "input_ids": torch.cat(images_tokens)[None].expand(len(text), -1), + "images": images_processed, + "embed_is_patch": images_embed_is_patch, + "num_patches": torch.tensor(images_num_patches), + } -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_pixtral) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_pixtral_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_pixtral) -@INPUT_REGISTRY.register_input_processor(input_processor_for_pixtral) +class PixtralProcessingInfo(BaseProcessingInfo): + + def get_tokenizer(self) -> MistralTokenizer: + tokenizer = cached_tokenizer_from_config(self.ctx.model_config) + if not isinstance(tokenizer, MistralTokenizer): + raise ValueError("This model requires `--tokenizer-mode mistral`") + + return tokenizer + + def get_hf_processor(self) -> PixtralProcessorAdapter: + return PixtralProcessorAdapter(self.get_tokenizer()) + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + return {"image": self.get_max_image_tokens()} + + def get_vision_config( + self, + processor: Optional[PixtralProcessorAdapter] = None, + ): + if processor is None: + processor = self.get_hf_processor() + + return PixtralVisionConfig( + image_size=processor.image_size, + patch_size=processor.patch_size, + ) + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + processor: Optional[PixtralProcessorAdapter] = None, + ) -> int: + if processor is None: + processor = self.get_hf_processor() + + ncols, nrows = processor.image_processor._image_to_num_tokens( + Image.new("RGB", (image_width, image_height))) + + return (ncols + 1) * nrows + + def get_image_size_with_most_features(self) -> ImageSize: + image_processor = self.get_hf_processor().image_processor + max_image_size = image_processor.mm_config.max_image_size + + return ImageSize(width=max_image_size, height=max_image_size) + + def get_max_image_tokens(self) -> int: + target_width, target_height = self.get_image_size_with_most_features() + + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + ) + + +class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + + target_width, target_height = \ + self.info.get_image_size_with_most_features() + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="", + mm_data=mm_data, + ) + + +class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo] + ): + + def _get_mm_fields_config( + self, + hf_inputs: Mapping[str, NestedTensors], + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + images=MultiModalFieldConfig.batched("image"), + embed_is_patch=MultiModalFieldConfig.batched("image"), + num_patches=MultiModalFieldConfig.batched("image"), + ) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + image_break_id = processor.image_break_id + image_token_id = processor.image_token_id + image_end_id = processor.image_end_id + + def get_replacement(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) + + ncols, nrows = processor.image_processor._image_to_num_tokens( + Image.new("RGB", (image_size.width, image_size.height))) + + tokens = ([image_token_id] * ncols + [image_break_id]) * nrows + tokens[-1] = image_end_id + + return tokens + + return [ + PromptReplacement( + modality="image", + target="", # Never match the prompt (see below note) + replacement=get_replacement, + ), + ] + + def _cached_apply_hf_processor( + self, + prompt: Union[str, list[int]], + mm_data_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> tuple[list[int], MultiModalKwargs, bool]: + prompt_ids, mm_kwargs, _ = super()._cached_apply_hf_processor( + prompt=prompt, + mm_data_items=mm_data_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) + + # NOTE: The tokens are already inserted by the chat template + return prompt_ids, mm_kwargs, True + + +@MULTIMODAL_REGISTRY.register_processor(PixtralMultiModalProcessor, + info=PixtralProcessingInfo, + dummy_inputs=PixtralDummyInputsBuilder) class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): @@ -191,13 +343,6 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, if key in dataclass_fields } - if not ("image_break_token_id" in vision_args - and "image_end_token_id" in vision_args): - raise ValueError( - "'image_break_token_id' and 'image_end_token_id' not found " - "in the vision_encoder arguments. Please download the latest " - "version of 'params.json' from the model repository.") - self.vision_args = VisionEncoderArgs(**vision_args) # init MistralForCausalLM @@ -221,36 +366,92 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, return get_sampler() + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[PixtralImagePixelInputs]: + images = kwargs.pop("images", None) + if images is None: + return None + + if not isinstance(images, (torch.Tensor, list)): + raise ValueError("Incorrect type of images. " + f"Got type: {type(images)}") + + embed_is_patch = kwargs.pop("embed_is_patch") + if not isinstance(embed_is_patch, (torch.Tensor, list)): + raise ValueError("Incorrect type of embed_is_patch. " + f"Got type: {type(embed_is_patch)}") + + num_patches = kwargs.pop("num_patches") + if not isinstance(num_patches, (torch.Tensor, list)): + raise ValueError("Incorrect type of num_patches. " + f"Got type: {type(num_patches)}") + + return PixtralImagePixelInputs( + type="pixel_values", + images=flatten_bn(images), + embed_is_patch=embed_is_patch, + num_patches=num_patches, + ) + + def _process_image_input( + self, + image_input: PixtralImagePixelInputs, + ) -> tuple[torch.Tensor, ...]: + images = image_input["images"] + + image_features = self.vision_encoder(images) + feature_sizes = [ + image_feature.shape[0] for image_feature in image_features + ] + + image_embeds = self.vision_language_adapter(torch.cat(image_features)) + image_embeds = torch.split(image_embeds, feature_sizes) + return image_embeds + + def _get_mm_embeds( + self, + features: torch.Tensor, # Shape: (num_patch, d) + num_patches: torch.Tensor, # Shape: (num_images,) + embed_is_patch: torch.Tensor, # Shape: (num_images, num_embeds) + ) -> tuple[torch.Tensor, ...]: + """Scatter the patch features into a contiguous tensor that corresponds + to the embedding tokens defined by the multimodal processor. + + Mostly copied from `Molmo._get_mm_embeds`. See following fixme comment. + """ + # Insert columns of nan values according to `embed_is_patch`. This work + # ideally should be done in `_process_image_input`, but + # `_process_image_input` is used in both V0 and V1 path. It's safer to + # put the logic here. + # FIXME: Move this logic to `_process_image_input` when v0 is + # deprecated. Merge this function with `Molmo._get_mm_embeds`. + num_patches_per_image: list[int] = num_patches.tolist() + + embeds_flat = features.new_full( + (sum(num_patches_per_image), *features.shape[1:]), + fill_value=torch.nan, + ) + embeds_flat[embed_is_patch.view(-1)] = features + + return embeds_flat.split(num_patches_per_image) + def get_multimodal_embeddings( self, **kwargs: object) -> Optional[MultiModalEmbeddings]: - image_input, image_tokens = self._parse_and_validate_image_input( - **kwargs) + image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None - vision_embeddings = self._process_image_input(image_input) + image_features = self._process_image_input(image_input) - # NOTE: We patch the outputs of the vision encoder with embeddings - # from `[IMG_BREAK]` and `[IMG_END]` tokens. - image_embeds = self.language_model.get_input_embeddings(image_tokens) - image_token_mask = image_tokens == self.vision_args.image_token_id - image_embeds[image_token_mask] = vision_embeddings + if kwargs.get("v0_path", False): + return image_features - # NOTE: Image embeddings are split into separate tensors for each image - # by the indices of `[IMG_END]` token. - image_end_mask = image_tokens == self.vision_args.image_end_token_id - split_indices = torch.where(image_end_mask)[0] + 1 - if len(split_indices) <= 1: - # Do not split, return as tensor of shape [1, fs, hs] - return image_embeds.unsqueeze(0) - - # If the last split index is the last index in image_tokens, we - # ignore it to avoid empty split tensor - if split_indices[-1] == len(image_tokens): - split_indices = split_indices[:-1] - - image_embeds = image_embeds.tensor_split(split_indices.cpu()) - return image_embeds + return flatten_2d_lists( + self._get_mm_embeds(*args) for args in zip( + image_features, + image_input["num_patches"], + image_input["embed_is_patch"], + )) def get_input_embeddings( self, @@ -259,12 +460,17 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: + # Extract the patch tokens + patch_embeddings = json_map_leaves( + lambda x: x[~x.isnan()].view(-1, *x.shape[1:]), + cast(JSONTree[torch.Tensor], multimodal_embeddings), + ) inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, [ - self.vision_args.image_token_id, - self.vision_args.image_break_token_id, - self.vision_args.image_end_token_id, - ]) + input_ids, + inputs_embeds, + cast(NestedTensors, patch_embeddings), + self.vision_args.image_token_id, + ) return inputs_embeds def forward( @@ -275,14 +481,14 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: - """Run forward pass for pixtral. - """ + """Run forward pass for pixtral.""" if intermediate_tensors is not None: inputs_embeds = None # NOTE: In v1, inputs_embeds is always generated at model runner, this # condition is for v0 compatibility. elif inputs_embeds is None: + kwargs.update({"v0_path": True}) vision_embeddings = self.get_multimodal_embeddings(**kwargs) inputs_embeds = self.get_input_embeddings(input_ids, vision_embeddings) @@ -295,47 +501,6 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, return hidden_states - def _parse_and_validate_image_input( - self, - images: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor], - torch.Tensor]] = None, - image_tokens: Optional[torch.Tensor] = None, - ) -> Tuple[Optional[List[torch.Tensor]], Optional[torch.Tensor]]: - if images is None: - return None, None - - if isinstance(images, torch.Tensor): - # if passed as batch take all images - N, B, C, W, H = images.shape - images = images.reshape(N * B, C, W, H) - images = [images[i] for i in range(images.size(0))] - elif isinstance(images, list): - # if passed as list flatten lists of tensors - flatten_images = [] - for imgs_per_req in images: - imgs_per_req = [ - imgs_per_req[i] for i in range(imgs_per_req.size(0)) - ] if isinstance(imgs_per_req, torch.Tensor) else imgs_per_req - - flatten_images.extend(imgs_per_req) - - images = flatten_images - - if isinstance(image_tokens, torch.Tensor): - # image_tokens are batched - image_tokens = image_tokens.flatten() - elif isinstance(image_tokens, list): - # image_tokens are of different lengths thus passed as a list - image_tokens = torch.cat(image_tokens) - - assert image_tokens.dim() == 1 - - return images, image_tokens - - def _process_image_input(self, - image_input: List[torch.Tensor]) -> torch.Tensor: - return self.vision_language_adapter(self.vision_encoder(image_input)) - def compute_logits( self, hidden_states: torch.Tensor, @@ -400,8 +565,6 @@ class VisionEncoderArgs: num_attention_heads: int rope_theta: float # for rope-2D image_token_id: int - image_break_token_id: int - image_end_token_id: int adapter_bias: bool = True @@ -637,9 +800,13 @@ class VisionTransformer(nn.Module): self.patch_conv(img.unsqueeze(0).to(self.dtype)) for img in images ] + patch_embeds = [ + p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list + ] + embed_sizes = [p.shape[1] for p in patch_embeds] + # flatten to a single sequence - patch_embeds = torch.cat( - [p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list], dim=1) + patch_embeds = torch.cat(patch_embeds, dim=1) patch_embeds = self.ln_pre(patch_embeds) # positional embeddings @@ -655,8 +822,8 @@ class VisionTransformer(nn.Module): "with the Mistral format") out = self.transformer(patch_embeds, mask=mask, freqs_cis=freqs_cis) - # remove batch dimension of the single sequence - return out.squeeze(0) + # squeeze dim 0 and split into separate tensors for each image + return torch.split(out.squeeze(0), embed_sizes) class VisionLanguageAdapter(nn.Module): @@ -978,9 +1145,9 @@ class PixtralHFVisionModel(nn.Module): def forward( self, - pixel_values: List[torch.Tensor], + pixel_values: list[torch.Tensor], feature_sample_layers: Optional[list[int]] = None, - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, ...]: """ Args: pixel_values: Each image to be processed will be a separate tensor @@ -1039,8 +1206,7 @@ class PixtralHFVisionModel(nn.Module): self.config.num_hidden_layers) # squeeze dim 0 and split into separate tensors for each image - out = torch.split(torch.squeeze(out), embed_sizes) - return out + return torch.split(out.squeeze(0), embed_sizes) # (TODO) Add prefix argument for filtering out weights to be loaded # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index cdbbed27a5218..10c53dfb2c66e 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -77,7 +77,9 @@ class PromptIndexTargets: else: if isinstance(prefix, str): # Make both `list[int]` - prefix = encode_tokens(tokenizer, prefix) + prefix = encode_tokens(tokenizer, + prefix, + add_special_tokens=False) match_idx = len(prefix) return match_idx if prompt[:match_idx] == prefix else None @@ -318,7 +320,7 @@ def _cached_encode( tokenizer: AnyTokenizer, text: str, *, - add_special_tokens: bool = False, + add_special_tokens: Optional[bool] = None, ) -> list[int]: return encode_tokens(tokenizer, text, @@ -330,7 +332,7 @@ def _cached_decode( tokenizer: AnyTokenizer, token_ids: tuple[int, ...], *, - skip_special_tokens: bool = False, + skip_special_tokens: Optional[bool] = None, ) -> str: return decode_tokens(tokenizer, list(token_ids), @@ -395,7 +397,9 @@ class _BoundPromptSequence: def token_ids(self) -> list[int]: if self._token_ids is None: assert self._text is not None - self._token_ids = _cached_encode(self.tokenizer, self._text) + self._token_ids = _cached_encode(self.tokenizer, + self._text, + add_special_tokens=False) return self._token_ids @@ -1046,7 +1050,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, - ) -> list[PromptUpdate]: + ) -> Sequence[PromptUpdate]: """ Given the original multi-modal items for this modality and HF-processed data, output the updates to perform. diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 2c34f2f5d44d5..1bfb50328338f 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -34,13 +34,20 @@ def decode_tokens( tokenizer: AnyTokenizer, token_ids: list[int], *, - skip_special_tokens: bool = False, + skip_special_tokens: Optional[bool] = None, ) -> str: """ Backend-agnostic equivalent of HF's - :code:`tokenizer.decode(token_ids, skip_special_tokens=...)`. + :code:`tokenizer.decode(token_ids, ...)`. + + :code:`skip_special_tokens=None` means to use the backend's default + settings. """ - return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) + if skip_special_tokens is not None: + return tokenizer.decode(token_ids, + skip_special_tokens=skip_special_tokens) + + return tokenizer.decode(token_ids) def encode_tokens( @@ -51,10 +58,14 @@ def encode_tokens( ) -> list[int]: """ Backend-agnostic equivalent of HF's - :code:`tokenizer.encode(text, add_special_tokens=...)`. + :code:`tokenizer.encode(text, ...)`. + + :code:`add_special_tokens=None` means to use the backend's default + settings. """ if add_special_tokens is not None: return tokenizer.encode(text, add_special_tokens=add_special_tokens) + return tokenizer.encode(text) diff --git a/vllm/utils.py b/vllm/utils.py index 632b3666e959c..79787303af5bc 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -845,7 +845,7 @@ def is_list_of( assert_never(check) -def flatten_2d_lists(lists: list[list[T]]) -> list[T]: +def flatten_2d_lists(lists: Iterable[Iterable[T]]) -> list[T]: """Flatten a list of lists to a single list.""" return [item for sublist in lists for item in sublist] From 3453b964a3ed84d99c9ae33bc0fae00790df36ef Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Sat, 15 Mar 2025 18:46:17 -0700 Subject: [PATCH 060/169] [Misc][Doc] Minor benchmark README update (#14874) Signed-off-by: Roger Wang --- benchmarks/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index c64c24fd3ad05..3225a4b0db3a0 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -82,10 +82,10 @@ Then run the benchmarking script # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B" NUM_PROMPTS=10 -BACKEND="openai-chat" +BACKEND="vllm" DATASET_NAME="sharegpt" DATASET_PATH="/ShareGPT_V3_unfiltered_cleaned_split.json" -python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS} +python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS} ``` If successful, you will see the following output From def232e122624504e49f1e5ff0ae01a7285de1a3 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 16 Mar 2025 09:53:52 +0800 Subject: [PATCH 061/169] [VLM] Clean up Phi-4-MM ViT implementation (#14812) Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung --- requirements/test.in | 1 + requirements/test.txt | 2 + .../vision_language/test_phi4mm.py | 229 ++ vllm/model_executor/models/aria.py | 4 +- .../models/idefics2_vision_model.py | 57 +- vllm/model_executor/models/phi4mm.py | 45 +- .../models/vision_siglip_navit.py | 1966 ----------------- 7 files changed, 316 insertions(+), 1988 deletions(-) create mode 100644 tests/models/decoder_only/vision_language/test_phi4mm.py delete mode 100644 vllm/model_executor/models/vision_siglip_navit.py diff --git a/requirements/test.in b/requirements/test.in index cc89d518c7eec..c171e8d41ddc2 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -8,6 +8,7 @@ pytest-shard # testing utils awscli +backoff # required for phi4mm test decord # required for video tests einops # required for MPT, qwen-vl and Mamba httpx diff --git a/requirements/test.txt b/requirements/test.txt index c2cdd2c8664d8..10fb1f14c3a18 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -33,6 +33,8 @@ audioread==3.0.1 # via librosa awscli==1.35.23 # via -r requirements/test.in +backoff==2.2.1 + # via -r requirements/test.in bitsandbytes==0.45.3 # via -r requirements/test.in black==24.10.0 diff --git a/tests/models/decoder_only/vision_language/test_phi4mm.py b/tests/models/decoder_only/vision_language/test_phi4mm.py new file mode 100644 index 0000000000000..fb69beaf77598 --- /dev/null +++ b/tests/models/decoder_only/vision_language/test_phi4mm.py @@ -0,0 +1,229 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os +import re +from typing import Optional + +import pytest +from huggingface_hub import snapshot_download +from transformers import AutoTokenizer + +from vllm.lora.request import LoRARequest +from vllm.multimodal.image import rescale_image_size +from vllm.platforms import current_platform +from vllm.sequence import SampleLogprobs + +from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner +from ....utils import large_gpu_test +from ...utils import check_logprobs_close + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501 + "cherry_blossom": + "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501 +}) +HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501 + +model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct") +# Since the vision-lora and speech-lora co-exist with the base model, +# we have to manually specify the path of the lora weights. +vision_lora_path = os.path.join(model_path, "vision-lora") +models = [model_path] + + +def vllm_to_hf_output(vllm_output: tuple[list[int], str, + Optional[SampleLogprobs]], + model: str): + """Sanitize vllm output to be comparable with hf output.""" + _, output_str, out_logprobs = vllm_output + + output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str) + assert output_str_without_image[0] == " " + output_str_without_image = output_str_without_image[1:] + + hf_output_str = output_str_without_image + "<|end|><|endoftext|>" + + tokenizer = AutoTokenizer.from_pretrained(model) + hf_output_ids = tokenizer.encode(output_str_without_image) + assert hf_output_ids[0] == 1 + hf_output_ids = hf_output_ids[1:] + + return hf_output_ids, hf_output_str, out_logprobs + + +target_dtype = "half" + +# ROCm Triton FA can run into shared memory issues with these models, +# use other backends in the meantime +# FIXME (mattwong, gshtrasb, hongxiayan) +if current_platform.is_rocm(): + os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0" + + +def run_test( + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + inputs: list[tuple[list[str], PromptImageInput]], + model: str, + *, + max_model_len: int, + dtype: str, + max_tokens: int, + num_logprobs: int, + mm_limit: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): + """Inference result should be the same between hf and vllm. + + All the image fixtures for the test are from IMAGE_ASSETS. + For huggingface runner, we provide the PIL images as input. + For vllm runner, we provide MultiModalDataDict objects + and corresponding MultiModalConfig as input. + Note, the text input is also adjusted to abide by vllm contract. + The text output is sanitized to be able to compare with hf. + """ + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default method). + # max_model_len should be greater than image_feature_size + with vllm_runner( + model, + task="generate", + max_model_len=max_model_len, + max_num_seqs=2, + dtype=dtype, + limit_mm_per_prompt={"image": mm_limit}, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enable_lora=True, + max_lora_rank=320, + lora_extra_vocab_size=0, + gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI + enforce_eager=True, + ) as vllm_model: + lora_request = LoRARequest("vision", 1, vision_lora_path) + vllm_model.model.llm_engine.add_lora(lora_request=lora_request) + vllm_outputs_per_case = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs + ] + + # use eager mode for hf runner, since phi3_v didn't work with flash_attn + hf_model_kwargs = {"_attn_implementation": "eager"} + with hf_runner(model, dtype=dtype, + model_kwargs=hf_model_kwargs) as hf_model: + eos_token_id = hf_model.processor.tokenizer.eos_token_id + hf_outputs_per_case = [ + hf_model.generate_greedy_logprobs_limit(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images, + eos_token_id=eos_token_id, + num_logits_to_keep=0) + for prompts, images in inputs + ] + + for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, + vllm_outputs_per_case): + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + +# Since we use _attn_implementation="eager" for hf_runner, there is more +# significant numerical difference. The basic `logprobs=5` fails to pass. +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # No image + [], + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.7, 0.75, 1.0], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_model_len", [4096]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [10]) +def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, + dtype: str, max_model_len: int, max_tokens: int, + num_logprobs: int) -> None: + images = [asset.pil_image for asset in image_assets] + + inputs_per_image = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + + run_test( + hf_runner, + vllm_runner, + inputs_per_image, + model, + dtype=dtype, + max_model_len=max_model_len, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + mm_limit=1, + tensor_parallel_size=1, + ) + + +@large_gpu_test(min_gb=48) +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # No image + # [], + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_model_len", [10000]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [10]) +@pytest.mark.xfail( + reason="Phi-4-MM multi-image inference is divergent with hf model.") +def test_multi_images_models(hf_runner, vllm_runner, image_assets, model, + size_factors, dtype: str, max_model_len: int, + max_tokens: int, num_logprobs: int) -> None: + images = [asset.pil_image for asset in image_assets] + + inputs_per_case = [ + ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], + [[rescale_image_size(image, factor) for image in images] + for factor in size_factors]) + ] + + run_test( + hf_runner, + vllm_runner, + inputs_per_case, + model, + dtype=dtype, + max_model_len=max_model_len, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + mm_limit=2, + tensor_parallel_size=1, + ) diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index ecd0a04b1dff7..8cd3be90ca8da 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -60,7 +60,7 @@ class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant): quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: - super().__init__(config, quant_config, prefix) + super().__init__(config, quant_config=quant_config, prefix=prefix) # Unlike Idefics3VisionTransformer which uses LayerNorm after the # final layer, Aria omits this normalization, so we replace it with an # Identity layer @@ -512,7 +512,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): self.config = config self.vision_tower = AriaVisionTransformer( config.vision_config, - quant_config, + quant_config=quant_config, prefix=f"{prefix}.vision_tower", ) self.multi_modal_projector = AriaProjector(config) diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index f9c2175b29881..cb0379c10f3a6 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -113,7 +113,7 @@ class Idefics2VisionAttention(nn.Module): def __init__( self, - config: Idefics2Config, + config: Idefics2VisionConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: @@ -164,7 +164,7 @@ class Idefics2VisionMLP(nn.Module): def __init__( self, - config: Idefics2Config, + config: Idefics2VisionConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: @@ -249,16 +249,24 @@ class Idefics2Encoder(nn.Module): self, config: Idefics2Config, quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, prefix: str = "", ) -> None: super().__init__() self.config = config + + if num_hidden_layers_override is None: + num_hidden_layers = config.num_hidden_layers + else: + num_hidden_layers = num_hidden_layers_override + self.layers = nn.ModuleList([ Idefics2EncoderLayer(config, quant_config=quant_config, prefix=f"{prefix}.layers.{layer_idx}") - for layer_idx in range(config.num_hidden_layers) + for layer_idx in range(num_hidden_layers) ]) def forward( @@ -287,6 +295,9 @@ class Idefics2VisionTransformer(nn.Module): self, config: Idefics2VisionConfig, quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + require_post_norm: bool = True, prefix: str = "", ) -> None: super().__init__() @@ -294,11 +305,24 @@ class Idefics2VisionTransformer(nn.Module): embed_dim = config.hidden_size self.config = config self.embeddings = Idefics2VisionEmbeddings(config) - self.encoder = Idefics2Encoder(config, - quant_config=quant_config, - prefix=f"{prefix}.encoder") - self.post_layernorm = nn.LayerNorm(embed_dim, - eps=config.layer_norm_eps) + self.encoder = Idefics2Encoder( + config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers_override, + prefix=f"{prefix}.encoder") + + num_hidden_layers = config.num_hidden_layers + if len(self.encoder.layers) > config.num_hidden_layers: + raise ValueError( + f"The original encoder only has {num_hidden_layers} " + f"layers, but you requested {len(self.encoder.layers)} layers." + ) + + self.require_post_norm = require_post_norm + self.post_layernorm = nn.LayerNorm( + embed_dim, + eps=config.layer_norm_eps, + ) if require_post_norm else nn.Identity() def get_input_embeddings(self): return self.embeddings @@ -328,7 +352,24 @@ class Idefics2VisionTransformer(nn.Module): ] params_dict = dict(self.named_parameters()) loaded_params: Set[str] = set() + layer_count = len(self.encoder.layers) + for name, loaded_weight in weights: + # skip pooling header + if name.startswith("head."): + continue + + # post_layernorm is optional + if (name.startswith("post_layernorm.") + and not self.require_post_norm): + continue + + # omit layers when num_hidden_layers_override is set + if name.startswith("encoder.layers."): + layer_idx = int(name.split(".")[2]) + if layer_idx >= layer_count: + continue + for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 2a839f3a50317..7250aaba557eb 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -11,7 +11,7 @@ import torch import torch.nn as nn import torchvision.transforms as T from PIL import Image -from transformers import PretrainedConfig +from transformers import PretrainedConfig, SiglipVisionConfig from transformers.utils import logging from vllm.config import VllmConfig @@ -32,10 +32,10 @@ from vllm.multimodal.inputs import MultiModalInputs, NestedTensors from vllm.sequence import IntermediateTensors, SequenceData from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from .idefics2_vision_model import Idefics2VisionTransformer from .interfaces import SupportsLoRA, SupportsMultiModal from .phi4mm_audio import AudioEmbedding from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix -from .vision_siglip_navit import get_siglip_vision_model # <|endoftext10|> (see vocab.json in hf model) _IMAGE_PLACEHOLDER_TOKEN_ID = 200010 @@ -339,6 +339,33 @@ def preprocess(images, dynamic_hd_size, vit_resolution, vit_patch_size): return data +def get_navit_vision_model(layer_idx: int = -1, **kwargs): + vision_config = { + "hidden_size": 1152, + "image_size": 448, + "intermediate_size": 4304, + "model_type": "siglip_vision_model", + "num_attention_heads": 16, + "num_hidden_layers": 27, + "patch_size": 14, + } + + model_config = SiglipVisionConfig(**vision_config, **kwargs) + if layer_idx < 0: + num_hidden_layers = model_config.num_hidden_layers \ + + layer_idx + 1 + else: + num_hidden_layers = layer_idx + 1 + + vision_model = Idefics2VisionTransformer( + config=model_config, + require_post_norm=False, + num_hidden_layers_override=num_hidden_layers, + ) + + return vision_model + + class Phi4MMImageEncoder(nn.Module): """Image embedding.""" @@ -362,8 +389,7 @@ class Phi4MMImageEncoder(nn.Module): self.layer_idx = -2 self.type_feature = 'patch' - self.img_processor = get_siglip_vision_model( - _flash_attn_2_enabled=True) + self.img_processor = get_navit_vision_model(layer_idx=self.layer_idx) pe_weight = self.img_processor.embeddings.position_embedding.weight L, D = pe_weight.size() @@ -430,16 +456,11 @@ class Phi4MMImageEncoder(nn.Module): def get_img_features(self, img_embeds: torch.FloatTensor, attention_mask=None) -> torch.FloatTensor: - LAYER_IDX = self.layer_idx - TYPE_FEATURE = self.type_feature - img_processor_output = self.img_processor( - img_embeds, - output_hidden_states=True, - patch_attention_mask=attention_mask) - img_feature = img_processor_output.hidden_states[LAYER_IDX] + img_feature = self.img_processor(img_embeds, + patch_attention_mask=attention_mask) - if TYPE_FEATURE == "patch": + if self.type_feature == "patch": patch_feature = img_feature use_token_compression = self.image_token_compression is not None diff --git a/vllm/model_executor/models/vision_siglip_navit.py b/vllm/model_executor/models/vision_siglip_navit.py deleted file mode 100644 index 3a9597a845ff9..0000000000000 --- a/vllm/model_executor/models/vision_siglip_navit.py +++ /dev/null @@ -1,1966 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Siglip model configuration""" - -import math -import os -import warnings -from dataclasses import dataclass -from typing import Any, Optional, Tuple, Union - -import numpy as np -import torch -import torch.nn.functional as F -import torch.utils.checkpoint -from torch import nn -from torch.nn.init import _calculate_fan_in_and_fan_out -from transformers.activations import ACT2FN -from transformers.configuration_utils import PretrainedConfig -from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask -from transformers.modeling_outputs import (BaseModelOutput, - BaseModelOutputWithPooling) -from transformers.modeling_utils import PreTrainedModel -from transformers.utils import (ModelOutput, add_start_docstrings, - add_start_docstrings_to_model_forward, logging, - replace_return_docstrings) - -from vllm.platforms import _Backend - -from .vision import get_vit_attn_backend - -logger = logging.get_logger(__name__) - -SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "google/siglip-base-patch16-224": - "https://huggingface.co/google/siglip-base-patch16-224/"\ - "resolve/main/config.json", -} - - -class SiglipTextConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a - [`SiglipTextModel`]. It is used to instantiate a Siglip text encoder - according to the specified arguments, defining the model architecture. - Instantiating a configuration with the defaults will yield a similar - configuration to that of the text encoder of the Siglip [google/ - siglip-base-patch16-224](https://huggingface.co/google/siglip-base - -patch16-224) architecture. - Configuration objects inherit from [`PretrainedConfig`] and can be used - to control the model outputs. Read the documentation from - [`PretrainedConfig`] for more information. - Args: - vocab_size (`int`, *optional*, defaults to 32000): - Vocabulary size of the Siglip text model. Defines the number of - different tokens that can be represented by the `inputs_ids` - passed when calling [`SiglipModel`]. - hidden_size (`int`, *optional*, defaults to 768): - Dimensionality of the encoder layers and the pooler layer. - intermediate_size (`int`, *optional*, defaults to 3072): - Dimensionality of the "intermediate" (i.e., feed-forward) layer - in the Transformer encoder. - num_hidden_layers (`int`, *optional*, defaults to 12): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each attention layer in the - Transformer encoder. - max_position_embeddings (`int`, *optional*, defaults to 64): - The maximum sequence length that this model might ever be used - with. Typically set this to something large - just in case (e.g., 512 or 1024 or 2048). - hidden_act (`str` or `function`, *optional*, defaults to - `"gelu_pytorch_tanh"`): - The non-linear activation function (function or string) in the - encoder and pooler. If string, `"gelu"`, - `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. - layer_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the layer normalization layers. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - pad_token_id (`int`, *optional*, defaults to 1): - The id of the padding token in the vocabulary. - bos_token_id (`int`, *optional*, defaults to 49406): - The id of the beginning-of-sequence token in the vocabulary. - eos_token_id (`int`, *optional*, defaults to 49407): - The id of the end-of-sequence token in the vocabulary. - Example: - ```python - >>> from transformers import SiglipTextConfig, SiglipTextModel - >>> # Initializing a SiglipTextConfig with google/siglip-base-patch16-224 - style configuration - >>> configuration = SiglipTextConfig() - >>> # Initializing a SiglipTextModel (with random weights) from the - google/siglip-base-patch16-224 style configuration - >>> model = SiglipTextModel(configuration) - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "siglip_text_model" - - def __init__( - self, - vocab_size=32000, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - max_position_embeddings=64, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - # This differs from `CLIPTokenizer`'s default and from openai/siglip - # See https://github.com/huggingface/transformers/pull/24773# - # issuecomment-1632287538 - pad_token_id=1, - bos_token_id=49406, - eos_token_id=49407, - _flash_attn_2_enabled=True, - **kwargs, - ): - super().__init__(pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - **kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.max_position_embeddings = max_position_embeddings - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.attention_dropout = attention_dropout - self._flash_attn_2_enabled = _flash_attn_2_enabled - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, - os.PathLike], - **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict( - pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from SiglipConfig - if config_dict.get("model_type") == "siglip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr( - cls, - "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - "You are using a model of type %s to instantiate a model of " - "type %s. This is not supported for all configurations of " - "models and can yield errors.", config_dict['model_type'], - cls.model_type) - - return cls.from_dict(config_dict, **kwargs) - - -class SiglipVisionConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a - [`SiglipVisionModel`]. It is used to instantiate a - Siglip vision encoder according to the specified arguments, defining the - model architecture. Instantiating a configuration with the defaults will - yield a similar configuration to that of the vision encoder of the Siglip - [google/siglip-base-patch16-224](https://huggingface.co/google/ - siglip-base-patch16-224) architecture. - Configuration objects inherit from [`PretrainedConfig`] and can be used - to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - Args: - hidden_size (`int`, *optional*, defaults to 768): - Dimensionality of the encoder layers and the pooler layer. - intermediate_size (`int`, *optional*, defaults to 3072): - Dimensionality of the "intermediate" (i.e., feed-forward) layer - in the Transformer encoder. - num_hidden_layers (`int`, *optional*, defaults to 12): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each attention layer in the - Transformer encoder. - num_channels (`int`, *optional*, defaults to 3): - Number of channels in the input images. - image_size (`int`, *optional*, defaults to 224): - The size (resolution) of each image. - patch_size (`int`, *optional*, defaults to 16): - The size (resolution) of each patch. - hidden_act (`str` or `function`, *optional*, defaults to - `"gelu_pytorch_tanh"`): - The non-linear activation function (function or string) in the - encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and - `"gelu_new"` ``"quick_gelu"` are supported. - layer_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the layer normalization layers. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - Example: - ```python - >>> from transformers import SiglipVisionConfig, SiglipVisionModel - >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 - style configuration - >>> configuration = SiglipVisionConfig() - >>> # Initializing a SiglipVisionModel (with random weights) from the - google/siglip-base-patch16-224 style configuration - >>> model = SiglipVisionModel(configuration) - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "siglip_vision_model" - - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=224, - patch_size=16, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - _flash_attn_2_enabled=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self._flash_attn_2_enabled = _flash_attn_2_enabled - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, - os.PathLike], - **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict( - pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from SiglipConfig - if config_dict.get("model_type") == "siglip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr( - cls, - "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - "You are using a model of type %s to " - "instantiate a model of type %s. This is not" - " supported for all configurations of models and can yield" - " errors.", config_dict['model_type'], cls.model_type) - - return cls.from_dict(config_dict, **kwargs) - - -class SiglipConfig(PretrainedConfig): - r""" - [`SiglipConfig`] is the configuration class to store the configuration of a - [`SiglipModel`]. It is used to instantiate a Siglip model according to the - specified arguments, defining the text model and vision model configs. - Instantiating a configuration with the defaults will yield a similar - configuration to that of the Siglip [google/siglip-base-patch16-224]( - https://huggingface.co/google/siglip-base-patch16-224) architecture. - Configuration objects inherit from [`PretrainedConfig`] and can be used to - control the model outputs. Read the documentation from - [`PretrainedConfig`] for more information. - Args: - text_config (`dict`, *optional*): - Dictionary of configuration options used to initialize - [`SiglipTextConfig`]. - vision_config (`dict`, *optional*): - Dictionary of configuration options used to initialize - [`SiglipVisionConfig`]. - kwargs (*optional*): - Dictionary of keyword arguments. - Example: - ```python - >>> from transformers import SiglipConfig, SiglipModel - >>> # Initializing a SiglipConfig with google/siglip-base-patch16-224 - style configuration - >>> configuration = SiglipConfig() - >>> # Initializing a SiglipModel (with random weights) from the - google/siglip-base-patch16-224 style configuration - >>> model = SiglipModel(configuration) - >>> # Accessing the model configuration - >>> configuration = model.config - >>> # We can also initialize a SiglipConfig from a SiglipTextConfig - and a SiglipVisionConfig - >>> from transformers import SiglipTextConfig, SiglipVisionConfig - >>> # Initializing a SiglipText and SiglipVision configuration - >>> config_text = SiglipTextConfig() - >>> config_vision = SiglipVisionConfig() - >>> config = SiglipConfig.from_text_vision_configs(config_text, - config_vision) - ```""" - - model_type = "siglip" - - def __init__(self, text_config=None, vision_config=None, **kwargs): - super().__init__(**kwargs) - - if text_config is None: - text_config = {} - logger.info( - "`text_config` is `None`. Initializing the `SiglipTextConfig`" - " with default values.") - - if vision_config is None: - vision_config = {} - logger.info("`vision_config` is `None`. initializing the " - "`SiglipVisionConfig` with default values.") - - self.text_config = SiglipTextConfig(**text_config) - self.vision_config = SiglipVisionConfig(**vision_config) - - self.initializer_factor = 1.0 - - @classmethod - def from_text_vision_configs(cls, text_config: SiglipTextConfig, - vision_config: SiglipVisionConfig, **kwargs): - r""" - Instantiate a [`SiglipConfig`] (or a derived class) from siglip text - model configuration and siglip vision - model configuration. - Returns: - [`SiglipConfig`]: An instance of a configuration object - """ - - return cls(text_config=text_config.to_dict(), - vision_config=vision_config.to_dict(), - **kwargs) - - -# coding=utf-8 -# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch Siglip model.""" - -_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224" - -SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "google/siglip-base-patch16-224", - # See all SigLIP models at https://huggingface.co/models?filter=siglip -] - - -# Copied from transformers.models.llama.modeling_llama._get_unpad_data -def _get_unpad_data(attention_mask): - seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) - indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() - max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad( - torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) - return ( - indices, - cu_seqlens, - max_seqlen_in_batch, - ) - - -def _trunc_normal_(tensor, mean, std, a, b): - # Cut & paste from PyTorch official master until it's in a few official - # releases - RW - # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/ - # truncated_normal.pdf - def norm_cdf(x): - # Computes standard normal cumulative distribution function - return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 - - if (mean < a - 2 * std) or (mean > b + 2 * std): - warnings.warn( - "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " - "The distribution of values may be incorrect.", - stacklevel=2, - ) - - # Values are generated by using a truncated uniform distribution and - # then using the inverse CDF for the normal distribution. - # Get upper and lower cdf values - l = norm_cdf((a - mean) / std) # noqa - u = norm_cdf((b - mean) / std) # noqa - - # Uniformly fill tensor with values from [l, u], then translate to - # [2l-1, 2u-1]. - tensor.uniform_(2 * l - 1, 2 * u - 1) - - # Use inverse cdf transform for normal distribution to get truncated - # standard normal - if tensor.dtype in [torch.float16, torch.bfloat16]: - # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu - og_dtype = tensor.dtype - tensor = tensor.to(torch.float32) - tensor.erfinv_() - tensor = tensor.to(og_dtype) - else: - tensor.erfinv_() - - # Transform to proper mean, std - tensor.mul_(std * math.sqrt(2.0)) - tensor.add_(mean) - - # Clamp to ensure it's in the proper range - if tensor.dtype == torch.float16: - # The `clamp_` op is not (yet?) defined in float16+cpu - tensor = tensor.to(torch.float32) - tensor.clamp_(min=a, max=b) - tensor = tensor.to(torch.float16) - else: - tensor.clamp_(min=a, max=b) - - -def trunc_normal_tf_(tensor: torch.Tensor, - mean: float = 0.0, - std: float = 1.0, - a: float = -2.0, - b: float = 2.0) -> torch.Tensor: - """Fills the input Tensor with values drawn from a truncated - normal distribution. The values are effectively drawn from the - normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)` - with values outside :math:`[a, b]` redrawn until they are within - the bounds. The method used for generating the random values works - best when :math:`a \\leq \text{mean} \\leq b`. - NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where - the bounds [a, b] are applied when sampling the normal distribution with - mean=0, std=1.0 and the result is subsequently scaled and shifted by the - mean and std args. - Args: - tensor: an n-dimensional `torch.Tensor` - mean: the mean of the normal distribution - std: the standard deviation of the normal distribution - a: the minimum cutoff value - b: the maximum cutoff value - """ - with torch.no_grad(): - _trunc_normal_(tensor, 0, 1.0, a, b) - tensor.mul_(std).add_(mean) - - -def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"): - fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) - if mode == "fan_in": - denom = fan_in - elif mode == "fan_out": - denom = fan_out - elif mode == "fan_avg": - denom = (fan_in + fan_out) / 2 - - variance = scale / denom - - if distribution == "truncated_normal": - # constant is stddev of standard normal truncated to (-2, 2) - trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978) - elif distribution == "normal": - with torch.no_grad(): - tensor.normal_(std=math.sqrt(variance)) - elif distribution == "uniform": - bound = math.sqrt(3 * variance) - with torch.no_grad(): - tensor.uniform_(-bound, bound) - else: - raise ValueError(f"invalid distribution {distribution}") - - -def lecun_normal_(tensor): - variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal") - - -def default_flax_embed_init(tensor): - variance_scaling_(tensor, mode="fan_in", distribution="normal") - - -@dataclass -# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with -# CLIP->Siglip -class SiglipVisionModelOutput(ModelOutput): - """ - Base class for vision model's outputs that also contains image embeddings - of the pooling of the last hidden states. - Args: - image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` - *optional* returned when model is initialized with - `with_projection=True`): - The image embeddings obtained by applying the projection layer to - the pooler_output. - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, - sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the - model. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_hidden_states=True` is passed or when - `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, - if the model has an embedding layer, + one for the output of each - layer) of shape `(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the - optional initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_attentions=True` is passed or when - `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape - `(batch_size, num_heads, sequence_length, sequence_length)`. - Attentions weights after the attention softmax, used to compute the - weighted average in the self-attention heads. - """ - - image_embeds: Optional[torch.FloatTensor] = None - last_hidden_state: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -@dataclass -# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with -# CLIP->Siglip -class SiglipTextModelOutput(ModelOutput): - """ - Base class for text model's outputs that also contains a pooling of the - last hidden states. - Args: - text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` - *optional* returned when model is initialized with - `with_projection=True`): - The text embeddings obtained by applying the projection layer to - model. - the pooler_output. - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, - sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_hidden_states=True` is passed or when - `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the - embeddings, if the model has an embedding layer, + one for the - output of each layer) of shape `(batch_size, sequence_length, - hidden_size)`. - Hidden-states of the model at the output of each layer plus the - optional initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when - `output_attentions=True` is passed or when - `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape - `(batch_size, num_heads, sequence_length, sequence_length)`. - Attentions weights after the attention softmax, used to compute - the weighted average in the self-attention heads. - """ - - text_embeds: Optional[torch.FloatTensor] = None - last_hidden_state: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -@dataclass -# Copied from transformers.models.clip.modeling_clip.CLIPOutput with -# CLIP->Siglip -class SiglipOutput(ModelOutput): - """ - Args: - loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when - `return_loss` is `True`): - Contrastive loss for image-text similarity. - logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, - text_batch_size)`): - The scaled dot product scores between `image_embeds` and - `text_embeds`. This represents the image-text similarity scores. - logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, - image_batch_size)`): - The scaled dot product scores between `text_embeds` and - `image_embeds`. This represents the text-image similarity scores. - text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): - The text embeddings obtained by applying the projection layer to - the pooled output of [`SiglipTextModel`]. - image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): - The image embeddings obtained by applying the projection layer to - the pooled output of [`SiglipVisionModel`]. - text_model_output(`BaseModelOutputWithPooling`): - The output of the [`SiglipTextModel`]. - vision_model_output(`BaseModelOutputWithPooling`): - The output of the [`SiglipVisionModel`]. - """ - - loss: Optional[torch.FloatTensor] = None - logits_per_image: torch.FloatTensor = None - logits_per_text: torch.FloatTensor = None - text_embeds: torch.FloatTensor = None - image_embeds: torch.FloatTensor = None - text_model_output: BaseModelOutputWithPooling = None - vision_model_output: BaseModelOutputWithPooling = None - - def to_tuple(self) -> Tuple[Any]: - return tuple( - self[k] if k not in ["text_model_output", "vision_model_output" - ] else getattr(self, k).to_tuple() - for k in self.keys()) - - -class SiglipVisionEmbeddings(nn.Module): - - def __init__(self, config: SiglipVisionConfig): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.image_size = config.image_size - self.patch_size = config.patch_size - - self.patch_embedding = nn.Conv2d( - in_channels=config.num_channels, - out_channels=self.embed_dim, - kernel_size=self.patch_size, - stride=self.patch_size, - padding="valid", - ) - - self.num_patches_per_side = self.image_size // self.patch_size - self.num_patches = self.num_patches_per_side**2 - self.num_positions = self.num_patches - self.position_embedding = nn.Embedding(self.num_positions, - self.embed_dim) - - def forward(self, pixel_values: torch.FloatTensor, - patch_attention_mask: torch.BoolTensor) -> torch.Tensor: - batch_size = pixel_values.size(0) - - patch_embeds = self.patch_embedding(pixel_values) - embeddings = patch_embeds.flatten(2).transpose(1, 2) - - max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3) - max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, \ - max_im_w // self.patch_size - boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, - 1 / self.num_patches_per_side) - position_ids = torch.full( - size=( - batch_size, - max_nb_patches_h * max_nb_patches_w, - ), - fill_value=0, - ) - - for batch_idx, p_attn_mask in enumerate(patch_attention_mask): - nb_patches_h = p_attn_mask[:, 0].sum() - nb_patches_w = p_attn_mask[0].sum() - - fractional_coords_h = torch.linspace(0, 1 - 1 / nb_patches_h, - nb_patches_h) - fractional_coords_w = torch.linspace(0, 1 - 1 / nb_patches_w, - nb_patches_w) - - bucket_coords_h = torch.bucketize(fractional_coords_h, - boundaries, - right=True) - bucket_coords_w = torch.bucketize(fractional_coords_w, - boundaries, - right=True) - - pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + - bucket_coords_w).flatten() - position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids - - position_ids = position_ids.to(self.position_embedding.weight.device) - - embeddings = embeddings + self.position_embedding(position_ids) - return embeddings - - -# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with -# CLIP->Siglip -class SiglipTextEmbeddings(nn.Module): - - def __init__(self, config: SiglipTextConfig): - super().__init__() - embed_dim = config.hidden_size - - self.token_embedding = nn.Embedding(config.vocab_size, embed_dim) - self.position_embedding = nn.Embedding(config.max_position_embeddings, - embed_dim) - - # position_ids (1, len position emb) is contiguous in memory and - # exported when serialized - self.register_buffer( - "position_ids", - torch.arange(config.max_position_embeddings).expand((1, -1)), - persistent=False) - - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - ) -> torch.Tensor: - seq_length = input_ids.shape[ - -1] if input_ids is not None else inputs_embeds.shape[-2] - - if position_ids is None: - position_ids = self.position_ids[:, :seq_length] - - if inputs_embeds is None: - inputs_embeds = self.token_embedding(input_ids) - - position_embeddings = self.position_embedding(position_ids) - embeddings = inputs_embeds + position_embeddings - - return embeddings - - -class SiglipAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__ - def __init__(self, config): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`:" - f" {self.embed_dim} and `num_heads`: {self.num_heads}).") - self.scale = self.head_dim**-0.5 - self.dropout = config.attention_dropout - - self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], - Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" - - batch_size, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(batch_size, q_len, self.num_heads, - self.head_dim).transpose(1, 2) - key_states = key_states.view(batch_size, q_len, self.num_heads, - self.head_dim).transpose(1, 2) - value_states = value_states.view(batch_size, q_len, self.num_heads, - self.head_dim).transpose(1, 2) - - k_v_seq_len = key_states.shape[-2] - attn_weights = torch.matmul(query_states, key_states.transpose( - 2, 3)) * self.scale - - if attn_weights.size() != (batch_size, self.num_heads, q_len, - k_v_seq_len): - raise ValueError( - f"Attention weights should be of size " - f"{(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is" - f" {attn_weights.size()}") - - if attention_mask is not None: - if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len): - raise ValueError(f"Attention mask should be of size " - f"{(batch_size, 1, q_len, k_v_seq_len)}, " - f"but is {attention_mask.size()}") - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, - dim=-1, - dtype=torch.float32).to( - query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, - p=self.dropout, - training=self.training) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (batch_size, self.num_heads, q_len, - self.head_dim): - raise ValueError( - f"`attn_output` should be of size " - f"{(batch_size, self.num_heads, q_len, self.head_dim)}, but is" - f" {attn_output.size()}") - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim) - - attn_output = self.out_proj(attn_output) - - return attn_output, attn_weights - - -class SiglipFlashAttention2(SiglipAttention): - """ - Llama flash attention module. This module inherits from `LlamaAttention` as - the weights of the module stays untouched. The only required change would - be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any - of them. - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.is_causal = False # Hack to make sure we don't use a causal mask - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], - Optional[Tuple[torch.Tensor]]]: - output_attentions = False - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # Flash attention requires the input to have the shape - # batch_size x seq_length x head_dim x hidden_dim - # therefore we just need to keep the original shape - query_states = query_states.view(bsz, q_len, self.num_heads, - self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_heads, - self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_heads, - self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value.get_usable_length( - kv_seq_len, self.layer_idx) - - # TODO: These transpose are quite inefficient but Flash Attention - # requires the layout [batch_size, sequence_length, num_heads, - # head_dim]. We would need to refactor the KV cache - # to be able to avoid many of these transpose/reshape/view. - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - dropout_rate = self.dropout if self.training else 0.0 - - # In PEFT, usually we cast the layer norms in float32 for training - # stability reasons therefore the input hidden states gets silently - # casted in float32. Hence, we need cast them back in the correct - # dtype just to be sure everything works as expected. - # This might slowdown training & inference so it is recommended to - # not cast the LayerNorms in fp32. (LlamaRMSNorm handles it correctly) - - input_dtype = query_states.dtype - if input_dtype == torch.float32: - if torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - # Handle the case where the model is quantized - elif hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once( - "The input hidden states seems to be silently casted in " - "float32, this might be related to the fact you have upcasted " - "embedding or layer norm layers in float32. We will cast " - f"back the input in {target_dtype}.") - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - attn_output = self._flash_attention_forward(query_states, - key_states, - value_states, - attention_mask, - q_len, - dropout=dropout_rate) - - attn_output = attn_output.reshape(bsz, q_len, - self.embed_dim).contiguous() - attn_output = self.out_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights - - def _flash_attention_forward(self, - query_states, - key_states, - value_states, - attention_mask, - query_length, - dropout=0.0, - softmax_scale=None): - """ - Calls the forward method of Flash Attention - if the input hidden - states contain at least one padding token first unpad the input, - then computes the attention scores and pad the final attention - scores. - Args: - query_states (`torch.Tensor`): - Input query states to be passed to Flash Attention API - key_states (`torch.Tensor`): - Input key states to be passed to Flash Attention API - value_states (`torch.Tensor`): - Input value states to be passed to Flash Attention API - attention_mask (`torch.Tensor`): - The padding mask - corresponds to a tensor of size - `(batch_size, seq_len)` where 0 stands for the position - of padding tokens and 1 for the position of non-padding - tokens. - dropout (`int`, *optional*): - Attention dropout - softmax_scale (`float`, *optional*): - The scaling of QK^T before applying softmax. Default to 1 / - sqrt(head_dim) - """ - from flash_attn import flash_attn_func, flash_attn_varlen_func - from flash_attn.bert_padding import pad_input # noqa - - # TODO: Remove the `query_length != 1` check once Flash Attention for - # RoCm is bumped to 2.1. For details, please see the comment in - # LlamaFlashAttention2 __init__. - causal = self.is_causal and query_length != 1 - - # Contains at least one padding token in the sequence - if attention_mask is not None: - batch_size = query_states.shape[0] - query_states, key_states, value_states, indices_q, cu_seq_lens, \ - max_seq_lens = self._upad_input( - query_states, key_states, value_states, attention_mask, - query_length) - - cu_seqlens_q, cu_seqlens_k = cu_seq_lens - max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - - attn_output = pad_input(attn_output_unpad, indices_q, batch_size, - query_length) - else: - attn_output = flash_attn_func(query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal) - - return attn_output - - def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, - query_length): - from flash_attn.bert_padding import index_first_axis, unpad_input - indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data( - attention_mask) - batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape - - key_layer = index_first_axis( - key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, - head_dim), indices_k) - value_layer = index_first_axis( - value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, - head_dim), indices_k) - if query_length == kv_seq_len: - query_layer = index_first_axis( - query_layer.reshape(batch_size * kv_seq_len, self.num_heads, - head_dim), indices_k) - cu_seqlens_q = cu_seqlens_k - max_seqlen_in_batch_q = max_seqlen_in_batch_k - indices_q = indices_k - elif query_length == 1: - max_seqlen_in_batch_q = 1 - cu_seqlens_q = torch.arange( - batch_size + 1, dtype=torch.int32, device=query_layer.device - ) # There is a memcpy here, that is very bad. - indices_q = cu_seqlens_q[:-1] - query_layer = query_layer.squeeze(1) - else: - # The -q_len: slice assumes left padding. - attention_mask = attention_mask[:, -query_length:] - query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = \ - unpad_input(query_layer, attention_mask) - - return ( - query_layer, - key_layer, - value_layer, - indices_q, - (cu_seqlens_q, cu_seqlens_k), - (max_seqlen_in_batch_q, max_seqlen_in_batch_k), - ) - - -# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip -class SiglipMLP(nn.Module): - - def __init__(self, config): - super().__init__() - self.config = config - self.activation_fn = ACT2FN[config.hidden_act] - self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) - self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.fc1(hidden_states) - hidden_states = self.activation_fn(hidden_states) - hidden_states = self.fc2(hidden_states) - return hidden_states - - -# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with -# CLIP->Siglip -class SiglipEncoderLayer(nn.Module): - - def __init__(self, config: SiglipConfig): - super().__init__() - self.embed_dim = config.hidden_size - self.self_attn = (SiglipAttention(config) if - not getattr(config, "_flash_attn_2_enabled", False) - else SiglipFlashAttention2(config)) - self.layer_norm1 = nn.LayerNorm(self.embed_dim, - eps=config.layer_norm_eps) - self.mlp = SiglipMLP(config) - self.layer_norm2 = nn.LayerNorm(self.embed_dim, - eps=config.layer_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: torch.Tensor, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.FloatTensor]: - """ - Args: - hidden_states (`torch.FloatTensor`): - Input to the layer of shape `(batch, seq_len, embed_dim)`. - attention_mask (`torch.FloatTensor`): - Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where - padding elements are indicated by very large negative values. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all - attention layers. See `attentions` under returned tensors for - more detail. - """ - residual = hidden_states - - hidden_states = self.layer_norm1(hidden_states) - hidden_states, attn_weights = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - output_attentions=output_attentions, - ) - hidden_states = residual + hidden_states - - residual = hidden_states - hidden_states = self.layer_norm2(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - outputs = (hidden_states, ) - - if output_attentions: - outputs += (attn_weights, ) - - return outputs - - -class SiglipPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface - for downloading and loading pretrained models. - """ - - config_class = SiglipConfig - base_model_prefix = "siglip" - supports_gradient_checkpointing = True - - def _init_weights(self, module): - """Initialize the weights""" - - if isinstance(module, SiglipVisionEmbeddings): - width = (self.config.vision_config.hidden_size if isinstance( - self.config, SiglipConfig) else self.config.hidden_size) - nn.init.normal_(module.position_embedding.weight, - std=1 / np.sqrt(width)) - elif isinstance(module, nn.Embedding): - default_flax_embed_init(module.weight) - elif isinstance(module, SiglipAttention): - nn.init.normal_(module.q_proj.weight) - nn.init.normal_(module.k_proj.weight) - nn.init.normal_(module.v_proj.weight) - nn.init.normal_(module.out_proj.weight) - nn.init.zeros_(module.q_proj.bias) - nn.init.zeros_(module.k_proj.bias) - nn.init.zeros_(module.v_proj.bias) - nn.init.zeros_(module.out_proj.bias) - elif isinstance(module, SiglipMLP): - nn.init.normal_(module.fc1.weight) - nn.init.normal_(module.fc2.weight) - nn.init.normal_(module.fc1.bias, std=1e-6) - nn.init.normal_(module.fc2.bias, std=1e-6) - elif isinstance(module, SiglipMultiheadAttentionPoolingHead): - nn.init.normal_(module.probe.data) - nn.init.normal_(module.attention.in_proj_weight.data) - nn.init.zeros_(module.attention.in_proj_bias.data) - elif isinstance(module, SiglipModel): - logit_scale_init = torch.tensor(0.0) - module.logit_scale.data.fill_(logit_scale_init) - module.logit_bias.data.zero_() - elif isinstance(module, (nn.Linear, nn.Conv2d)): - lecun_normal_(module.weight) - if module.bias is not None: - nn.init.zeros_(module.bias) - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - - -SIGLIP_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass - documentation for the generic methods the library implements for all - its model (such as downloading or saving, resizing the input embeddings, - pruning heads etc.) - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/ - stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation - for all matter related to general usage and behavior. - Parameters: - config ([`SiglipConfig`]): Model configuration class with all the - parameters of the model. - Initializing with a config file does not load the weights - associated with the model, only the configuration. Check out - the [`~PreTrainedModel.from_pretrained`] method to load the - model weights. -""" - -SIGLIP_TEXT_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length) - `): - Indices of input sequence tokens in the vocabulary. Padding will - be ignored by default should you provide it. - Indices can be obtained using [`AutoTokenizer`]. See - [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] - for details. [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, - sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask - values selected in `[0, 1]`: - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - [What are attention masks?](../glossary#attention-mask) - position_ids (`torch.LongTensor` of shape `(batch_size, - sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position - embeddings. Selected in the range `[0, - config.max_position_embeddings - 1]`. - [What are position IDs?](../glossary#position-ids) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention - layers. See `attentions` under returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See - `hidden_states` under returned tensors for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a - plain tuple. -""" - -SIGLIP_VISION_INPUTS_DOCSTRING = r""" - Args: - pixel_values (`torch.FloatTensor` of shape `(batch_size, - num_channels, height, width)`): - Pixel values. Padding will be ignored by default should you - provide it. Pixel values can be obtained using - [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] - for details. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention - layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See - `hidden_states` under returned tensors for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a - plain tuple. -""" - -SIGLIP_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, - sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding - will be ignored by default should you provide it. - Indices can be obtained using [`AutoTokenizer`]. See - [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] - for details. [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)` - , *optional*): - Mask to avoid performing attention on padding token indices. Mask - values selected in `[0, 1]`: - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - [What are attention masks?](../glossary#attention-mask) - position_ids (`torch.LongTensor` of shape `(batch_size, - sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position - embeddings. Selected in the range `[0, - config.max_position_embeddings - 1]`. - [What are position IDs?](../glossary#position-ids) - pixel_values (`torch.FloatTensor` of shape `(batch_size, - num_channels, height, width)`): - Pixel values. Padding will be ignored by default should you - provide it. Pixel values can be obtained using - [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] - for details. - return_loss (`bool`, *optional*): - Whether or not to return the contrastive loss. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention - layers. See `attentions` under returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See - `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a - plain tuple. -""" - - -# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with -# CLIP->Siglip -class SiglipEncoder(nn.Module): - """ - Transformer encoder consisting of `config.num_hidden_layers` - self attention layers. Each layer is a [`SiglipEncoderLayer`]. - Args: - config: SiglipConfig - """ - - def __init__(self, config: SiglipConfig): - super().__init__() - self.config = config - self.layers = nn.ModuleList([ - SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers) - ]) - self.gradient_checkpointing = False - - # Ignore copy - def forward( - self, - inputs_embeds, - attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: - r""" - Args: - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, - sequence_length, hidden_size)`): - Optionally, instead of passing `input_ids` you can choose to - directly pass an embedded representation. - This is useful if you want more control over how to convert - `input_ids` indices into associated vectors - than the model's internal embedding lookup matrix. - attention_mask (`torch.Tensor` of shape `(batch_size, - sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. - Mask values selected in `[0, 1]`: - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all - attention layers. See `attentions` under returned tensors for - more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See - `hidden_states` under returned tensors for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a - plain tuple. - """ - output_attentions = output_attentions if output_attentions \ - is not None else self.config.output_attentions - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = return_dict if return_dict is not None else \ - self.config.use_return_dict - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - - hidden_states = inputs_embeds - for encoder_layer in self.layers: - if output_hidden_states: - encoder_states = encoder_states + (hidden_states, ) - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - encoder_layer.__call__, - hidden_states, - attention_mask, - output_attentions, - ) - else: - layer_outputs = encoder_layer( - hidden_states, - attention_mask, - output_attentions=output_attentions, - ) - - hidden_states = layer_outputs[0] - - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1], ) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states, ) - - if not return_dict: - return tuple( - v for v in [hidden_states, encoder_states, all_attentions] - if v is not None) - return BaseModelOutput(last_hidden_state=hidden_states, - hidden_states=encoder_states, - attentions=all_attentions) - - -class SiglipTextTransformer(nn.Module): - - def __init__(self, config: SiglipTextConfig): - super().__init__() - self.config = config - embed_dim = config.hidden_size - self.embeddings = SiglipTextEmbeddings(config) - self.encoder = SiglipEncoder(config) - self.final_layer_norm = nn.LayerNorm(embed_dim, - eps=config.layer_norm_eps) - - self.head = nn.Linear(embed_dim, embed_dim) - - @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=BaseModelOutputWithPooling, - config_class=SiglipTextConfig) - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - r""" - Returns: - """ - output_attentions = output_attentions if output_attentions \ - is not None else self.config.output_attentions - output_hidden_states = (output_hidden_states - if output_hidden_states \ - is not None else - self.config.output_hidden_states) - return_dict = return_dict if return_dict is not None else \ - self.config.use_return_dict - - if input_ids is None: - raise ValueError("You have to specify input_ids") - - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) - - hidden_states = self.embeddings(input_ids=input_ids, - position_ids=position_ids) - - # note: SigLIP's text model does not use a causal mask, unlike the - # original CLIP model. - # expand attention_mask - if attention_mask is not None: - # [batch_size, seq_len] -> - # [batch_size, 1, tgt_seq_len, src_seq_len] - attention_mask = _prepare_4d_attention_mask( - attention_mask, hidden_states.dtype) - - encoder_outputs = self.encoder( - inputs_embeds=hidden_states, - attention_mask=attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - last_hidden_state = encoder_outputs[0] - last_hidden_state = self.final_layer_norm(last_hidden_state) - - # Assuming "sticky" EOS tokenization, last token is always EOS. - pooled_output = last_hidden_state[:, -1, :] - pooled_output = self.head(pooled_output) - - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - - return BaseModelOutputWithPooling( - last_hidden_state=last_hidden_state, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) - - -@add_start_docstrings( - """The text model from SigLIP without any head or projection on top.""", - SIGLIP_START_DOCSTRING, -) -class SiglipTextModel(SiglipPreTrainedModel): - config_class = SiglipTextConfig - - _no_split_modules = ["SiglipTextEmbeddings", "SiglipEncoderLayer"] - - def __init__(self, config: SiglipTextConfig): - super().__init__(config) - self.text_model = SiglipTextTransformer(config) - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self) -> nn.Module: - return self.text_model.embeddings.token_embedding - - def set_input_embeddings(self, value): - self.text_model.embeddings.token_embedding = value - - @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=BaseModelOutputWithPooling, - config_class=SiglipTextConfig) - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - r""" - Returns: - Examples: - ```python - >>> from transformers import AutoTokenizer, SiglipTextModel - >>> model = SiglipTextModel. - from_pretrained("google/siglip-base-patch16-224") - >>> tokenizer = AutoTokenizer. - from_pretrained("google/siglip-base-patch16-224") - >>> # important: make sure to set padding="max_length" - as that's how the model was trained - >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], - padding="max_length", return_tensors="pt") - >>> outputs = model(**inputs) - >>> last_hidden_state = outputs.last_hidden_state - >>> pooled_output = outputs.pooler_output # pooled (EOS token) - states - ```""" - return_dict = return_dict if return_dict is not None else \ - self.config.use_return_dict - - return self.text_model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - -class SiglipVisionTransformer(nn.Module): - - def __init__(self, config: SiglipVisionConfig): - super().__init__() - self.config = config - embed_dim = config.hidden_size - - self.embeddings = SiglipVisionEmbeddings(config) - self.encoder = SiglipEncoder(config) - self.post_layernorm = nn.LayerNorm(embed_dim, - eps=config.layer_norm_eps) - self.head = SiglipMultiheadAttentionPoolingHead(config) - - @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=BaseModelOutputWithPooling, - config_class=SiglipVisionConfig) - def forward( - self, - pixel_values, - patch_attention_mask: Optional[torch.BoolTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - r""" - Returns: - """ - output_attentions = output_attentions if output_attentions is not None\ - else self.config.output_attentions - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = return_dict if return_dict is not None \ - else self.config.use_return_dict - - batch_size = pixel_values.size(0) - if patch_attention_mask is None: - patch_attention_mask = torch.ones( - size=( - batch_size, - pixel_values.size(2) // self.config.patch_size, - pixel_values.size(3) // self.config.patch_size, - ), - dtype=torch.bool, - device=pixel_values.device, - ) - - hidden_states = self.embeddings( - pixel_values=pixel_values, - patch_attention_mask=patch_attention_mask) - - patch_attention_mask = patch_attention_mask.view(batch_size, -1) - # The call to `_upad_input` in `_flash_attention_forward` is expensive - # So when the `patch_attention_mask` is full of 1s (i.e. attending - # to the whole sequence), avoiding passing the attention_mask, which - # is equivalent to attending to the full sequence - if not torch.any(~patch_attention_mask): - attention_mask = None - else: - attention_mask = (_prepare_4d_attention_mask( - patch_attention_mask, hidden_states.dtype) - if not self.config._flash_attn_2_enabled else - patch_attention_mask) - - encoder_outputs = self.encoder( - inputs_embeds=hidden_states, - attention_mask=attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - last_hidden_state = encoder_outputs[0] - last_hidden_state = self.post_layernorm(last_hidden_state) - - pooled_output = self.head( - hidden_state=last_hidden_state, - attention_mask=patch_attention_mask, - ) - - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - - return BaseModelOutputWithPooling( - last_hidden_state=last_hidden_state, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) - - -class SiglipMultiheadAttentionPoolingHead(nn.Module): - """Multihead Attention Pooling.""" - - def __init__(self, config: SiglipVisionConfig): - super().__init__() - - self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size)) - self.attention = torch.nn.MultiheadAttention( - config.hidden_size, config.num_attention_heads, batch_first=True) - self.layernorm = nn.LayerNorm(config.hidden_size, - eps=config.layer_norm_eps) - self.mlp = SiglipMLP(config) - - def forward(self, hidden_state, attention_mask): - batch_size = hidden_state.shape[0] - probe = self.probe.repeat(batch_size, 1, 1) - - hidden_state = self.attention(query=probe, - key=hidden_state, - value=hidden_state, - key_padding_mask=~attention_mask)[0] - - residual = hidden_state - hidden_state = self.layernorm(hidden_state) - hidden_state = residual + self.mlp(hidden_state) - - return hidden_state[:, 0] - - -@add_start_docstrings( - """The vision model from SigLIP without any head or projection on top.""", - SIGLIP_START_DOCSTRING, -) -class SiglipVisionModel(SiglipPreTrainedModel): - config_class = SiglipVisionConfig - main_input_name = "pixel_values" - - def __init__(self, config: SiglipVisionConfig): - super().__init__(config) - - self.vision_model = SiglipVisionTransformer(config) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self) -> nn.Module: - return self.vision_model.embeddings.patch_embedding - - @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=BaseModelOutputWithPooling, - config_class=SiglipVisionConfig) - def forward( - self, - pixel_values, - patch_attention_mask: Optional[torch.BoolTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - r""" - Returns: - Examples: - ```python - >>> from PIL import Image - >>> import requests - >>> from transformers import AutoProcessor, SiglipVisionModel - >>> model = SiglipVisionModel.from_pretrained( - "google/siglip-base-patch16-224") - >>> processor = AutoProcessor.from_pretrained( - "google/siglip-base-patch16-224") - >>> url = - "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - >>> inputs = processor(images=image, return_tensors="pt") - >>> outputs = model(**inputs) - >>> last_hidden_state = outputs.last_hidden_state - >>> pooled_output = outputs.pooler_output # pooled features - ```""" - return_dict = return_dict if return_dict is not None \ - else self.config.use_return_dict - - return self.vision_model( - pixel_values=pixel_values, - patch_attention_mask=patch_attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - -@add_start_docstrings(SIGLIP_START_DOCSTRING) -class SiglipModel(SiglipPreTrainedModel): - config_class = SiglipConfig - - def __init__(self, config: SiglipConfig): - super().__init__(config) - - if not isinstance(config.text_config, SiglipTextConfig): - raise ValueError("config.text_config is expected to be of type " - f"SiglipTextConfig but is of type" - f" {type(config.text_config)}.") - - if not isinstance(config.vision_config, SiglipVisionConfig): - raise ValueError("config.vision_config is expected to be of type " - "SiglipVisionConfig but is of type" - f" {type(config.vision_config)}.") - - text_config = config.text_config - vision_config = config.vision_config - - self.text_model = SiglipTextTransformer(text_config) - self.vision_model = SiglipVisionTransformer(vision_config) - - self.logit_scale = nn.Parameter(torch.randn(1)) - self.logit_bias = nn.Parameter(torch.randn(1)) - - # Initialize weights and apply final processing - self.post_init() - - @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING) - def get_text_features( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> torch.FloatTensor: - r""" - Returns: - text_features (`torch.FloatTensor` of shape `(batch_size, - output_dim`): The text embeddings obtained by - applying the projection layer to the pooled output - of [`SiglipTextModel`]. - Examples: - ```python - >>> from transformers import AutoTokenizer, AutoModel - >>> import torch - >>> model = AutoModel.from_pretrained( - "google/siglip-base-patch16-224") - >>> tokenizer = AutoTokenizer.from_pretrained( - "google/siglip-base-patch16-224") - >>> # important: make sure to set padding="max_length" as that's - how the model was trained - >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], - padding="max_length", return_tensors="pt") - >>> with torch.no_grad(): - ... text_features = model.get_text_features(**inputs) - ```""" - # Use SigLIP model's config for some fields (if specified) instead - # of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None\ - else self.config.output_attentions - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = return_dict if return_dict is not None \ - else self.config.use_return_dict - - text_outputs = self.text_model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - pooled_output = text_outputs[1] - - return pooled_output - - @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING) - def get_image_features( - self, - pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> torch.FloatTensor: - r""" - Returns: - image_features (`torch.FloatTensor` of shape `(batch_size, - output_dim`): The image embeddings obtained by applying the - projection layer to the pooled output of [`SiglipVisionModel`]. - Examples: - ```python - >>> from PIL import Image - >>> import requests - >>> from transformers import AutoProcessor, AutoModel - >>> import torch - >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224") - >>> processor = AutoProcessor.from_pretrained( - "google/siglip-base-patch16-224") - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - >>> inputs = processor(images=image, return_tensors="pt") - >>> with torch.no_grad(): - ... image_features = model.get_image_features(**inputs) - ```""" - # Use SiglipModel's config for some fields (if specified) instead - # of those of vision & text components. - output_attentions = output_attentions if output_attentions \ - is not None else self.config.output_attentions - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = return_dict if return_dict is not None else \ - self.config.use_return_dict - - vision_outputs = self.vision_model( - pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - pooled_output = vision_outputs[1] - - return pooled_output - - @add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=SiglipOutput, - config_class=SiglipConfig) - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - pixel_values: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - return_loss: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SiglipOutput]: - r""" - Returns: - Examples: - ```python - >>> from PIL import Image - >>> import requests - >>> from transformers import AutoProcessor, AutoModel - >>> import torch - >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224") - >>> processor = AutoProcessor.from_pretrained( - "google/siglip-base-patch16-224") - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"] - >>> # important: we pass `padding=max_length` since the model was - trained with this - >>> inputs = processor(text=texts, images=image, - padding="max_length", return_tensors="pt") - >>> with torch.no_grad(): - ... outputs = model(**inputs) - >>> logits_per_image = outputs.logits_per_image - >>> probs = torch.sigmoid(logits_per_image) # these are the - probabilities - >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'") - 31.9% that image 0 is 'a photo of 2 cats' - ```""" - # Use SigLIP model's config for some fields (if specified) instead of - # those of vision & text components. - output_attentions = output_attentions if output_attentions \ - is not None else self.config.output_attentions - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = return_dict if return_dict is not None else \ - self.config.use_return_dict - - vision_outputs = self.vision_model( - pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - text_outputs = self.text_model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - image_embeds = vision_outputs[1] - text_embeds = text_outputs[1] - - # normalized features - image_embeds = image_embeds / image_embeds.norm( - p=2, dim=-1, keepdim=True) - text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True) - - # cosine similarity as logits - logits_per_text = torch.matmul(text_embeds, image_embeds.t( - )) * self.logit_scale.exp() + self.logit_bias - logits_per_image = logits_per_text.t() - - loss = None - if return_loss: - raise NotImplementedError("SigLIP loss to be implemented") - - if not return_dict: - output = (logits_per_image, logits_per_text, text_embeds, - image_embeds, text_outputs, vision_outputs) - return ((loss, ) + output) if loss is not None else output - - return SiglipOutput( - loss=loss, - logits_per_image=logits_per_image, - logits_per_text=logits_per_text, - text_embeds=text_embeds, - image_embeds=image_embeds, - text_model_output=text_outputs, - vision_model_output=vision_outputs, - ) - - -def get_siglip_vision_model(_flash_attn_2_enabled=True, **kwargs): - siglip_vision_config = { - "hidden_size": 1152, - "image_size": 448, - "intermediate_size": 4304, - "model_type": "siglip_vision_model", - "num_attention_heads": 16, - "num_hidden_layers": 27, - "patch_size": 14, - } - - # Detect attention implementation. - attn_backend: _Backend = get_vit_attn_backend(support_fa=True) - if attn_backend != _Backend.FLASH_ATTN: - _flash_attn_2_enabled = False - - model_config = SiglipVisionConfig( - **siglip_vision_config, - _flash_attn_2_enabled=_flash_attn_2_enabled, - **kwargs) - - vision_model = SiglipVisionModel(model_config).vision_model - - return vision_model From b30c75dda4f6c5e0d8b3d2b39134da38b72ea96e Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Sat, 15 Mar 2025 20:21:11 -0700 Subject: [PATCH 062/169] [V1] Remove V0 fallback for mistral-tokenizer (#14873) Signed-off-by: Roger Wang --- vllm/engine/arg_utils.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 31d567de0efa5..4e695da4ef765 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1487,13 +1487,6 @@ class EngineArgs: recommend_to_remove=False) return False - # No MistralTokenizer support so far (not compatible - # with xgrammar) - if model_config.tokenizer_mode == "mistral": - _raise_or_fallback(feature_name="--tokenizer-mode mistral", - recommend_to_remove=False) - return False - # No CPU offloading yet. if self.cpu_offload_gb != EngineArgs.cpu_offload_gb: _raise_or_fallback(feature_name="--cpu-offload-gb", From 71c1e0710783e1b0427610ba9e32bed7724fa36f Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Sat, 15 Mar 2025 20:25:03 -0700 Subject: [PATCH 063/169] [Kernel] Add more tuned configs (#14877) Signed-off-by: simon-mo --- ...192,device_name=NVIDIA_A800-SXM4-80GB.json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...=64,device_name=NVIDIA_A800-SXM4-80GB.json | 146 ++++++++++++++++++ ...280,device_name=NVIDIA_A800-SXM4-80GB.json | 146 ++++++++++++++++++ ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=64,N=1280,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=64,N=2560,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ ...320,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=64,N=320,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...640,device_name=NVIDIA_A800-SXM4-80GB.json | 146 ++++++++++++++++++ ...VIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=64,N=640,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=8,N=14336,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=8,N=1792,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=8,N=2048,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...VIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=8,N=3584,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=8,N=4096,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=8,N=7168,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../layers/fused_moe/configs/README | 3 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ 105 files changed, 13627 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json new file mode 100644 index 0000000000000..0611620eb3362 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..4dd00d110e486 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json new file mode 100644 index 0000000000000..48f9697af2639 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..a8c05712ba587 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json new file mode 100644 index 0000000000000..f1244c61efb01 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..a2ee05da1d7c6 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..fc573cd6e8561 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..c6d7e96c7f0ae --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json new file mode 100644 index 0000000000000..21f60229ff875 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json new file mode 100644 index 0000000000000..39a9912fa4bdd --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..05b54639d234e --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..c17a4ec346915 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..170ae7f3fff1d --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..1d9d352edebc3 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..9ad5b31675005 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..2883dfd11e7f3 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..8abfd84a776b7 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000000..2fc18a5e43d29 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..be8d4a7fd23d9 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..71fdd88643c6f --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json new file mode 100644 index 0000000000000..c02de2f628b71 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..3e0bc75ff87c4 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..9f7ed6726f44e --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..21b72557e365d --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..eaf32f6d76c0a --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..841044a4fc6e2 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..59be497fc4287 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..e4110a5d2e70f --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..0883ef40582ea --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..1a0aa33193329 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..9952be6ba4abe --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..32bbadbb9eae8 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..e6f753cdba35b --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..53f3394693f06 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..4dd475c02a19b --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..2ed15f30fe603 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..eb817268d4120 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..0c7062aea6c4e --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..96cbc111c7fff --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/README b/vllm/model_executor/layers/fused_moe/configs/README index 45d40cbfb1a2e..787bd06116646 100644 --- a/vllm/model_executor/layers/fused_moe/configs/README +++ b/vllm/model_executor/layers/fused_moe/configs/README @@ -8,3 +8,6 @@ the JSON file contains a mapping from M (batch size) to the chosen configuration The example configurations provided are for the Mixtral model for TP2 on H100 and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have N = 7168 and for TP4 we have N = 3584. + +Please feel free to tune the configurations using scripts in `benchmarks/kernels/benchmark_moe.py` +Some of the configurations files are copied from the SGLang repository. Thank you! diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..3e8ebf3f7301c --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..2bb5b457d774a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..6e2aeee9b75c2 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..b0f9442a6aaa8 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..bee8d03ba47cf --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..9da876d3ccb43 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..0a1a252a5e032 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..d6279a1e37b6f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..3bc003647cda8 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..310dff4635c28 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..206c8a2bac667 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..edc23530ea745 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..43b5bdbdff5db --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..bffa749724ad3 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..f96f12787f6fb --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..fe3e18cf01aa1 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..e4d5b2dd02a8c --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..137b9ddaca305 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..38cac4690a8a6 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..8e6ebe21fc3c6 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..1225d847b7d5e --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..d44e38438c9f6 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..c559a69a77eed --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..8ec2005f02e88 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..65840aa538bc6 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..4e120d6d08432 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..5c298746788d9 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..4990268b2a9eb --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..18afdd96fbfb2 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..51d10bb0ee1a4 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..1480e09293213 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..6bd350c388972 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..2b9f0d1ec64ed --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..d979c6b66d048 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..6eb22deb8dd2b --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..c746e7080522d --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..0b4746ceeb61d --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..8ec2005f02e88 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..202acf23f8ca7 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..11a9bceb77c85 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..386ee59beae38 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..60df5e33eed5d --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..4f1747b81f58e --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..53bbaca407af6 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..ffe67dcf48c23 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..2a17e164e9ec7 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..b259993b617c3 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..a71ab88d43c1e --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..eda96e76cb6d9 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..bd0767b5ef66f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..29f7651876940 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..6db13852c9d4e --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..1a47cae9e17bd --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..8dd5ae5c49715 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..6d1a8b56a2831 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..e77abaf396831 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..01327b2c4f907 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..12eea5fb6687a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..9db9daece8c18 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..365f8d0d8abc0 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..f080ea5da7dd1 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..e9bf04442a91f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..c37aced26e8d5 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..d6bef7f60c614 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..8df6e4b6e5dc8 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file From b82662d9523d9aa1386d8d1de410426781a1fa3b Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sat, 15 Mar 2025 20:26:19 -0700 Subject: [PATCH 064/169] [BugFix] Fix torch distributed stateless PG backend init (#14870) Signed-off-by: Nick Hill --- examples/offline_inference/data_parallel.py | 5 +++++ vllm/distributed/utils.py | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index b00519314d8bd..b73770ce382cf 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -76,5 +76,10 @@ if __name__ == "__main__": GPUs_per_dp_rank)) proc.start() procs.append(proc) + exit_code = 0 for proc in procs: proc.join() + if proc.exitcode: + exit_code = proc.exitcode + + exit(exit_code) diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index 25202062e9757..84899358a6d66 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -299,13 +299,10 @@ def stateless_init_torch_distributed_process_group( # different systems (e.g. RPC) in case the store is multi-tenant. prefix_store = PrefixStore(init_method, store) - pg_options = ProcessGroup.Options(backend=backend, timeout=timeout) - pg: ProcessGroup = ProcessGroup( prefix_store, group_rank, group_size, - pg_options, ) if backend == "gloo": @@ -327,7 +324,10 @@ def stateless_init_torch_distributed_process_group( backend_options) backend_type = ProcessGroup.BackendType.NCCL device = torch.device("cuda") + else: + raise RuntimeError(f"Unsupported torch distributed backend: {backend}") + pg._set_default_backend(backend_type) backend_class._set_sequence_number_for_group() pg._register_backend(device, backend_type, backend_class) From d1ad2a57af72fb4c9bb4b6c7cfc58e0159693fc6 Mon Sep 17 00:00:00 2001 From: Lily Liu Date: Sun, 16 Mar 2025 00:29:22 -0700 Subject: [PATCH 065/169] [V1] [Spec Decode] Fix ngram tests (#14878) --- tests/v1/spec_decode/test_ngram.py | 55 ++++++++++++++++-------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py index ec663c84d0d2a..2c2e125ade48c 100644 --- a/tests/v1/spec_decode/test_ngram.py +++ b/tests/v1/spec_decode/test_ngram.py @@ -1,32 +1,37 @@ # SPDX-License-Identifier: Apache-2.0 -import pytest -from vllm.v1.spec_decode.ngram_proposer import NgramProposer -from vllm.v1.utils import ConstantList +import numpy as np + +from vllm.v1.spec_decode.ngram_proposer import (_find_subarray_kmp, + _kmp_lps_array) -@pytest.fixture -def proposer(): - return NgramProposer() +def test_kmp_lps_array(): + np.testing.assert_array_equal(_kmp_lps_array(np.array([])), np.array([])) + np.testing.assert_array_equal(_kmp_lps_array(np.array([1])), np.array([0])) + np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 1, 1])), + np.array([0, 1, 2])) + np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 3, 4])), + np.array([0, 0, 0, 0])) + np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 1, 2, 3])), + np.array([0, 0, 1, 2, 0])) -def test_kmp_lps_array(proposer): - assert proposer._kmp_lps_array([]) == [] - assert proposer._kmp_lps_array([1]) == [0] - assert proposer._kmp_lps_array([1, 1, 1]) == [0, 1, 2] - assert proposer._kmp_lps_array([1, 2, 3, 4]) == [0, 0, 0, 0] - assert proposer._kmp_lps_array([1, 2, 1, 2, 3]) == [0, 0, 1, 2, 0] - - -def test_find_subarray_kmp(proposer): - X = ConstantList([1, 2, 3, 4, 1, 2, 3, 5, 6]) - assert proposer._find_subarray_kmp(X, 2, 2) is None - X = ConstantList([1, 2, 3, 4, 1, 2, 3]) - assert proposer._find_subarray_kmp(X, 2, 3) == [4, 1, 2] - assert proposer._find_subarray_kmp(X, 2, 2) == [4, 1] - assert proposer._find_subarray_kmp(X, 1, 3) == [4, 1, 2] - assert proposer._find_subarray_kmp(X, 1, 2) == [4, 1] - X = ConstantList([1, 3, 6, 2, 3, 4, 1, 2, 3]) - assert proposer._find_subarray_kmp(X, 2, 3) == [4, 1, 2] +def test_find_subarray_kmp(): + X = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6]) + assert _find_subarray_kmp(X, 2, 2) is None + X = np.array([1, 2, 3, 4, 1, 2, 3]) + np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3), + np.array([4, 1, 2])) + np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 2), np.array([4, + 1])) + np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3), + np.array([4, 1, 2])) + np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 2), np.array([4, + 1])) + X = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3]) + np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3), + np.array([4, 1, 2])) # Return on the first match - assert proposer._find_subarray_kmp(X, 1, 3) == [6, 2, 3] \ No newline at end of file + np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3), + np.array([6, 2, 3])) From d30aa7e9e6afd6147865c8c9fae8cd21f5ddce3d Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Sun, 16 Mar 2025 10:44:19 -0400 Subject: [PATCH 066/169] [Bugfix] Limit profiling run sequence length by max_model_len (#14785) Signed-off-by: Kyle Sayers --- vllm/inputs/registry.py | 5 +++++ vllm/worker/enc_dec_model_runner.py | 1 + vllm/worker/model_runner.py | 1 + vllm/worker/openvino_model_runner.py | 1 + vllm/worker/xpu_model_runner.py | 1 + 5 files changed, 9 insertions(+) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index b6ceb5fb82d70..24980833864b0 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -330,6 +330,11 @@ class InputRegistry: from vllm.multimodal import MultiModalKwargs from vllm.multimodal.profiling import MultiModalProfiler + if seq_len > model_config.max_model_len: + raise AssertionError( + f"Profiling attempted with sequence length ({seq_len}) " + f"greater than model length ({model_config.max_model_len})") + if mm_registry.has_processor(model_config): tokenizer = cached_tokenizer_from_config(model_config) processor = mm_registry.create_processor(model_config, diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 5f39f2fa4947c..f34597ac05db4 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -281,6 +281,7 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) + seq_len = min(seq_len, self.model_config.max_model_len) batch_size += seq_len decoder_dummy_data = self.input_registry \ diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 473bd901b5b23..3181483fe8390 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1302,6 +1302,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) + seq_len = min(seq_len, self.model_config.max_model_len) batch_size += seq_len dummy_data = self.input_registry \ diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index aa1d2cbb2df29..9b484a9f543fe 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -148,6 +148,7 @@ class OpenVINOModelRunner(ModelRunnerBase): seq_len = min( seq_data.get_len(), computed_len + seq_group_metadata.token_chunk_size, + self.model_config.max_model_len, ) if is_prompt: tokens = seq_data.get_token_ids()[computed_len:seq_len] diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 39957e661c474..2103260d8900c 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -466,6 +466,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) + seq_len = min(seq_len, self.model_config.max_model_len) batch_size += seq_len dummy_data = self.input_registry \ From e53b1350f289d65011d9251fd826646c169018df Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 17 Mar 2025 00:05:40 +0800 Subject: [PATCH 067/169] [Bugfix] Explicitly disable Phi-4-multimodal in V1 (#14889) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/phi4mm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 7250aaba557eb..3d4505d556e2c 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -33,7 +33,7 @@ from vllm.sequence import IntermediateTensors, SequenceData from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from .idefics2_vision_model import Idefics2VisionTransformer -from .interfaces import SupportsLoRA, SupportsMultiModal +from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsV0Only from .phi4mm_audio import AudioEmbedding from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix @@ -1433,7 +1433,8 @@ def cat_with_pad(tensors, dim, padding_value=0): "image", get_max_phi4mm_image_tokens) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi4mm) @INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm) -class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): +class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal, + SupportsV0Only): """ Implements the Phi-4-multimodal-instruct model in vLLM. """ From f6137adbcbbdea8b5023a66480de921b558bef83 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 17 Mar 2025 00:13:46 +0800 Subject: [PATCH 068/169] Revert "[Bugfix] Limit profiling run sequence length by max_model_len (#14785) (#14892) Signed-off-by: DarkLight1337 --- vllm/inputs/registry.py | 5 ----- vllm/worker/enc_dec_model_runner.py | 1 - vllm/worker/model_runner.py | 1 - vllm/worker/openvino_model_runner.py | 1 - vllm/worker/xpu_model_runner.py | 1 - 5 files changed, 9 deletions(-) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 24980833864b0..b6ceb5fb82d70 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -330,11 +330,6 @@ class InputRegistry: from vllm.multimodal import MultiModalKwargs from vllm.multimodal.profiling import MultiModalProfiler - if seq_len > model_config.max_model_len: - raise AssertionError( - f"Profiling attempted with sequence length ({seq_len}) " - f"greater than model length ({model_config.max_model_len})") - if mm_registry.has_processor(model_config): tokenizer = cached_tokenizer_from_config(model_config) processor = mm_registry.create_processor(model_config, diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index f34597ac05db4..5f39f2fa4947c 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -281,7 +281,6 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) - seq_len = min(seq_len, self.model_config.max_model_len) batch_size += seq_len decoder_dummy_data = self.input_registry \ diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 3181483fe8390..473bd901b5b23 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1302,7 +1302,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) - seq_len = min(seq_len, self.model_config.max_model_len) batch_size += seq_len dummy_data = self.input_registry \ diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index 9b484a9f543fe..aa1d2cbb2df29 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -148,7 +148,6 @@ class OpenVINOModelRunner(ModelRunnerBase): seq_len = min( seq_data.get_len(), computed_len + seq_group_metadata.token_chunk_size, - self.model_config.max_model_len, ) if is_prompt: tokens = seq_data.get_token_ids()[computed_len:seq_len] diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 2103260d8900c..39957e661c474 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -466,7 +466,6 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) - seq_len = min(seq_len, self.model_config.max_model_len) batch_size += seq_len dummy_data = self.input_registry \ From fc1f67715d95f24885288b75c736cc1fc1be0103 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sun, 16 Mar 2025 14:53:34 -0700 Subject: [PATCH 069/169] [BugFix][V1] Fix overhead related to bad_words sampling when not in use (#14894) Signed-off-by: Nick Hill --- tests/v1/worker/test_gpu_input_batch.py | 5 +++-- vllm/sampling_params.py | 7 ++++--- vllm/v1/worker/gpu_input_batch.py | 5 +++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 192ddefe102d2..2486c26c6071a 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -124,8 +124,9 @@ def _construct_expected_sampling_metadata( if req.sampling_params.allowed_token_ids: allowed_token_ids_mask[index_in_input_batch][ req.sampling_params.allowed_token_ids] = True - bad_words_token_ids[ - index_in_input_batch] = req.sampling_params.bad_words_token_ids + if req.sampling_params.bad_words_token_ids: + bad_words_token_ids[ + index_in_input_batch] = req.sampling_params.bad_words_token_ids return SamplingMetadata( temperature=torch.tensor(temperature, dtype=torch.float, diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index b0a5777cc8d56..9b474a37b96b6 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -235,7 +235,7 @@ class SamplingParams( # Fields used for bad words bad_words: Optional[list[str]] = None - _bad_words_token_ids: list[list[int]] = msgspec.field(default_factory=list) + _bad_words_token_ids: Optional[list[list[int]]] = None @staticmethod def from_optional( @@ -464,8 +464,9 @@ class SamplingParams( self.stop_token_ids = list(eos_ids) def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None: - if self.bad_words is None: + if not self.bad_words: return + self._bad_words_token_ids = [] for bad_word in self.bad_words: # To prohibit words both at the beginning # and in the middle of text @@ -516,7 +517,7 @@ class SamplingParams( return self._all_stop_token_ids @property - def bad_words_token_ids(self) -> list[list[int]]: + def bad_words_token_ids(self) -> Optional[list[list[int]]]: # For internal use only. Backward compatibility not guaranteed return self._bad_words_token_ids diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 9707cb5774cd0..55d5429a8935d 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -324,8 +324,9 @@ class InputBatch: self.allowed_token_ids_mask_cpu_tensor[req_index][ sampling_params.allowed_token_ids] = False - self.bad_words_token_ids[ - req_index] = sampling_params.bad_words_token_ids + if sampling_params.bad_words_token_ids: + self.bad_words_token_ids[ + req_index] = sampling_params.bad_words_token_ids # Add request lora ID if request.lora_request: From 31060b2757fb19ec67894b7c441383ceec9f1272 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 16 Mar 2025 14:53:53 -0700 Subject: [PATCH 070/169] [V1][BugFix] Detect interleaved sliding window attention (#14896) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c2a976108e4d4..8dd7521ff49a2 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -82,8 +82,15 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ cache_config.cache_dtype] - self.is_multimodal_model = model_config.is_multimodal_model + # NOTE(woosuk): sliding_window is None for models with interleaved + # attention. Use interleaved_sliding_window instead. self.sliding_window = model_config.get_sliding_window() + self.interleaved_sliding_window = getattr( + model_config.hf_text_config, "interleaved_sliding_window", None) + self.window_size = (self.sliding_window + or self.interleaved_sliding_window) + + self.is_multimodal_model = model_config.is_multimodal_model self.block_size = cache_config.block_size self.max_model_len = model_config.max_model_len self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size) @@ -674,7 +681,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): num_query_heads=self.num_query_heads, num_kv_heads=self.num_kv_heads, use_alibi=False, # FIXME - use_sliding_window=self.sliding_window is not None, + use_sliding_window=self.window_size is not None, num_sms=self.num_sms, ) return common_prefix_len if use_cascade else 0 From b9b5bdfc7d5cd0f8610a4de7a79327d10a09dfab Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Sun, 16 Mar 2025 15:46:42 -0700 Subject: [PATCH 071/169] [Misc] Catching Ray Compiled Graph PP test failures for V1 (#14847) --- tests/distributed/test_pipeline_parallel.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 05b6ba40506a2..4d3306509c8f2 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -350,6 +350,10 @@ def _compare_tp( else: pp_env = None + tp_env = { + "VLLM_USE_V1": vllm_major_version, + } + pp_args = [ *common_args, "--pipeline-parallel-size", @@ -374,14 +378,20 @@ def _compare_tp( ] try: - compare_two_settings(model_id, pp_args, tp_args, pp_env, method=method) + compare_two_settings(model_id, + pp_args, + tp_args, + pp_env, + tp_env, + method=method) except Exception: - if pp_env is None: - raise - else: - # Ray Compiled Graph tests are flaky, + testing_ray_compiled_graph = pp_env is not None + if testing_ray_compiled_graph and vllm_major_version == "0": + # Ray Compiled Graph tests are flaky for V0, # so we don't want to fail the test logger.exception("Ray Compiled Graph tests failed") + else: + raise @pytest.mark.parametrize( From 90df7f23aadad4aafc509fa950bd9b967a996e84 Mon Sep 17 00:00:00 2001 From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Date: Mon, 17 Mar 2025 03:10:04 +0400 Subject: [PATCH 072/169] [Doc] Add guidance for using `ccache` with `pip install -e .` in doc (#14901) --- docs/source/getting_started/installation/gpu/cuda.inc.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md index 7e3b884c2ab1e..d3e375aec10cb 100644 --- a/docs/source/getting_started/installation/gpu/cuda.inc.md +++ b/docs/source/getting_started/installation/gpu/cuda.inc.md @@ -131,6 +131,8 @@ Building from source requires a lot of compilation. If you are building from sou For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` . As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. +When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built. + [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments. The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`. ::: From aecc780dba30db6b503754926564642374cb2c2e Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Sun, 16 Mar 2025 20:56:16 -0400 Subject: [PATCH 073/169] [V1] Enable Entrypoints Tests (#14903) --- .buildkite/test-pipeline.yaml | 1 + tests/v1/entrypoints/llm/test_struct_output_generate.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 93ac8a29c676c..a6616d7b41480 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -198,6 +198,7 @@ steps: commands: # split the test to avoid interference - pytest -v -s v1/core + - pytest -v -s v1/entrypoints - pytest -v -s v1/engine - pytest -v -s v1/sample - pytest -v -s v1/worker diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index b4eb475c23baa..98983fa05b83f 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -18,6 +18,9 @@ MODELS_TO_TEST = [ "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410" ] +# Undo after https://github.com/vllm-project/vllm/pull/14868 +pytest.skip(allow_module_level=True) + @pytest.mark.skip_global_cleanup @pytest.mark.parametrize("guided_decoding_backend", From bb3aeddfaf338a9bbac10e3c75027b7f8c5c08e0 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Sun, 16 Mar 2025 22:06:43 -0400 Subject: [PATCH 074/169] [CI] Nightly Tests (#14898) Signed-off-by: rshaw@neuralmagic.com Signed-off-by: rshaw@neuralmagic.com Co-authored-by: rshaw@neuralmagic.com --- .../models/decoder_only/language/test_mistral.py | 1 + tests/tool_use/utils.py | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py index 7e1337b7d4876..4c2055361d445 100644 --- a/tests/models/decoder_only/language/test_mistral.py +++ b/tests/models/decoder_only/language/test_mistral.py @@ -201,6 +201,7 @@ def test_models( ) +@pytest.mark.skip("RE-ENABLE: test is currently failing on main.") @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index aad37eb9b8f3a..df117b96cd07b 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -46,6 +46,7 @@ CONFIGS: dict[str, ServerConfig] = { "model": "NousResearch/Hermes-3-Llama-3.1-8B", "arguments": [ + "--enforce-eager", "--no-enable-prefix-caching", "--tool-call-parser", "hermes", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja") ], @@ -60,6 +61,7 @@ CONFIGS: dict[str, ServerConfig] = { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "arguments": [ + "--enforce-eager", "--no-enable-prefix-caching", "--tool-call-parser", "llama3_json", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_llama3.1_json.jinja") ], @@ -70,6 +72,7 @@ CONFIGS: dict[str, ServerConfig] = { "model": "meta-llama/Llama-3.2-3B-Instruct", "arguments": [ + "--enforce-eager", "--no-enable-prefix-caching", "--tool-call-parser", "llama3_json", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_llama3.2_json.jinja") ], @@ -80,6 +83,7 @@ CONFIGS: dict[str, ServerConfig] = { "model": "mistralai/Mistral-7B-Instruct-v0.3", "arguments": [ + "--enforce-eager", "--no-enable-prefix-caching", "--tool-call-parser", "mistral", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"), "--ignore-patterns=\"consolidated.safetensors\"" @@ -111,22 +115,28 @@ CONFIGS: dict[str, ServerConfig] = { "model": "ibm-granite/granite-3.0-8b-instruct", "arguments": [ + "--enforce-eager", "--no-enable-prefix-caching", "--tool-call-parser", "granite", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_granite.jinja") ], }, "granite-3.1-8b": { - "model": "ibm-granite/granite-3.1-8b-instruct", + "model": + "ibm-granite/granite-3.1-8b-instruct", "arguments": [ + "--enforce-eager", + "--no-enable-prefix-caching", "--tool-call-parser", "granite", ], - "supports_parallel": True, + "supports_parallel": + True, }, "internlm": { "model": "internlm/internlm2_5-7b-chat", "arguments": [ + "--enforce-eager", "--no-enable-prefix-caching", "--tool-call-parser", "internlm", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_internlm2_tool.jinja"), @@ -139,6 +149,7 @@ CONFIGS: dict[str, ServerConfig] = { "model": "Team-ACE/ToolACE-8B", "arguments": [ + "--enforce-eager", "--no-enable-prefix-caching", "--tool-call-parser", "pythonic", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_toolace.jinja") ], From 8a5a9b70d702feb17e79691870c638b0f1e71192 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 17 Mar 2025 10:38:15 +0800 Subject: [PATCH 075/169] [CI/Build] Update defaults for test reproducibility (#14893) Signed-off-by: DarkLight1337 --- tests/conftest.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 4716ca2e315b7..41c0e62ce14f3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -681,6 +681,17 @@ def hf_runner(): class VllmRunner: + """ + The default value of some arguments have been modified from + :class:`~vllm.LLM` as follows: + - `trust_remote_code`: Set to `True` instead of `False` for convenience. + - `seed`: Set to `0` instead of `None` for test reproducibility. + - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage. + - `block_size`: Set to `16` instead of `None` to reduce memory usage. + - `enable_chunked_prefill`: Set to `False` instead of `None` for + test reproducibility. + - `enforce_eager`: Set to `False` instead of `None` to test CUDA graph. + """ def __init__( self, @@ -688,6 +699,8 @@ class VllmRunner: task: TaskOption = "auto", tokenizer_name: Optional[str] = None, tokenizer_mode: str = "auto", + trust_remote_code: bool = True, + seed: Optional[int] = 0, # Use smaller max model length, otherwise bigger model cannot run due # to kv cache size limit. max_model_len: int = 1024, @@ -695,7 +708,7 @@ class VllmRunner: disable_log_stats: bool = True, tensor_parallel_size: int = 1, block_size: int = 16, - enable_chunked_prefill: bool = False, + enable_chunked_prefill: Optional[bool] = False, swap_space: int = 4, enforce_eager: Optional[bool] = False, **kwargs, @@ -705,8 +718,9 @@ class VllmRunner: task=task, tokenizer=tokenizer_name, tokenizer_mode=tokenizer_mode, - trust_remote_code=True, + trust_remote_code=trust_remote_code, dtype=dtype, + seed=seed, swap_space=swap_space, enforce_eager=enforce_eager, disable_log_stats=disable_log_stats, From faa02757307583f2c5557ff23cb41f1db4f1f29c Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 16 Mar 2025 20:19:30 -0700 Subject: [PATCH 076/169] [V1] Optimize the overhead of rewinding (#14905) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8dd7521ff49a2..4059d5b17b71b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1032,17 +1032,16 @@ class GPUModelRunner(LoRAModelRunnerMixin): # TODO(woosuk): The following loop can be slow since it iterates over # the requests one by one. Optimize. - for i, req_id in enumerate(self.input_batch.req_ids): + for i, generator in self.input_batch.generators.items(): + req_id = self.input_batch.req_ids[i] req_state = self.requests[req_id] seq_len = (req_state.num_computed_tokens + scheduler_output.num_scheduled_tokens[req_id]) if seq_len < req_state.num_tokens: - # Ignore the sampled token. + # Ignore the sampled token for partial prefills. # Rewind the generator state as if the token was not sampled. - generator = self.input_batch.generators.get(i) - if generator is not None: - # This relies on cuda-specific torch-internal impl details - generator.set_offset(generator.get_offset() - 4) + # This relies on cuda-specific torch-internal impl details + generator.set_offset(generator.get_offset() - 4) # NOTE: GPU -> CPU Sync happens here. # Move as many CPU operations as possible before this sync point. From 7f6c5ee06c4861ae1310f4ea5caaa2104efb4d22 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 16 Mar 2025 20:20:15 -0700 Subject: [PATCH 077/169] [V1][Minor] Add __repr__ to ConstantList (#14907) Signed-off-by: Woosuk Kwon --- vllm/v1/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 8e1fb18cca05b..6c01ed3de52d7 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -86,6 +86,9 @@ class ConstantList(Generic[T], Sequence): def __len__(self): return len(self._x) + def __repr__(self): + return f"ConstantList({self._x})" + class BackgroundProcHandle: """ From 1e799b7ec1b1c61952d2ae24c85ecf3fcb0f6de3 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Sun, 16 Mar 2025 23:35:37 -0400 Subject: [PATCH 078/169] [BugFix] Fix MLA + V1 + TP==1 causing reinitialization of cuda context (#14910) --- vllm/platforms/cuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 3897584307e91..8a53337ebc087 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -152,7 +152,7 @@ class CudaPlatformBase(Platform): # here use_flashmla = (envs.VLLM_ATTENTION_BACKEND is None \ or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA") - from vllm.attention.backends.flashmla import is_flashmla_supported + from vllm.attention.ops.flashmla import is_flashmla_supported if use_flashmla and is_flashmla_supported()[0] \ and cache_config.block_size != 64: cache_config.block_size = 64 From a73e183e36a818ea95f442ae1751bc66cf4f135d Mon Sep 17 00:00:00 2001 From: Sibi <85477603+t-sibiraj@users.noreply.github.com> Date: Mon, 17 Mar 2025 11:35:57 +0800 Subject: [PATCH 079/169] [Misc] Replace os environ to monkeypatch in test suite (#14516) Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com> Signed-off-by: Aaron Pham Co-authored-by: Cyrus Leung Co-authored-by: Aaron Pham --- .buildkite/test-pipeline.yaml | 2 +- .../test_basic_correctness.py | 105 +++--- .../basic_correctness/test_chunked_prefill.py | 168 +++++----- tests/basic_correctness/test_cumem.py | 62 ++-- tests/compile/test_basic_correctness.py | 207 ++++++------ tests/compile/test_full_graph.py | 115 ++++++- tests/compile/utils.py | 93 ------ tests/conftest.py | 2 +- tests/distributed/test_comm_ops.py | 85 +++-- tests/distributed/test_custom_all_reduce.py | 173 +++++----- tests/distributed/test_pipeline_partition.py | 60 ++-- tests/distributed/test_pp_cudagraph.py | 38 ++- tests/entrypoints/llm/test_accuracy.py | 4 +- .../offline_mode/test_offline_mode.py | 49 +-- .../openai/correctness/test_lmeval.py | 5 +- tests/kernels/test_attention_selector.py | 129 +++++--- tests/kernels/test_awq.py | 60 ++-- tests/kernels/test_rocm_attention_selector.py | 18 +- tests/kernels/utils.py | 64 ++-- .../{disagg_test.py => test_disagg.py} | 0 .../{module_test.py => test_module.py} | 0 .../models/decoder_only/language/test_fp8.py | 120 +++---- .../models/embedding/language/test_gritlm.py | 96 +++--- tests/models/test_oot_registration.py | 130 ++++---- tests/mq_llm_engine/test_error_handling.py | 31 +- .../multi_step/test_correctness_async_llm.py | 202 ++++++------ tests/multi_step/test_correctness_llm.py | 299 ++++++++--------- tests/neuron/1_core/test_block_table.py | 80 ++--- tests/neuron/1_core/test_prefix_prefill.py | 306 +++++++++--------- tests/plugins_tests/test_platform_plugins.py | 13 +- tests/plugins_tests/test_scheduler_plugins.py | 62 ++-- tests/prefix_caching/test_prefix_caching.py | 111 ++++--- tests/test_regression.py | 16 +- tests/test_utils.py | 63 ++-- tests/tpu/test_custom_dispatcher.py | 25 +- tests/tracing/test_tracing.py | 277 ++++++++-------- tests/utils.py | 11 +- tests/v1/e2e/test_ngram_spec_decode.py | 11 +- tests/v1/engine/test_async_llm.py | 11 +- tests/v1/engine/test_engine_core.py | 10 +- tests/v1/engine/test_engine_core_client.py | 5 +- tests/v1/sample/test_logprobs.py | 224 +++++++------ tests/v1/tpu/test_basic.py | 16 +- 43 files changed, 1900 insertions(+), 1658 deletions(-) delete mode 100644 tests/compile/utils.py rename tests/kv_transfer/{disagg_test.py => test_disagg.py} (100%) rename tests/kv_transfer/{module_test.py => test_module.py} (100%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a6616d7b41480..f85572e7c234c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -522,7 +522,7 @@ steps: # TODO: investigate and fix # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py + - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py - label: Plugin Tests (2 GPUs) # 40min working_dir: "/vllm-workspace/tests" diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 0cb3b739b7245..1458f0893a93c 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -47,6 +47,7 @@ def test_vllm_gc_ed(): @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("enforce_eager", [False]) def test_models( + monkeypatch: pytest.MonkeyPatch, hf_runner, model: str, backend: str, @@ -63,31 +64,33 @@ def test_models( pytest.skip( f"{backend} does not support gemma2 with full context length.") - os.environ["VLLM_ATTENTION_BACKEND"] = backend + with monkeypatch.context() as m: + m.setenv("VLLM_ATTENTION_BACKEND", backend) - # 5042 tokens for gemma2 - # gemma2 has alternating sliding window size of 4096 - # we need a prompt with more than 4096 tokens to test the sliding window - prompt = "The following numbers of the sequence " + ", ".join( - str(i) for i in range(1024)) + " are:" - example_prompts = [prompt] + # 5042 tokens for gemma2 + # gemma2 has alternating sliding window size of 4096 + # we need a prompt with more than 4096 tokens to test the sliding window + prompt = "The following numbers of the sequence " + ", ".join( + str(i) for i in range(1024)) + " are:" + example_prompts = [prompt] - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - with VllmRunner(model, - max_model_len=8192, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + with VllmRunner(model, + max_model_len=8192, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens) - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @multi_gpu_test(num_gpus=2) @@ -104,6 +107,7 @@ def test_models( ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"), ]) def test_models_distributed( + monkeypatch: pytest.MonkeyPatch, hf_runner, vllm_runner, example_prompts, @@ -116,34 +120,41 @@ def test_models_distributed( if test_suite != TARGET_TEST_SUITE: pytest.skip(f"Skip test for {test_suite}") - if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa - # test Ray Compiled Graph - os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" - os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" + with monkeypatch.context() as monkeypatch_context: + if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa + # test Ray Compiled Graph + monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1") + monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1") - if attention_backend: - os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend + if attention_backend: + monkeypatch_context.setenv( + "VLLM_ATTENTION_BACKEND", + attention_backend, + ) - dtype = "half" - max_tokens = 5 + dtype = "half" + max_tokens = 5 - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - with vllm_runner(model, - dtype=dtype, - tensor_parallel_size=2, - distributed_executor_backend=distributed_executor_backend - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method + # (the default method). + with vllm_runner( + model, + dtype=dtype, + tensor_parallel_size=2, + distributed_executor_backend=distributed_executor_backend, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens) - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index be007de321c8a..06c9e25ed8dd8 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -7,16 +7,22 @@ prefill requests are chunked. Run `pytest tests/models/test_chunked_prefill.py`. """ -import os + +from __future__ import annotations + +from typing import TYPE_CHECKING import pytest -from tests.kernels.utils import override_backend_env_variable from vllm.platforms import current_platform +from vllm.utils import STR_BACKEND_ENV_VAR from ..models.utils import check_logprobs_close, check_outputs_equal from ..utils import multi_gpu_test +if TYPE_CHECKING: + from .conftest import HfRunner, VllmRunner + MODELS = [ "facebook/opt-125m", "meta-llama/Llama-3.2-1B-Instruct", @@ -24,12 +30,14 @@ MODELS = [ @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): +def use_v0_only(monkeypatch: pytest.MonkeyPatch): """ Since this module is V0 only, set VLLM_USE_V1=0 for all tests in the file. """ - monkeypatch.setenv('VLLM_USE_V1', '0') + with monkeypatch.context() as m: + m.setenv('VLLM_USE_V1', '0') + yield @pytest.mark.parametrize("model", MODELS) @@ -42,8 +50,8 @@ def use_v0_only(monkeypatch): @pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) def test_models( - hf_runner, - vllm_runner, + hf_runner: HfRunner, + vllm_runner: VllmRunner, example_prompts, model: str, dtype: str, @@ -52,37 +60,39 @@ def test_models( enforce_eager: bool, tensor_parallel_size: int, attention_backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """ Checks exact match decode between huggingface model and vllm runner with chunked prefill. """ - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - max_num_seqs = chunked_prefill_token_size - max_num_batched_tokens = chunked_prefill_token_size + max_num_seqs = chunked_prefill_token_size + max_num_batched_tokens = chunked_prefill_token_size - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - with vllm_runner( - model, - dtype=dtype, - max_num_batched_tokens=max_num_batched_tokens, - enable_chunked_prefill=True, - tensor_parallel_size=tensor_parallel_size, - enforce_eager=enforce_eager, - max_num_seqs=max_num_seqs, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + with vllm_runner( + model, + dtype=dtype, + max_num_batched_tokens=max_num_batched_tokens, + enable_chunked_prefill=True, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + max_num_seqs=max_num_seqs, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens) - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @multi_gpu_test(num_gpus=2) @@ -90,57 +100,61 @@ def test_models( @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) def test_models_distributed( - hf_runner, - vllm_runner, + hf_runner: HfRunner, + vllm_runner: VllmRunner, example_prompts, model: str, distributed_executor_backend: str, attention_backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) + if (model == "meta-llama/Llama-3.2-1B-Instruct" + and distributed_executor_backend == "ray"): + # test Ray Compiled Graph + m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1") + m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1") - if (model == "meta-llama/Llama-3.2-1B-Instruct" - and distributed_executor_backend == "ray"): - # test Ray Compiled Graph - os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" - os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" + dtype = "half" + max_tokens = 5 + chunked_prefill_token_size = 16 - dtype = "half" - max_tokens = 5 - chunked_prefill_token_size = 16 + # Add a chunked prefill config. + max_num_seqs = min(chunked_prefill_token_size, 256) + assert chunked_prefill_token_size != -1 + enable_chunked_prefill = True + max_num_batched_tokens = chunked_prefill_token_size - # Add a chunked prefill config. - max_num_seqs = min(chunked_prefill_token_size, 256) - assert chunked_prefill_token_size != -1 - enable_chunked_prefill = True - max_num_batched_tokens = chunked_prefill_token_size + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with + # fork method (the default method). - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). + with vllm_runner( + model, + dtype=dtype, + tensor_parallel_size=2, + max_num_seqs=max_num_seqs, + enable_chunked_prefill=enable_chunked_prefill, + max_num_batched_tokens=max_num_batched_tokens, + distributed_executor_backend=distributed_executor_backend, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy( + example_prompts, + max_tokens, + ) - with vllm_runner( - model, - dtype=dtype, - tensor_parallel_size=2, - max_num_seqs=max_num_seqs, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - distributed_executor_backend=distributed_executor_backend, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize( @@ -158,7 +172,7 @@ def test_models_distributed( # the async postprocessor @pytest.mark.parametrize("disable_async_output_proc", [True]) def test_models_with_fp8_kv_cache( - vllm_runner, + vllm_runner: VllmRunner, example_prompts, kv_cache_dtype: str, model: str, @@ -218,7 +232,7 @@ def test_models_with_fp8_kv_cache( @pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("dtype", ["half"]) def test_with_prefix_caching( - vllm_runner, + vllm_runner: VllmRunner, max_tokens: int, enforce_eager: bool, chunk_size: int, @@ -254,8 +268,10 @@ def test_with_prefix_caching( ) as vllm_model: outputs[enable] = [] for prompt in full_prompts: - outputs[enable] += vllm_model.generate_greedy([prompt], - max_tokens) + outputs[enable] += vllm_model.generate_greedy( + [prompt], + max_tokens, + ) check_outputs_equal( outputs_0_lst=outputs[False], @@ -274,8 +290,8 @@ def test_with_prefix_caching( @pytest.mark.cpu_model @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") def test_models_cpu( - hf_runner, - vllm_runner, + hf_runner: HfRunner, + vllm_runner: VllmRunner, example_prompts, model: str, dtype: str, @@ -283,7 +299,7 @@ def test_models_cpu( chunked_prefill_token_size: int, enforce_eager: bool, attention_backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: test_models( hf_runner, @@ -307,7 +323,7 @@ def test_models_cpu( @pytest.mark.cpu_model @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") def test_with_prefix_caching_cpu( - vllm_runner, + vllm_runner: VllmRunner, max_tokens: int, enforce_eager: bool, chunk_size: int, diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index ba81f2bb79d11..f5ee469fb00a9 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -123,40 +123,38 @@ def test_cumem_with_cudagraph(): # sleep mode with pytorch checkpoint ("facebook/opt-125m", False), ]) -def test_end_to_end(model: str, use_v1: bool): - import os - os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0" - free, total = torch.cuda.mem_get_info() - used_bytes_baseline = total - free # in case other process is running - llm = LLM(model, enable_sleep_mode=True) - prompt = "How are you?" - sampling_params = SamplingParams(temperature=0, max_tokens=10) - output = llm.generate(prompt, sampling_params) +def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + free, total = torch.cuda.mem_get_info() + used_bytes_baseline = total - free # in case other process is running + llm = LLM(model, enable_sleep_mode=True) + prompt = "How are you?" + sampling_params = SamplingParams(temperature=0, max_tokens=10) + output = llm.generate(prompt, sampling_params) - # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, - # which is difficult to measure in the test. therefore, we only - # test sleep level 1 here. - llm.sleep(level=1) + # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, + # which is difficult to measure in the test. therefore, we only + # test sleep level 1 here. + llm.sleep(level=1) - free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() - used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline - # now the memory usage is mostly cudagraph memory pool, - # and it should be less than the model weights (1B model, 2GiB weights) + free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() + used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline + # now the memory usage is mostly cudagraph memory pool, + # and it should be less than the model weights (1B model, 2GiB weights) - # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size) - # is captured but cannot be releasesd from PyTorch due to a known bug, - # therefore high memory usage after `llm.sleep` is called is expected. - # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode - # in V1. - if use_v1: - assert used_bytes < 7 * GiB_bytes - else: - assert used_bytes < 2 * GiB_bytes + # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size) + # is captured but cannot be releasesd from PyTorch due to a known bug, + # therefore high memory usage after `llm.sleep` is called is expected. + # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode + # in V1. + if use_v1: + assert used_bytes < 7 * GiB_bytes + else: + assert used_bytes < 2 * GiB_bytes - llm.wake_up() - output2 = llm.generate(prompt, sampling_params) + llm.wake_up() + output2 = llm.generate(prompt, sampling_params) - # cmp output - assert output[0].outputs[0].text == output2[0].outputs[0].text - - del os.environ["VLLM_USE_V1"] + # cmp output + assert output[0].outputs[0].text == output2[0].outputs[0].text diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 48323b21a8c42..b639fd719ca0a 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations import dataclasses -from typing import Optional import pytest @@ -22,75 +22,76 @@ class TestSetting: fullgraph: bool -# representative settings for testing -test_settings = [ - # basic llama model - TestSetting( - model="meta-llama/Llama-3.2-1B-Instruct", - model_args=[], - pp_size=2, - tp_size=2, - attn_backend="FLASHINFER", - method="generate", - fullgraph=True, - ), - # llama model with quantization - TestSetting( - model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", - model_args=["--quantization", "gptq"], - pp_size=1, - tp_size=1, - attn_backend="FLASH_ATTN", - method="generate", - fullgraph=True, - ), - # MoE model - TestSetting( - model="ibm/PowerMoE-3b", - model_args=[], - pp_size=1, - tp_size=2, - attn_backend="FLASH_ATTN", - method="generate", - fullgraph=True, - ), - # embedding model - TestSetting( - model="BAAI/bge-multilingual-gemma2", - model_args=["--task", "embed"], - pp_size=1, - tp_size=1, - attn_backend="FLASH_ATTN", - method="encode", - fullgraph=True, - ), - # encoder-based embedding model (BERT) - TestSetting( - model="BAAI/bge-base-en-v1.5", - model_args=["--task", "embed"], - pp_size=1, - tp_size=1, - attn_backend="XFORMERS", - method="encode", - fullgraph=True, - ), - # vision language model - TestSetting( - model="microsoft/Phi-3.5-vision-instruct", - model_args=["--trust-remote-code", "--max-model-len", "2048"], - pp_size=2, - tp_size=1, - attn_backend="FLASH_ATTN", - method="generate_with_image", - fullgraph=False, - ), -] - - # we cannot afford testing the full Catesian product # of all models and all levels -@pytest.mark.parametrize("test_setting", test_settings) -def test_compile_correctness(test_setting: TestSetting): +@pytest.mark.parametrize( + "test_setting", + [ + # basic llama model + TestSetting( + model="meta-llama/Llama-3.2-1B-Instruct", + model_args=[], + pp_size=2, + tp_size=2, + attn_backend="FLASHINFER", + method="generate", + fullgraph=True, + ), + # llama model with quantization + TestSetting( + model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", + model_args=["--quantization", "gptq"], + pp_size=1, + tp_size=1, + attn_backend="FLASH_ATTN", + method="generate", + fullgraph=True, + ), + # MoE model + TestSetting( + model="ibm/PowerMoE-3b", + model_args=[], + pp_size=1, + tp_size=2, + attn_backend="FLASH_ATTN", + method="generate", + fullgraph=True, + ), + # embedding model + TestSetting( + model="BAAI/bge-multilingual-gemma2", + model_args=["--task", "embed"], + pp_size=1, + tp_size=1, + attn_backend="FLASH_ATTN", + method="encode", + fullgraph=True, + ), + # encoder-based embedding model (BERT) + TestSetting( + model="BAAI/bge-base-en-v1.5", + model_args=["--task", "embed"], + pp_size=1, + tp_size=1, + attn_backend="XFORMERS", + method="encode", + fullgraph=True, + ), + # vision language model + TestSetting( + model="microsoft/Phi-3.5-vision-instruct", + model_args=["--trust-remote-code", "--max-model-len", "2048"], + pp_size=2, + tp_size=1, + attn_backend="FLASH_ATTN", + method="generate_with_image", + fullgraph=False, + ), + ]) +def test_compile_correctness( + monkeypatch: pytest.MonkeyPatch, + test_setting: TestSetting, +): # this test is run under multiple suits, with different GPUs. # make sure we only run the test with correct CUDA devices. # don't use "<", as it will duplicate the tests. @@ -103,41 +104,45 @@ def test_compile_correctness(test_setting: TestSetting): fullgraph = test_setting.fullgraph if cuda_device_count_stateless() != pp_size * tp_size: pytest.skip("Not correct CUDA devices for the test.") - import os - os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend - final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \ - ["-tp", str(tp_size)] - all_args: list[list[str]] = [] - all_envs: list[Optional[dict[str, str]]] = [] + with monkeypatch.context() as m: + m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) + final_args = [ + "--enforce-eager", *model_args, "-pp", + str(pp_size), "-tp", + str(tp_size) + ] - for level in [ - CompilationLevel.NO_COMPILATION, - CompilationLevel.PIECEWISE, - ]: - all_args.append(final_args + [f"-O{level}"]) - all_envs.append({}) + all_args: list[list[str]] = [] + all_envs: list[dict[str, str] | None] = [] - # inductor will change the output, so we only compare if the output - # is close, not exactly the same. - compare_all_settings( - model, - all_args, - all_envs, - method=method if method != "generate" else "generate_close") - all_envs.clear() - all_args.clear() + for level in [ + CompilationLevel.NO_COMPILATION, + CompilationLevel.PIECEWISE, + ]: + all_args.append(final_args + [f"-O{level}"]) + all_envs.append({}) - for level in [ - CompilationLevel.NO_COMPILATION, - CompilationLevel.DYNAMO_AS_IS, - CompilationLevel.DYNAMO_ONCE, - ]: - all_args.append(final_args + [f"-O{level}"]) - all_envs.append({}) - if level != CompilationLevel.DYNAMO_ONCE and not fullgraph: - # "DYNAMO_ONCE" will always use fullgraph - all_envs[-1][ - "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore + # inductor will change the output, so we only compare if the output + # is close, not exactly the same. + compare_all_settings( + model, + all_args, + all_envs, + method=method if method != "generate" else "generate_close") + all_envs.clear() + all_args.clear() - compare_all_settings(model, all_args * 3, all_envs, method=method) + for level in [ + CompilationLevel.NO_COMPILATION, + CompilationLevel.DYNAMO_AS_IS, + CompilationLevel.DYNAMO_ONCE, + ]: + all_args.append(final_args + [f"-O{level}"]) + all_envs.append({}) + if level != CompilationLevel.DYNAMO_ONCE and not fullgraph: + # "DYNAMO_ONCE" will always use fullgraph + all_envs[-1][ + "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore + + compare_all_settings(model, all_args * 3, all_envs, method=method) diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 6e83fa36881e4..cf463f3e75254 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -1,22 +1,115 @@ # SPDX-License-Identifier: Apache-2.0 -import pytest +from __future__ import annotations +from typing import Any + +import pytest +import torch + +from tests.quantization.utils import is_quant_method_supported +from vllm import LLM, SamplingParams from vllm.config import CompilationLevel +from vllm.platforms import current_platform from ..utils import fork_new_process_for_each_test -from .utils import TEST_MODELS, check_full_graph_support -@pytest.mark.parametrize("model_info", TEST_MODELS) +@pytest.fixture(params=None, name="model_info") +def models_list_fixture(request): + TEST_MODELS: list[tuple[str, dict[str, Any]]] = [ + ("facebook/opt-125m", {}), + ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", { + "dtype": torch.float16, + "quantization": "compressed-tensors" + }), + ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", { + "dtype": torch.float16, + "quantization": "compressed-tensors" + }), + ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", { + "quantization": "compressed-tensors" + }), + ("meta-llama/Llama-3.2-1B-Instruct", {}), + ] + + if is_quant_method_supported("aqlm"): + TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", { + "quantization": "aqlm" + })) + + # TODO: figure out why this fails. + if False and is_quant_method_supported("gguf"): # noqa: SIM223 + TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", { + "quantization": "gguf" + })) + + if is_quant_method_supported("gptq"): + TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", { + "quantization": "gptq" + })) + + if is_quant_method_supported("gptq_marlin"): + TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", { + "quantization": "gptq_marlin" + })) + + if is_quant_method_supported("gptq_marlin_24"): + TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", { + "quantization": "gptq_marlin_24" + })) + + if is_quant_method_supported("marlin"): + TEST_MODELS.append( + ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", { + "quantization": "marlin" + })) + + if not current_platform.is_rocm() and is_quant_method_supported("awq"): + TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", { + "quantization": "AWQ" + })) + + return TEST_MODELS + + @pytest.mark.parametrize( "optimization_level", - [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE]) + [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE], +) +@pytest.mark.parametrize("model_info", "", indirect=True) @fork_new_process_for_each_test -def test_full_graph(model_info, optimization_level): - model = model_info[0] - model_kwargs = model_info[1] - check_full_graph_support(model, - model_kwargs, - optimization_level, - tp_size=1) +def test_full_graph( + monkeypatch: pytest.MonkeyPatch, + model_info: tuple[str, dict[str, Any]], + optimization_level: int, +): + model, model_kwargs = model_info + + with monkeypatch.context() as m: + # make sure these models can be captured in full graph mode + m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") + print(f"MODEL={model}") + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0) + llm = LLM( + model=model, + enforce_eager=True, + tensor_parallel_size=1, + disable_custom_all_reduce=True, + compilation_config=optimization_level, + **model_kwargs, + ) + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/tests/compile/utils.py b/tests/compile/utils.py deleted file mode 100644 index fb8270c26b1b0..0000000000000 --- a/tests/compile/utils.py +++ /dev/null @@ -1,93 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import os - -import torch - -from tests.quantization.utils import is_quant_method_supported -from vllm import LLM, SamplingParams -from vllm.platforms import current_platform - -TEST_MODELS = [ - ("facebook/opt-125m", {}), - ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", { - "dtype": torch.float16, - "quantization": "compressed-tensors" - }), - ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", { - "dtype": torch.float16, - "quantization": "compressed-tensors" - }), - ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", { - "quantization": "compressed-tensors" - }), - ("meta-llama/Llama-3.2-1B-Instruct", {}), -] - -if is_quant_method_supported("aqlm"): - TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", { - "quantization": "aqlm" - })) - -# TODO: figure out why this fails. -if False and is_quant_method_supported("gguf"): # noqa: SIM223 - TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", { - "quantization": "gguf" - })) - -if is_quant_method_supported("gptq"): - TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", { - "quantization": "gptq" - })) - -if is_quant_method_supported("gptq_marlin"): - TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", { - "quantization": "gptq_marlin" - })) - -if is_quant_method_supported("gptq_marlin_24"): - TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", { - "quantization": "gptq_marlin_24" - })) - -if is_quant_method_supported("marlin"): - TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", { - "quantization": "marlin" - })) - -if not current_platform.is_rocm() and is_quant_method_supported("awq"): - TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", { - "quantization": "AWQ" - })) - - -def check_full_graph_support(model, - model_kwargs, - optimization_level, - tp_size=1): - # make sure these models can be captured in full graph mode - os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1" - - print(f"MODEL={model}") - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0) - llm = LLM(model=model, - enforce_eager=True, - tensor_parallel_size=tp_size, - disable_custom_all_reduce=True, - compilation_config=optimization_level, - **model_kwargs) - - outputs = llm.generate(prompts, sampling_params) - - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/tests/conftest.py b/tests/conftest.py index 41c0e62ce14f3..30e5ca2eb137a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1110,4 +1110,4 @@ def pytest_collection_modifyitems(config, items): skip_optional = pytest.mark.skip(reason="need --optional option to run") for item in items: if "optional" in item.keywords: - item.add_marker(skip_optional) + item.add_marker(skip_optional) \ No newline at end of file diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index 7b0346b8ab50f..ac6d6aae30063 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -3,7 +3,10 @@ Run `pytest tests/distributed/test_comm_ops.py`. """ -import os + +from __future__ import annotations + +from typing import Any, Callable import pytest import ray @@ -17,12 +20,18 @@ from ..utils import init_test_distributed_environment, multi_process_parallel @ray.remote(num_gpus=1, max_calls=1) -def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int, - distributed_init_port: str): +def all_reduce_test_worker( + monkeypatch: pytest.MonkeyPatch, + tp_size: int, + pp_size: int, + rank: int, + distributed_init_port: str, +): # it is important to delete the CUDA_VISIBLE_DEVICES environment variable # so that each worker can see all the GPUs # they will be able to set the device to the correct GPU - os.environ.pop("CUDA_VISIBLE_DEVICES", None) + monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) + device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int, @ray.remote(num_gpus=1, max_calls=1) -def all_gather_test_worker(tp_size: int, pp_size: int, rank: int, - distributed_init_port: str): +def all_gather_test_worker( + monkeypatch: pytest.MonkeyPatch, + tp_size: int, + pp_size: int, + rank: int, + distributed_init_port: str, +): # it is important to delete the CUDA_VISIBLE_DEVICES environment variable # so that each worker can see all the GPUs # they will be able to set the device to the correct GPU - os.environ.pop("CUDA_VISIBLE_DEVICES", None) + monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int, @ray.remote(num_gpus=1, max_calls=1) -def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, - distributed_init_port: str): +def broadcast_tensor_dict_test_worker( + monkeypatch: pytest.MonkeyPatch, + tp_size: int, + pp_size: int, + rank: int, + distributed_init_port: str, +): # it is important to delete the CUDA_VISIBLE_DEVICES environment variable # so that each worker can see all the GPUs # they will be able to set the device to the correct GPU - os.environ.pop("CUDA_VISIBLE_DEVICES", None) + monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, @ray.remote(num_gpus=1, max_calls=1) -def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, - distributed_init_port: str): - os.environ.pop("CUDA_VISIBLE_DEVICES", None) +def send_recv_tensor_dict_test_worker( + monkeypatch: pytest.MonkeyPatch, + tp_size: int, + pp_size: int, + rank: int, + distributed_init_port: str, +): + monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, @ray.remote(num_gpus=1, max_calls=1) -def send_recv_test_worker(tp_size: int, pp_size: int, rank: int, - distributed_init_port: str): - os.environ.pop("CUDA_VISIBLE_DEVICES", None) +def send_recv_test_worker( + monkeypatch: pytest.MonkeyPatch, + tp_size: int, + pp_size: int, + rank: int, + distributed_init_port: str, +): + monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int, all_reduce_test_worker, all_gather_test_worker, broadcast_tensor_dict_test_worker ]) -def test_multi_process_tensor_parallel(tp_size, test_target): - multi_process_parallel(tp_size, 1, test_target) +def test_multi_process_tensor_parallel( + monkeypatch: pytest.MonkeyPatch, + tp_size: int, + test_target: Callable[..., Any], +): + multi_process_parallel(monkeypatch, tp_size, 1, test_target) @pytest.mark.skipif(torch.cuda.device_count() < 2, @@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target): @pytest.mark.parametrize("pp_size", [2]) @pytest.mark.parametrize( "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]) -def test_multi_process_pipeline_parallel(pp_size, test_target): - multi_process_parallel(1, pp_size, test_target) +def test_multi_process_pipeline_parallel( + monkeypatch: pytest.MonkeyPatch, + pp_size: int, + test_target: Callable[..., Any], +): + multi_process_parallel(monkeypatch, 1, pp_size, test_target) @pytest.mark.skipif(torch.cuda.device_count() < 4, @@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target): broadcast_tensor_dict_test_worker ]) def test_multi_process_tensor_parallel_pipeline_parallel( - tp_size, pp_size, test_target): - multi_process_parallel(tp_size, pp_size, test_target) + tp_size: int, + pp_size: int, + test_target: Callable[..., Any], + monkeypatch: pytest.MonkeyPatch, +): + multi_process_parallel(monkeypatch, tp_size, pp_size, test_target) diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index 4928690bebb07..bfa7d06c4d075 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import os import random import pytest @@ -23,95 +22,115 @@ for i, v in enumerate(test_sizes): @ray.remote(num_gpus=1, max_calls=1) -def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): - os.environ.pop("CUDA_VISIBLE_DEVICES", None) - device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) - init_test_distributed_environment(tp_size, pp_size, rank, - distributed_init_port) - ensure_model_parallel_initialized(tp_size, pp_size) - group = get_tensor_model_parallel_group().device_group +def graph_allreduce( + monkeypatch: pytest.MonkeyPatch, + tp_size, + pp_size, + rank, + distributed_init_port, +): + with monkeypatch.context() as m: + m.delenv("CUDA_VISIBLE_DEVICES", raising=False) + device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(device) + init_test_distributed_environment(tp_size, pp_size, rank, + distributed_init_port) + ensure_model_parallel_initialized(tp_size, pp_size) + group = get_tensor_model_parallel_group().device_group - # A small all_reduce for warmup. - # this is needed because device communicators might be created lazily - # (e.g. NCCL). This will ensure that the communicator is initialized - # before any communication happens, so that this group can be used for - # graph capture immediately. - data = torch.zeros(1) - data = data.to(device=device) - torch.distributed.all_reduce(data, group=group) - torch.cuda.synchronize() - del data + # A small all_reduce for warmup. + # this is needed because device communicators might be created lazily + # (e.g. NCCL). This will ensure that the communicator is initialized + # before any communication happens, so that this group can be used for + # graph capture immediately. + data = torch.zeros(1) + data = data.to(device=device) + torch.distributed.all_reduce(data, group=group) + torch.cuda.synchronize() + del data - # we use the first group to communicate once - # and the second group to communicate twice - # and so on - # this is used to demonstrate that each group can - # communicate independently - num_communication = rank // tp_size + 1 + # we use the first group to communicate once + # and the second group to communicate twice + # and so on + # this is used to demonstrate that each group can + # communicate independently + num_communication = rank // tp_size + 1 - for sz in test_sizes: - for dtype in [torch.float32, torch.float16, torch.bfloat16]: - with graph_capture(device=device) as graph_capture_context: - # use integers so result matches NCCL exactly - inp1 = torch.randint(1, - 16, (sz, ), - dtype=dtype, - device=torch.cuda.current_device()) - inp2 = torch.randint(1, - 16, (sz, ), - dtype=dtype, - device=torch.cuda.current_device()) - torch.cuda.synchronize() - graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(graph, - stream=graph_capture_context.stream): - for i in range(num_communication): - out1 = tensor_model_parallel_all_reduce(inp1) - # the input buffer is immediately modified to test - # synchronization - dist.all_reduce(inp1, group=group) - out2 = tensor_model_parallel_all_reduce(inp2) - dist.all_reduce(inp2, group=group) - graph.replay() - torch.testing.assert_close(out1, inp1) - torch.testing.assert_close(out2, inp2) + for sz in test_sizes: + for dtype in [torch.float32, torch.float16, torch.bfloat16]: + with graph_capture(device=device) as graph_capture_context: + # use integers so result matches NCCL exactly + inp1 = torch.randint(1, + 16, (sz, ), + dtype=dtype, + device=torch.cuda.current_device()) + inp2 = torch.randint(1, + 16, (sz, ), + dtype=dtype, + device=torch.cuda.current_device()) + torch.cuda.synchronize() + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph, + stream=graph_capture_context.stream): + for i in range(num_communication): + out1 = tensor_model_parallel_all_reduce(inp1) + # the input buffer is immediately modified to test + # synchronization + dist.all_reduce(inp1, group=group) + out2 = tensor_model_parallel_all_reduce(inp2) + dist.all_reduce(inp2, group=group) + graph.replay() + torch.testing.assert_close(out1, inp1) + torch.testing.assert_close(out2, inp2) @ray.remote(num_gpus=1, max_calls=1) -def eager_allreduce(tp_size, pp_size, rank, distributed_init_port): - os.environ.pop("CUDA_VISIBLE_DEVICES", None) - device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) - init_test_distributed_environment(tp_size, pp_size, rank, - distributed_init_port) +def eager_allreduce( + monkeypatch: pytest.MonkeyPatch, + tp_size, + pp_size, + rank, + distributed_init_port, +): + with monkeypatch.context() as m: + m.delenv("CUDA_VISIBLE_DEVICES", raising=False) + device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(device) + init_test_distributed_environment(tp_size, pp_size, rank, + distributed_init_port) - # we use the first group to communicate once - # and the second group to communicate twice - # and so on - # this is used to demonstrate that each group can - # communicate independently - num_communication = rank // tp_size + 1 - sz = 1024 - fa = get_tp_group().ca_comm - inp = torch.ones(sz, dtype=torch.float32, device=device) - out = inp - for _ in range(num_communication): - out = fa.all_reduce(out, registered=False) - torch.testing.assert_close(out, inp * (tp_size**num_communication)) + # we use the first group to communicate once + # and the second group to communicate twice + # and so on + # this is used to demonstrate that each group can + # communicate independently + num_communication = rank // tp_size + 1 + sz = 1024 + fa = get_tp_group().ca_comm + inp = torch.ones(sz, dtype=torch.float32, device=device) + out = inp + for _ in range(num_communication): + out = fa.all_reduce(out, registered=False) + torch.testing.assert_close(out, inp * (tp_size**num_communication)) - inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device) - out = inp - for _ in range(num_communication): - out = fa.all_reduce(out, registered=False) - torch.testing.assert_close(out, inp * (tp_size**num_communication)) + inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device) + out = inp + for _ in range(num_communication): + out = fa.all_reduce(out, registered=False) + torch.testing.assert_close(out, inp * (tp_size**num_communication)) @pytest.mark.parametrize("tp_size", [2]) @pytest.mark.parametrize("pipeline_parallel_size", [1, 2]) @pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce]) -def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target): +def test_custom_allreduce( + monkeypatch: pytest.MonkeyPatch, + tp_size, + pipeline_parallel_size, + test_target, +): world_size = tp_size * pipeline_parallel_size if world_size > torch.cuda.device_count(): pytest.skip("Not enough GPUs to run the test.") - multi_process_parallel(tp_size, pipeline_parallel_size, test_target) + multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, + test_target) diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py index 18c5be29c5ce1..7bf93f270148b 100644 --- a/tests/distributed/test_pipeline_partition.py +++ b/tests/distributed/test_pipeline_partition.py @@ -7,33 +7,35 @@ import pytest from vllm.distributed.utils import get_pp_indices -def test_custom_layer_partition(): +def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch): - def _verify(partition_str, num_layers, pp_size, goldens): - bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None) - os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str - for pp_rank, golden in enumerate(goldens): - assert get_pp_indices(num_layers, pp_rank, pp_size) == golden - if bak is not None: - os.environ["VLLM_PP_LAYER_PARTITION"] = bak + with monkeypatch.context() as m: - # Even partition - _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) - # Balanced partition - _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)]) - # Put reminder somewhere - _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)]) - # Invalid partition strings - with pytest.raises(ValueError): - _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) - with pytest.raises(ValueError): - _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) - # Wrong number of partitions - with pytest.raises(ValueError): - _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) - # Wrong number of layers - with pytest.raises(ValueError): - _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) + def _verify(partition_str, num_layers, pp_size, goldens): + bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None) + m.setenv("VLLM_PP_LAYER_PARTITION", partition_str) + for pp_rank, golden in enumerate(goldens): + assert get_pp_indices(num_layers, pp_rank, pp_size) == golden + if bak is not None: + m.setenv("VLLM_PP_LAYER_PARTITION", bak) + + # Even partition + _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) + # Balanced partition + _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)]) + # Put reminder somewhere + _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)]) + # Invalid partition strings + with pytest.raises(ValueError): + _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) + with pytest.raises(ValueError): + _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) + # Wrong number of partitions + with pytest.raises(ValueError): + _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) + # Wrong number of layers + with pytest.raises(ValueError): + _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) @pytest.mark.parametrize( @@ -55,6 +57,10 @@ def test_custom_layer_partition(): (5, 3, 1, (2, 4)), (5, 3, 2, (4, 5)), ]) -def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int, - pp_rank: int, indices: tuple[int, int]): +def test_uneven_auto_partition( + num_hidden_layers: int, + pp_size: int, + pp_rank: int, + indices: tuple[int, int], +): assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size) diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py index 3bc85b05e7d15..19414971f2b46 100644 --- a/tests/distributed/test_pp_cudagraph.py +++ b/tests/distributed/test_pp_cudagraph.py @@ -1,11 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations -import os +from typing import TYPE_CHECKING import pytest from ..utils import compare_two_settings, fork_new_process_for_each_test +if TYPE_CHECKING: + from typing_extensions import LiteralString + @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [ (2, "JackFram/llama-160m"), @@ -15,18 +19,24 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test "FLASHINFER", ]) @fork_new_process_for_each_test -def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND): - cudagraph_args = [ - # use half precision for speed and memory savings in CI environment - "--dtype", - "float16", - "--pipeline-parallel-size", - str(PP_SIZE), - "--distributed-executor-backend", - "mp", - ] - os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND +def test_pp_cudagraph( + monkeypatch: pytest.MonkeyPatch, + PP_SIZE: int, + MODEL_NAME: str, + ATTN_BACKEND: LiteralString, +): + with monkeypatch.context() as m: + cudagraph_args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "float16", + "--pipeline-parallel-size", + str(PP_SIZE), + "--distributed-executor-backend", + "mp", + ] + m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND) - eager_args = cudagraph_args + ["--enforce-eager"] + eager_args = cudagraph_args + ["--enforce-eager"] - compare_two_settings(MODEL_NAME, eager_args, cudagraph_args) + compare_two_settings(MODEL_NAME, eager_args, cudagraph_args) diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py index 3ebc5a44d80c6..77fbb5827da9e 100644 --- a/tests/entrypoints/llm/test_accuracy.py +++ b/tests/entrypoints/llm/test_accuracy.py @@ -49,7 +49,7 @@ TPU_TP_TEST_STR = "" #"tensor_parallel_size=4" @pytest.mark.skipif(not current_platform.is_cuda() and not current_platform.is_tpu(), reason="V1 is currently only supported on CUDA and TPU") -def test_lm_eval_accuracy_v1_engine(monkeypatch): +def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch): """Run with the V1 Engine.""" with monkeypatch.context() as m: @@ -67,7 +67,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch): run_test(more_args) -def test_lm_eval_accuracy_v0_engine(monkeypatch): +def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch): """Run with the V0 Engine.""" with monkeypatch.context() as m: diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index 85156d6931c8c..23fd72f4ebbb9 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -53,32 +53,37 @@ def cache_models(): @pytest.mark.skip_global_cleanup @pytest.mark.usefixtures("cache_models") -def test_offline_mode(monkeypatch): +def test_offline_mode(monkeypatch: pytest.MonkeyPatch): # Set HF to offline mode and ensure we can still construct an LLM - try: - monkeypatch.setenv("HF_HUB_OFFLINE", "1") - monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1") + with monkeypatch.context() as m: + try: + m.setenv("HF_HUB_OFFLINE", "1") + m.setenv("VLLM_NO_USAGE_STATS", "1") - def disable_connect(*args, **kwargs): - raise RuntimeError("No http calls allowed") + def disable_connect(*args, **kwargs): + raise RuntimeError("No http calls allowed") - monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect", - disable_connect) - monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect", - disable_connect) + m.setattr( + urllib3.connection.HTTPConnection, + "connect", + disable_connect, + ) + m.setattr( + urllib3.connection.HTTPSConnection, + "connect", + disable_connect, + ) - # Need to re-import huggingface_hub and friends to setup offline mode - _re_import_modules() - # Cached model files should be used in offline mode - for model_config in MODEL_CONFIGS: - LLM(**model_config) - finally: - # Reset the environment after the test - # NB: Assuming tests are run in online mode - monkeypatch.delenv("HF_HUB_OFFLINE") - monkeypatch.delenv("VLLM_NO_USAGE_STATS") - _re_import_modules() - pass + # Need to re-import huggingface_hub + # and friends to setup offline mode + _re_import_modules() + # Cached model files should be used in offline mode + for model_config in MODEL_CONFIGS: + LLM(**model_config) + finally: + # Reset the environment after the test + # NB: Assuming tests are run in online mode + _re_import_modules() def _re_import_modules(): diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py index e4c087db3d4f0..d3948e2ed575e 100644 --- a/tests/entrypoints/openai/correctness/test_lmeval.py +++ b/tests/entrypoints/openai/correctness/test_lmeval.py @@ -70,7 +70,7 @@ def run_test(more_args): @pytest.mark.skipif(not current_platform.is_cuda() and not current_platform.is_tpu(), reason="V1 currently only supported on CUDA and TPU") -def test_lm_eval_accuracy_v1_engine(monkeypatch): +def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch): """Run with the V1 Engine.""" with monkeypatch.context() as m: @@ -85,7 +85,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch): @pytest.mark.parametrize("more_args", MORE_ARGS_LIST) -def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args): +def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch, + more_args): """Run with the V0 Engine.""" with monkeypatch.context() as m: diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 570e643e0364d..66db7509cc474 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -5,13 +5,12 @@ from unittest.mock import Mock, patch import pytest import torch -from tests.kernels.utils import override_backend_env_variable from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform from vllm.platforms.openvino import OpenVinoPlatform from vllm.platforms.rocm import RocmPlatform -from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL +from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL @pytest.fixture(autouse=True) @@ -25,87 +24,111 @@ def clear_cache(): "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"]) @pytest.mark.parametrize("use_v1", [True, False]) @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"]) -def test_env(name: str, use_v1: bool, device: str, monkeypatch): +def test_env( + name: str, + use_v1: bool, + device: str, + monkeypatch: pytest.MonkeyPatch, +): """Test that the attention selector can be set via environment variable. Note that we do not test FlashAttn because it is the default backend. """ - monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0") - override_backend_env_variable(monkeypatch, name) + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + m.setenv(STR_BACKEND_ENV_VAR, name) - if device == "cpu": - with patch("vllm.attention.selector.current_platform", CpuPlatform()): - backend = get_attn_backend(16, torch.float16, torch.float16, 16, - False) - assert backend.get_name() == "TORCH_SDPA" - elif device == "hip": - with patch("vllm.attention.selector.current_platform", RocmPlatform()): - backend = get_attn_backend(16, torch.float16, torch.float16, 16, - False) - EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH" - assert backend.get_name() == EXPECTED - elif device == "openvino": - with patch("vllm.attention.selector.current_platform", - OpenVinoPlatform()), patch.dict('sys.modules', - {'openvino': Mock()}): - backend = get_attn_backend(16, torch.float16, torch.float16, 16, - False) - assert backend.get_name() == "OPENVINO" - else: - if name in ["XFORMERS", "FLASHINFER"]: + if device == "cpu": with patch("vllm.attention.selector.current_platform", - CudaPlatform()): + CpuPlatform()): backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) - EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name + assert backend.get_name() == "TORCH_SDPA" + elif device == "hip": + with patch("vllm.attention.selector.current_platform", + RocmPlatform()): + backend = get_attn_backend(16, torch.float16, torch.float16, + 16, False) + EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH" assert backend.get_name() == EXPECTED + elif device == "openvino": + with patch("vllm.attention.selector.current_platform", + OpenVinoPlatform()), patch.dict('sys.modules', + {'openvino': Mock()}): + backend = get_attn_backend(16, torch.float16, torch.float16, + 16, False) + assert backend.get_name() == "OPENVINO" + else: + if name in ["XFORMERS", "FLASHINFER"]: + with patch("vllm.attention.selector.current_platform", + CudaPlatform()): + backend = get_attn_backend(16, torch.float16, + torch.float16, 16, False) + EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name + assert backend.get_name() == EXPECTED -def test_flash_attn(monkeypatch): +def test_flash_attn(monkeypatch: pytest.MonkeyPatch): """Test FlashAttn validation.""" # TODO: When testing for v1, pipe in `use_v1` as an argument to # get_attn_backend - override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL) - # Unsupported CUDA arch - with patch("torch.cuda.get_device_capability", return_value=(7, 5)): + # Unsupported CUDA arch + monkeypatch.setattr(torch.cuda, "get_device_capability", lambda: + (7, 5)) backend = get_attn_backend(16, torch.float16, None, 16, False) assert backend.get_name() != STR_FLASH_ATTN_VAL - # Unsupported data type - backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL + # Reset the monkeypatch for subsequent tests + monkeypatch.undo() - # Unsupported kv cache data type - backend = get_attn_backend(16, torch.float16, "fp8", 16, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL + # Unsupported data type + backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL - # Unsupported block size - backend = get_attn_backend(16, torch.float16, None, 8, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL + # Unsupported kv cache data type + backend = get_attn_backend(16, torch.float16, "fp8", 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL - # flash-attn is not installed - with patch.dict('sys.modules', {'vllm_flash_attn': None}): + # Unsupported block size + backend = get_attn_backend(16, torch.float16, None, 8, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL + + # flash-attn is not installed + import sys + original_module = sys.modules.get('vllm_flash_attn') + monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None) backend = get_attn_backend(16, torch.float16, None, 16, False) assert backend.get_name() != STR_FLASH_ATTN_VAL - # Unsupported head size - backend = get_attn_backend(17, torch.float16, None, 16, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL + # Restore the original module if it existed + if original_module is not None: + monkeypatch.setitem(sys.modules, 'vllm_flash_attn', + original_module) + else: + monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False) - # Attention-free models should bypass env and use PlaceholderAttention - backend = get_attn_backend(16, torch.float16, torch.float16, 16, True) - assert backend.get_name() != STR_FLASH_ATTN_VAL + # Unsupported head size + backend = get_attn_backend(17, torch.float16, None, 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL + + # Attention-free models should bypass env and use PlaceholderAttention + backend = get_attn_backend(16, torch.float16, torch.float16, 16, True) + assert backend.get_name() != STR_FLASH_ATTN_VAL @pytest.mark.parametrize("use_v1", [True, False]) -def test_invalid_env(use_v1: bool, monkeypatch): - """Ignore the invalid env variable if it is set.""" - monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0") - override_backend_env_variable(monkeypatch, STR_INVALID_VAL) +def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch): - with patch("vllm.attention.selector.current_platform", CudaPlatform()): + with monkeypatch.context() as m, patch( + "vllm.attention.selector.current_platform", CudaPlatform()): + m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL) + + # Test with head size 32 backend = get_attn_backend(32, torch.float16, None, 16, False) EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN" assert backend.get_name() == EXPECTED diff --git a/tests/kernels/test_awq.py b/tests/kernels/test_awq.py index 37ce00c74030a..248b294e546b3 100644 --- a/tests/kernels/test_awq.py +++ b/tests/kernels/test_awq.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import os - import pytest import torch @@ -11,36 +9,38 @@ from vllm import _custom_ops as ops # noqa: F401 @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"), reason="AWQ is not supported on this GPU type.") -def test_awq_dequantize_opcheck(): - os.environ["VLLM_USE_TRITON_AWQ"] = "0" - qweight = torch.randint(-2000000000, - 2000000000, (8192, 256), - device='cuda', - dtype=torch.int32) - scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16) - zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32) - split_k_iters = 0 - thx = 0 - thy = 0 - opcheck(torch.ops._C.awq_dequantize, - (qweight, scales, zeros, split_k_iters, thx, thy)) +def test_awq_dequantize_opcheck(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_TRITON_AWQ", "0") + qweight = torch.randint(-2000000000, + 2000000000, (8192, 256), + device='cuda', + dtype=torch.int32) + scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16) + zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32) + split_k_iters = 0 + thx = 0 + thy = 0 + opcheck(torch.ops._C.awq_dequantize, + (qweight, scales, zeros, split_k_iters, thx, thy)) @pytest.mark.skip(reason="Not working; needs investigation.") @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"), reason="AWQ is not supported on this GPU type.") -def test_awq_gemm_opcheck(): - os.environ["VLLM_USE_TRITON_AWQ"] = "0" - input = torch.rand((2, 8192), device='cuda', dtype=torch.float16) - qweight = torch.randint(-2000000000, - 2000000000, (8192, 256), - device='cuda', - dtype=torch.int32) - scales = torch.randint(-2000000000, - 2000000000, (64, 256), - device='cuda', - dtype=torch.int32) - qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16) - split_k_iters = 8 - opcheck(torch.ops._C.awq_gemm, - (input, qweight, qzeros, scales, split_k_iters)) +def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_TRITON_AWQ", "0") + input = torch.rand((2, 8192), device='cuda', dtype=torch.float16) + qweight = torch.randint(-2000000000, + 2000000000, (8192, 256), + device='cuda', + dtype=torch.int32) + scales = torch.randint(-2000000000, + 2000000000, (64, 256), + device='cuda', + dtype=torch.int32) + qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16) + split_k_iters = 8 + opcheck(torch.ops._C.awq_gemm, + (input, qweight, qzeros, scales, split_k_iters)) diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/test_rocm_attention_selector.py index 7cd6082486605..724f0af283f70 100644 --- a/tests/kernels/test_rocm_attention_selector.py +++ b/tests/kernels/test_rocm_attention_selector.py @@ -1,13 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import patch - import pytest import torch -from tests.kernels.utils import override_backend_env_variable from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.platforms.rocm import RocmPlatform +from vllm.utils import STR_BACKEND_ENV_VAR @pytest.fixture(autouse=True) @@ -17,15 +15,19 @@ def clear_cache(): _cached_get_attn_backend.cache_clear() -def test_selector(monkeypatch): - """Test that the attention selector for ROCm. - """ - override_backend_env_variable(monkeypatch, "ROCM_FLASH") +def test_selector(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH") - with patch("vllm.attention.selector.current_platform", RocmPlatform()): + # Set the current platform to ROCm using monkeypatch + monkeypatch.setattr("vllm.attention.selector.current_platform", + RocmPlatform()) + + # Test standard ROCm attention backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) assert (backend.get_name() == "ROCM_FLASH" or backend.get_name() == "ROCM_ATTN_VLLM_V1") + # mla test for deepseek related backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, False, True) diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 010974076ba8f..22b3d7c2be7a5 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -36,12 +36,12 @@ ALL_OPCHECK_TEST_UTILS: tuple[str, ...] = ( class QKVInputs(NamedTuple): ''' - Data structure for representing unpacked attention inputs, + Data structure for representing unpacked attention inputs, query/key/values and their sequence lengths. Attributes: - * {query,key,value}: unpacked (batch_size x padded_seq_len x + * {query,key,value}: unpacked (batch_size x padded_seq_len x num_heads x head_size) attention inputs * q_seq_lens: query sequence lengths list * kv_seq_lens: shared key/value sequence lengths list @@ -56,14 +56,14 @@ class QKVInputs(NamedTuple): class QKVO(NamedTuple): ''' - Data structure for representing unpacked attention inputs, + Data structure for representing unpacked attention inputs, alongside unpacked known-correct attention output Attributes: - * qkv: unpacked (batch_size x padded_seq_len x + * qkv: unpacked (batch_size x padded_seq_len x num_heads x head_size) attention inputs - * ideal_output: unpacked (batch_size x padded_seq_len x + * ideal_output: unpacked (batch_size x padded_seq_len x num_heads x head_size) known-correct attention output ''' @@ -77,7 +77,7 @@ class PackedQKVInputs(NamedTuple): Attributes: - * {query,key,value}: packed (number_of_tokens x num_heads + * {query,key,value}: packed (number_of_tokens x num_heads x head_size) attention inputs * q_start_loc_list: list of query start locations within packed tensor * kv_start_loc_list: shared list of key/value start locations within @@ -97,14 +97,14 @@ class PackedQKVInputs(NamedTuple): class PackedQKVO(NamedTuple): ''' - Data structure for representing packed attention inputs, + Data structure for representing packed attention inputs, alongside packed known-correct attention output Attributes: - * packed_qkv: packed (number_of_tokens x num_heads + * packed_qkv: packed (number_of_tokens x num_heads x head_size) attention inputs - * ideal_output: packed (number_of_tokens x num_heads + * ideal_output: packed (number_of_tokens x num_heads x head_size) known-correct attention output ''' @@ -134,7 +134,7 @@ class PhaseTestParameters(NamedTuple): Attributes: - * packed_qkvo: packed (number_of_tokens x num_heads + * packed_qkvo: packed (number_of_tokens x num_heads x head_size) attention inputs & known-correct output * kv_mmap: KV cache memory mapping, specific to this test phase & @@ -195,7 +195,7 @@ def make_causal_mask( Create a q_max_seq_len x kv_max_seq_len causal mask Arguments: - + * q_max_seq_len: query max seq len * kv_max_seq_len: key/value max seq len @@ -320,9 +320,9 @@ def make_qkv( * max_kv_seq_len: max key/value seq len * num_heads * head_size - * is_encoder_decoder_attn: if True, query seqlen may differ from - key/value seqlen (as is often the case for cross-attention); - o/w, query/key/value seqlens match at each batch index + * is_encoder_decoder_attn: if True, query seqlen may differ from + key/value seqlen (as is often the case for cross-attention); + o/w, query/key/value seqlens match at each batch index (max_kv_seq_len is unused) * force_kv_seq_lens: if not None, overrides kv sequence lengths * attn_type: encoder, decoder self, or enc/dec cross attention @@ -469,7 +469,7 @@ def pack_qkv(qkv: QKVInputs, device: Union[torch.device, Individually pack each of Q, K and V, each with dimensions batch_size x padded_seq_len x num_heads x head_size, into respective number_of_tokens x num_heads x head_size tensors. - + For Q, number_of_tokens = sum(q_seq_lens). For K and V, number_of_tokens = sum(kv_seq_lens) @@ -619,9 +619,9 @@ def make_kv_cache(num_blocks: int, Returns: * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size) - * for backend 'XFORMERS' + * for backend 'XFORMERS' * kv_cache: 2 x num_blocks x block_size x num_heads x head_size - * for backend 'FLASH_ATTN' + * for backend 'FLASH_ATTN' ''' if backend == 'XFORMERS': kv_cache = torch.rand( @@ -662,20 +662,20 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int], Context: * Your goal is to test (1) prefill of N prompts, with prompt-lengths {K_i \\forall i \\in [0,N)}, followed by (2) decoding of a single token - for all N prompts (N tokens total); the resultant sequence lengths + for all N prompts (N tokens total); the resultant sequence lengths after decode would be {K_i + 1 for i \\in [0,N)} - * The test you want to do requires (1) having the prefill slot mapping - for all tokens present during prefill, the number of which is - M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N + * The test you want to do requires (1) having the prefill slot mapping + for all tokens present during prefill, the number of which is + M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N decoded tokens - - This function consumes a single 1D slot mapping, which is the + + This function consumes a single 1D slot mapping, which is the concatenation of N slot mappings each of length K_i + 1 (corresponding to the sequence lengths after decode), with a total length of P = \\sum_i{K_i + 1} = M + N The prefill-phase slot mapping results from excising the (K_i + 1)-th entry - from each of the N subsequences in the slot mapping (i.e. omitting the + from each of the N subsequences in the slot mapping (i.e. omitting the decoded token's mapping.) The N excised entries are appended to obtain the decode-phase slot mapping @@ -684,15 +684,15 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int], * slot_mapping_list: Length-P 1D slot mapping (as list) reflecting all N post-decode sequences - * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the + * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the description above) * device: cuda, cpu, etc. Returns: - * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor) + * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor) reflecting all N prefill prompts - * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting + * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting all N decoded tokens ''' @@ -725,7 +725,7 @@ def make_block_tables_slot_mapping( Then the minimum KV cache size in blocks is - total_cache_blocks = sum(num_blocks for all seqs) + total_cache_blocks = sum(num_blocks for all seqs) Then, the blocktable mapping counts downward from @@ -734,7 +734,7 @@ def make_block_tables_slot_mapping( to block_base_addr - + The constructed block-tables and slot-mapping are sized to the lengths of the sequences in their entirety (as reflected by seq_lens), @@ -749,7 +749,7 @@ def make_block_tables_slot_mapping( Return: - * block_tables_tensor: block table for sequence + * block_tables_tensor: block table for sequence * slot_mapping_list: slot mapping for sequence * max_block_idx: the highest block address within this block table ''' @@ -807,7 +807,7 @@ def make_test_metadata( encoder_test_params and cross_test_params arguments allow encoder attention and enc/dec cross-attention (respectively) to use distinct metadata values from decoder self-attention (decoder_test_params.) - + if encoder_test_params and cross_test_params are None, the attention metadata will support decoder-only scenario. @@ -820,7 +820,7 @@ def make_test_metadata( * attn_backend_name: Backend for sourcing attention kernels * is_prompt: prefill if True, o/w decode * seq_lens: list of token counts for each sequence - * decoder_test_params: decoder self-attention test params; + * decoder_test_params: decoder self-attention test params; this function requires kv_mmap (memory mapping) field * device: CPU or CUDA device diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/test_disagg.py similarity index 100% rename from tests/kv_transfer/disagg_test.py rename to tests/kv_transfer/test_disagg.py diff --git a/tests/kv_transfer/module_test.py b/tests/kv_transfer/test_module.py similarity index 100% rename from tests/kv_transfer/module_test.py rename to tests/kv_transfer/test_module.py diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py index faca7a566e79c..51abcb7172cb7 100644 --- a/tests/models/decoder_only/language/test_fp8.py +++ b/tests/models/decoder_only/language/test_fp8.py @@ -12,11 +12,10 @@ import pytest from tests.kernels.utils import override_backend_env_variable from tests.quantization.utils import is_quant_method_supported from vllm.platforms import current_platform +from vllm.utils import STR_BACKEND_ENV_VAR from ...utils import check_logprobs_close -os.environ["TOKENIZERS_PARALLELISM"] = "true" - @pytest.mark.quant_model @pytest.mark.skipif(not is_quant_method_supported("fp8"), @@ -55,45 +54,47 @@ def test_models( backend: str, tensor_parallel_size: int, disable_async_output_proc: bool, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """ Only checks log probs match to cover the discrepancy in numerical sensitive kernels. """ - override_backend_env_variable(monkeypatch, backend) + with monkeypatch.context() as m: + m.setenv("TOKENIZERS_PARALLELISM", 'true') + m.setenv(STR_BACKEND_ENV_VAR, backend) - MAX_MODEL_LEN = 1024 - NUM_LOG_PROBS = 8 + MAX_MODEL_LEN = 1024 + NUM_LOG_PROBS = 8 - with vllm_runner( - base_model, - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=tensor_parallel_size, - enforce_eager=enforce_eager, - kv_cache_dtype="auto", - disable_async_output_proc=disable_async_output_proc, - ) as vllm_model: - baseline_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) + with vllm_runner( + base_model, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + kv_cache_dtype="auto", + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + baseline_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) - with vllm_runner( - test_model, - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=tensor_parallel_size, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - disable_async_output_proc=disable_async_output_proc, - ) as vllm_model: - test_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) + with vllm_runner( + test_model, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + kv_cache_dtype=kv_cache_dtype, + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + test_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) - check_logprobs_close( - outputs_0_lst=baseline_outputs, - outputs_1_lst=test_outputs, - name_0="fp16_kv_cache", - name_1="fp8_kv_cache", - ) + check_logprobs_close( + outputs_0_lst=baseline_outputs, + outputs_1_lst=test_outputs, + name_0="fp16_kv_cache", + name_1="fp8_kv_cache", + ) @pytest.mark.cpu_model @@ -119,38 +120,41 @@ def test_cpu_models( test_model: str, max_tokens: int, disable_async_output_proc: bool, + monkeypatch: pytest.MonkeyPatch, ) -> None: """ Only checks log probs match to cover the discrepancy in numerical sensitive kernels. """ + with monkeypatch.context() as m: + m.setenv("TOKENIZERS_PARALLELISM", 'true') - MAX_MODEL_LEN = 1024 - NUM_LOG_PROBS = 8 + MAX_MODEL_LEN = 1024 + NUM_LOG_PROBS = 8 - with vllm_runner( - base_model, - max_model_len=MAX_MODEL_LEN, - dtype="bfloat16", - kv_cache_dtype="auto", - disable_async_output_proc=disable_async_output_proc, - ) as vllm_model: - baseline_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) + with vllm_runner( + base_model, + max_model_len=MAX_MODEL_LEN, + dtype="bfloat16", + kv_cache_dtype="auto", + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + baseline_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) - with vllm_runner( - test_model, - max_model_len=MAX_MODEL_LEN, - dtype="bfloat16", - kv_cache_dtype=kv_cache_dtype, - disable_async_output_proc=disable_async_output_proc, - ) as vllm_model: - test_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) + with vllm_runner( + test_model, + max_model_len=MAX_MODEL_LEN, + dtype="bfloat16", + kv_cache_dtype=kv_cache_dtype, + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + test_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) - check_logprobs_close( - outputs_0_lst=baseline_outputs, - outputs_1_lst=test_outputs, - name_0="bf16_kv_cache", - name_1="fp8_kv_cache", - ) + check_logprobs_close( + outputs_0_lst=baseline_outputs, + outputs_1_lst=test_outputs, + name_0="bf16_kv_cache", + name_1="fp8_kv_cache", + ) diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py index cae3e1a5c6244..d6bf7d2706397 100644 --- a/tests/models/embedding/language/test_gritlm.py +++ b/tests/models/embedding/language/test_gritlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations import importlib.util import math @@ -11,6 +12,7 @@ from scipy.spatial.distance import cosine import vllm import vllm.config +from vllm.utils import STR_BACKEND_ENV_VAR from ....utils import RemoteOpenAIServer @@ -29,36 +31,34 @@ def _arr(arr): return array("i", arr) -def test_find_array(monkeypatch): +def test_find_array(monkeypatch: pytest.MonkeyPatch): # GritLM embedding implementation is only supported by XFormers backend. - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS") - from vllm.model_executor.models.gritlm import GritLMPooler + from vllm.model_executor.models.gritlm import GritLMPooler - # Create an LLM object to get the model config. - llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) - pooler = GritLMPooler(model_config=llm.llm_engine.model_config) + # Create an LLM object to get the model config. + llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) + pooler = GritLMPooler(model_config=llm.llm_engine.model_config) - arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) - assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3 - assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3 - assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1 - assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1 + assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3 + assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3 + assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1 + assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1 - with pytest.raises(ValueError): - pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1) + with pytest.raises(ValueError): + pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1) @pytest.fixture(scope="module") def server_embedding(): # GritLM embedding implementation is only supported by XFormers backend. - with pytest.MonkeyPatch.context() as mp: - mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") - - args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)] - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: - yield remote_server + args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)] + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server @pytest.fixture(scope="module") @@ -69,9 +69,12 @@ def server_generate(): @pytest_asyncio.fixture -async def client_embedding(server_embedding: RemoteOpenAIServer): - async with server_embedding.get_async_client() as async_client: - yield async_client +async def client_embedding(monkeypatch: pytest.MonkeyPatch, + server_embedding: RemoteOpenAIServer): + with monkeypatch.context() as m: + m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") + async with server_embedding.get_async_client() as async_client: + yield async_client @pytest_asyncio.fixture @@ -80,14 +83,20 @@ async def client_generate(server_generate: RemoteOpenAIServer): yield async_client -def run_llm_encode(llm: vllm.LLM, queries: list[str], - instruction: str) -> list[float]: +def run_llm_encode( + llm: vllm.LLM, + queries: list[str], + instruction: str, +) -> list[float]: outputs = llm.encode([instruction + q for q in queries], ) return [output.outputs.embedding for output in outputs] -async def run_client_embeddings(client: vllm.LLM, queries: list[str], - instruction: str) -> list[float]: +async def run_client_embeddings( + client: vllm.LLM, + queries: list[str], + instruction: str, +) -> list[float]: outputs = await client.embeddings.create( model=MODEL_NAME, input=[instruction + q for q in queries], @@ -106,7 +115,7 @@ def get_test_data(): README.md in https://github.com/ContextualAI/gritlm """ q_instruction = gritlm_instruction( - "Given a scientific paper title, retrieve the paper's abstract") + "Given a scientific paper title, retrieve the paper's abstract", ) queries = [ "Bitcoin: A Peer-to-Peer Electronic Cash System", "Generative Representational Instruction Tuning", @@ -136,31 +145,32 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]): assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001) -def test_gritlm_offline_embedding(monkeypatch): +def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch): # GritLM embedding implementation is only supported by XFormers backend. - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS") - queries, q_instruction, documents, d_instruction = get_test_data() + queries, q_instruction, documents, d_instruction = get_test_data() - llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) + llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) - d_rep = run_llm_encode( - llm, - documents, - d_instruction, - ) - q_rep = run_llm_encode( - llm, - queries, - q_instruction, - ) + d_rep = run_llm_encode( + llm, + documents, + d_instruction, + ) + q_rep = run_llm_encode( + llm, + queries, + q_instruction, + ) - validate_embed_output(q_rep, d_rep) + validate_embed_output(q_rep, d_rep) @pytest.mark.asyncio async def test_gritlm_api_server_embedding( - client_embedding: openai.AsyncOpenAI): + client_embedding: openai.AsyncOpenAI, ): queries, q_instruction, documents, d_instruction = get_test_data() d_rep = await run_client_embeddings( diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index d3d07d0d9acfc..465c496f4c0f3 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import os - import pytest from vllm import LLM, SamplingParams @@ -11,76 +9,92 @@ from ..utils import fork_new_process_for_each_test @fork_new_process_for_each_test -def test_plugin(dummy_opt_path, monkeypatch): +def test_plugin( + monkeypatch: pytest.MonkeyPatch, + dummy_opt_path: str, +): # V1 shuts down rather than raising an error here. - monkeypatch.setenv("VLLM_USE_V1", "0") - os.environ["VLLM_PLUGINS"] = "" - with pytest.raises(Exception) as excinfo: - LLM(model=dummy_opt_path, load_format="dummy") - error_msg = "has no vLLM implementation and " \ - "the Transformers implementation is not compatible with vLLM" - assert (error_msg in str(excinfo.value)) + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + m.setenv("VLLM_PLUGINS", "") + + with pytest.raises(Exception) as excinfo: + LLM(model=dummy_opt_path, load_format="dummy") + error_msg = "has no vLLM implementation and the Transformers implementation is not compatible with vLLM" # noqa: E501 + assert (error_msg in str(excinfo.value)) @fork_new_process_for_each_test -def test_oot_registration_text_generation(dummy_opt_path): - os.environ["VLLM_PLUGINS"] = "register_dummy_model" - prompts = ["Hello, my name is", "The text does not matter"] - sampling_params = SamplingParams(temperature=0) - llm = LLM(model=dummy_opt_path, load_format="dummy") - first_token = llm.get_tokenizer().decode(0) - outputs = llm.generate(prompts, sampling_params) +def test_oot_registration_text_generation( + monkeypatch: pytest.MonkeyPatch, + dummy_opt_path: str, +): + with monkeypatch.context() as m: + m.setenv("VLLM_PLUGINS", "register_dummy_model") + prompts = ["Hello, my name is", "The text does not matter"] + sampling_params = SamplingParams(temperature=0) + llm = LLM(model=dummy_opt_path, load_format="dummy") + first_token = llm.get_tokenizer().decode(0) + outputs = llm.generate(prompts, sampling_params) - for output in outputs: - generated_text = output.outputs[0].text - # make sure only the first token is generated - rest = generated_text.replace(first_token, "") - assert rest == "" + for output in outputs: + generated_text = output.outputs[0].text + # make sure only the first token is generated + rest = generated_text.replace(first_token, "") + assert rest == "" @fork_new_process_for_each_test -def test_oot_registration_embedding(dummy_gemma2_embedding_path): - os.environ["VLLM_PLUGINS"] = "register_dummy_model" - prompts = ["Hello, my name is", "The text does not matter"] - llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy") - outputs = llm.embed(prompts) +def test_oot_registration_embedding( + monkeypatch: pytest.MonkeyPatch, + dummy_gemma2_embedding_path: str, +): + with monkeypatch.context() as m: + m.setenv("VLLM_PLUGINS", "register_dummy_model") + prompts = ["Hello, my name is", "The text does not matter"] + llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy") + outputs = llm.embed(prompts) - for output in outputs: - assert all(v == 0 for v in output.outputs.embedding) + for output in outputs: + assert all(v == 0 for v in output.outputs.embedding) image = ImageAsset("cherry_blossom").pil_image.convert("RGB") @fork_new_process_for_each_test -def test_oot_registration_multimodal(dummy_llava_path, monkeypatch): - os.environ["VLLM_PLUGINS"] = "register_dummy_model" - prompts = [{ - "prompt": "What's in the image?", - "multi_modal_data": { - "image": image - }, - }, { - "prompt": "Describe the image", - "multi_modal_data": { - "image": image - }, - }] +def test_oot_registration_multimodal( + monkeypatch: pytest.MonkeyPatch, + dummy_llava_path: str, +): + with monkeypatch.context() as m: + m.setenv("VLLM_PLUGINS", "register_dummy_model") + prompts = [{ + "prompt": "What's in the image?", + "multi_modal_data": { + "image": image + }, + }, { + "prompt": "Describe the image", + "multi_modal_data": { + "image": image + }, + }] - sampling_params = SamplingParams(temperature=0) - llm = LLM(model=dummy_llava_path, - load_format="dummy", - max_num_seqs=1, - trust_remote_code=True, - gpu_memory_utilization=0.98, - max_model_len=4096, - enforce_eager=True, - limit_mm_per_prompt={"image": 1}) - first_token = llm.get_tokenizer().decode(0) - outputs = llm.generate(prompts, sampling_params) + sampling_params = SamplingParams(temperature=0) + llm = LLM(model=dummy_llava_path, + load_format="dummy", + max_num_seqs=1, + trust_remote_code=True, + gpu_memory_utilization=0.98, + max_model_len=4096, + enforce_eager=True, + limit_mm_per_prompt={"image": 1}) + first_token = llm.get_tokenizer().decode(0) + outputs = llm.generate(prompts, sampling_params) - for output in outputs: - generated_text = output.outputs[0].text - # make sure only the first token is generated - rest = generated_text.replace(first_token, "") - assert rest == "" + for output in outputs: + generated_text = output.outputs[0].text + # make sure only the first token is generated + rest = generated_text.replace(first_token, "") + assert rest == "" diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py index aad7fc5303c13..e617bd057f1f4 100644 --- a/tests/mq_llm_engine/test_error_handling.py +++ b/tests/mq_llm_engine/test_error_handling.py @@ -235,25 +235,28 @@ async def test_bad_request(tmp_socket): @pytest.mark.asyncio -async def test_mp_crash_detection(monkeypatch): +async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: - parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.") - parser = make_arg_parser(parser) - args = parser.parse_args([]) + parser = FlexibleArgumentParser( + description="vLLM's remote OpenAI server.") + parser = make_arg_parser(parser) + args = parser.parse_args([]) - # When LLMEngine is loaded, it will crash. - def mock_init(): - raise ValueError + # When LLMEngine is loaded, it will crash. + def mock_init(): + raise ValueError - monkeypatch.setattr(LLMEngine, "__init__", mock_init) + m.setattr(LLMEngine, "__init__", mock_init) - start = time.perf_counter() - async with build_async_engine_client(args): - pass - end = time.perf_counter() + start = time.perf_counter() + async with build_async_engine_client(args): + pass + end = time.perf_counter() - assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s " - "if there is an error in the startup.") + assert end - start < 60, ( + "Expected vLLM to gracefully shutdown in <60s " + "if there is an error in the startup.") @pytest.mark.asyncio diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index f925e42f46d37..ce716e6474cb4 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -5,7 +5,7 @@ from typing import Optional import pytest -from tests.kernels.utils import override_backend_env_variable +from vllm.utils import STR_BACKEND_ENV_VAR from ..models.utils import check_logprobs_close from ..utils import (completions_with_server_args, get_client_text_generations, @@ -52,7 +52,7 @@ async def test_multi_step( num_logprobs: Optional[int], attention_backend: str, enable_chunked_prefill: bool, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """Test vLLM engine with multi-step scheduling in an OpenAI-protocol client/server environment. @@ -82,67 +82,70 @@ async def test_multi_step( pytest.skip("Multi-step with Chunked-Prefill only supports" "PP=1 and FLASH_ATTN backend") - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - prompts = example_prompts - if len(prompts) < num_prompts: - prompts = prompts * ((num_prompts // len(prompts)) + 1) - prompts = prompts[:num_prompts] - assert len(prompts) == num_prompts + prompts = example_prompts + if len(prompts) < num_prompts: + prompts = prompts * ((num_prompts // len(prompts)) + 1) + prompts = prompts[:num_prompts] + assert len(prompts) == num_prompts - server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"] - ms_server_args = DEFAULT_SERVER_ARGS + \ - ["--num-scheduler-steps", f"{num_scheduler_steps}"] + server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"] + ms_server_args = DEFAULT_SERVER_ARGS + \ + ["--num-scheduler-steps", f"{num_scheduler_steps}"] - if not is_async: - ms_server_args += ["--disable-async-output-proc"] + if not is_async: + ms_server_args += ["--disable-async-output-proc"] - if eager_mode: - ms_server_args.append("--enforce-eager") + if eager_mode: + ms_server_args.append("--enforce-eager") - if enable_chunked_prefill: - ms_server_args.append("--enable-chunked-prefill") + if enable_chunked_prefill: + ms_server_args.append("--enable-chunked-prefill") - distributed_args = [ - "--tensor-parallel-size", - str(tp_size), - "--pipeline-parallel-size", - str(pp_size), - ] + distributed_args = [ + "--tensor-parallel-size", + str(tp_size), + "--pipeline-parallel-size", + str(pp_size), + ] - # Spin up client/server & issue completion API requests. - # Default `max_wait_seconds` is 240 but was empirically - # was raised 5x to 1200 *just for this test* due to - # observed timeouts in GHA CI - ref_completions = await completions_with_server_args( - prompts, - model, - server_args + distributed_args, - num_logprobs, - max_wait_seconds=5 * 240) - test_completions = await completions_with_server_args( - prompts, - model, - ms_server_args + distributed_args, - num_logprobs, - max_wait_seconds=5 * 240) + # Spin up client/server & issue completion API requests. + # Default `max_wait_seconds` is 240 but was empirically + # was raised 5x to 1200 *just for this test* due to + # observed timeouts in GHA CI + ref_completions = await completions_with_server_args( + prompts, + model, + server_args + distributed_args, + num_logprobs, + max_wait_seconds=5 * 240) + test_completions = await completions_with_server_args( + prompts, + model, + ms_server_args + distributed_args, + num_logprobs, + max_wait_seconds=5 * 240) - # Assert multi-step scheduling produces identical tokens - # to single-step scheduling. - ref_generations = get_client_text_generations(ref_completions) - test_generations = get_client_text_generations(test_completions) - assert ref_generations == test_generations + # Assert multi-step scheduling produces identical tokens + # to single-step scheduling. + ref_generations = get_client_text_generations(ref_completions) + test_generations = get_client_text_generations(test_completions) + assert ref_generations == test_generations - # Assert multi-step scheduling produces nearly-identical logprobs - # to single-step scheduling. - ref_text_logprobs = get_client_text_logprob_generations(ref_completions) - test_text_logprobs = get_client_text_logprob_generations(test_completions) - check_logprobs_close( - outputs_0_lst=ref_text_logprobs, - outputs_1_lst=test_text_logprobs, - name_0="hf", - name_1="vllm", - ) + # Assert multi-step scheduling produces nearly-identical logprobs + # to single-step scheduling. + ref_text_logprobs = get_client_text_logprob_generations( + ref_completions) + test_text_logprobs = get_client_text_logprob_generations( + test_completions) + check_logprobs_close( + outputs_0_lst=ref_text_logprobs, + outputs_1_lst=test_text_logprobs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize(("tp_size, pp_size"), [ @@ -152,7 +155,7 @@ async def test_multi_step( async def test_multi_step_pp_smoke( tp_size: int, pp_size: int, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """ Smoke test for the vLLM engine with multi-step scheduling in an @@ -174,54 +177,55 @@ async def test_multi_step_pp_smoke( attention_backend = "FLASH_ATTN" max_num_seqs = 3 - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - # Prompt from the ShareGPT dataset - prompts = [ - "in the jtbd context whats a push?", # codespell:ignore - "in the jtbd context whats a push?", # codespell:ignore - "in the jtbd context whats a push?", # codespell:ignore - "in the jtbd context whats a push?", # codespell:ignore - ] - # Use varying max_tokens to introduce scheduling randomness. - max_tokens = [10 * i for i in range(1, len(prompts) + 1)] - assert len(prompts) == len(max_tokens) + # Prompt from the ShareGPT dataset + prompts = [ + "in the jtbd context whats a push?", # codespell:ignore + "in the jtbd context whats a push?", # codespell:ignore + "in the jtbd context whats a push?", # codespell:ignore + "in the jtbd context whats a push?", # codespell:ignore + ] + # Use varying max_tokens to introduce scheduling randomness. + max_tokens = [10 * i for i in range(1, len(prompts) + 1)] + assert len(prompts) == len(max_tokens) - test_args = [ - "--tensor-parallel-size", - str(tp_size), "--pipeline-parallel-size", - str(pp_size), "--max-num-seqs", - str(max_num_seqs) - ] + test_args = [ + "--tensor-parallel-size", + str(tp_size), "--pipeline-parallel-size", + str(pp_size), "--max-num-seqs", + str(max_num_seqs) + ] - server_args = DEFAULT_SERVER_ARGS + test_args - ms_server_args = DEFAULT_SERVER_ARGS + \ - ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \ - test_args + server_args = DEFAULT_SERVER_ARGS + test_args + ms_server_args = DEFAULT_SERVER_ARGS + \ + ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \ + test_args - # Spin up client/server & issue completion API requests. - # Default `max_wait_seconds` is 240 but was empirically - # was raised 3x to 720 *just for this test* due to - # observed timeouts in GHA CI - ref_completions = await completions_with_server_args( - prompts=prompts, - model_name=model, - server_cli_args=server_args, - num_logprobs=None, - max_wait_seconds=5 * 240, - max_tokens=max_tokens) + # Spin up client/server & issue completion API requests. + # Default `max_wait_seconds` is 240 but was empirically + # was raised 3x to 720 *just for this test* due to + # observed timeouts in GHA CI + ref_completions = await completions_with_server_args( + prompts=prompts, + model_name=model, + server_cli_args=server_args, + num_logprobs=None, + max_wait_seconds=5 * 240, + max_tokens=max_tokens) - test_completions = await completions_with_server_args( - prompts=prompts, - model_name=model, - server_cli_args=ms_server_args, - num_logprobs=None, - max_wait_seconds=5 * 240, - max_tokens=max_tokens) + test_completions = await completions_with_server_args( + prompts=prompts, + model_name=model, + server_cli_args=ms_server_args, + num_logprobs=None, + max_wait_seconds=5 * 240, + max_tokens=max_tokens) - # Assert multi-step scheduling produces identical tokens - # to single-step scheduling. - ref_generations = get_client_text_generations(ref_completions) - test_generations = get_client_text_generations(test_completions) + # Assert multi-step scheduling produces identical tokens + # to single-step scheduling. + ref_generations = get_client_text_generations(ref_completions) + test_generations = get_client_text_generations(test_completions) - assert ref_generations == test_generations + assert ref_generations == test_generations diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py index 29d5ffd4c9cb1..a823e484beab6 100644 --- a/tests/multi_step/test_correctness_llm.py +++ b/tests/multi_step/test_correctness_llm.py @@ -7,7 +7,7 @@ from typing import Optional import pytest -from tests.kernels.utils import override_backend_env_variable +from vllm.utils import STR_BACKEND_ENV_VAR from ..models.utils import check_logprobs_close, check_outputs_equal @@ -42,7 +42,7 @@ def test_multi_step_llm( num_prompts: int, num_logprobs: Optional[int], attention_backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """Test vLLM engine with multi-step scheduling via sync LLM Engine. @@ -70,48 +70,49 @@ def test_multi_step_llm( num_logprobs: corresponds to the `logprobs` argument to the OpenAI completions endpoint; `None` -> 1 logprob returned. """ - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - prompts = example_prompts - if len(prompts) < num_prompts: - prompts = prompts * ((num_prompts // len(prompts)) + 1) - prompts = prompts[:num_prompts] - assert len(prompts) == num_prompts + prompts = example_prompts + if len(prompts) < num_prompts: + prompts = prompts * ((num_prompts // len(prompts)) + 1) + prompts = prompts[:num_prompts] + assert len(prompts) == num_prompts - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - enable_chunked_prefill=enable_chunked_prefill, - num_scheduler_steps=num_scheduler_steps, - ) as vllm_model: - vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens) - if num_logprobs is None else - vllm_model.generate_greedy_logprobs( - prompts, max_tokens, num_logprobs)) + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + enable_chunked_prefill=enable_chunked_prefill, + num_scheduler_steps=num_scheduler_steps, + ) as vllm_model: + vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens) + if num_logprobs is None else + vllm_model.generate_greedy_logprobs( + prompts, max_tokens, num_logprobs)) - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = (hf_model.generate_greedy(prompts, max_tokens) - if num_logprobs is None else - hf_model.generate_greedy_logprobs_limit( - prompts, max_tokens, num_logprobs)) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = (hf_model.generate_greedy(prompts, max_tokens) + if num_logprobs is None else + hf_model.generate_greedy_logprobs_limit( + prompts, max_tokens, num_logprobs)) - if num_logprobs is None: - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - else: - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + if num_logprobs is None: + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + else: + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize("model", MODELS) @@ -136,7 +137,7 @@ def test_multi_step_llm_w_prompt_logprobs( num_logprobs: Optional[int], num_prompt_logprobs: Optional[int], attention_backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """Test prompt logprobs with multi-step scheduling via sync LLM Engine. @@ -166,47 +167,48 @@ def test_multi_step_llm_w_prompt_logprobs( note that this argument is not supported by the OpenAI completions endpoint. """ - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - prompts = example_prompts - if len(prompts) < num_prompts: - prompts = prompts * ((num_prompts // len(prompts)) + 1) - prompts = prompts[:num_prompts] - assert len(prompts) == num_prompts + prompts = example_prompts + if len(prompts) < num_prompts: + prompts = prompts * ((num_prompts // len(prompts)) + 1) + prompts = prompts[:num_prompts] + assert len(prompts) == num_prompts - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - num_scheduler_steps=num_scheduler_steps, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( - prompts, - max_tokens, - num_logprobs, - num_prompt_logprobs=num_prompt_logprobs) + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + num_scheduler_steps=num_scheduler_steps, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + prompts, + max_tokens, + num_logprobs, + num_prompt_logprobs=num_prompt_logprobs) - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - ) as vllm_model: - single_step_vllm_outputs = vllm_model.generate_greedy_logprobs( - prompts, - max_tokens, - num_logprobs, - num_prompt_logprobs=num_prompt_logprobs) + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + ) as vllm_model: + single_step_vllm_outputs = vllm_model.generate_greedy_logprobs( + prompts, + max_tokens, + num_logprobs, + num_prompt_logprobs=num_prompt_logprobs) - check_logprobs_close( - outputs_0_lst=single_step_vllm_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_logprobs_close( + outputs_0_lst=single_step_vllm_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize("model", MODELS) @@ -230,7 +232,7 @@ def test_multi_step_llm_chunked_prefill_prefix_cache( num_prompts: int, num_logprobs: Optional[int], attention_backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """Test vLLM engine with multi-step+"single-step chunked prefill"+APC. @@ -293,77 +295,78 @@ def test_multi_step_llm_chunked_prefill_prefix_cache( # # The Incorrect scheduling behavior - if it occurs - will cause an exception # in the model runner resulting from `do_sample=False`. - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - assert len(example_prompts) >= 2 - challenge_prompts = copy.deepcopy(example_prompts) - challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient ' - 'inference and serving engine for LLMs.\n' - ) # 24 tok - challenge_prompts[1] = ( - 'Briefly describe the major milestones in the ' - 'development of artificial intelligence from 1950 to 2020.\n' - ) # 30 tok + assert len(example_prompts) >= 2 + challenge_prompts = copy.deepcopy(example_prompts) + challenge_prompts[0] = ( + 'vLLM is a high-throughput and memory-efficient ' + 'inference and serving engine for LLMs.\n') # 24 tok + challenge_prompts[1] = ( + 'Briefly describe the major milestones in the ' + 'development of artificial intelligence from 1950 to 2020.\n' + ) # 30 tok - # If necessary, adjust the length of `challenge_prompts` to match - # `num_prompts` - if len(challenge_prompts) < num_prompts: - challenge_prompts = (challenge_prompts * - ((num_prompts // len(challenge_prompts)) + 1)) - challenge_prompts = challenge_prompts[:num_prompts] - assert len(challenge_prompts) == num_prompts + # If necessary, adjust the length of `challenge_prompts` to match + # `num_prompts` + if len(challenge_prompts) < num_prompts: + challenge_prompts = (challenge_prompts * + ((num_prompts // len(challenge_prompts)) + 1)) + challenge_prompts = challenge_prompts[:num_prompts] + assert len(challenge_prompts) == num_prompts - # Single-step scheduler baseline - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - num_scheduler_steps=num_scheduler_steps, - max_model_len=48, - max_num_batched_tokens=48, - max_num_seqs=4, - block_size=16, - ) as vllm_model: - outputs_baseline = (vllm_model.generate_greedy( - challenge_prompts, max_tokens) if num_logprobs is None else - vllm_model.generate_greedy_logprobs( - challenge_prompts, max_tokens, num_logprobs)) + # Single-step scheduler baseline + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + num_scheduler_steps=num_scheduler_steps, + max_model_len=48, + max_num_batched_tokens=48, + max_num_seqs=4, + block_size=16, + ) as vllm_model: + outputs_baseline = ( + vllm_model.generate_greedy(challenge_prompts, max_tokens) if + num_logprobs is None else vllm_model.generate_greedy_logprobs( + challenge_prompts, max_tokens, num_logprobs)) - # multi-step+"single-step chunked prefill"+APC - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - enable_chunked_prefill=True, - enable_prefix_caching=True, - num_scheduler_steps=num_scheduler_steps, - max_model_len=48, - max_num_batched_tokens=48, - max_num_seqs=4, - block_size=16, - ) as vllm_model: - outputs_w_features = (vllm_model.generate_greedy( - challenge_prompts, max_tokens) if num_logprobs is None else - vllm_model.generate_greedy_logprobs( - challenge_prompts, max_tokens, num_logprobs)) + # multi-step+"single-step chunked prefill"+APC + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + enable_chunked_prefill=True, + enable_prefix_caching=True, + num_scheduler_steps=num_scheduler_steps, + max_model_len=48, + max_num_batched_tokens=48, + max_num_seqs=4, + block_size=16, + ) as vllm_model: + outputs_w_features = ( + vllm_model.generate_greedy(challenge_prompts, max_tokens) if + num_logprobs is None else vllm_model.generate_greedy_logprobs( + challenge_prompts, max_tokens, num_logprobs)) - if num_logprobs is None: - # No-logprobs test - check_outputs_equal( - outputs_0_lst=outputs_baseline, - outputs_1_lst=outputs_w_features, - name_0="multi-step", - name_1="multi-step+features", - ) - else: - # Yes-logprobs test - check_logprobs_close( - outputs_0_lst=outputs_baseline, - outputs_1_lst=outputs_w_features, - name_0="multi-step", - name_1="multi-step+features", - ) + if num_logprobs is None: + # No-logprobs test + check_outputs_equal( + outputs_0_lst=outputs_baseline, + outputs_1_lst=outputs_w_features, + name_0="multi-step", + name_1="multi-step+features", + ) + else: + # Yes-logprobs test + check_logprobs_close( + outputs_0_lst=outputs_baseline, + outputs_1_lst=outputs_w_features, + name_0="multi-step", + name_1="multi-step+features", + ) diff --git a/tests/neuron/1_core/test_block_table.py b/tests/neuron/1_core/test_block_table.py index 30dcdd573edf3..033a36b4156b0 100644 --- a/tests/neuron/1_core/test_block_table.py +++ b/tests/neuron/1_core/test_block_table.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -import os import neuronxcc.nki.language as nl import pytest @@ -99,6 +98,7 @@ def ref_block_tables_transform( ) @torch.inference_mode() def test_load_and_transform_block_tables( + monkeypatch: pytest.MonkeyPatch, num_tiles, num_blocks_per_tile, q_head_per_kv_head, @@ -108,46 +108,46 @@ def test_load_and_transform_block_tables( device = xm.xla_device() - compiler_flags = [ + compiler_flags_str = " ".join([ "-O1", "--retry_failed_compilation", - ] - compiler_flags_str = " ".join(compiler_flags) - os.environ["NEURON_CC_FLAGS"] = compiler_flags_str + ]) + with monkeypatch.context() as m: + m.setenv("NEURON_CC_FLAGS", compiler_flags_str) - torch.manual_seed(10000) - torch.set_printoptions(sci_mode=False) + torch.manual_seed(10000) + torch.set_printoptions(sci_mode=False) - # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient - B_P_SIZE = 128 - if num_blocks_per_tile < B_P_SIZE: - assert B_P_SIZE % num_blocks_per_tile == 0 - block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile - else: - block_size_tiling_factor = 1 - max_num_blocks = 100000 - block_tables = torch.randint( - 0, - max_num_blocks, - (num_tiles * num_blocks_per_tile, ), - dtype=torch.int32, - ) - nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1]( - block_tables.to(device=device), - num_tiles, - num_blocks_per_tile, - q_head_per_kv_head, - head_id, - block_size_tiling_factor, - ).cpu() - ref_out = ref_block_tables_transform( - block_tables, - num_tiles, - num_blocks_per_tile, - q_head_per_kv_head, - head_id, - block_size_tiling_factor, - ) - assert (nki_out.shape == ref_out.shape - ), f"{nki_out.shape=} != {ref_out.shape=}" - assert torch.all(nki_out == ref_out) + # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient + B_P_SIZE = 128 + if num_blocks_per_tile < B_P_SIZE: + assert B_P_SIZE % num_blocks_per_tile == 0 + block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile + else: + block_size_tiling_factor = 1 + max_num_blocks = 100000 + block_tables = torch.randint( + 0, + max_num_blocks, + (num_tiles * num_blocks_per_tile, ), + dtype=torch.int32, + ) + nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1]( + block_tables.to(device=device), + num_tiles, + num_blocks_per_tile, + q_head_per_kv_head, + head_id, + block_size_tiling_factor, + ).cpu() + ref_out = ref_block_tables_transform( + block_tables, + num_tiles, + num_blocks_per_tile, + q_head_per_kv_head, + head_id, + block_size_tiling_factor, + ) + assert (nki_out.shape == ref_out.shape + ), f"{nki_out.shape=} != {ref_out.shape=}" + assert torch.all(nki_out == ref_out) diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py index 326a1f82e9b30..37d6679f8d55b 100644 --- a/tests/neuron/1_core/test_prefix_prefill.py +++ b/tests/neuron/1_core/test_prefix_prefill.py @@ -320,6 +320,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size, ]) @torch.inference_mode() def test_contexted_kv_attention( + monkeypatch: pytest.MonkeyPatch, prefill_batch_size: int, decode_batch_size: int, num_heads: int, @@ -329,7 +330,6 @@ def test_contexted_kv_attention( large_tile_size, mixed_precision: bool, ) -> None: - import os import torch_xla.core.xla_model as xm @@ -340,174 +340,178 @@ def test_contexted_kv_attention( device = xm.xla_device() - compiler_flags = [ + compiler_flags_str = " ".join([ "-O1", "--retry_failed_compilation", - ] - compiler_flags_str = " ".join(compiler_flags) - os.environ["NEURON_CC_FLAGS"] = compiler_flags_str + ]) + with monkeypatch.context() as m: + m.setenv("NEURON_CC_FLAGS", compiler_flags_str) - torch.manual_seed(0) - torch.set_printoptions(sci_mode=False) - torch.set_default_device("cpu") - dtype = torch.float32 + torch.manual_seed(0) + torch.set_printoptions(sci_mode=False) + torch.set_default_device("cpu") + dtype = torch.float32 - min_ctx_len = 32 - max_ctx_len = 1024 - min_query_len = 16 - max_query_len = 512 - num_kv_heads = num_heads // num_queries_per_kv - ( - query, - k_active, - v_active, - k_cache, - v_cache, - block_table, - key, - value, - query_lens, - seq_lens, - ) = sample_inputs( - prefill_batch_size=prefill_batch_size, - decode_batch_size=decode_batch_size, - min_query_len=min_query_len, - max_query_len=max_query_len, - min_ctx_len=min_ctx_len, - max_ctx_len=max_ctx_len, - block_size=block_size, - num_heads=num_heads, - num_kv_heads=num_kv_heads, - head_size=head_size, - dtype=dtype, - ) + min_ctx_len = 32 + max_ctx_len = 1024 + min_query_len = 16 + max_query_len = 512 + num_kv_heads = num_heads // num_queries_per_kv + ( + query, + k_active, + v_active, + k_cache, + v_cache, + block_table, + key, + value, + query_lens, + seq_lens, + ) = sample_inputs( + prefill_batch_size=prefill_batch_size, + decode_batch_size=decode_batch_size, + min_query_len=min_query_len, + max_query_len=max_query_len, + min_ctx_len=min_ctx_len, + max_ctx_len=max_ctx_len, + block_size=block_size, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + head_size=head_size, + dtype=dtype, + ) - output_ref = ref_context_attention( - query, - key, - value, - query_lens, - seq_lens, - head_size, - num_queries_per_kv, - return_max_reduce=False, - ) + output_ref = ref_context_attention( + query, + key, + value, + query_lens, + seq_lens, + head_size, + num_queries_per_kv, + return_max_reduce=False, + ) - # build neuron program - B_P_SIZE = 128 - assert (large_tile_size >= B_P_SIZE - ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}" + # build neuron program + B_P_SIZE = 128 + assert (large_tile_size >= B_P_SIZE + ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}" - def ceil_div(a, b): - return (a + b - 1) // b + def ceil_div(a, b): + return (a + b - 1) // b - def pad_to_multiple(a, b): - return ceil_div(a, b) * b + def pad_to_multiple(a, b): + return ceil_div(a, b) * b - def pad_to_next_power_of_2(a): - assert a > 0 - return 2**int(a - 1).bit_length() + def pad_to_next_power_of_2(a): + assert a > 0 + return 2**int(a - 1).bit_length() - # calculate input shapes - max_num_queries = pad_to_next_power_of_2(sum(query_lens)) - context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens) - num_active_blocks = ceil_div(context_lens, block_size).sum().item() - num_active_blocks = pad_to_multiple(num_active_blocks, - large_tile_size // block_size) - context_kv_len = num_active_blocks * block_size - assert (context_kv_len % + # calculate input shapes + max_num_queries = pad_to_next_power_of_2(sum(query_lens)) + context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens) + num_active_blocks = ceil_div(context_lens, block_size).sum().item() + num_active_blocks = pad_to_multiple(num_active_blocks, + large_tile_size // block_size) + context_kv_len = num_active_blocks * block_size + assert ( + context_kv_len % large_tile_size == 0), f"invalid context_kv_len={context_kv_len}" - # pad QKV tensors - pad_dims = ( - 0, - 0, - 0, - 0, - 0, - max_num_queries - query.shape[0], - ) - query = F.pad(query, pad_dims, "constant", 0) - k = F.pad(k_active, pad_dims, "constant", 0) - v = F.pad(v_active, pad_dims, "constant", 0) - - # permute QKV tensors - # query: (1, n_heads, d, seq_q) - # key: (1, n_kv_heads, d, seq_k) - # value: (1, n_kv_heads, seq_v, d) - query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous() - k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous() - v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous() - k_cache = k_cache.permute(0, 2, 1, 3).contiguous() - v_cache = v_cache.permute(0, 2, 1, 3).contiguous() - - # transform block table - active_block_table = get_active_block_tables( - block_table.cpu(), - torch.tensor(query_lens).cpu(), - torch.tensor(seq_lens).cpu(), - block_size, - num_active_blocks, - ) - - # Build attention masks - prior_mask, active_mask = ( - BlockDiagonalCausalFromBottomRightMask.from_seqlens( - query_lens, seq_lens, block_size=block_size)) - prior_mask_padded = F.pad( - prior_mask, - ( + # pad QKV tensors + pad_dims = ( 0, - context_kv_len - prior_mask.shape[1], 0, - max_num_queries - prior_mask.shape[0], - ), - "constant", - 0, - ).bool() - active_mask_padded = F.pad( - active_mask, - ( 0, - max_num_queries - active_mask.shape[1], 0, - max_num_queries - active_mask.shape[0], - ), - "constant", - 0, - ).bool() - attn_mask = torch.concat([prior_mask_padded, active_mask_padded], dim=1) + 0, + max_num_queries - query.shape[0], + ) + query = F.pad(query, pad_dims, "constant", 0) + k = F.pad(k_active, pad_dims, "constant", 0) + v = F.pad(v_active, pad_dims, "constant", 0) - attn_mask = reorder_context_mask(attn_mask, large_tile_size, block_size) + # permute QKV tensors + # query: (1, n_heads, d, seq_q) + # key: (1, n_kv_heads, d, seq_k) + # value: (1, n_kv_heads, seq_v, d) + query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous() + k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous() + v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous() + k_cache = k_cache.permute(0, 2, 1, 3).contiguous() + v_cache = v_cache.permute(0, 2, 1, 3).contiguous() - input_args = ( - query.to(device=device), - k.to(device=device), - v.to(device=device), - k_cache.to(device=device), - v_cache.to(device=device), - active_block_table.to(device=device), - attn_mask.to(device=device), - ) - input_kwargs = dict( - n_kv_head=num_kv_heads, - head_size=head_size, - mixed_precision=mixed_precision, - LARGE_TILE_SZ=large_tile_size, - ) + # transform block table + active_block_table = get_active_block_tables( + block_table.cpu(), + torch.tensor(query_lens).cpu(), + torch.tensor(seq_lens).cpu(), + block_size, + num_active_blocks, + ) - output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs) + # Build attention masks + prior_mask, active_mask = ( + BlockDiagonalCausalFromBottomRightMask.from_seqlens( + query_lens, seq_lens, block_size=block_size)) + prior_mask_padded = F.pad( + prior_mask, + ( + 0, + context_kv_len - prior_mask.shape[1], + 0, + max_num_queries - prior_mask.shape[0], + ), + "constant", + 0, + ).bool() + active_mask_padded = F.pad( + active_mask, + ( + 0, + max_num_queries - active_mask.shape[1], + 0, + max_num_queries - active_mask.shape[0], + ), + "constant", + 0, + ).bool() + attn_mask = torch.concat([prior_mask_padded, active_mask_padded], + dim=1) - num_actual_tokens = sum(query_lens) - # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d) - output_nki = output_nki.cpu().permute(0, 2, 1, 3) - output_nki = output_nki[0, :num_actual_tokens, :, :] - output_ref_padded = F.pad( - output_ref, - (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]), - "constant", - 0, - ) - output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :] + attn_mask = reorder_context_mask(attn_mask, large_tile_size, + block_size) - torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0) + input_args = ( + query.to(device=device), + k.to(device=device), + v.to(device=device), + k_cache.to(device=device), + v_cache.to(device=device), + active_block_table.to(device=device), + attn_mask.to(device=device), + ) + input_kwargs = dict( + n_kv_head=num_kv_heads, + head_size=head_size, + mixed_precision=mixed_precision, + LARGE_TILE_SZ=large_tile_size, + ) + + output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs) + + num_actual_tokens = sum(query_lens) + # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d) + output_nki = output_nki.cpu().permute(0, 2, 1, 3) + output_nki = output_nki[0, :num_actual_tokens, :, :] + output_ref_padded = F.pad( + output_ref, + (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]), + "constant", + 0, + ) + output_ref = output_ref_padded.transpose( + 0, 1)[0, :num_actual_tokens, :, :] + + torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0) diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py index 3be248f5aca45..9d6872e0e0772 100644 --- a/tests/plugins_tests/test_platform_plugins.py +++ b/tests/plugins_tests/test_platform_plugins.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 +import pytest import torch -from tests.kernels.utils import override_backend_env_variable from vllm.attention.selector import get_attn_backend -from vllm.utils import STR_INVALID_VAL +from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL def test_platform_plugins(): @@ -25,8 +25,9 @@ def test_platform_plugins(): f" is loaded. The first import:\n{_init_trace}") -def test_oot_attention_backend(monkeypatch): +def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch): # ignore the backend env variable if it is set - override_backend_env_variable(monkeypatch, STR_INVALID_VAL) - backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) - assert backend.get_name() == "Dummy_Backend" + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL) + backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) + assert backend.get_name() == "Dummy_Backend" diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py index 98981a81e909c..7abf5066a4133 100644 --- a/tests/plugins_tests/test_scheduler_plugins.py +++ b/tests/plugins_tests/test_scheduler_plugins.py @@ -22,43 +22,47 @@ class DummyV1Scheduler(V1Scheduler): raise Exception("Exception raised by DummyV1Scheduler") -def test_scheduler_plugins_v0(monkeypatch): - monkeypatch.setenv("VLLM_USE_V1", "0") - with pytest.raises(Exception) as exception_info: +def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + with pytest.raises(Exception) as exception_info: - engine_args = EngineArgs( - model="facebook/opt-125m", - enforce_eager=True, # reduce test time - scheduler_cls=DummyV0Scheduler, - ) + engine_args = EngineArgs( + model="facebook/opt-125m", + enforce_eager=True, # reduce test time + scheduler_cls=DummyV0Scheduler, + ) - engine = LLMEngine.from_engine_args(engine_args=engine_args) + engine = LLMEngine.from_engine_args(engine_args=engine_args) - sampling_params = SamplingParams(max_tokens=1) - engine.add_request("0", "foo", sampling_params) - engine.step() + sampling_params = SamplingParams(max_tokens=1) + engine.add_request("0", "foo", sampling_params) + engine.step() - assert str(exception_info.value) == "Exception raised by DummyV0Scheduler" + assert str( + exception_info.value) == "Exception raised by DummyV0Scheduler" -def test_scheduler_plugins_v1(monkeypatch): - monkeypatch.setenv("VLLM_USE_V1", "1") - # Explicitly turn off engine multiprocessing so that the scheduler runs in - # this process - monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") +def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + # Explicitly turn off engine multiprocessing so + # that the scheduler runs in this process + m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") - with pytest.raises(Exception) as exception_info: + with pytest.raises(Exception) as exception_info: - engine_args = EngineArgs( - model="facebook/opt-125m", - enforce_eager=True, # reduce test time - scheduler_cls=DummyV1Scheduler, - ) + engine_args = EngineArgs( + model="facebook/opt-125m", + enforce_eager=True, # reduce test time + scheduler_cls=DummyV1Scheduler, + ) - engine = V1LLMEngine.from_engine_args(engine_args=engine_args) + engine = V1LLMEngine.from_engine_args(engine_args=engine_args) - sampling_params = SamplingParams(max_tokens=1) - engine.add_request("0", "foo", sampling_params) - engine.step() + sampling_params = SamplingParams(max_tokens=1) + engine.add_request("0", "foo", sampling_params) + engine.step() - assert str(exception_info.value) == "Exception raised by DummyV1Scheduler" + assert str( + exception_info.value) == "Exception raised by DummyV1Scheduler" diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 7a4bc7aecc0f4..607b6c43e02e2 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -4,25 +4,29 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`. """ +from __future__ import annotations + import pytest from tests.conftest import VllmRunner from tests.core.utils import SchedulerProxy, create_dummy_prompt -from tests.kernels.utils import override_backend_env_variable from vllm import SamplingParams, TokensPrompt from vllm.core.scheduler import Scheduler from vllm.engine.llm_engine import LLMEngine from vllm.platforms import current_platform +from vllm.utils import STR_BACKEND_ENV_VAR from ..models.utils import check_outputs_equal @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): +def use_v0_only(monkeypatch: pytest.MonkeyPatch): """ This module relies on V0 internals, so set VLLM_USE_V1=0. """ - monkeypatch.setenv('VLLM_USE_V1', '0') + with monkeypatch.context() as m: + m.setenv('VLLM_USE_V1', '0') + yield MODELS = [ @@ -56,7 +60,7 @@ def test_mixed_requests( cached_position: int, enable_chunked_prefill: bool, block_size: int, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """ Test the case when some sequences have the prefix cache hit @@ -67,72 +71,77 @@ def test_mixed_requests( pytest.skip("Flashinfer does not support ROCm/HIP.") if backend == "XFORMERS" and current_platform.is_rocm(): pytest.skip("Xformers does not support ROCm/HIP.") - override_backend_env_variable(monkeypatch, backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, backend) - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - cached_prompt = example_prompts[cached_position] - with vllm_runner( - model, - dtype=dtype, - enable_prefix_caching=True, - enable_chunked_prefill=enable_chunked_prefill, - block_size=block_size, - ) as vllm_model: - # Run the first prompt so the cache is populated - vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens) + cached_prompt = example_prompts[cached_position] + with vllm_runner( + model, + dtype=dtype, + enable_prefix_caching=True, + enable_chunked_prefill=enable_chunked_prefill, + block_size=block_size, + ) as vllm_model: + # Run the first prompt so the cache is populated + vllm_outputs = vllm_model.generate_greedy([cached_prompt], + max_tokens) - # Run all the promopts - greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) - req_outputs = vllm_model.model.generate(example_prompts, greedy_params) + # Run all the promopts + greedy_params = SamplingParams(temperature=0.0, + max_tokens=max_tokens) + req_outputs = vllm_model.model.generate(example_prompts, + greedy_params) - # Verify number of cached tokens - for i in range(len(req_outputs)): - if i == cached_position: - expected_num_cached_tokens = ( - len(req_outputs[i].prompt_token_ids) // - block_size) * block_size - else: - expected_num_cached_tokens = 0 - assert ( - req_outputs[i].num_cached_tokens == expected_num_cached_tokens) + # Verify number of cached tokens + for i in range(len(req_outputs)): + if i == cached_position: + expected_num_cached_tokens = ( + len(req_outputs[i].prompt_token_ids) // + block_size) * block_size + else: + expected_num_cached_tokens = 0 + assert (req_outputs[i].num_cached_tokens == + expected_num_cached_tokens) - vllm_outputs = [( - output.prompt_token_ids + list(output.outputs[0].token_ids), - output.prompt + output.outputs[0].text, - ) for output in req_outputs] + vllm_outputs = [( + output.prompt_token_ids + list(output.outputs[0].token_ids), + output.prompt + output.outputs[0].text, + ) for output in req_outputs] - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) def test_unstable_prompt_sequence( vllm_runner, backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: if backend == "FLASHINFER" and current_platform.is_rocm(): pytest.skip("Flashinfer does not support ROCm/HIP.") if backend == "XFORMERS" and current_platform.is_rocm(): pytest.skip("Xformers does not support ROCm/HIP.") - override_backend_env_variable(monkeypatch, backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, backend) - with vllm_runner( - "Qwen/Qwen2.5-0.5B-Instruct", - enable_chunked_prefill=True, - enable_prefix_caching=True, - max_model_len=4096, - ) as vllm_model: - for prompt in UNSTABLE_PROMPT_SEQUENCE: - vllm_model.generate(TokensPrompt(prompt_token_ids=prompt), - SamplingParams(max_tokens=1)) + with vllm_runner( + "Qwen/Qwen2.5-0.5B-Instruct", + enable_chunked_prefill=True, + enable_prefix_caching=True, + max_model_len=4096, + ) as vllm_model: + for prompt in UNSTABLE_PROMPT_SEQUENCE: + vllm_model.generate(TokensPrompt(prompt_token_ids=prompt), + SamplingParams(max_tokens=1)) @pytest.mark.parametrize("model", MODELS) diff --git a/tests/test_regression.py b/tests/test_regression.py index b54dc6af3e9a6..8c9d4a91c73be 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -56,12 +56,11 @@ def test_gc(): assert allocated < 50 * 1024 * 1024 -def test_model_from_modelscope(monkeypatch): +def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch): # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary - MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat" - monkeypatch.setenv("VLLM_USE_MODELSCOPE", "True") - try: - llm = LLM(model=MODELSCOPE_MODEL_NAME) + with monkeypatch.context() as m: + m.setenv("VLLM_USE_MODELSCOPE", "True") + llm = LLM(model="qwen/Qwen1.5-0.5B-Chat") prompts = [ "Hello, my name is", @@ -73,10 +72,3 @@ def test_model_from_modelscope(monkeypatch): outputs = llm.generate(prompts, sampling_params) assert len(outputs) == 4 - finally: - monkeypatch.delenv("VLLM_USE_MODELSCOPE", raising=False) - - -if __name__ == "__main__": - import pytest - pytest.main([__file__]) diff --git a/tests/test_utils.py b/tests/test_utils.py index dcca7d5965e9e..ae4fddd046d45 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +# ruff: noqa import asyncio -import os import socket from collections.abc import AsyncIterator from unittest.mock import patch @@ -112,16 +112,16 @@ def test_deprecate_kwargs_additional_message(): dummy(old_arg=1) -def test_get_open_port(): - os.environ["VLLM_PORT"] = "5678" - # make sure we can get multiple ports, even if the env var is set - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1: - s1.bind(("localhost", get_open_port())) - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2: - s2.bind(("localhost", get_open_port())) - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3: - s3.bind(("localhost", get_open_port())) - os.environ.pop("VLLM_PORT") +def test_get_open_port(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_PORT", "5678") + # make sure we can get multiple ports, even if the env var is set + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1: + s1.bind(("localhost", get_open_port())) + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2: + s2.bind(("localhost", get_open_port())) + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3: + s3.bind(("localhost", get_open_port())) # Tests for FlexibleArgumentParser @@ -366,31 +366,32 @@ def test_bind_kv_cache_non_attention(): assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1] -def test_bind_kv_cache_encoder_decoder(monkeypatch): +def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch): # V1 TESTS: ENCODER_DECODER is not supported on V1 yet. - monkeypatch.setenv("VLLM_USE_V1", "0") + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") - from vllm.attention import Attention, AttentionType + from vllm.attention import Attention, AttentionType - # example from bart - ctx = { - 'encoder.layers.0.self_attn.attn': - Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER), - 'decoder.layers.0.encoder_attn.attn': - Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER), - 'decoder.layers.0.self_attn.attn': - Attention(32, 128, 0.1, attn_type=AttentionType.DECODER), - } + # example from bart + ctx = { + 'encoder.layers.0.self_attn.attn': + Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER), + 'decoder.layers.0.encoder_attn.attn': + Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER), + 'decoder.layers.0.self_attn.attn': + Attention(32, 128, 0.1, attn_type=AttentionType.DECODER), + } - kv_cache = [ - torch.zeros((1, )), - ] - encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache + kv_cache = [ + torch.zeros((1, )), + ] + encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache - bind_kv_cache(ctx, [kv_cache]) - assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache - assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0] - assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0] + bind_kv_cache(ctx, [kv_cache]) + assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache + assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0] + assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0] def test_bind_kv_cache_pp(): diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py index e94bbd2877225..f7a59f054b61b 100644 --- a/tests/tpu/test_custom_dispatcher.py +++ b/tests/tpu/test_custom_dispatcher.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -import os +import pytest from vllm.config import CompilationLevel @@ -9,16 +9,17 @@ from ..utils import compare_two_settings # --enforce-eager on TPU causes graph compilation # this times out default Health Check in the MQLLMEngine, # so we set the timeout here to 30s -os.environ["VLLM_RPC_TIMEOUT"] = "30000" -def test_custom_dispatcher(): - compare_two_settings( - "google/gemma-2b", - arg1=[ - "--enforce-eager", - f"-O{CompilationLevel.DYNAMO_ONCE}", - ], - arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"], - env1={}, - env2={}) +def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_RPC_TIMEOUT", "30000") + compare_two_settings( + "google/gemma-2b", + arg1=[ + "--enforce-eager", + f"-O{CompilationLevel.DYNAMO_ONCE}", + ], + arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"], + env1={}, + env2={}) diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index 97149884497af..a781b8b563be1 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -1,10 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 +# ruff: noqa +# type: ignore +from __future__ import annotations -import os import threading from collections.abc import Iterable from concurrent import futures -from typing import Callable, Literal +from typing import Callable, Generator, Literal import grpc import pytest @@ -21,12 +23,14 @@ from vllm.tracing import SpanAttributes @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): +def use_v0_only(monkeypatch: pytest.MonkeyPatch): """ Since this module is V0 only, set VLLM_USE_V1=0 for all tests in the module. """ - monkeypatch.setenv('VLLM_USE_V1', '0') + with monkeypatch.context() as m: + m.setenv('VLLM_USE_V1', '0') + yield FAKE_TRACE_SERVER_ADDRESS = "localhost:4317" @@ -67,7 +71,7 @@ class FakeTraceService(TraceServiceServicer): @pytest.fixture -def trace_service(): +def trace_service() -> Generator[FakeTraceService, None, None]: """Fixture to set up a fake gRPC trace service""" server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) service = FakeTraceService() @@ -80,136 +84,153 @@ def trace_service(): server.stop(None) -def test_traces(trace_service): - os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true" +def test_traces( + monkeypatch: pytest.MonkeyPatch, + trace_service: FakeTraceService, +): + with monkeypatch.context() as m: + m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true") - sampling_params = SamplingParams(temperature=0.01, - top_p=0.1, - max_tokens=256) - model = "facebook/opt-125m" - llm = LLM( - model=model, - otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, - ) - prompts = ["This is a short prompt"] - outputs = llm.generate(prompts, sampling_params=sampling_params) + sampling_params = SamplingParams( + temperature=0.01, + top_p=0.1, + max_tokens=256, + ) + model = "facebook/opt-125m" + llm = LLM( + model=model, + otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, + ) + prompts = ["This is a short prompt"] + outputs = llm.generate(prompts, sampling_params=sampling_params) - timeout = 5 - if not trace_service.evt.wait(timeout): - raise TimeoutError( - f"The fake trace service didn't receive a trace within " - f"the {timeout} seconds timeout") + timeout = 5 + if not trace_service.evt.wait(timeout): + raise TimeoutError( + f"The fake trace service didn't receive a trace within " + f"the {timeout} seconds timeout") - request = trace_service.request - assert len(request.resource_spans) == 1, ( - f"Expected 1 resource span, " - f"but got {len(request.resource_spans)}") - assert len(request.resource_spans[0].scope_spans) == 1, ( - f"Expected 1 scope span, " - f"but got {len(request.resource_spans[0].scope_spans)}") - assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( - f"Expected 1 span, " - f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") + request = trace_service.request + assert len(request.resource_spans) == 1, ( + f"Expected 1 resource span, " + f"but got {len(request.resource_spans)}") + assert len(request.resource_spans[0].scope_spans) == 1, ( + f"Expected 1 scope span, " + f"but got {len(request.resource_spans[0].scope_spans)}") + assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( + f"Expected 1 span, " + f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") - attributes = decode_attributes( - request.resource_spans[0].scope_spans[0].spans[0].attributes) - assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE - ) == sampling_params.temperature - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n - assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( - outputs[0].prompt_token_ids) - completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) - assert attributes.get( - SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens - metrics = outputs[0].metrics - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue - ttft = metrics.first_token_time - metrics.arrival_time - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft - e2e_time = metrics.finished_time - metrics.arrival_time - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time - assert metrics.scheduler_time > 0 - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER - ) == metrics.scheduler_time - # Model forward and model execute should be none, since detailed traces is - # not enabled. - assert metrics.model_forward_time is None - assert metrics.model_execute_time is None + attributes = decode_attributes( + request.resource_spans[0].scope_spans[0].spans[0].attributes) + assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE + ) == sampling_params.temperature + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS + ) == sampling_params.max_tokens + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n + assert attributes.get( + SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( + outputs[0].prompt_token_ids) + completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) + assert attributes.get( + SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens + metrics = outputs[0].metrics + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE + ) == metrics.time_in_queue + ttft = metrics.first_token_time - metrics.arrival_time + assert attributes.get( + SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft + e2e_time = metrics.finished_time - metrics.arrival_time + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time + assert metrics.scheduler_time > 0 + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER + ) == metrics.scheduler_time + # Model forward and model execute should be none, since detailed traces is + # not enabled. + assert metrics.model_forward_time is None + assert metrics.model_execute_time is None -def test_traces_with_detailed_steps(trace_service): - os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true" +def test_traces_with_detailed_steps( + monkeypatch: pytest.MonkeyPatch, + trace_service: FakeTraceService, +): + with monkeypatch.context() as m: + m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true") - sampling_params = SamplingParams(temperature=0.01, - top_p=0.1, - max_tokens=256) - model = "facebook/opt-125m" - llm = LLM( - model=model, - otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, - collect_detailed_traces="all", - ) - prompts = ["This is a short prompt"] - outputs = llm.generate(prompts, sampling_params=sampling_params) + sampling_params = SamplingParams( + temperature=0.01, + top_p=0.1, + max_tokens=256, + ) + model = "facebook/opt-125m" + llm = LLM( + model=model, + otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, + collect_detailed_traces="all", + ) + prompts = ["This is a short prompt"] + outputs = llm.generate(prompts, sampling_params=sampling_params) - timeout = 5 - if not trace_service.evt.wait(timeout): - raise TimeoutError( - f"The fake trace service didn't receive a trace within " - f"the {timeout} seconds timeout") + timeout = 5 + if not trace_service.evt.wait(timeout): + raise TimeoutError( + f"The fake trace service didn't receive a trace within " + f"the {timeout} seconds timeout") - request = trace_service.request - assert len(request.resource_spans) == 1, ( - f"Expected 1 resource span, " - f"but got {len(request.resource_spans)}") - assert len(request.resource_spans[0].scope_spans) == 1, ( - f"Expected 1 scope span, " - f"but got {len(request.resource_spans[0].scope_spans)}") - assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( - f"Expected 1 span, " - f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") + request = trace_service.request + assert len(request.resource_spans) == 1, ( + f"Expected 1 resource span, " + f"but got {len(request.resource_spans)}") + assert len(request.resource_spans[0].scope_spans) == 1, ( + f"Expected 1 scope span, " + f"but got {len(request.resource_spans[0].scope_spans)}") + assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( + f"Expected 1 span, " + f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") - attributes = decode_attributes( - request.resource_spans[0].scope_spans[0].spans[0].attributes) - assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE - ) == sampling_params.temperature - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n - assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( - outputs[0].prompt_token_ids) - completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) - assert attributes.get( - SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens - metrics = outputs[0].metrics - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue - ttft = metrics.first_token_time - metrics.arrival_time - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft - e2e_time = metrics.finished_time - metrics.arrival_time - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time - assert metrics.scheduler_time > 0 - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER - ) == metrics.scheduler_time - assert metrics.model_forward_time > 0 - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx( - metrics.model_forward_time / 1000) - assert metrics.model_execute_time > 0 - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE - ) == metrics.model_execute_time - assert metrics.model_forward_time < 1000 * metrics.model_execute_time + attributes = decode_attributes( + request.resource_spans[0].scope_spans[0].spans[0].attributes) + assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE + ) == sampling_params.temperature + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS + ) == sampling_params.max_tokens + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n + assert attributes.get( + SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( + outputs[0].prompt_token_ids) + completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) + assert attributes.get( + SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens + metrics = outputs[0].metrics + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE + ) == metrics.time_in_queue + ttft = metrics.first_token_time - metrics.arrival_time + assert attributes.get( + SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft + e2e_time = metrics.finished_time - metrics.arrival_time + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time + assert metrics.scheduler_time > 0 + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER + ) == metrics.scheduler_time + assert metrics.model_forward_time > 0 + assert attributes.get( + SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD + ) == pytest.approx(metrics.model_forward_time / 1000) + assert metrics.model_execute_time > 0 + assert attributes.get( + SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE + ) == metrics.model_execute_time + assert metrics.model_forward_time < 1000 * metrics.model_execute_time diff --git a/tests/utils.py b/tests/utils.py index fc19c8d031b16..06ba8a2421c16 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -566,6 +566,7 @@ def init_test_distributed_environment( def multi_process_parallel( + monkeypatch: pytest.MonkeyPatch, tp_size: int, pp_size: int, test_target: Any, @@ -582,7 +583,13 @@ def multi_process_parallel( refs = [] for rank in range(tp_size * pp_size): refs.append( - test_target.remote(tp_size, pp_size, rank, distributed_init_port)) + test_target.remote( + monkeypatch, + tp_size, + pp_size, + rank, + distributed_init_port, + ), ) ray.get(refs) ray.shutdown() @@ -700,7 +707,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator: """ Get a pytest mark, which skips the test if the GPU doesn't meet a minimum memory requirement in GB. - + This can be leveraged via `@large_gpu_test` to skip tests in environments without enough resources, or called when filtering tests to run directly. """ diff --git a/tests/v1/e2e/test_ngram_spec_decode.py b/tests/v1/e2e/test_ngram_spec_decode.py index 519a74cab84bc..6cca324514565 100644 --- a/tests/v1/e2e/test_ngram_spec_decode.py +++ b/tests/v1/e2e/test_ngram_spec_decode.py @@ -1,5 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import random +from typing import Any import pytest @@ -50,8 +53,12 @@ def model_name(): return "meta-llama/Meta-Llama-3-8B-Instruct" -def test_ngram_correctness(monkeypatch, test_prompts, sampling_config, - model_name): +def test_ngram_correctness( + monkeypatch: pytest.MonkeyPatch, + test_prompts: list[list[dict[str, Any]]], + sampling_config: SamplingParams, + model_name: str, +): ''' Compare the outputs of a original LLM and a speculative LLM should be the same when using ngram speculative decoding. diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 5b9725d59ddc5..0ff804976ada6 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -80,9 +80,11 @@ async def generate(engine: AsyncLLM, [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)]) @pytest.mark.asyncio -async def test_load(monkeypatch, output_kind: RequestOutputKind, - engine_args_and_prompt: tuple[AsyncEngineArgs, - PromptType]): +async def test_load( + monkeypatch: pytest.MonkeyPatch, + output_kind: RequestOutputKind, + engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType], +): # TODO(rickyx): Remove monkeypatch once we have a better way to test V1 # so that in the future when we switch, we don't have to change all the # tests. @@ -126,7 +128,8 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind, [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)]) @pytest.mark.asyncio -async def test_abort(monkeypatch, output_kind: RequestOutputKind, +async def test_abort(monkeypatch: pytest.MonkeyPatch, + output_kind: RequestOutputKind, engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType]): diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 5fdbcf5b99636..2ec4f7e034af8 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -45,7 +45,7 @@ def make_request() -> EngineCoreRequest: @fork_new_process_for_each_test -def test_engine_core(monkeypatch): +def test_engine_core(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") @@ -159,10 +159,10 @@ def test_engine_core(monkeypatch): @fork_new_process_for_each_test -def test_engine_core_advanced_sampling(monkeypatch): +def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch): """ - A basic end-to-end test to verify that the engine functions correctly - when additional sampling parameters, such as top_p, min_tokens, and + A basic end-to-end test to verify that the engine functions correctly + when additional sampling parameters, such as top_p, min_tokens, and presence_penalty, are set. """ with monkeypatch.context() as m: @@ -209,7 +209,7 @@ def test_engine_core_advanced_sampling(monkeypatch): @fork_new_process_for_each_test -def test_engine_core_concurrent_batches(monkeypatch): +def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): """ Test that the engine can handle multiple concurrent batches. """ diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index e646ccbd46030..004b4dc82f4d9 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -90,7 +90,8 @@ def echo(self, msg: str, err_msg: Optional[str] = None) -> str: @fork_new_process_for_each_test @pytest.mark.parametrize("multiprocessing_mode", [True, False]) -def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): +def test_engine_core_client(monkeypatch: pytest.MonkeyPatch, + multiprocessing_mode: bool): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") @@ -175,7 +176,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): @pytest.mark.asyncio(loop_scope="function") -async def test_engine_core_client_asyncio(monkeypatch): +async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index e763aa2c86998..3800cb392fbad 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -57,7 +57,7 @@ def _repeat_logprob_config( logprob_prompt_logprob_list: BatchLogprobsSpecType, ) -> BatchLogprobsSpecType: """Ensure each test prompt has a logprob config. - + A logprob config specifies the optional (i.e. may-be-`None`) number of sample logprobs and the optional number of prompt logprobs. @@ -80,7 +80,7 @@ def _repeat_logprob_config( (optional num sample logprob, optional num prompt logprob) tuples - + Returns: list of (optional num sample logprob,optional num prompt logprob) @@ -255,14 +255,12 @@ def _run_and_validate( [NONE, SAMPLE, PROMPT, SAMPLE_PROMPT]) @pytest.mark.parametrize("temperature", [0.0, 2.0]) def test_get_logprobs_and_prompt_logprobs( - hf_model, - vllm_model, - batch_logprobs_composition: BatchLogprobsComposition, - temperature: float, - example_prompts, -) -> None: + hf_model, vllm_model, + batch_logprobs_composition: BatchLogprobsComposition, + temperature: float, example_prompts: list[str], + monkeypatch: pytest.MonkeyPatch) -> None: """Test V1 Engine logprobs & prompt logprobs - + Exercise a variety of combinations of `logprobs` and `prompt_logprobs` settings and validate that * The generated logprobs and prompt logprobs are consistent with the @@ -279,7 +277,7 @@ def test_get_logprobs_and_prompt_logprobs( To save time, only test one APC-enabled scenario (sample & prompt logprobs enabled, temperature>0.0). - + Args: hf_model: HuggingFace reference model fixture vllm_model: vLLM model fixture @@ -287,128 +285,140 @@ def test_get_logprobs_and_prompt_logprobs( temperature: "temperature" sampling parameter example_prompts: example prompt fixture """ - do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching - if do_apc and (temperature < 2.0 - or batch_logprobs_composition != SAMPLE_PROMPT): - # Skip some test-cases to save time. - pytest.skip() - test_prompts = example_prompts + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching + if do_apc and (temperature < 2.0 + or batch_logprobs_composition != SAMPLE_PROMPT): + # Skip some test-cases to save time. + pytest.skip() + test_prompts = example_prompts - max_tokens = 5 - hf_outputs = hf_model.generate_greedy( - test_prompts, - max_tokens=max_tokens, - ) - hf_logprobs = hf_model.generate_greedy_logprobs( - test_prompts, - max_tokens=max_tokens, - ) - - # Batch has mixed sample params - # (different logprobs/prompt logprobs combos) - logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition) - - # Ensure that each test prompt has a logprob config for testing - logprob_prompt_logprob_list = _repeat_logprob_config( - test_prompts, logprob_prompt_logprob_list) - # Generate SamplingParams - vllm_sampling_params = [ - SamplingParams(max_tokens=max_tokens, - logprobs=num_lp, - prompt_logprobs=num_plp, - temperature=temperature, - seed=1984) - for num_lp, num_plp in logprob_prompt_logprob_list - ] - for _ in range(2 if do_apc else 1): - _run_and_validate( - vllm_model=vllm_model, - test_prompts=test_prompts, - vllm_sampling_params=vllm_sampling_params, - hf_logprobs=hf_logprobs, - hf_outputs=hf_outputs, - logprob_prompt_logprob_list=logprob_prompt_logprob_list, - temperature=temperature, + max_tokens = 5 + hf_outputs = hf_model.generate_greedy( + test_prompts, max_tokens=max_tokens, - do_apc=do_apc) + ) + hf_logprobs = hf_model.generate_greedy_logprobs( + test_prompts, + max_tokens=max_tokens, + ) + + # Batch has mixed sample params + # (different logprobs/prompt logprobs combos) + logprob_prompt_logprob_list = get_test_batch( + batch_logprobs_composition) + + # Ensure that each test prompt has a logprob config for testing + logprob_prompt_logprob_list = _repeat_logprob_config( + test_prompts, logprob_prompt_logprob_list) + # Generate SamplingParams + vllm_sampling_params = [ + SamplingParams(max_tokens=max_tokens, + logprobs=num_lp, + prompt_logprobs=num_plp, + temperature=temperature, + seed=1984) + for num_lp, num_plp in logprob_prompt_logprob_list + ] + for _ in range(2 if do_apc else 1): + _run_and_validate( + vllm_model=vllm_model, + test_prompts=test_prompts, + vllm_sampling_params=vllm_sampling_params, + hf_logprobs=hf_logprobs, + hf_outputs=hf_outputs, + logprob_prompt_logprob_list=logprob_prompt_logprob_list, + temperature=temperature, + max_tokens=max_tokens, + do_apc=do_apc) -def test_max_logprobs(): +def test_max_logprobs(monkeypatch: pytest.MonkeyPatch): """vLLM v1 engine should fail a request with `logprobs > max_logprobs` - Should also fail for `prompt_logprobs > max_logprobs` - APC should not matter as this test checks basic request validation. - - Args: - monkeypatch """ + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") - runner = VllmRunner("facebook/opt-125m", - max_logprobs=1, - enable_prefix_caching=False, - max_model_len=256) - vllm_sampling_params = SamplingParams(logprobs=1) - # should pass - runner.generate(["Hello world"], sampling_params=vllm_sampling_params) + runner = VllmRunner("facebook/opt-125m", + max_logprobs=1, + enable_prefix_caching=False, + max_model_len=256) + vllm_sampling_params = SamplingParams(logprobs=1) + # should pass + runner.generate(["Hello world"], sampling_params=vllm_sampling_params) - bad_sampling_params = SamplingParams(logprobs=2) - with pytest.raises(ValueError): - runner.generate(["Hello world"], sampling_params=bad_sampling_params) + bad_sampling_params = SamplingParams(logprobs=2) + with pytest.raises(ValueError): + runner.generate(["Hello world"], + sampling_params=bad_sampling_params) -def test_none_logprobs(vllm_model, example_prompts): +def test_none_logprobs(vllm_model, example_prompts, + monkeypatch: pytest.MonkeyPatch): """Engine should return `logprobs` and `prompt_logprobs` as `None` - + Args: vllm_model: vLLM model fixture example_prompts: list of example prompts (test fixture) """ - max_tokens = 5 + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + max_tokens = 5 - sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens, - logprobs=None, - prompt_logprobs=None, - temperature=0.0) - results_logprobs_none = vllm_model.model.generate( - example_prompts, sampling_params=sampling_params_logprobs_none) + sampling_params_logprobs_none = SamplingParams( + max_tokens=max_tokens, + logprobs=None, + prompt_logprobs=None, + temperature=0.0, + ) + results_logprobs_none = vllm_model.model.generate( + example_prompts, + sampling_params=sampling_params_logprobs_none, + ) - for i in range(len(results_logprobs_none)): - # Check sample logprobs are None - assert results_logprobs_none[i].outputs[0].logprobs is None - assert results_logprobs_none[i].outputs[0].cumulative_logprob is None - # Check prompt logprobs are None - assert results_logprobs_none[i].prompt_logprobs is None + for i in range(len(results_logprobs_none)): + # Check sample logprobs are None + assert results_logprobs_none[i].outputs[0].logprobs is None + assert results_logprobs_none[i].outputs[ + 0].cumulative_logprob is None + # Check prompt logprobs are None + assert results_logprobs_none[i].prompt_logprobs is None -def test_zero_logprobs(vllm_model, example_prompts): +def test_zero_logprobs(vllm_model, example_prompts, + monkeypatch: pytest.MonkeyPatch): """Engine should return sampled token and prompt token logprobs - + Args: vllm_model: vLLM model fixture example_prompts: list of example prompts (test fixture) """ - max_tokens = 5 + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + max_tokens = 5 - sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens, - logprobs=0, - prompt_logprobs=0, - temperature=0.0) - results_logprobs_zero = vllm_model.model.generate( - example_prompts, sampling_params=sampling_params_logprobs_zero) + sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens, + logprobs=0, + prompt_logprobs=0, + temperature=0.0) + results_logprobs_zero = vllm_model.model.generate( + example_prompts, sampling_params=sampling_params_logprobs_zero) - for i in range(len(results_logprobs_zero)): - # Check that there is one sample logprob dict for each - # sample token - logprobs = results_logprobs_zero[i].outputs[0].logprobs - prompt_logprobs = results_logprobs_zero[i].prompt_logprobs - sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids - prompt_token_ids = results_logprobs_zero[i].prompt_token_ids - assert logprobs is not None - assert len(sampled_token_ids) == len(logprobs) - assert results_logprobs_zero[i].outputs[ - 0].cumulative_logprob is not None - # Check that there is one prompt logprob dict for each - # prompt token - assert prompt_logprobs is not None - assert len(prompt_token_ids) == len(prompt_logprobs) + for i in range(len(results_logprobs_zero)): + # Check that there is one sample logprob dict for each + # sample token + logprobs = results_logprobs_zero[i].outputs[0].logprobs + prompt_logprobs = results_logprobs_zero[i].prompt_logprobs + sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids + prompt_token_ids = results_logprobs_zero[i].prompt_token_ids + assert logprobs is not None + assert len(sampled_token_ids) == len(logprobs) + assert results_logprobs_zero[i].outputs[ + 0].cumulative_logprob is not None + # Check that there is one prompt logprob dict for each + # prompt token + assert prompt_logprobs is not None + assert len(prompt_token_ids) == len(prompt_logprobs) diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py index 0309f545ea49e..241f49e4faea8 100644 --- a/tests/v1/tpu/test_basic.py +++ b/tests/v1/tpu/test_basic.py @@ -3,11 +3,16 @@ Run `pytest tests/v1/tpu/test_basic.py`. """ +from __future__ import annotations + +from typing import TYPE_CHECKING + import pytest from vllm.platforms import current_platform -from ...conftest import VllmRunner +if TYPE_CHECKING: + from tests.conftest import VllmRunner MODELS = [ # "Qwen/Qwen2-7B-Instruct", @@ -28,7 +33,8 @@ TENSOR_PARALLEL_SIZES = [1] @pytest.mark.parametrize("enforce_eager", [True]) @pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES) def test_models( - monkeypatch, + vllm_runner: type[VllmRunner], + monkeypatch: pytest.MonkeyPatch, model: str, max_tokens: int, enforce_eager: bool, @@ -41,7 +47,7 @@ def test_models( with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") - with VllmRunner( + with vllm_runner( model, max_model_len=8192, enforce_eager=enforce_eager, @@ -50,5 +56,5 @@ def test_models( tensor_parallel_size=tensor_parallel_size) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - output = vllm_outputs[0][1] - assert "1024" in output + output = vllm_outputs[0][1] + assert "1024" in output From 583a9778e0bc65b031bc3e430d8f13655f727ec7 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Sun, 16 Mar 2025 21:48:11 -0700 Subject: [PATCH 080/169] [Benchmark] Do not save detailed info to json by default (#14879) Signed-off-by: simon-mo --- benchmarks/backend_request_func.py | 5 ++++- benchmarks/benchmark_serving.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 6a7db920b5b63..09c8e23ebb1c3 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -14,7 +14,8 @@ from tqdm.asyncio import tqdm from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) -from vllm.model_executor.model_loader.weight_utils import get_lock +# NOTE(simon): do not import vLLM here so the benchmark script +# can run without vLLM installed. AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) @@ -427,6 +428,8 @@ def get_model(pretrained_model_name_or_path: str) -> str: if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': from modelscope import snapshot_download + from vllm.model_executor.model_loader.weight_utils import get_lock + # Use file lock to prevent multiple processes from # downloading the same model weights at the same time. with get_lock(pretrained_model_name_or_path): diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 1dd01ca968678..47627126b6688 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -684,6 +684,15 @@ def main(args: argparse.Namespace): "Invalid metadata format. Please use KEY=VALUE format." ) + if not args.save_detailed: + # Remove fields with too many data points + for field in [ + "input_lens", "output_lens", "ttfts", "itls", + "generated_texts", "errors" + ]: + if field in result_json: + del result_json[field] + # Traffic result_json["request_rate"] = (args.request_rate if args.request_rate < float("inf") else "inf") @@ -828,6 +837,12 @@ if __name__ == "__main__": action="store_true", help="Specify to save benchmark results to a json file", ) + parser.add_argument( + "--save-detailed", + action="store_true", + help="When saving the results, whether to include per request " + "information such as response, error, ttfs, tpots, etc.", + ) parser.add_argument( "--metadata", metavar="KEY=VALUE", From 8d6cf89526ff983b7eb74aad3903138004ae95cd Mon Sep 17 00:00:00 2001 From: Lily Liu Date: Sun, 16 Mar 2025 22:00:20 -0700 Subject: [PATCH 081/169] [V1] [Spec Decode] Support random sampling for spec decode (#13933) Co-authored-by: Woosuk Kwon --- tests/v1/sample/test_rejection_sampler.py | 301 +++++++++++++--- vllm/v1/sample/rejection_sampler.py | 400 +++++++++++++++------- vllm/v1/sample/sampler.py | 8 - vllm/v1/spec_decode/utils.py | 22 ++ vllm/v1/worker/gpu_model_runner.py | 31 +- 5 files changed, 568 insertions(+), 194 deletions(-) create mode 100644 vllm/v1/spec_decode/utils.py diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py index 190927745f1fe..84139a40b544a 100644 --- a/tests/v1/sample/test_rejection_sampler.py +++ b/tests/v1/sample/test_rejection_sampler.py @@ -1,37 +1,51 @@ # SPDX-License-Identifier: Apache-2.0 +from typing import Any, Optional import pytest import torch +import torch.nn.functional as F from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler +DEVICE = "cpu" + @pytest.fixture def sampler(): return RejectionSampler() -def create_logits_tensor(token_ids: list[int], +def create_logits_tensor(token_ids: list[list[int]], vocab_size: int = 100) -> torch.Tensor: """Helper function to create logits tensor that will produce desired token ids on argmax""" - logits = torch.full((len(token_ids), vocab_size), -100.0).cuda() - for i, token_id in enumerate(token_ids): - logits[i, token_id] = 100.0 + num_total_tokens = sum(len(tokens) for tokens in token_ids) + logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE) + start_loc = 0 + for tokens in token_ids: + for j, token_id in enumerate(tokens): + logits[start_loc + j, token_id] = 100.0 + start_loc += len(tokens) return logits -def create_sampling_metadata(spec_tokens: list[list[int]]) -> SamplingMetadata: - batch_size = len(spec_tokens) +def create_sampling_metadata( + all_greedy: bool, + generators: Optional[dict[int, Any]] = None) -> SamplingMetadata: + """Create a v1 sampling metadata object with all_greedy set + to the given value. Either all greedy or all random sampling + is used. + """ + generators = generators or {} return SamplingMetadata( temperature=torch.tensor([]), - all_greedy=True, - all_random=False, + all_greedy=all_greedy, + all_random=not all_greedy, top_p=None, top_k=None, - min_p=torch.empty(batch_size, ), - generators={}, + min_p=torch.empty(1, ), + generators=generators, max_num_logprobs=0, no_penalties=False, prompt_token_ids=None, @@ -40,129 +54,310 @@ def create_sampling_metadata(spec_tokens: list[list[int]]) -> SamplingMetadata: repetition_penalties=torch.tensor([]), output_token_ids=[], min_tokens={}, - logit_bias=[None] * batch_size, + logit_bias=[None], allowed_token_ids_mask=None, bad_words_token_ids={}, ) +########################### Tests for Greedy Sampling ################### def test_perfect_match(sampler): """Test when output tokens perfectly match speculated tokens""" spec_tokens = [[1, 2, 3]] - output_tokens = [1, 2, 3, 4] # 4 is the bonus token + output_tokens = [[1, 2, 3, 4]] # 4 is the bonus token - metadata = create_sampling_metadata(spec_tokens) + metadata = create_sampling_metadata(all_greedy=True) logits = create_logits_tensor(output_tokens) + bonus_token_tensor = torch.tensor([output_tokens[0][-1]], + device=logits.device) - output = sampler(spec_tokens, logits, metadata) + output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata) expected = torch.tensor([[1, 2, 3, 4]], dtype=torch.int, device=logits.device) - assert torch.equal(output.sampled_token_ids, expected) + assert torch.equal(output, expected) def test_early_mismatch(sampler): """Test when there's an early mismatch in tokens""" spec_tokens = [[1, 2, 3]] - output_tokens = [1, 5, 3, 4] # Mismatch at position 1 + output_tokens = [[1, 5, 3, 4]] # Mismatch at position 1 - metadata = create_sampling_metadata(spec_tokens) + metadata = create_sampling_metadata(all_greedy=True) logits = create_logits_tensor(output_tokens) + bonus_token_tensor = torch.tensor([output_tokens[0][-1]], + device=logits.device) - output = sampler(spec_tokens, logits, metadata) + output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata) expected = torch.tensor([[1, 5, INVALID_TOKEN_ID, INVALID_TOKEN_ID]], dtype=torch.int, device=logits.device) - assert torch.equal(output.sampled_token_ids, expected) + assert torch.equal(output, expected) def test_multiple_sequences(sampler): """Test handling multiple sequences of speculated tokens""" spec_tokens = [[1, 2], [3]] - output_tokens = [1, 2, 5, 3, 4] # Two sequences with bonus tokens 5 and 4 + output_tokens = [[1, 2, 5], [3, + 4]] # Two sequences with bonus tokens 5 and 4 - metadata = create_sampling_metadata(spec_tokens) + metadata = create_sampling_metadata(all_greedy=True) logits = create_logits_tensor(output_tokens) + bonus_token_tensor = torch.tensor( + [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device) - output = sampler(spec_tokens, logits, metadata) + output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata) expected = torch.tensor([[1, 2, 5], [3, 4, INVALID_TOKEN_ID]], dtype=torch.int, device=logits.device) - assert torch.equal(output.sampled_token_ids, expected) + assert torch.equal(output, expected) def test_single_token_sequence(sampler): """Test handling sequences with single token""" spec_tokens = [[1]] - output_tokens = [1, 2] # Single token with bonus token 2 + output_tokens = [[1, 2]] # Single token with bonus token 2 - metadata = create_sampling_metadata(spec_tokens) + metadata = create_sampling_metadata(all_greedy=True) logits = create_logits_tensor(output_tokens) + bonus_token_tensor = torch.tensor([output_tokens[0][-1]], + device=logits.device) - output = sampler(spec_tokens, logits, metadata) + output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata) expected = torch.tensor([[1, 2]], dtype=torch.int, device=logits.device) - assert torch.equal(output.sampled_token_ids, expected) + assert torch.equal(output, expected) def test_empty_sequence(sampler): """Test handling empty sequence of speculated tokens""" spec_tokens: list[list[int]] = [[]] - output_tokens = [5] # Just the bonus token + output_tokens = [[5]] # Just the bonus token - metadata = create_sampling_metadata(spec_tokens) + metadata = create_sampling_metadata(all_greedy=True) logits = create_logits_tensor(output_tokens) + bonus_token_tensor = torch.tensor([output_tokens[0][-1]], + device=logits.device) - output = sampler(spec_tokens, logits, metadata) + output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata) expected = torch.tensor([[5]], dtype=torch.int, device=logits.device) - assert torch.equal(output.sampled_token_ids, expected) + assert torch.equal(output, expected) def test_multiple_mismatches(sampler): """Test handling multiple sequences with mismatches""" spec_tokens = [[1, 2, 3], [4, 5, 6]] - output_tokens = [1, 2, 7, 6, 4, 8, 6, 9] # Mismatches in both sequences + output_tokens = [[1, 2, 7, 6], [4, 8, 6, + 9]] # Mismatches in both sequences - metadata = create_sampling_metadata(spec_tokens) + metadata = create_sampling_metadata(all_greedy=True) logits = create_logits_tensor(output_tokens) + bonus_token_tensor = torch.tensor( + [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device) - output = sampler(spec_tokens, logits, metadata) + output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata) expected = torch.tensor([[1, 2, 7, INVALID_TOKEN_ID], [4, 8, INVALID_TOKEN_ID, INVALID_TOKEN_ID]], dtype=torch.int, device=logits.device) - assert torch.equal(output.sampled_token_ids, expected) + assert torch.equal(output, expected) @pytest.mark.parametrize( "spec_tokens,output_tokens,expected", [ - ([[1, 2]], [1, 2, 3], [[1, 2, 3]]), # Perfect match with bonus - ([[1]], [2, 3], [[2, INVALID_TOKEN_ID]]), # First mismatch - ([[1, 2], [3, 4]], [1, 5, 6, 3, 4, 7], [[1, 5, INVALID_TOKEN_ID], - [3, 4, 7]]), # Mixed matches + ([[1, 2]], [[1, 2, 3]], [[1, 2, 3]]), # Perfect match with bonus + ([[1]], [[2, 3]], [[2, INVALID_TOKEN_ID]]), # First mismatch + ([[1, 2], [3, 4]], [[1, 5, 6], [3, 4, 7]], + [[1, 5, INVALID_TOKEN_ID], [3, 4, 7]]), # Mixed matches ]) def test_parametrized_cases(sampler, spec_tokens, output_tokens, expected): """Parametrized test for various matching scenarios""" - metadata = create_sampling_metadata(spec_tokens) + metadata = create_sampling_metadata(all_greedy=True) logits = create_logits_tensor(output_tokens) + bonus_token_tensor = torch.tensor([tokens[-1] for tokens in output_tokens], + device=logits.device) - output = sampler(spec_tokens, logits, metadata) + output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata) expected_tensor = torch.tensor(expected, dtype=torch.int, device=logits.device) - assert torch.equal(output.sampled_token_ids, expected_tensor) + assert torch.equal(output, expected_tensor) -def test_logits_shape_handling(sampler): - """Test handling of different logits tensor shapes""" - spec_tokens = [[1, 2]] - output_tokens = [1, 2, 3] - vocab_size = 1000 +########################### Tests for Random Sampling ################### +@pytest.mark.parametrize("k", [1, 3, 5]) +@pytest.mark.parametrize("vocab_size", [1000]) +@pytest.mark.parametrize("batch_size", [1, 4, 8]) +@pytest.mark.parametrize("frac_seeded", [0.0, 0.5]) +@pytest.mark.parametrize("n_rep", [20]) +def test_deterministic_when_seeded(sampler, k: int, vocab_size: int, + batch_size: int, frac_seeded: float, + n_rep: int): + draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) + target_probs = torch.rand(batch_size * (k + 1), + vocab_size, + dtype=torch.float32) + bonus_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, 1), + dtype=torch.int64) + draft_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64) - metadata = create_sampling_metadata(spec_tokens) - logits = create_logits_tensor(output_tokens, vocab_size) + seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded - output = sampler(spec_tokens, logits, metadata) - expected = torch.tensor([[1, 2, 3]], dtype=torch.int, device=logits.device) - assert torch.equal(output.sampled_token_ids, expected) - assert logits.shape[-1] == vocab_size + results = [] + for _ in range(n_rep): + seeded_seqs = { + i: torch.Generator(device=DEVICE).manual_seed(i) + for i in range(batch_size) if seeded_mask[i] + } + + sampling_metadata = create_sampling_metadata(all_greedy=False, + generators=seeded_seqs) + rep_result = sampler(draft_token_ids.tolist(), draft_probs, + bonus_token_ids, target_probs, sampling_metadata) + + results.append(rep_result) + + for i in range(batch_size): + if seeded_mask[i]: + for j in range(1, n_rep): + assert torch.equal(results[j][i], results[0][i]) + + +def test_rejection_sampling_approximates_target_distribution(): + """Verify rejection sampling approximates target distribution, + despite sampling from a potentially distinct draft distribution. + + This is done by first creating a random target probability + distribution and a random draft probability distribution. We then + sample token ids from the rejection sampler using these draft + and target distributions. The samples are used to estimate + the output probability distribution, which we expect to approximate + the target distribution. + + A basic distance metric is used to determine similarity between + distributions. + + We expect that as we increase the number of samples, + the distance between the observed distribution and the target + distribution decreases. To measure this, we compare the distance + of the observed distribution against both the target distribution + and a uniform random distribution. We expect the distance between + the observed distribution and the target distribution to improve + much more than the distance improvement between the observed + distribution and the random distribution. + """ + torch.set_default_device(DEVICE) + vocab_size = 10 + k = 2 + num_reference_probs = 100 + + # Prepare draft, target, and reference probability distributions + draft_probs, target_probs = (F.softmax( + torch.rand(vocab_size, dtype=torch.float32), + dim=-1, + ) for _ in range(2)) + reference_probs = F.softmax( + torch.rand(num_reference_probs, vocab_size, dtype=torch.float32), + dim=-1, + ) + + sample_sizes = [10, 100, 1_000, 10_000, 100_000] + distance_wrt_reference: list[float] = [] + distance_wrt_target: list[float] = [] + + for num_samples in sample_sizes: + # Sample using rejection sampling. + rej_sample_probs = estimate_rejection_sampling_pdf( + draft_probs, target_probs, k, vocab_size, num_samples) + rej_sample_probs = rej_sample_probs.to(DEVICE) + + # Average distance from reference probs. + reference_vs_rejsample_dist = torch.dist( + reference_probs, + rej_sample_probs).item() / reference_probs.shape[0] + target_vs_rejsample_dist = torch.dist(target_probs, + rej_sample_probs).item() + + distance_wrt_reference.append(reference_vs_rejsample_dist) + distance_wrt_target.append(target_vs_rejsample_dist) + + relative_change_in_distance_wrt_target = get_ratio_first_to_last( + distance_wrt_target) + relative_change_in_distance_wrt_reference = get_ratio_first_to_last( + distance_wrt_reference) + + print(f"{num_samples=} {target_vs_rejsample_dist=:.05f} " + f"{reference_vs_rejsample_dist=:.05f}") + print(f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} " + f"{relative_change_in_distance_wrt_reference=:.02f}") + + relative_change_in_distance_wrt_target = get_ratio_first_to_last( + distance_wrt_target) + relative_change_in_distance_wrt_reference = get_ratio_first_to_last( + distance_wrt_reference) + + expected_improvement_multiplier = 20 + assert (relative_change_in_distance_wrt_target + > relative_change_in_distance_wrt_reference * + expected_improvement_multiplier) + + +def get_ratio_first_to_last(elements: list[float]) -> float: + return elements[0] / elements[-1] + + +def estimate_rejection_sampling_pdf( + draft_probs: torch.Tensor, + target_probs: torch.Tensor, + k: int, + vocab_size: int, + num_samples: int, +) -> torch.Tensor: + """Estimate the probability distribution of the output tokens + using rejection sampling. + + Args: + draft_probs: Draft probability distribution. + target_probs: Target probability distribution. + num_samples: Number of samples to draw. + + Returns: + Estimated probability distribution of the output tokens. + """ + sampler = RejectionSampler() + # Repeat draft probs num_samples times. + draft_probs = draft_probs.reshape(1, 1, + vocab_size).repeat(num_samples, k, 1) + + # Repeat target probs num_samples * (k + 1) times. + target_probs = target_probs.reshape(1, 1, vocab_size).repeat( + num_samples, k + 1, 1).reshape(num_samples * (k + 1), vocab_size) + + # Randomly sample draft token ids from draft probs. + draft_token_ids = torch.multinomial(draft_probs[:, 0, :], + num_samples=k, + replacement=True).reshape( + num_samples, k) + + # Bonus tokens not used but required. + bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64, + device=DEVICE).repeat(num_samples, 1) + + sampling_metadata = create_sampling_metadata(all_greedy=False) + output_token_ids = sampler(draft_token_ids.tolist(), draft_probs, + bonus_token_ids, target_probs, + sampling_metadata) + output_token_ids = output_token_ids[:, :-1].flatten() + + hist = torch.histogram(output_token_ids.to(dtype=torch.float, + device="cpu"), + bins=vocab_size, + range=(0, vocab_size), + density=True) + + return hist.hist diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index ea7f3353c115f..5601c62e91fc0 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -1,87 +1,89 @@ # SPDX-License-Identifier: Apache-2.0 +from typing import Optional import torch import torch.nn as nn from torch.nn.utils.rnn import pad_sequence -from vllm import envs from vllm.logger import init_logger -from vllm.platforms import current_platform -from vllm.v1.outputs import SamplerOutput from vllm.v1.sample.metadata import SamplingMetadata - -try: - import flashinfer.sampling as fs - is_flashinfer_available = True -except ImportError: - is_flashinfer_available = False +from vllm.v1.spec_decode.utils import random_sample logger = init_logger(__name__) INVALID_TOKEN_ID = -1 class RejectionSampler(nn.Module): + """ + The implementation strictly follows the algorithm described in + https://arxiv.org/abs/2211.17192. + However, we want to clarify the terminology used in the implementation: + accepted tokens: tokens that are accepted based on the relationship + between the "raw" draft and target probabilities. + recovered tokens: tokens that are sampled based on the adjusted probability + distribution, which is derived from both the draft and target + probabilities. + bonus tokens: + If all proposed tokens are accepted, the bonus token is added to the + end of the sequence. The bonus token is only sampled from the target + probabilities. We pass in the bonus tokens instead of sampling them + in the rejection sampler to allow for more flexibility in the + sampling process. For example, we can use top_p, top_k sampling for + bonus tokens, while spec decode does not support these sampling + strategies. + output tokens: + Tokens are finally generated with the rejection sampler. + output tokens = accepted tokens + recovered tokens + bonus tokens + """ def __init__(self): super().__init__() - if current_platform.is_cuda(): - if is_flashinfer_available: - if envs.VLLM_USE_FLASHINFER_SAMPLER is not False: - # FIXME(woosuk): Currently, we have errors when using - # FlashInfer for rejection sampling. As a workaround, we - # disable FlashInfer for rejection sampling by default. - logger.info("Currently, FlashInfer rejection sampler is " - "disabled because of a bug. Falling back to " - "the PyTorch-native implementation of " - "rejection sampling.") - self.forward_method = self.forward_native - # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for - # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by - # default it is unused). For backward compatibility, we set - # `VLLM_USE_FLASHINFER_SAMPLER` as None by default and - # interpret it differently in V0 and V1 samplers: In V0, - # None means False, while in V1, None means True. This is - # why we use the condition - # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here. - # logger.info("Using FlashInfer for rejection sampling.") - # self.forward_method = self.flashinfer_sample - else: - logger.warning( - "FlashInfer is available, but it is not enabled. " - "Falling back to the PyTorch-native implementation of " - "rejection sampling. For the best performance, " - "please set VLLM_USE_FLASHINFER_SAMPLER=1.") - self.forward_method = self.forward_native - else: - logger.warning( - "FlashInfer is not available. Falling back to the PyTorch-" - "native implementation of rejection sampling. For the " - "best performance, please install FlashInfer.") - self.forward_method = self.forward_native - else: - self.forward_method = self.forward_native - - def forward(self, draft_token_ids: list[list[int]], - target_probs: torch.Tensor, - sampling_metadata: SamplingMetadata) -> SamplerOutput: - if not sampling_metadata.all_greedy: - raise NotImplementedError( - "Currently, only greedy sampling is supported by " - "rejection sampler.") - return self.forward_method(draft_token_ids, target_probs, - sampling_metadata) - - def flashinfer_sample( + def forward( self, draft_token_ids: list[list[int]], - target_probs: torch.Tensor, + draft_probs: Optional[torch.Tensor], + bonus_token_ids_tensor: torch.Tensor, # [batch_size, 1] + target_probs: torch.Tensor, # [num_total_tokens, vocab_size] sampling_metadata: SamplingMetadata, - ) -> SamplerOutput: + ) -> torch.Tensor: + ''' + Args: + draft_token_ids (List[List[int]]): + A 2D list of token IDs for each request in the batch. + Each request might have different number of draft tokens. + It may also contain empty lists for requests that have + no draft tokens. + draft_probs (Optional[torch.Tensor]): + Probability distribution for the draft tokens. Shape is + [batch_size, max_spec_len, vocab_size]. Can be None if + probabilities are not provided, which is the case for + ngram spec decode. + bonus_token_ids_tensor (torch.Tensor): + A tensor containing bonus tokens. Shape is [batch_size, 1]. + Bonus tokens are added to the end of the sequence if all + proposed tokens are accepted. We generate the bonus tokens + outside of the rejection sampler with the default sampling + strategy. It allows for more flexibility in the sampling + process such as top_p, top_k sampling. + target_probs (torch.Tensor): + Target model probability distribution. + Shape is [num_total_tokens, vocab_size]. num_total_tokens + is the total number of tokens from all requests. Here, + probabilities from different requests are flattened into + a single tensor because this is the shape of the output + logits. + sampling_metadata (SamplingMetadata): + Additional metadata needed for sampling, such as temperature, + top-k/top-p parameters, or other relevant information. + Returns: + output_token_ids (torch.Tensor): + A tensor containing the final output token IDs. + ''' + # NOTE: The following input preparationg can be moved # to the model runner with a persistent manner for better # performance. - sample_lens = [len(x) + 1 for x in draft_token_ids] # Convert draft token IDs to a tensor, split by sample_lens, then pad. draft_token_ids = [ torch.tensor(x, dtype=int, device='cpu') for x in draft_token_ids @@ -90,90 +92,171 @@ class RejectionSampler(nn.Module): batch_first=True, padding_value=INVALID_TOKEN_ID) - if sampling_metadata.all_greedy: - target_token_ids = target_probs.argmax(dim=-1).view(-1) - target_token_ids = target_token_ids.split(sample_lens) - target_token_ids = pad_sequence(target_token_ids, - batch_first=True, - padding_value=INVALID_TOKEN_ID) + # NOTE: CPU <-> GPU synchronization happens here. + draft_token_ids_tensor = draft_token_ids_tensor.to(target_probs.device) + # Create one-hot tensor for draft token ids. + # This is used for ngram where we don't have draft_probs. + if draft_probs is None and not sampling_metadata.all_greedy: vocab_size = target_probs.size(-1) - # NOTE: CPU <-> GPU synchronization happens here. - draft_token_ids_tensor = draft_token_ids_tensor.to( - target_probs.device) draft_probs = _create_greedy_token_probs(draft_token_ids_tensor, vocab_size, target_probs.device) - target_probs = _create_greedy_token_probs(target_token_ids, - vocab_size, - target_probs.device) - uniform_samples = torch.zeros(draft_token_ids_tensor.size(0), - draft_token_ids_tensor.size(1) + 1, - device=target_probs.device) - else: - raise NotImplementedError( - "Currently, only greedy sampling is supported by " - "rejection sampler.") + sample_lens = [len(x) + 1 for x in draft_token_ids] + target_probs = _convert_2d_probs(target_probs, sample_lens) - sampled_token_ids, _, _ = fs.chain_speculative_sampling( - draft_probs, - draft_token_ids_tensor, - uniform_samples, - target_probs, - ) - return SamplerOutput(sampled_token_ids=sampled_token_ids, - logprobs_tensors=None) + return self.forward_native(draft_token_ids_tensor, draft_probs, + bonus_token_ids_tensor, target_probs, + sampling_metadata) # TODO: The following method can be optimized for better performance. def forward_native( self, - draft_token_ids: list[list[int]], + draft_token_ids_tensor: torch.Tensor, + # [batch_size, max_spec_len, vocab_size] + draft_probs: Optional[torch.Tensor], + bonus_token_ids_tensor: torch.Tensor, + # [batch_size, max_spec_len + 1, vocab_size] target_probs: torch.Tensor, sampling_metadata: SamplingMetadata, - ) -> SamplerOutput: - sample_lens = [len(x) + 1 for x in draft_token_ids] - # Convert draft token IDs to a tensor, split by sample_lens, then pad. - draft_token_ids = [ - torch.tensor(x, dtype=int, device='cpu') for x in draft_token_ids - ] - draft_token_ids_tensor = pad_sequence(draft_token_ids, - batch_first=True, - padding_value=INVALID_TOKEN_ID) - draft_token_ids_tensor = draft_token_ids_tensor.to(target_probs.device) + ) -> torch.Tensor: # Add 1 to include the 'bonus' token. if sampling_metadata.all_greedy: - output_token_ids = target_probs.argmax(dim=-1).view(-1) - output_token_ids = output_token_ids.split(sample_lens) - output_token_ids = pad_sequence(output_token_ids, - batch_first=True, - padding_value=INVALID_TOKEN_ID) # Produce a mask that remains 1 (True) until the first # mismatch (cumprod turns 0 after a mismatch). - accept_mask = ( - output_token_ids[:, :-1] == draft_token_ids_tensor).cumprod( - dim=1) - else: - raise NotImplementedError( - "Currently, only greedy sampling is supported by " - "rejection sampler.") - # Identify valid positions (non-padding). - valid_mask = output_token_ids != INVALID_TOKEN_ID - # Generate mask with bonus token. - generate_mask = torch.cat([ - accept_mask, - torch.zeros(accept_mask.size(0), 1, device=accept_mask.device) - ], - dim=1).to(torch.bool) & valid_mask - zeros_mask = (generate_mask == 0) - first_zero_idx = zeros_mask.float().argmax(dim=1) - # Figure out which rows actually contain at least one zero. - rows_with_zero = zeros_mask.any(dim=1) - # Use indexing to set the first zero in each of those rows to 1. - generate_mask[rows_with_zero, first_zero_idx[rows_with_zero]] = 1 + target_token_ids_tensor = target_probs.argmax(dim=-1) + accept_mask = (target_token_ids_tensor[:, :-1] == + draft_token_ids_tensor).cumprod(dim=1) - output_token_ids[~generate_mask] = INVALID_TOKEN_ID - return SamplerOutput(sampled_token_ids=output_token_ids, - logprobs_tensors=None) + # Identify valid positions (non-padding). + valid_mask = target_token_ids_tensor != INVALID_TOKEN_ID + # Generate mask with bonus token. + generate_mask = torch.cat([ + accept_mask, + torch.zeros(accept_mask.size(0), 1, device=accept_mask.device) + ], + dim=1).to(torch.bool) & valid_mask + zeros_mask = (generate_mask == 0) + first_zero_idx = zeros_mask.float().argmax(dim=1) + # Figure out which rows actually contain at least one zero. + rows_with_zero = zeros_mask.any(dim=1) + # Use indexing to set the first zero in each of those rows to 1. + generate_mask[rows_with_zero, first_zero_idx[rows_with_zero]] = 1 + + output_token_ids = target_token_ids_tensor + output_token_ids[~generate_mask] = INVALID_TOKEN_ID + else: + # Reference: https://arxiv.org/pdf/2211.17192 + # 1. Extract the probabilities of the draft tokens. + # [batch_size, max_spec_len] + batch_size = draft_token_ids_tensor.size(0) + max_spec_len = draft_token_ids_tensor.size(1) + invalid_idx = draft_token_ids_tensor == INVALID_TOKEN_ID + draft_token_ids_tensor[invalid_idx] = 0 + assert draft_probs is not None + draft_token_probs = draft_probs.gather( + dim=-1, index=draft_token_ids_tensor.unsqueeze(-1)).squeeze(-1) + target_token_probs = target_probs.gather( + dim=-1, index=draft_token_ids_tensor.unsqueeze(-1)).squeeze(-1) + # Force the probabilities of invalid tokens to inf + # so that they are not accepted. + draft_token_probs[invalid_idx] = float('inf') + + # 2. Generate uniform samples. + # [batch_size, max_spec_len + 1] + uniform_samples = _create_uniform_samples( + sampling_metadata.generators, batch_size, max_spec_len, + target_probs.device) + + # 3. Accept or reject the samples. + # [batch_size, max_spec_len] + # If the draft token probabilities are 0, set them to the smallest + # positive normal value representable by float32. + safe_draft_probs = torch.where(draft_token_probs > 0, + draft_token_probs, + torch.finfo(torch.float32).tiny) + accepted = uniform_samples <= target_token_probs / safe_draft_probs + accept_mask = accepted.cumprod(dim=1) + # Set the token ids to the draft token ids if accepted, otherwise + # set them to INVALID_TOKEN_ID. + accepted_token_ids = (draft_token_ids_tensor * accept_mask + + INVALID_TOKEN_ID * (1 - accept_mask)) + + # 4. Adjust the distribution for the recovered tokens. + # Clamp the bonus probabilities to the smallest positive normal + # value representable by float32. + bonus_prob = torch.clamp(target_probs[:, :-1, :] - draft_probs, + min=torch.finfo(torch.float32).tiny) + normalized_bonus_prob = bonus_prob / bonus_prob.sum(dim=-1, + keepdim=True) + + # 5. Sample recovered token ids. + recovered_token_ids = random_sample( + normalized_bonus_prob, + sampling_metadata.generators).reshape(batch_size, max_spec_len) + + # 6. Get the final output token ids. + # output_token_ids = accepted_token_ids + + # recovered_token_ids + + # bonus_token_id + recovered_bonus_token_ids = torch.cat( + [recovered_token_ids, bonus_token_ids_tensor], dim=1) + # Generate mask with bonus tokens. + generate_mask = torch.cat([ + accept_mask, + torch.zeros(batch_size, 1, device=accept_mask.device) + ], + dim=1).to(torch.bool) + zeros_mask = (generate_mask == 0) + first_zero_idx = zeros_mask.float().argmax(dim=1) + output_token_ids = torch.cat([ + accepted_token_ids, + torch.full((batch_size, 1), + fill_value=INVALID_TOKEN_ID, + device=accept_mask.device) + ], + dim=1) + output_token_ids[torch.arange(batch_size), + first_zero_idx] = recovered_bonus_token_ids[ + torch.arange(batch_size), first_zero_idx] + + return output_token_ids + + def compute_probs(self, logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + sample_lens: list[int]) -> torch.Tensor: + """ + Compute probability distribution from logits based on sampling metadata. + + This function applies temperature scaling to the logits and converts + them to probabilities using softmax. Note that division by + temperature is not performed inplace to preserve the original logits + tensor, which will be used by the original sampler to get bonus tokens. + + Args: + logits: Input logits tensor to be converted to probabilities + sampling_metadata: Metadata containing sampling parameters such + as temperature and whether greedy sampling is used + sample_lens: List of sample lengths used for repeating + temperature values + + Returns: + torch.Tensor: Probability distribution (softmax of scaled logits) + if non-greedy sampling is used, otherwise returns the + original logits + """ + if sampling_metadata.all_greedy: + return logits + assert sampling_metadata.temperature is not None + # We should optimize the following code as + # it will cause CPU -> GPU synchronization. + temperature = torch.repeat_interleave( + sampling_metadata.temperature, + torch.tensor(sample_lens, + device=sampling_metadata.temperature.device)) + temperature = temperature.unsqueeze(dim=1) + logits = logits / temperature + return logits.softmax(dim=-1, dtype=torch.float32) def _create_greedy_token_probs( @@ -199,3 +282,66 @@ def _create_greedy_token_probs( src=valid_mask.unsqueeze(-1).float()) return token_probs + + +def _convert_2d_probs( + probs: torch.Tensor, # [num_total_tokens, vocab_size] + sample_lens: list[int]) -> torch.Tensor: + """ + Converts a 2D tensor of probabilities to a 3D tensor with padding. + [num_total_tokens, vocab_size] -> + [batch_size, max_spec_len + 1, vocab_size] + """ + cumulative_lens = torch.cumsum(torch.tensor(sample_lens, + device=probs.device), + dim=0) + split_indices = cumulative_lens[:-1].tolist() # Exclude last index + + # Split into chunks without loops + chunks = torch.tensor_split(probs, split_indices, dim=0) + + # Pad all sequences to maximum length + padded_probs = pad_sequence(chunks, batch_first=True, padding_value=0.0) + return padded_probs + + +def _create_uniform_samples(seeded_seqs: dict[int, torch.Generator], + batch_size: int, k: int, + device: torch.device) -> torch.Tensor: + """ + Generates a batch of uniform random samples, with optional seeding + for specific sequences. + + This method creates a tensor of shape `(batch_size, k)` filled + with uniform random values in the range [0, 1). If `seeded_seqs` + is provided, the sequences corresponding to specific indices + will be generated using the provided `torch.Generator` for + reproducibility. The other sequences will be generated without + a seed. + + Args: + seeded_seqs : Optional[Dict[int, torch.Generator]] + A dictionary mapping indices in the batch to + `torch.Generator` objects. + batch_size : int + The number of sequences to generate. + k : int + The number of random samples per sequence. + device : torch.device + The device on which to allocate the tensor. + + Returns: + uniform_rand : torch.Tensor + A tensor of shape `(batch_size, k)` containing uniform + random values in the range [0, 1). + """ + + uniform_rand = torch.rand(batch_size, + k, + dtype=torch.float32, + device=device) + # Apply seeded generators only where needed + if seeded_seqs: + for idx, generator in seeded_seqs.items(): + uniform_rand[idx].uniform_(0, 1, generator=generator) + return uniform_rand diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 96f6d807b10ce..d91c057083f31 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -119,14 +119,6 @@ class Sampler(nn.Module): ) return sampled - def compute_probs(self, logits: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - if sampling_metadata.all_greedy: - return logits - # Apply temperature. This is an in-place op changing logits. - logits = self.apply_temperature(logits, sampling_metadata.temperature) - return logits.softmax(dim=-1, dtype=torch.float32) - def compute_logprobs(self, logits: torch.Tensor) -> torch.Tensor: return logits.log_softmax(dim=-1, dtype=torch.float32) diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py new file mode 100644 index 0000000000000..5841401367788 --- /dev/null +++ b/vllm/v1/spec_decode/utils.py @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: Apache-2.0 +from vllm.v1.sample.ops.topk_topp_sampler import random_sample # noqa +from vllm.v1.worker.gpu_input_batch import InputBatch + + +def is_spec_decode_supported(req_id: str, input_batch: InputBatch) -> bool: + if req_id in input_batch.top_k_reqs or req_id in input_batch.top_p_reqs: + # Spec decode doesn't support top_p/top_k sampling. + return False + elif req_id in input_batch.min_p_reqs: + # Spec decode doesn't support min_p sampling. + return False + elif (req_id in input_batch.frequency_penalties_reqs + or req_id in input_batch.presence_penalties_reqs + or req_id in input_batch.repetition_penalties_reqs): + # Spec decode doesn't support penalties. + return False + elif req_id in input_batch.num_logprobs: + # Spec decode doesn't support logprobs. + return False + + return True diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4059d5b17b71b..2a98bea562dcb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -37,6 +37,7 @@ from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors, from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler from vllm.v1.spec_decode.ngram_proposer import NgramProposer +from vllm.v1.spec_decode.utils import is_spec_decode_supported from vllm.v1.utils import bind_kv_cache from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin @@ -1020,15 +1021,26 @@ class GPUModelRunner(LoRAModelRunnerMixin): sampling_metadata=sampling_metadata, ) else: - target_probs = self.model.sampler.compute_probs( - logits, sampling_metadata) draft_token_ids = [ scheduler_output.scheduled_spec_decode_tokens.get(req_id, []) for req_id in self.input_batch.req_ids ] - sampler_output = self.rejection_sampler(draft_token_ids, - target_probs, - sampling_metadata) + sample_lens = [len(tokens) + 1 for tokens in draft_token_ids] + recover_logits_idx = np.cumsum(sample_lens) - 1 + target_probs = self.rejection_sampler.compute_probs( + logits, sampling_metadata, sample_lens) + sampler_output = self.model.sample( + logits=logits[recover_logits_idx, :], + sampling_metadata=sampling_metadata, + ) + bonus_token_ids = sampler_output.sampled_token_ids + output_token_ids = self.rejection_sampler( + draft_token_ids, + None, # draft_probs + bonus_token_ids, + target_probs, + sampling_metadata) + sampler_output.sampled_token_ids = output_token_ids # TODO(woosuk): The following loop can be slow since it iterates over # the requests one by one. Optimize. @@ -1075,7 +1087,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): spec_token_ids = None else: spec_token_ids = self.generate_draft_token_ids( - valid_sampled_token_ids) + valid_sampled_token_ids, sampling_metadata) return ModelRunnerOutput( req_ids=self.input_batch.req_ids, @@ -1089,6 +1101,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): def generate_draft_token_ids( self, sampled_token_ids: list[list[int]], + sampling_metadata: SamplingMetadata, ) -> list[list[int]]: # TODO(woosuk): Optimize. draft_token_ids: list[list[int]] = [] @@ -1099,6 +1112,12 @@ class GPUModelRunner(LoRAModelRunnerMixin): draft_token_ids.append([]) continue + # Skip requests that require top-p, top-k, etc. + req_id = self.input_batch.req_ids[i] + if not is_spec_decode_supported(req_id, self.input_batch): + draft_token_ids.append([]) + continue + # Add sampled_token_ids to token_ids_cpu. start_idx = self.input_batch.num_tokens_no_spec[i] end_idx = start_idx + num_sampled_ids From b539222d4e81512e0cfa6cf56927a70c3aaca9d2 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 17 Mar 2025 14:42:06 +0800 Subject: [PATCH 082/169] [V1] Remove input cache client (#14864) Signed-off-by: DarkLight1337 Signed-off-by: Roger Wang Co-authored-by: Roger Wang --- vllm/inputs/preprocess.py | 6 ++ vllm/v1/engine/__init__.py | 2 +- vllm/v1/engine/mm_input_cache.py | 122 +++-------------------------- vllm/v1/engine/processor.py | 80 ++++++------------- vllm/v1/worker/gpu_model_runner.py | 39 ++------- 5 files changed, 48 insertions(+), 201 deletions(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index f56cff292b68b..af35e43d825a2 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -379,6 +379,7 @@ class InputPreprocessor: multi_modal_data, mm_processor_kwargs, lora_request=lora_request, + return_mm_hashes=return_mm_hashes, ) prompt_token_ids = self._tokenize_prompt( @@ -401,6 +402,7 @@ class InputPreprocessor: prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, + return_mm_hashes: bool = False, ) -> SingletonInputs: """Async version of :meth:`_extract_prompt_components`.""" parsed = parse_singleton_prompt(prompt) @@ -431,6 +433,7 @@ class InputPreprocessor: multi_modal_data, mm_processor_kwargs, lora_request=lora_request, + return_mm_hashes=return_mm_hashes, ) return token_inputs( @@ -452,6 +455,7 @@ class InputPreprocessor: multi_modal_data, mm_processor_kwargs, lora_request=lora_request, + return_mm_hashes=return_mm_hashes, ) prompt_token_ids = await self._tokenize_prompt_async( @@ -726,6 +730,7 @@ class InputPreprocessor: prompt, request_id=request_id, lora_request=lora_request, + return_mm_hashes=return_mm_hashes, ) return self._build_decoder_only_llm_inputs( @@ -746,6 +751,7 @@ class InputPreprocessor: prompt, request_id=request_id, lora_request=lora_request, + return_mm_hashes=return_mm_hashes, ) return self._build_decoder_only_llm_inputs( diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index cd29c2d7d57c0..3699779b3a0fe 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -52,7 +52,7 @@ class EngineCoreRequest( # Detokenizer, but set to None when it is added to EngineCoreClient. prompt: Optional[str] prompt_token_ids: list[int] - mm_inputs: Optional[list[Optional[MultiModalKwargs]]] + mm_inputs: Optional[list[MultiModalKwargs]] mm_hashes: Optional[list[str]] mm_placeholders: Optional[list[PlaceholderRange]] sampling_params: SamplingParams diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index e2dda73ba4299..61a55d2499bd1 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -1,131 +1,30 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Optional - -from vllm.config import ModelConfig from vllm.envs import VLLM_MM_INPUT_CACHE_GIB -from vllm.logger import init_logger -from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, - MultiModalKwargs, MultiModalRegistry) +from vllm.multimodal import MultiModalKwargs from vllm.multimodal.processing import ProcessingCache -logger = init_logger(__name__) - # The idea of multimodal preprocessing caching is based on having a client and # a server, where the client executes in the frontend process (=P0) and the # server in the core process (=P1). # # -- Client: -# - Apply legacy input_mapper (if one exists) to generate MultiModalKwargs. -# - Perform caching of the generated MultiModalKwargs. -# - This client can be deprecated once all mutimodal models migrate to use -# merged preprocessor with built-in caching functionality. +# - BaseMultiModalProcessor to process MultiModalData into MultiModalKwargs +# with built-in caching functionality, with mm_hash as its identifier. # # -- Server: -# - Perform caching of the received MultiModalKwargs. +# - MMInputCacheServer to perform caching of the received MultiModalKwargs. # -# The caching for both client and server is mirrored/similar, and this allows us +# The caching for both client and server is mirrored, and this allows us # to avoid the serialization of "mm_inputs" (like pixel values) between -# client (=P0) and server (=P1) processes. +# client (=P0) and server (=P1) processes if the mm_hash is found in the client +# cache. # Both Client and Server must use the same cache size # (to perform mirrored caching). This cache size is set by the environment # variable VLLM_MM_INPUT_CACHE_GIB. -# TODO(ywang96): Deprecate this class once all multimodal models migrate to use -# merged preprocessor with built-in caching functionality. -class MMInputCacheClient: - - def __init__( - self, - model_config: ModelConfig, - mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, - ): - self.model_config = model_config - self.mm_registry = mm_registry - self.multi_modal_input_mapper = mm_registry.create_input_mapper( - model_config) - self.mm_registry.init_mm_limits_per_prompt(model_config) - - # Init cache - self.use_cache = not model_config.disable_mm_preprocessor_cache - self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB, - MultiModalKwargs) - - # DEBUG: Set to None to disable - self.mm_debug_cache_hit_ratio_steps = None - self.mm_debug_cache_hits = 0 - self.mm_debug_cache_total = 0 - - def cache_hit_ratio(self, steps): - total = self.mm_debug_cache_total - - if total > 0 and total % steps == 0: - logger.debug("MMInputMapper: cache_hit_ratio = %.2f ", - self.mm_debug_cache_hits / total) - - # NOTE: process_inputs only supports image inputs since all multimodal - # models with other modalities have migrated to use merged preprocessor. - def process_inputs( - self, - mm_data: MultiModalDataDict, - mm_hashes: Optional[list[str]], - mm_processor_kwargs: Optional[dict[str, Any]], - precomputed_mm_inputs: Optional[list[MultiModalKwargs]], - ) -> list[Optional[MultiModalKwargs]]: - if precomputed_mm_inputs is None: - image_inputs = mm_data["image"] - if not isinstance(image_inputs, list): - image_inputs = [image_inputs] - num_inputs = len(image_inputs) - else: - num_inputs = len(precomputed_mm_inputs) - - # Sanity - if self.use_cache: - assert mm_hashes is not None - assert num_inputs == len(mm_hashes) - - # Process each image input separately, so that later we can schedule - # them in a fine-grained manner. - # Apply caching (if enabled) and reuse precomputed inputs (if provided) - ret_inputs: list[Optional[MultiModalKwargs]] = [] - for input_id in range(num_inputs): - if self.mm_debug_cache_hit_ratio_steps is not None: - self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps) - - mm_input = None - if self.use_cache: - assert mm_hashes is not None - mm_hash = mm_hashes[input_id] - mm_input = self.mm_cache.get(mm_hash) - - self.mm_debug_cache_total += 1 - if mm_input is None: - if precomputed_mm_inputs is not None: - # Reuse precomputed input (for merged preprocessor) - mm_input = precomputed_mm_inputs[input_id] - else: - # Apply legacy input_mapper - mm_input = self.multi_modal_input_mapper( - {"image": [image_inputs[input_id]]}, - mm_processor_kwargs=mm_processor_kwargs, - ) - - if self.use_cache: - # Add to cache - assert mm_hash is not None - self.mm_cache[mm_hash] = mm_input - else: - self.mm_debug_cache_hits += 1 - mm_input = None # Avoids sending mm_input to Server - - ret_inputs.append(mm_input) - - return ret_inputs - - class MMInputCacheServer: def __init__(self, model_config): @@ -135,9 +34,9 @@ class MMInputCacheServer: def get_and_update( self, - mm_inputs: list[Optional[MultiModalKwargs]], + mm_inputs: list[MultiModalKwargs], mm_hashes: list[str], - ) -> list[Optional[MultiModalKwargs]]: + ) -> list[MultiModalKwargs]: assert len(mm_inputs) == len(mm_hashes) if not self.use_cache: @@ -147,8 +46,7 @@ class MMInputCacheServer: for mm_input, mm_hash in zip(mm_inputs, mm_hashes): assert mm_hash is not None if mm_input is None: - mm_input = self.mm_cache.get(mm_hash) - assert mm_input is not None + mm_input = self.mm_cache[mm_hash] else: self.mm_cache[mm_hash] = mm_input diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 663e1e36f7561..4e9e5506bb587 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -11,15 +11,15 @@ from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, from vllm.inputs.parse import is_encoder_decoder_inputs from vllm.inputs.preprocess import InputPreprocessor from vllm.lora.request import LoRARequest -from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalHasher, - MultiModalKwargs, MultiModalRegistry) +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, + MultiModalRegistry) +from vllm.multimodal.inputs import PlaceholderRange from vllm.multimodal.utils import merge_and_sort_multimodal_metadata from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine.mm_input_cache import MMInputCacheClient from vllm.v1.structured_output.utils import validate_structured_output_request @@ -45,11 +45,6 @@ class Processor: self.input_preprocessor = InputPreprocessor(self.model_config, self.tokenizer, mm_registry) - self.input_processor = input_registry.create_input_processor( - self.model_config) - - # Multi-modal (huggingface) input mapper - self.mm_input_cache_client = MMInputCacheClient(self.model_config) # Multi-modal hasher (for images) self.use_hash = ( @@ -171,7 +166,7 @@ class Processor: # 2. For multimodal models with a merged preprocessor, preprocess # multimodal data and expand prompt token ids accordingly. # 3. Apply prompt adapter to prompt token ids if one exists. - preprocessed_inputs = self.input_preprocessor.preprocess( + processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess( prompt, request_id=request_id, lora_request=lora_request, @@ -180,10 +175,6 @@ class Processor: ) eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) - # Process prompt and prompt token ids. - # Only applicable to multimodal models with legacy input processor. - processed_inputs = self.input_processor(preprocessed_inputs) - self._validate_model_inputs(processed_inputs, lora_request) if is_encoder_decoder_inputs(processed_inputs): @@ -212,36 +203,22 @@ class Processor: self.tokenizer.get_lora_tokenizer(lora_request)) # Multimodal related. - # Compute MM hashes (if enabled) - mm_hashes = None - if self.use_hash: - # Use mm_hashes from processed inputs if the model has merged - # input processor. - if decoder_inputs.multi_modal_hashes: - mm_hashes = decoder_inputs.multi_modal_hashes - # Fallback to using MultiModalHasher directly. - else: - mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt) + sorted_mm_inputs: Optional[list[MultiModalKwargs]] = None + sorted_mm_positions: Optional[list[PlaceholderRange]] = None + sorted_mm_hashes: Optional[list[str]] = None + if (decoder_mm_inputs := decoder_inputs.multi_modal_data): + assert isinstance(decoder_mm_inputs, MultiModalKwargs) - # For merged preprocessor, mm_data is already mm_inputs - precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None - decoder_mm_data = decoder_inputs.multi_modal_data - if isinstance(decoder_mm_data, MultiModalKwargs): - # The output of merged multi-modal processor (`decoder_mm_data`) + # The output of merged multi-modal processor (`decoder_mm_inputs`) # contains the kwargs for all items from all modalities. # This code separates them so that there is one set of kwargs # per item per modality. - precomputed_mm_inputs = [ + individual_mm_inputs = [ MultiModalKwargs.from_items([item]) - for modality in decoder_mm_data.modalities - for item in decoder_mm_data.get_items(modality) + for modality in decoder_mm_inputs.modalities + for item in decoder_mm_inputs.get_items(modality) ] - mm_positions = decoder_inputs.multi_modal_placeholders - - # Last-mile processing of multimodal metadata and inputs. - if mm_positions: - # Merge and flatten multimodal placeholders, hashes and inputs # from dictionaries to lists, and sort them by each item's position # in the input sequence. @@ -251,14 +228,13 @@ class Processor: sorted_mm_positions, sorted_mm_hashes, ) = merge_and_sort_multimodal_metadata( - mm_positions, - mm_hashes, + decoder_inputs.multi_modal_placeholders, + decoder_inputs.multi_modal_hashes if self.use_hash else None, ) # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple - # modalities involved AND the model supports merged input processor. - if len(sorted_modalities) > 1 and precomputed_mm_inputs: - + # modalities involved. + if len(sorted_modalities) > 1: modality_order_dict = { modality: order for order, modality in enumerate(sorted_modalities) @@ -266,26 +242,16 @@ class Processor: # Sanity check to make sure each multimodal input has only one # modality key. - for mm_input in precomputed_mm_inputs: + for mm_input in individual_mm_inputs: assert len(mm_input.modalities) == 1 - # Sort MultiModalKwags to match sorted_mm_positions - precomputed_mm_inputs = sorted( - precomputed_mm_inputs, + # Sort MultiModalKwargs to match sorted_mm_positions + sorted_mm_inputs = sorted( + individual_mm_inputs, key=lambda mm_input: modality_order_dict[list( mm_input.modalities)[0]]) - - # Apply mm input cache update and legacy input mapper if one exists. - sorted_mm_inputs = self.mm_input_cache_client.process_inputs( - mm_data=decoder_mm_data, - mm_hashes=sorted_mm_hashes, - mm_processor_kwargs=decoder_inputs.mm_processor_kwargs, - precomputed_mm_inputs=precomputed_mm_inputs, - ) - else: - sorted_mm_inputs = None - sorted_mm_hashes = None - sorted_mm_positions = None + else: + sorted_mm_inputs = individual_mm_inputs return EngineCoreRequest( request_id=request_id, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2a98bea562dcb..66015382bfe85 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -29,7 +29,6 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, is_pin_memory_available) from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.core.encoder_cache_manager import compute_encoder_budget -from vllm.v1.engine.mm_input_cache import MMInputCacheClient from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheSpec) from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors, @@ -133,14 +132,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope - if self.is_multimodal_model: - # NOTE: Initialized client is only used for processing dummy - # multimodal data into multimodal kwargs for GPU memory profiling. - # Only applicable to multimodal models with legacy input mapper. - self.mm_input_mapper_profiling = MMInputCacheClient( - self.model_config) - self.mm_input_mapper_profiling.use_cache = False - encoder_compute_budget, encoder_cache_size = compute_encoder_budget( model_config=model_config, scheduler_config=scheduler_config, @@ -1376,32 +1367,18 @@ class GPUModelRunner(LoRAModelRunnerMixin): mm_registry=self.mm_registry, ) dummy_mm_data = dummy_request_data.multi_modal_data + if not isinstance(dummy_mm_data, MultiModalKwargs): + # TODO: Delete this check once input mapper is fully removed. + raise RuntimeError( + "Legacy input mapper is not supported in V1") - # Dummy data definition in V0 may contain multiple multimodal items + # Dummy data definition may contain multiple multimodal items # (e.g, multiple images) for a single request, therefore here we # always replicate first item by max_num_mm_items times since in V1 # they are scheduled to be processed separately. - - # Case when models have a merged processor, their dummy data is - # already batched `MultiModalKwargs`, therefore we take the first - # `MultiModalKwargsItem` from the desired modality to profile on. - if isinstance(dummy_mm_data, MultiModalKwargs): - dummy_mm_item = dummy_mm_data.get_item( - modality=dummy_data_modality, item_index=0) - dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) - - # Case when models have dummy data explicitly defined as - # `MultiModalDataDict`, so they need to be processed through input - # mapper. - # TODO (ywang96): deprecate this path once merged processor is - # supported on all models. - else: - mm_kwargs_list = self.mm_input_mapper_profiling.process_inputs( - mm_data=dummy_mm_data, - mm_hashes=None, - mm_processor_kwargs=None, - precomputed_mm_inputs=None) - dummy_mm_kwargs = mm_kwargs_list[0] + dummy_mm_item = dummy_mm_data.get_item( + modality=dummy_data_modality, item_index=0) + dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) batched_dummy_mm_inputs = MultiModalKwargs.batch( [dummy_mm_kwargs] * max_num_mm_items) From 9b87a579aaf82338d5304219350932abae9b19ac Mon Sep 17 00:00:00 2001 From: Yan Ma Date: Mon, 17 Mar 2025 16:22:14 +0800 Subject: [PATCH 083/169] [Misc][XPU] Use None as device capacity for XPU (#14932) Signed-off-by: yan ma --- vllm/platforms/xpu.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index d99d4ef3dac06..225e756cd7ce8 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -37,10 +37,11 @@ class XPUPlatform(Platform): return "vllm.attention.backends.ipex_attn.IpexAttnBackend" @staticmethod - def get_device_capability(device_id: int = 0) -> DeviceCapability: - major, minor, *_ = torch.xpu.get_device_capability( - device_id)['version'].split('.') - return DeviceCapability(major=int(major), minor=int(minor)) + def get_device_capability( + device_id: int = 0) -> Optional[DeviceCapability]: + # capacity format differs from cuda's and will cause unexpected + # failure, so use None directly + return None @staticmethod def get_device_name(device_id: int = 0) -> str: From dd3b865854c21c99ebc5d1bd34c12936002174c2 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Mon, 17 Mar 2025 16:29:36 +0800 Subject: [PATCH 084/169] [Doc] Add vLLM Beijing meetup slide (#14938) Signed-off-by: Chen Zhang --- README.md | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/README.md b/README.md index bfab7faf598b6..f61b4218e1824 100644 --- a/README.md +++ b/README.md @@ -13,18 +13,9 @@ Easy, fast, and cheap LLM serving for everyone | Documentation | Blog | Paper | Twitter/X | Developer Slack |

---- - -We’re excited to invite you to the first **vLLM China Meetup** on **March 16** in **Beijing**! - -Join us to connect with the **vLLM team** and explore how vLLM is leveraged in **post-training, fine-tuning, and deployment**, including [verl](https://github.com/volcengine/verl), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [vllm-ascend](https://github.com/vllm-project/vllm-ascend). - -πŸ‘‰ **[Register Now](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)** to be part of the discussion! - ---- - *Latest News* πŸ”₯ +- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit#slide=id.g33fb1ff286e_0_29). - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0). - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted. - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html). From 0a74bfce9cb9e51616c50b007e53400244cbc24a Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Mon, 17 Mar 2025 04:37:42 -0400 Subject: [PATCH 085/169] setup.py: drop assumption about local `main` branch (#14692) Signed-off-by: Russell Bryant --- setup.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/setup.py b/setup.py index d18fe53f12de1..d412f34b3e3dc 100755 --- a/setup.py +++ b/setup.py @@ -294,26 +294,28 @@ class repackage_wheel(build_ext): ]).decode("utf-8") upstream_main_commit = json.loads(resp_json)["sha"] - # Check if the local main branch is up-to-date. This is to ensure - # the base commit we found is the most recent commit on the main - # branch. - local_main_commit = subprocess.check_output( - ["git", "rev-parse", "main"]).decode("utf-8").strip() - if local_main_commit != upstream_main_commit: - raise ValueError( - f"Local main branch ({local_main_commit}) is not " - "up-to-date with upstream main branch " - f"({upstream_main_commit}). Please pull the latest " - "changes from upstream main branch first.") + # Check if the upstream_main_commit exists in the local repo + try: + subprocess.check_output( + ["git", "cat-file", "-e", f"{upstream_main_commit}"]) + except subprocess.CalledProcessError: + # If not present, fetch it from the remote repository. + # Note that this does not update any local branches, + # but ensures that this commit ref and its history are + # available in our local repo. + subprocess.check_call([ + "git", "fetch", "https://github.com/vllm-project/vllm", + "main" + ]) # Then get the commit hash of the current branch that is the same as # the upstream main commit. current_branch = subprocess.check_output( ["git", "branch", "--show-current"]).decode("utf-8").strip() - base_commit = subprocess.check_output( - ["git", "merge-base", "main", - current_branch]).decode("utf-8").strip() + base_commit = subprocess.check_output([ + "git", "merge-base", f"{upstream_main_commit}", current_branch + ]).decode("utf-8").strip() return base_commit except ValueError as err: raise ValueError(err) from None From cd0cd85102e4b5971dd44109776942df5cdca70f Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Mon, 17 Mar 2025 01:40:41 -0700 Subject: [PATCH 086/169] [MISC] More AMD unused var clean up (#14926) Signed-off-by: Lu Fang --- csrc/rocm/attention.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index 90f0b54d2f006..c500d00ea528e 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -127,7 +127,7 @@ __device__ __forceinline__ T from_float(const float& inp) { template __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) { - union tmpcvt { + [[maybe_unused]] union tmpcvt { uint16_t u; _Float16 f; __hip_bfloat16 b; @@ -160,7 +160,7 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) { template __device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1, const _B16x4& inp2) { - union tmpcvt { + [[maybe_unused]] union tmpcvt { uint16_t u; _Float16 f; __hip_bfloat16 b; @@ -1273,9 +1273,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( const int seq_idx = blockIdx.y; const int context_len = context_lens[seq_idx]; const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); - constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; + [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; const int warpid = threadIdx.x / WARP_SIZE; - const int laneid = threadIdx.x % WARP_SIZE; + [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE; __shared__ float shared_global_exp_sum; // max num partitions supported is warp_size * NPAR_LOOPS From 69698f257e3a329fd68276459e82e37cd5ae43f2 Mon Sep 17 00:00:00 2001 From: kushanam <42385577+kushanam@users.noreply.github.com> Date: Mon, 17 Mar 2025 01:47:58 -0700 Subject: [PATCH 087/169] fix minor miscalled method (#14327) From b4ad56c1bd2fd39028f64919a11a4c5af96bf0c5 Mon Sep 17 00:00:00 2001 From: iefgnoix Date: Mon, 17 Mar 2025 01:48:28 -0700 Subject: [PATCH 088/169] [V1][TPU] Apply the ragged paged attention kernel fix and remove the padding. (#14846) Signed-off-by: Xiongfei Wei --- requirements/tpu.txt | 12 ++++++------ vllm/v1/worker/tpu_model_runner.py | 7 ++----- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/requirements/tpu.txt b/requirements/tpu.txt index 97a39bcd4a6d6..7246fc19bfa97 100644 --- a/requirements/tpu.txt +++ b/requirements/tpu.txt @@ -17,9 +17,9 @@ ray[data] --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" -torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" -torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" +torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index effcac7e7bdef..00869467be341 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -23,8 +23,7 @@ from vllm.multimodal.utils import group_mm_inputs_by_modality from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available -from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK, - PallasAttentionBackend, +from vllm.v1.attention.backends.pallas import (PallasAttentionBackend, PallasMetadata) from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, @@ -139,10 +138,8 @@ class TPUModelRunner: device="cpu") self.slot_mapping_np = self.slot_mapping_cpu.numpy() - padded_max_num_blocks_per_req = _get_padded_number( - self.max_num_blocks_per_req, NUM_KV_PAGES_PER_BLOCK) self.block_table_cpu = torch.zeros( - (self.max_num_tokens, padded_max_num_blocks_per_req), + (self.max_num_tokens, self.max_num_blocks_per_req), dtype=self.input_batch.block_table.get_cpu_tensor().dtype, device="cpu") From 868a8c5b2c8c042fc869eb30bce29fb8e19d979e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 17 Mar 2025 17:15:20 +0800 Subject: [PATCH 089/169] [Bugfix] Fix Ultravox on V1 (#14929) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/ultravox.py | 42 +++++++++++++++----------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index d368c145d55f9..cb1e143838496 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -5,7 +5,7 @@ import math from collections.abc import Iterable, Mapping, Sequence from functools import cached_property -from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union import torch import torch.utils.checkpoint @@ -36,7 +36,7 @@ from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.ultravox import UltravoxConfig from .interfaces import (MultiModalEmbeddings, SupportsLoRA, - SupportsMultiModal, SupportsPP, SupportsV0Only) + SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings, @@ -50,14 +50,14 @@ _MAX_ENCODER_BATCH_SIZE = 16 class UltravoxAudioFeatureInputs(TypedDict): type: Literal["audio_features"] - data: NestedTensors + data: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]] """Shape: `(batch_size, num_chunks, 80, M)`""" - lens: NestedTensors + lens: Union[torch.Tensor, list[torch.Tensor]] """ Length of the audio frames. Used for attention mask in WhisperEncoder. Shape: `(batch_size, num_chunks)` """ - token_len: NestedTensors + token_len: Union[torch.Tensor, list[torch.Tensor]] """ Length of the audio tokens. Used for flattening the audio features. Shape: `(batch_size, num_chunks)` @@ -405,8 +405,7 @@ class ModifiedWhisperEncoder(WhisperEncoder): UltravoxMultiModalProcessor, info=UltravoxProcessingInfo, dummy_inputs=UltravoxDummyInputsBuilder) -class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, - SupportsV0Only): +class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], @@ -506,6 +505,12 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, if not isinstance(audio_features, (torch.Tensor, list)): raise ValueError("Incorrect type of audio features. " f"Got type: {type(audio_features)}") + if not isinstance(audio_lens, (torch.Tensor, list)): + raise ValueError("Incorrect type of audio_lens. " + f"Got type: {type(audio_features)}") + if not isinstance(audio_token_len, (torch.Tensor, list)): + raise ValueError("Incorrect type of audio_token_len. " + f"Got type: {type(audio_features)}") return UltravoxAudioFeatureInputs(type="audio_features", data=audio_features, @@ -523,7 +528,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, raise AssertionError("This line should be unreachable.") def _process_audio_input( - self, audio_input: UltravoxAudioInputs) -> NestedTensors: + self, + audio_input: UltravoxAudioInputs, + ) -> Union[NestedTensors, tuple[torch.Tensor, ...]]: if audio_input["type"] == "audio_embeds": return audio_input["data"] @@ -531,13 +538,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, # [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)] audio_features = pad_and_concat_to_dim3(audio_input["data"]) - if isinstance(audio_input['lens'], list): - # [B1, B2] -> [B1+B2] - audio_lens = torch.cat(audio_input['lens']) - audio_token_len = torch.cat(audio_input['token_len']) - else: - audio_lens = flatten_bn(audio_input['lens']) - audio_token_len = flatten_bn(audio_input['token_len']) + # [B1, B2] -> [B1+B2] + audio_lens = flatten_bn(audio_input['lens'], concat=True) + audio_token_len = flatten_bn(audio_input['token_len'], concat=True) embeddings = self._audio_features_to_embeddings( audio_features, audio_lens) @@ -554,7 +557,12 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, # Apply mask and flatten flattened_embeddings = embeddings[mask] - return flattened_embeddings + # Return one tensor per input audio + embed_lens = [ + token_len_item.sum().item() + for token_len_item in audio_input['token_len'] + ] + return flattened_embeddings.split(embed_lens) def get_multimodal_embeddings( self, **kwargs: object) -> Optional[MultiModalEmbeddings]: @@ -646,7 +654,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, def pad_and_concat_to_dim3( - features: Union[torch.Tensor, List[torch.Tensor], List[List[torch.Tensor]]] + features: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]] ) -> torch.Tensor: """ Pad and concatenate a list of tensors. From 6eaf1e5c52d5e72a577ad03d378a28b39f0e849e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 17 Mar 2025 18:00:17 +0800 Subject: [PATCH 090/169] [Misc] Add `--seed` option to offline multi-modal examples (#14934) Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 7 +- examples/offline_inference/audio_language.py | 132 +++-- .../encoder_decoder_multimodal.py | 48 +- examples/offline_inference/vision_language.py | 455 ++++++++++++------ .../vision_language_embedding.py | 31 +- .../vision_language_multi_image.py | 179 ++++--- 6 files changed, 537 insertions(+), 315 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f85572e7c234c..f5be8dca05f1d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -226,10 +226,13 @@ steps: - python3 offline_inference/basic/chat.py - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py - - python3 offline_inference/vision_language.py - - python3 offline_inference/vision_language_multi_image.py + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_embedding.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder.py + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index 293b9fddac89e..02dbdcb64232f 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -7,11 +7,13 @@ For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ import os +from dataclasses import asdict +from typing import NamedTuple, Optional from huggingface_hub import snapshot_download from transformers import AutoTokenizer -from vllm import LLM, SamplingParams +from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.audio import AudioAsset from vllm.lora.request import LoRARequest from vllm.utils import FlexibleArgumentParser @@ -23,21 +25,31 @@ question_per_audio_count = { 2: "What sport and what nursery rhyme are referenced?" } + +class ModelRequestData(NamedTuple): + engine_args: EngineArgs + prompt: str + stop_token_ids: Optional[list[int]] = None + lora_requests: Optional[list[LoRARequest]] = None + + # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on # lower-end GPUs. # Unless specified, these settings have been tested to work on a single L4. # MiniCPM-O -def run_minicpmo(question: str, audio_count: int): +def run_minicpmo(question: str, audio_count: int) -> ModelRequestData: model_name = "openbmb/MiniCPM-o-2_6" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - llm = LLM(model=model_name, - trust_remote_code=True, - max_model_len=4096, - max_num_seqs=5, - limit_mm_per_prompt={"audio": audio_count}) + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}, + ) stop_tokens = ['<|im_end|>', '<|endoftext|>'] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] @@ -52,11 +64,16 @@ def run_minicpmo(question: str, audio_count: int): tokenize=False, add_generation_prompt=True, chat_template=audio_chat_template) - return llm, prompt, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + stop_token_ids=stop_token_ids, + ) # Phi-4-multimodal-instruct -def run_phi4mm(questions: str, audio_count: int): +def run_phi4mm(question: str, audio_count: int) -> ModelRequestData: """ Phi-4-multimodal-instruct supports both image and audio inputs. Here, we show how to process audio inputs. @@ -67,9 +84,9 @@ def run_phi4mm(questions: str, audio_count: int): speech_lora_path = os.path.join(model_path, "speech-lora") placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)]) - prompts = f"<|user|>{placeholders}{questions}<|end|><|assistant|>" + prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>" - llm = LLM( + engine_args = EngineArgs( model=model_path, trust_remote_code=True, max_model_len=4096, @@ -79,24 +96,24 @@ def run_phi4mm(questions: str, audio_count: int): lora_extra_vocab_size=0, limit_mm_per_prompt={"audio": audio_count}, ) - lora_request = LoRARequest("speech", 1, speech_lora_path) - # To maintain code compatibility in this script, we add LoRA here. - llm.llm_engine.add_lora(lora_request=lora_request) - # You can also add LoRA using: - # llm.generate(prompts, lora_request=lora_request,...) - stop_token_ids = None - return llm, prompts, stop_token_ids + return ModelRequestData( + engine_args=engine_args, + prompt=prompts, + lora_requests=[LoRARequest("speech", 1, speech_lora_path)], + ) # Qwen2-Audio -def run_qwen2_audio(question: str, audio_count: int): +def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData: model_name = "Qwen/Qwen2-Audio-7B-Instruct" - llm = LLM(model=model_name, - max_model_len=4096, - max_num_seqs=5, - limit_mm_per_prompt={"audio": audio_count}) + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}, + ) audio_in_prompt = "".join([ f"Audio {idx+1}: " @@ -107,12 +124,15 @@ def run_qwen2_audio(question: str, audio_count: int): "<|im_start|>user\n" f"{audio_in_prompt}{question}<|im_end|>\n" "<|im_start|>assistant\n") - stop_token_ids = None - return llm, prompt, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + ) # Ultravox 0.5-1B -def run_ultravox(question: str, audio_count: int): +def run_ultravox(question: str, audio_count: int) -> ModelRequestData: model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b" tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -124,29 +144,39 @@ def run_ultravox(question: str, audio_count: int): tokenize=False, add_generation_prompt=True) - llm = LLM(model=model_name, - max_model_len=4096, - max_num_seqs=5, - trust_remote_code=True, - limit_mm_per_prompt={"audio": audio_count}) - stop_token_ids = None - return llm, prompt, stop_token_ids + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=5, + trust_remote_code=True, + limit_mm_per_prompt={"audio": audio_count}, + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + ) # Whisper -def run_whisper(question: str, audio_count: int): +def run_whisper(question: str, audio_count: int) -> ModelRequestData: assert audio_count == 1, ( "Whisper only support single audio input per prompt") model_name = "openai/whisper-large-v3-turbo" prompt = "<|startoftranscript|>" - llm = LLM(model=model_name, - max_model_len=448, - max_num_seqs=5, - limit_mm_per_prompt={"audio": audio_count}) - stop_token_ids = None - return llm, prompt, stop_token_ids + engine_args = EngineArgs( + model=model_name, + max_model_len=448, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}, + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + ) model_example_map = { @@ -164,14 +194,24 @@ def main(args): raise ValueError(f"Model type {model} is not supported.") audio_count = args.num_audios - llm, prompt, stop_token_ids = model_example_map[model]( - question_per_audio_count[audio_count], audio_count) + req_data = model_example_map[model](question_per_audio_count[audio_count], + audio_count) + + engine_args = asdict(req_data.engine_args) | {"seed": args.seed} + llm = LLM(**engine_args) + + # To maintain code compatibility in this script, we add LoRA here. + # You can also add LoRA using: + # llm.generate(prompts, lora_request=lora_request,...) + if req_data.lora_requests: + for lora_request in req_data.lora_requests: + llm.llm_engine.add_lora(lora_request=lora_request) # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. sampling_params = SamplingParams(temperature=0.2, max_tokens=64, - stop_token_ids=stop_token_ids) + stop_token_ids=req_data.stop_token_ids) mm_data = {} if audio_count > 0: @@ -183,7 +223,7 @@ def main(args): } assert args.num_prompts > 0 - inputs = {"prompt": prompt, "multi_modal_data": mm_data} + inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data} if args.num_prompts > 1: # Batch inference inputs = [inputs] * args.num_prompts @@ -214,6 +254,10 @@ if __name__ == "__main__": default=1, choices=[0, 1, 2], help="Number of audio items per prompt.") + parser.add_argument("--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.") args = parser.parse_args() main(args) diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index f44bc423658ec..6d0c3ac1ee09a 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -4,16 +4,23 @@ This example shows how to use vLLM for running offline inference with the explicit/implicit prompt format on enc-dec LMMs for text generation. """ import time +from collections.abc import Sequence +from dataclasses import asdict +from typing import NamedTuple -from vllm import LLM, SamplingParams +from vllm import LLM, EngineArgs, PromptType, SamplingParams from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.utils import FlexibleArgumentParser +class ModelRequestData(NamedTuple): + engine_args: EngineArgs + prompts: Sequence[PromptType] + + def run_florence2(): - # Create a Florence-2 encoder/decoder model instance - llm = LLM( + engine_args = EngineArgs( model="microsoft/Florence-2-large", tokenizer="facebook/bart-large", max_num_seqs=8, @@ -39,12 +46,15 @@ def run_florence2(): "decoder_prompt": "", }, ] - return llm, prompts + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) def run_mllama(): - # Create a Mllama encoder/decoder model instance - llm = LLM( + engine_args = EngineArgs( model="meta-llama/Llama-3.2-11B-Vision-Instruct", max_model_len=4096, max_num_seqs=2, @@ -69,12 +79,15 @@ def run_mllama(): "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501 }, ] - return llm, prompts + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) def run_whisper(): - # Create a Whisper encoder/decoder model instance - llm = LLM( + engine_args = EngineArgs( model="openai/whisper-large-v3-turbo", max_model_len=448, max_num_seqs=16, @@ -99,7 +112,11 @@ def run_whisper(): "decoder_prompt": "<|startoftranscript|>", } ] - return llm, prompts + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) model_example_map = { @@ -114,7 +131,12 @@ def main(args): if model not in model_example_map: raise ValueError(f"Model type {model} is not supported.") - llm, prompts = model_example_map[model]() + req_data = model_example_map[model]() + + engine_args = asdict(req_data.engine_args) | {"seed": args.seed} + llm = LLM(**engine_args) + + prompts = req_data.prompts # Create a sampling params object. sampling_params = SamplingParams( @@ -153,6 +175,10 @@ if __name__ == "__main__": default="mllama", choices=model_example_map.keys(), help='Huggingface "model_type".') + parser.add_argument("--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.") args = parser.parse_args() main(args) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 432cda5e24396..58fd5e53bf8dc 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -8,122 +8,164 @@ on HuggingFace model repository. """ import os import random +from dataclasses import asdict +from typing import NamedTuple, Optional from huggingface_hub import snapshot_download from transformers import AutoTokenizer -from vllm import LLM, SamplingParams +from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.lora.request import LoRARequest from vllm.utils import FlexibleArgumentParser + +class ModelRequestData(NamedTuple): + engine_args: EngineArgs + prompts: list[str] + stop_token_ids: Optional[list[int]] = None + lora_requests: Optional[list[LoRARequest]] = None + + # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on # lower-end GPUs. # Unless specified, these settings have been tested to work on a single L4. # Aria -def run_aria(questions: list[str], modality: str): +def run_aria(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "rhymes-ai/Aria" # NOTE: Need L40 (or equivalent) to avoid OOM - llm = LLM(model=model_name, - max_model_len=4096, - max_num_seqs=2, - dtype="bfloat16", - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=2, + dtype="bfloat16", + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) prompts = [(f"<|im_start|>user\n<|img|>{question}" "<|im_end|>\n<|im_start|>assistant\n") for question in questions] stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) # BLIP-2 -def run_blip2(questions: list[str], modality: str): +def run_blip2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" # BLIP-2 prompt format is inaccurate on HuggingFace model repository. # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa prompts = [f"Question: {question} Answer:" for question in questions] - llm = LLM(model="Salesforce/blip2-opt-2.7b", - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - stop_token_ids = None - return llm, prompts, stop_token_ids + engine_args = EngineArgs( + model="Salesforce/blip2-opt-2.7b", + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # Chameleon -def run_chameleon(questions: list[str], modality: str): +def run_chameleon(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [f"{question}" for question in questions] - llm = LLM(model="facebook/chameleon-7b", - max_model_len=4096, - max_num_seqs=2, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - stop_token_ids = None - return llm, prompts, stop_token_ids + engine_args = EngineArgs( + model="facebook/chameleon-7b", + max_model_len=4096, + max_num_seqs=2, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # Deepseek-VL2 -def run_deepseek_vl2(questions: list[str], modality: str): +def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "deepseek-ai/deepseek-vl2-tiny" - llm = LLM(model=model_name, - max_model_len=4096, - max_num_seqs=2, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, - hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}) + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=2, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}, + ) prompts = [ f"<|User|>: \n{question}\n\n<|Assistant|>:" for question in questions ] - stop_token_ids = None - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # Florence2 -def run_florence2(question: str, modality: str): +def run_florence2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" - llm = LLM(model="microsoft/Florence-2-large", - tokenizer="facebook/bart-large", - max_num_seqs=8, - trust_remote_code=True, - dtype="bfloat16", - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) + engine_args = EngineArgs( + model="microsoft/Florence-2-large", + tokenizer="facebook/bart-large", + max_num_seqs=8, + trust_remote_code=True, + dtype="bfloat16", + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) - prompt = "" - stop_token_ids = None - return llm, prompt, stop_token_ids + prompts = ["" for _ in questions] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # Fuyu -def run_fuyu(questions: list[str], modality: str): +def run_fuyu(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [f"{question}\n" for question in questions] - llm = LLM(model="adept/fuyu-8b", - max_model_len=2048, - max_num_seqs=2, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - stop_token_ids = None - return llm, prompts, stop_token_ids + engine_args = EngineArgs( + model="adept/fuyu-8b", + max_model_len=2048, + max_num_seqs=2, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # Gemma 3 -def run_gemma3(questions: list[str], modality: str): +def run_gemma3(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "google/gemma-3-4b-it" - llm = LLM( + engine_args = EngineArgs( model=model_name, max_model_len=2048, max_num_seqs=2, @@ -135,22 +177,27 @@ def run_gemma3(questions: list[str], modality: str): prompts = [("user\n" f"{question}\n" "model\n") for question in questions] - stop_token_ids = None - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # GLM-4v -def run_glm4v(questions: list[str], modality: str): +def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "THUDM/glm-4v-9b" - llm = LLM(model=model_name, - max_model_len=2048, - max_num_seqs=2, - trust_remote_code=True, - enforce_eager=True, - hf_overrides={"architectures": ["GLM4VForCausalLM"]}, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) + engine_args = EngineArgs( + model=model_name, + max_model_len=2048, + max_num_seqs=2, + trust_remote_code=True, + enforce_eager=True, + hf_overrides={"architectures": ["GLM4VForCausalLM"]}, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) prompts = [ f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\ @@ -158,16 +205,21 @@ def run_glm4v(questions: list[str], modality: str): ] stop_token_ids = [151329, 151336, 151338] - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) # H2OVL-Mississippi -def run_h2ovl(questions: list[str], modality: str): +def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "h2oai/h2ovl-mississippi-800m" - llm = LLM( + engine_args = EngineArgs( model=model_name, trust_remote_code=True, max_model_len=8192, @@ -187,15 +239,20 @@ def run_h2ovl(questions: list[str], modality: str): # Stop tokens for H2OVL-Mississippi # https://huggingface.co/h2oai/h2ovl-mississippi-800m stop_token_ids = [tokenizer.eos_token_id] - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) # Idefics3-8B-Llama3 -def run_idefics3(questions: list[str], modality: str): +def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "HuggingFaceM4/Idefics3-8B-Llama3" - llm = LLM( + engine_args = EngineArgs( model=model_name, max_model_len=8192, max_num_seqs=2, @@ -212,17 +269,20 @@ def run_idefics3(questions: list[str], modality: str): prompts = [( f"<|begin_of_text|>User:{question}\nAssistant:" ) for question in questions] - stop_token_ids = None - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # InternVL -def run_internvl(questions: list[str], modality: str): +def run_internvl(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "OpenGVLab/InternVL2-2B" - llm = LLM( + engine_args = EngineArgs( model=model_name, trust_remote_code=True, max_model_len=4096, @@ -245,53 +305,75 @@ def run_internvl(questions: list[str], modality: str): # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] - return llm, prompts, stop_token_ids + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) # LLaVA-1.5 -def run_llava(questions: list[str], modality: str): +def run_llava(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [ f"USER: \n{question}\nASSISTANT:" for question in questions ] - llm = LLM(model="llava-hf/llava-1.5-7b-hf", - max_model_len=4096, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - stop_token_ids = None - return llm, prompts, stop_token_ids + engine_args = EngineArgs( + model="llava-hf/llava-1.5-7b-hf", + max_model_len=4096, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # LLaVA-1.6/LLaVA-NeXT -def run_llava_next(questions: list[str], modality: str): +def run_llava_next(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [f"[INST] \n{question} [/INST]" for question in questions] - llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", - max_model_len=8192, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - stop_token_ids = None - return llm, prompts, stop_token_ids + engine_args = EngineArgs( + model="llava-hf/llava-v1.6-mistral-7b-hf", + max_model_len=8192, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) # LlaVA-NeXT-Video # Currently only support for video input -def run_llava_next_video(questions: list[str], modality: str): +def run_llava_next_video(questions: list[str], + modality: str) -> ModelRequestData: assert modality == "video" prompts = [ f"USER: