From 6b04039a7240ae1039fea4bd179ec3b452f19107 Mon Sep 17 00:00:00 2001 From: sstamenk Date: Fri, 15 Aug 2025 19:17:31 +0200 Subject: [PATCH 001/361] [BugFix] Skip the Q component for QKVParallelLinear in the case of QKVCrossParallelLinear since its width is 0 (#22369) Signed-off-by: sstamenk --- vllm/model_executor/layers/quantization/utils/w8a8_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index ddb50968904d1..659029fd37f70 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -121,6 +121,9 @@ def requantize_with_max_scale( if unfused_module_in_checkpoint: start = 0 for idx, logical_width in enumerate(logical_widths): + # Skip any component with zero width. + if logical_width == 0: + continue end = start + logical_width weight_dq = per_tensor_dequantize(weight[start:end, :], weight_scale[idx]) From 68af77e51c5ca78ec0fd2496eca80b2257176b6e Mon Sep 17 00:00:00 2001 From: JartX Date: Fri, 15 Aug 2025 19:42:49 +0200 Subject: [PATCH 002/361] [FIXBUG] Correctly Apply Grammar Bitmask in Mixed Batches (#22896) Signed-off-by: JartX --- vllm/v1/worker/gpu_model_runner.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9460d91c58323..3ea39dc519d86 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1337,9 +1337,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): out_indices = [] # Reorder the bitmask to match the order of the requests in the batch. - sorted_bitmask = np.zeros_like(grammar_bitmask, - shape=(logits.shape[0], - grammar_bitmask.shape[1])) + sorted_bitmask = np.full(shape=(logits.shape[0], + grammar_bitmask.shape[1]), + fill_value=-1, + dtype=grammar_bitmask.dtype) cumulative_index = 0 seq = sorted(scheduler_output.structured_output_request_ids.items(), key=lambda x: x[1]) From 993d3d122b114cf93bf423fe0b4410ac493d9c45 Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Fri, 15 Aug 2025 11:23:06 -0700 Subject: [PATCH 003/361] [Benchmarks] Include image data when ShareGPT4V dataset is used. (#22955) Signed-off-by: Chenheli Hua --- benchmarks/README.md | 49 +++++++++++++++++++++++++++++++++ benchmarks/benchmark_dataset.py | 8 +++++- vllm/benchmarks/datasets.py | 8 +++++- 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index d6442a4fc3872..caff8f0342141 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -22,6 +22,17 @@ become available. ✅ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + + ShareGPT4V (Image) + ✅ + ✅ + + wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json +
+
Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:
+ wget http://images.cocodataset.org/zips/train2017.zip + + BurstGPT ✅ @@ -616,3 +627,41 @@ python3 benchmarks/benchmark_prioritization.py \ ``` + +## 👁️ Example - Multi-Modal Benchmark + +
+Show more + +
+ +Benchmark the performance of multi-modal requests in vLLM. + +### Images (ShareGPT4V) + +Start vLLM: + +```bash +python -m vllm.entrypoints.openai.api_server \ + --model Qwen/Qwen2.5-VL-7B-Instruct \ + --dtype bfloat16 \ + --limit-mm-per-prompt '{"image": 1}' \ + --allowed-local-media-path /path/to/sharegpt4v/images +``` + +Send requests with images: + +```bash +python benchmarks/benchmark_serving.py \ + --backend openai-chat \ + --model Qwen/Qwen2.5-VL-7B-Instruct \ + --dataset-name sharegpt \ + --dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \ + --num-prompts 100 \ + --save-result \ + --result-dir ~/vllm_benchmark_results \ + --save-detailed \ + --endpoint /v1/chat/completion +``` + +
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index ea684f18a7421..572292a5aca46 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -430,14 +430,20 @@ class ShareGPTDataset(BenchmarkDataset): skip_min_output_len_check=output_len is not None, ): continue + # TODO: Also support ShareGPT4Video. + if image_path := entry.get("image"): + mm_content = process_image(image_path) + else: + mm_content = None if enable_multimodal_chat: - prompt = self.apply_multimodal_chat_transformation(prompt, None) + prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) samples.append( SampleRequest( prompt=prompt, prompt_len=prompt_len, expected_output_len=new_output_len, lora_request=lora_request, + multi_modal_data=mm_content, ) ) self.maybe_oversample_requests(samples, num_requests) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 4e8ac5162542f..5299dcf54b395 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -454,15 +454,21 @@ class ShareGPTDataset(BenchmarkDataset): skip_min_output_len_check=output_len is not None): continue + # TODO: Also support ShareGPT4Video. + if image_path := entry.get("image"): + mm_content = process_image(image_path) + else: + mm_content = None if enable_multimodal_chat: prompt = self.apply_multimodal_chat_transformation( - prompt, None) + prompt, mm_content) samples.append( SampleRequest( prompt=prompt, prompt_len=prompt_len, expected_output_len=new_output_len, lora_request=lora_request, + multi_modal_data=mm_content, )) self.maybe_oversample_requests(samples, num_requests) return samples From 48b01fd4d442d4b9250cef4fca3ca75d5c5c1f69 Mon Sep 17 00:00:00 2001 From: Shanshan Shen <467638484@qq.com> Date: Sat, 16 Aug 2025 02:29:25 +0800 Subject: [PATCH 004/361] [Structured Output] Make the output of structured output example more complete (#22481) Signed-off-by: shen-shanshan <467638484@qq.com> --- examples/offline_inference/structured_outputs.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py index 8ef121ebe848e..f46064931dbac 100644 --- a/examples/offline_inference/structured_outputs.py +++ b/examples/offline_inference/structured_outputs.py @@ -15,6 +15,8 @@ from pydantic import BaseModel from vllm import LLM, SamplingParams from vllm.sampling_params import GuidedDecodingParams +MAX_TOKENS = 50 + # Guided decoding by Choice (list of possible options) guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"]) sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice) @@ -23,7 +25,9 @@ prompt_choice = "Classify this sentiment: vLLM is wonderful!" # Guided decoding by Regex guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n") sampling_params_regex = SamplingParams( - guided_decoding=guided_decoding_params_regex, stop=["\n"] + guided_decoding=guided_decoding_params_regex, + stop=["\n"], + max_tokens=MAX_TOKENS, ) prompt_regex = ( "Generate an email address for Alan Turing, who works in Enigma." @@ -48,7 +52,10 @@ class CarDescription(BaseModel): json_schema = CarDescription.model_json_schema() guided_decoding_params_json = GuidedDecodingParams(json=json_schema) -sampling_params_json = SamplingParams(guided_decoding=guided_decoding_params_json) +sampling_params_json = SamplingParams( + guided_decoding=guided_decoding_params_json, + max_tokens=MAX_TOKENS, +) prompt_json = ( "Generate a JSON with the brand, model and car_type of" "the most iconic car from the 90's" @@ -64,7 +71,10 @@ condition ::= column "= " number number ::= "1 " | "2 " """ guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar) -sampling_params_grammar = SamplingParams(guided_decoding=guided_decoding_params_grammar) +sampling_params_grammar = SamplingParams( + guided_decoding=guided_decoding_params_grammar, + max_tokens=MAX_TOKENS, +) prompt_grammar = ( "Generate an SQL query to show the 'username' and 'email'from the 'users' table." ) From 8ad7285ea28ad3bcc898fa99812120bcda8ea7b4 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Fri, 15 Aug 2025 14:46:00 -0400 Subject: [PATCH 005/361] [Kernels] Clean up FusedMoeMethodBase and modular kernel setup. Remove extra arguments from modular kernel methods. (#22035) Signed-off-by: Bill Nell Co-authored-by: Michael Goin --- .buildkite/test-pipeline.yaml | 1 + docs/design/fused_moe_modular_kernel.md | 10 +- examples/offline_inference/data_parallel.py | 23 +- .../moe/modular_kernel_tools/common.py | 532 +++++++++--------- .../moe/modular_kernel_tools/mk_objects.py | 461 ++++++++++++++- .../profile_modular_kernel.py | 4 +- .../kernels/moe/modular_kernel_tools/utils.py | 117 ---- tests/kernels/moe/test_batched_moe.py | 4 +- tests/kernels/moe/test_block_fp8.py | 31 +- tests/kernels/moe/test_block_int8.py | 15 +- .../kernels/moe/test_cutlass_grouped_gemm.py | 17 +- tests/kernels/moe/test_deepep_deepgemm_moe.py | 6 +- tests/kernels/moe/test_deepgemm.py | 6 +- tests/kernels/moe/test_flashinfer_moe.py | 147 +++++ .../moe/test_modular_kernel_combinations.py | 129 +++-- tests/kernels/moe/test_nvfp4_moe.py | 60 +- tests/kernels/moe/test_pplx_cutlass_moe.py | 11 +- tests/kernels/moe/test_pplx_moe.py | 4 +- tests/kernels/moe/utils.py | 75 ++- .../base_device_communicator.py | 7 +- .../layers/fused_moe/__init__.py | 4 +- .../layers/fused_moe/batched_deep_gemm_moe.py | 36 +- .../batched_triton_or_deep_gemm_moe.py | 38 +- .../model_executor/layers/fused_moe/config.py | 11 +- .../layers/fused_moe/cutlass_moe.py | 326 ++++++----- .../layers/fused_moe/deep_gemm_moe.py | 3 +- .../fused_moe/deepep_ht_prepare_finalize.py | 30 +- .../fused_moe/deepep_ll_prepare_finalize.py | 32 +- .../fused_moe/flashinfer_cutlass_moe.py | 59 +- .../flashinfer_cutlass_prepare_finalize.py | 52 +- .../layers/fused_moe/fused_batched_moe.py | 98 ++-- .../layers/fused_moe/fused_moe.py | 7 +- .../fused_moe/gpt_oss_triton_kernels_moe.py | 15 +- vllm/model_executor/layers/fused_moe/layer.py | 93 +-- .../layers/fused_moe/modular_kernel.py | 117 ++-- .../layers/fused_moe/pplx_prepare_finalize.py | 33 +- .../layers/fused_moe/prepare_finalize.py | 43 +- .../layers/fused_moe/triton_deep_gemm_moe.py | 37 +- vllm/model_executor/layers/fused_moe/utils.py | 18 +- .../layers/quantization/auto_round.py | 4 +- .../model_executor/layers/quantization/awq.py | 2 +- .../layers/quantization/awq_marlin.py | 18 +- .../layers/quantization/bitsandbytes.py | 12 +- .../compressed_tensors_moe.py | 168 ++++-- .../layers/quantization/experts_int8.py | 17 +- .../model_executor/layers/quantization/fp8.py | 43 +- .../layers/quantization/gguf.py | 15 +- .../layers/quantization/gptq_marlin.py | 14 +- .../layers/quantization/modelopt.py | 99 ++-- .../layers/quantization/moe_wna16.py | 16 +- .../layers/quantization/mxfp4.py | 2 +- .../layers/quantization/quark/quark_moe.py | 39 +- .../model_executor/layers/quantization/rtn.py | 13 +- .../quantization/utils/flashinfer_fp4_moe.py | 129 +---- 54 files changed, 2010 insertions(+), 1293 deletions(-) delete mode 100644 tests/kernels/moe/modular_kernel_tools/utils.py create mode 100644 tests/kernels/moe/test_flashinfer_moe.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 04d7cdc3d8854..87296a08e2071 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -399,6 +399,7 @@ steps: - label: Kernels MoE Test %N mirror_hardwares: [amdexperimental] source_file_dependencies: + - csrc/quantization/cutlass_w8a8/moe/ - csrc/moe/ - tests/kernels/moe - vllm/model_executor/layers/fused_moe/ diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md index 3ef1232051b07..4b917ab408eec 100644 --- a/docs/design/fused_moe_modular_kernel.md +++ b/docs/design/fused_moe_modular_kernel.md @@ -175,11 +175,19 @@ implementations that input `FusedMoEActivationFormat.Standard` support chunking ### FusedMoEModularKernel Initialization -`FusedMoEMethodBase` class has 2 methods that are collectively responsible in creating the `FusedMoEModularKernel` object. They are, +`FusedMoEMethodBase` class has 3 methods that are collectively responsible in creating the `FusedMoEModularKernel` object. They are, +* maybe_make_prepare_finalize, * select_gemm_impl, and * init_prepare_finalize +#### maybe_make_prepare_finalize + +The `maybe_make_prepare_finalize` method is responsbile for constructing an instance of `FusedMoEPrepareAndFinalize` when appropriate based on the current all2all backend, e.g. when EP + DP is enabled. The base class method currently constructs all the `FusedMoEPrepareAndFinalize` objects for the EP+DP case. Derived classes can override this method to construct prepare/finalize objects for different scenarios, e.g. `ModelOptNvFp4FusedMoE` can construct a `FlashInferCutlassMoEPrepareAndFinalize` for the EP+TP case. +Please refer to the implementations in, + +* `ModelOptNvFp4FusedMoE` + #### select_gemm_impl The `select_gemm_impl` method is undefined in the base class. It is the responsibility of the derived class to implement a method that constructs a valid/appropriate `FusedMoEPermuteExpertsUnpermute` object. diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index dbf8ed58cc477..dd7559451c4c6 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -70,12 +70,27 @@ def parse_args(): default=64, help=("Maximum number of sequences to be processed in a single iteration."), ) + parser.add_argument( + "--max-model-len", + type=int, + help=("Maximum number of tokens to be processed in a single iteration."), + ) + parser.add_argument( + "--timeout", + type=int, + default=300, + help=("Number of seconds before unresponsive process is killed."), + ) parser.add_argument( "--gpu-memory-utilization", type=float, default=0.8, help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."), ) + parser.add_argument( + "--quantization", + type=str, + ) return parser.parse_args() @@ -90,7 +105,9 @@ def main( enforce_eager, trust_remote_code, max_num_seqs, + max_model_len, gpu_memory_utilization, + quantization, ): os.environ["VLLM_DP_RANK"] = str(global_dp_rank) os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank) @@ -142,7 +159,9 @@ def main( enable_expert_parallel=True, trust_remote_code=trust_remote_code, max_num_seqs=max_num_seqs, + max_model_len=max_model_len, gpu_memory_utilization=gpu_memory_utilization, + quantization=quantization, ) outputs = llm.generate(prompts, sampling_params) # Print the outputs. @@ -198,14 +217,16 @@ if __name__ == "__main__": args.enforce_eager, args.trust_remote_code, args.max_num_seqs, + args.max_model_len, args.gpu_memory_utilization, + args.quantization, ), ) proc.start() procs.append(proc) exit_code = 0 for proc in procs: - proc.join(timeout=300) + proc.join(timeout=args.timeout) if proc.exitcode is None: print(f"Killing process {proc.pid} that didn't stop within 5 minutes.") proc.kill() diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py index fd99e8dc5c987..a10666b6ec9a7 100644 --- a/tests/kernels/moe/modular_kernel_tools/common.py +++ b/tests/kernels/moe/modular_kernel_tools/common.py @@ -7,41 +7,22 @@ import torch import vllm._custom_ops as ops import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from tests.kernels.moe.utils import make_test_weights, per_token_cast_to_fp8 +from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX, + FLOAT8_E4M3_MAX, + dequantize_nvfp4_to_dtype) from tests.kernels.utils import torch_experts from vllm.config import VllmConfig from vllm.distributed import get_dp_group, get_tensor_model_parallel_world_size -# Fused experts and PrepareFinalize imports -from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( - BatchedDeepGemmExperts) -from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501 - BatchedTritonOrDeepGemmExperts) +from vllm.forward_context import set_forward_context from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, FusedMoEQuantConfig) -from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 -from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts -from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( - BatchedTritonExperts, NaiveBatchedExperts) from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk -from vllm.model_executor.layers.fused_moe.layer import (FusedMoEMethodBase, - TritonExperts) -from vllm.model_executor.layers.fused_moe.prepare_finalize import ( - MoEPrepareAndFinalizeNoEP) -from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( - TritonOrDeepGemmExperts) from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx +from .mk_objects import (expert_info, make_fused_experts, + make_prepare_finalize, prepare_finalize_info) from .parallel_utils import ProcessGroupInfo -from .utils import (make_block_quant_fp8_weights, make_non_quant_weights, - make_quant_fp8_weights, per_token_cast_to_fp8) - -if has_pplx(): - from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import ( - PplxPrepareAndFinalize) -if has_deep_ep(): - from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 - DeepEPHTPrepareAndFinalize) - from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 - DeepEPLLPrepareAndFinalize) def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str: @@ -69,24 +50,31 @@ class Config: torch_trace_dir_path: Optional[str] = None + def __post_init__(self): + if self.quant_config is None: + self.quant_config = FusedMoEQuantConfig() + def describe(self) -> str: s = "" - s += "== Config: \n" - s += f" world_size={self.world_size} \n" - s += f" PF={self.prepare_finalize_type.__name__} \n" - s += f" FE={self.fused_experts_type.__name__} \n" - s += f" topk={self.topks} \n" - s += f" dtype={self.dtype} \n" - s += f" fused_moe_chunk_size={self.fused_moe_chunk_size} \n" - s += " Quant: \n" - s += f" fused_moe_chunk_size={self.fused_moe_chunk_size} \n " + s += "== Config:\n" + s += f" world_size={self.world_size}\n" + s += f" PF={self.prepare_finalize_type.__name__}\n" + s += f" FE={self.fused_experts_type.__name__}\n" + s += f" E={self.E}\n" + s += f" Ms={self.Ms}\n" + s += f" N={self.N}\n" + s += f" K={self.K}\n" + s += f" topk={self.topks}\n" + s += f" dtype={self.dtype}\n" + s += f" fused_moe_chunk_size={self.fused_moe_chunk_size}\n" + s += " Quant:\n" if self.quant_config is not None: - s += f" q_dtype={self.quant_dtype} \n" - s += f" q_block_shape={self.quant_block_shape} \n" - s += f" q_per_out_ch_quant={self.is_per_out_ch_quant} \n" - s += f" q_per_act_token={self.is_per_act_token_quant} \n" + s += f" q_dtype={self.quant_dtype}\n" + s += f" q_block_shape={self.quant_block_shape}\n" + s += f" q_per_out_ch_quant={self.is_per_out_ch_quant}\n" + s += f" q_per_act_token={self.is_per_act_token_quant}\n" else: - s += " quant=None \n" + s += " quant=None\n" return s @property @@ -95,34 +83,28 @@ class Config: return self.Ms @property - def quant_dtype(self) -> Optional[torch.dtype]: - if self.quant_config is None: - return None + def quant_dtype(self) -> Union[torch.dtype, str, None]: + assert self.quant_config is not None return self.quant_config.quant_dtype @property def is_per_act_token_quant(self) -> bool: - if self.quant_config is None: - return False + assert self.quant_config is not None return self.quant_config.per_act_token_quant @property def is_per_tensor_act_quant(self) -> bool: - if self.quant_config is None: - return False return (not self.is_per_act_token_quant and self.quant_block_shape is None) @property def is_per_out_ch_quant(self) -> bool: - if self.quant_config is None: - return False + assert self.quant_config is not None return self.quant_config.per_out_ch_quant @property def quant_block_shape(self) -> Optional[list[int]]: - if self.quant_config is None: - return None + assert self.quant_config is not None return self.quant_config.block_shape @property @@ -130,36 +112,30 @@ class Config: assert isinstance(self.topks, int) return self.topks - @property - def topk_ids_dtype(self) -> Optional[torch.dtype]: - topk_ids_dtype = None - if self.prepare_finalize_type == PplxPrepareAndFinalize: - topk_ids_dtype = torch.uint32 - elif self.prepare_finalize_type in [ - DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize - ]: - topk_ids_dtype = torch.int64 - return topk_ids_dtype - @property def num_local_experts(self) -> int: return self.E // self.world_size def make_env_data(self) -> tuple[VllmConfig, dict[Any, Any]]: """ - make env data for vllm launch. + make env data for vllm launch. """ vllm_config = VllmConfig() vllm_config.parallel_config.data_parallel_size = self.world_size vllm_config.parallel_config.enable_expert_parallel = True env_dict = { - "VLLM_ALL2ALL_BACKEND": self.all2all_backend(), "VLLM_USE_DEEP_GEMM": str(int(self.needs_deep_gemm())), } + + backend = self.all2all_backend() + if backend is not None: + env_dict.update({"VLLM_ALL2ALL_BACKEND": backend}) + if self.fused_moe_chunk_size is not None: env_dict.update( {"VLLM_FUSED_MOE_CHUNK_SIZE": str(self.fused_moe_chunk_size)}) + return vllm_config, env_dict def is_fp8_block_quantized(self): @@ -167,85 +143,59 @@ class Config: and self.quant_block_shape is not None) def is_batched_prepare_finalize(self): - return self.prepare_finalize_type in [ - PplxPrepareAndFinalize, DeepEPLLPrepareAndFinalize - ] + info = prepare_finalize_info(self.prepare_finalize_type) + return (mk.FusedMoEActivationFormat.BatchedExperts == + info.activation_format) def is_batched_fused_experts(self): - return self.fused_experts_type in [ - CutlassExpertsFp8, BatchedDeepGemmExperts, BatchedTritonExperts, - NaiveBatchedExperts, BatchedTritonOrDeepGemmExperts - ] + info = expert_info(self.fused_experts_type) + return (mk.FusedMoEActivationFormat.BatchedExperts == + info.activation_format) def is_standard_fused_experts(self): - return self.fused_experts_type in [ - CutlassExpertsFp8, DeepGemmExperts, TritonOrDeepGemmExperts, - TritonExperts - ] + info = expert_info(self.fused_experts_type) + return mk.FusedMoEActivationFormat.Standard == info.activation_format - def is_fe_16bit_supported(self): - return self.fused_experts_type in [ - BatchedTritonExperts, BatchedTritonOrDeepGemmExperts, - NaiveBatchedExperts, TritonExperts - ] + def fe_supported_types(self): + info = expert_info(self.fused_experts_type) + return info.supported_dtypes - def is_fe_fp8_supported(self): - return self.fused_experts_type in [ - BatchedDeepGemmExperts, - BatchedTritonExperts, - BatchedTritonOrDeepGemmExperts, - CutlassExpertsFp8, - DeepGemmExperts, - TritonExperts, - TritonOrDeepGemmExperts, - NaiveBatchedExperts, - ] + def pf_supported_types(self): + info = prepare_finalize_info(self.prepare_finalize_type) + return info.supported_dtypes - def is_fe_block_fp8_supported(self): - return self.fused_experts_type in [ - BatchedDeepGemmExperts, - BatchedTritonOrDeepGemmExperts, - DeepGemmExperts, - TritonExperts, - TritonOrDeepGemmExperts, - BatchedTritonExperts, - NaiveBatchedExperts, - ] + def is_block_quant_supported(self): + info = expert_info(self.fused_experts_type) + return info.blocked_quantization_support def is_fe_supports_chunking(self): - return self.fused_experts_type in [ - CutlassExpertsFp8, DeepGemmExperts, TritonOrDeepGemmExperts, - TritonExperts - ] + info = expert_info(self.fused_experts_type) + return info.supports_chunking + + def supports_expert_map(self): + info = expert_info(self.fused_experts_type) + return info.supports_expert_map + + def supports_apply_weight_on_input(self): + info = prepare_finalize_info(self.prepare_finalize_type) + return info.supports_apply_weight_on_input def needs_deep_gemm(self): - return self.fused_experts_type in [ - BatchedDeepGemmExperts, - DeepGemmExperts, - ] + info = expert_info(self.fused_experts_type) + return info.needs_deep_gemm def needs_pplx(self): - return self.prepare_finalize_type in [PplxPrepareAndFinalize] + info = prepare_finalize_info(self.prepare_finalize_type) + return info.backend == "pplx" def needs_deep_ep(self): - return self.prepare_finalize_type in [ - DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize - ] + info = prepare_finalize_info(self.prepare_finalize_type) + return (info.backend == "deepep_high_throughput" + or info.backend == "deepep_low_latency") def all2all_backend(self): - if self.needs_pplx(): - return "pplx" - if self.prepare_finalize_type == DeepEPHTPrepareAndFinalize: - return "deepep_high_throughput" - if self.prepare_finalize_type == DeepEPLLPrepareAndFinalize: - return "deepep_low_latency" - return "naive" - - def needs_all2all(self): - return self.prepare_finalize_type in [ - PplxPrepareAndFinalize, DeepEPHTPrepareAndFinalize, - DeepEPLLPrepareAndFinalize - ] + info = prepare_finalize_info(self.prepare_finalize_type) + return info.backend def is_valid(self): # Check prepare-finalize and fused-experts compatibility @@ -267,28 +217,28 @@ class Config: # invalid quant config return False - # check bf16 / fp16 support - is_16bit = (self.dtype.itemsize == 2 and self.quant_dtype is None) - if is_16bit and not self.is_fe_16bit_supported(): - return False + # check type support + if self.quant_dtype is None: + if (self.dtype not in self.pf_supported_types() + or self.dtype not in self.fe_supported_types()): + return False + else: + if (self.quant_dtype not in self.pf_supported_types() + or self.quant_dtype not in self.fe_supported_types()): + return False - # Check fp8 support - is_fp8 = self.quant_dtype == torch.float8_e4m3fn - if is_fp8 and not self.is_fe_fp8_supported(): - return False - - # Check fp8 block quanization support + # Check block quanization support is_block_quatized = self.quant_block_shape is not None - if is_block_quatized and not is_fp8: + if is_block_quatized and self.quant_dtype is None: return False - if is_block_quatized and not self.is_fe_block_fp8_supported(): + if is_block_quatized and not self.is_block_quant_supported(): return False # deep_gemm only works with block-quantized if self.needs_deep_gemm() and not is_block_quatized: return False - # Check dependencies + # Check dependencies (turn into asserts?) if self.needs_deep_ep() and not has_deep_ep(): return False if self.needs_deep_gemm() and not has_deep_gemm(): @@ -305,6 +255,8 @@ class WeightTensors: w2: torch.Tensor w1_scale: Optional[torch.Tensor] w2_scale: Optional[torch.Tensor] + w1_gs: Optional[torch.Tensor] = None + w2_gs: Optional[torch.Tensor] = None def describe(self): s = "" @@ -313,13 +265,20 @@ class WeightTensors: s += f' - {_describe_tensor(self.w2, "w2")} \n' s += f' - {_describe_tensor(self.w1_scale, "w1_scale")} \n' s += f' - {_describe_tensor(self.w2_scale, "w2_scale")} \n' + s += f' - {_describe_tensor(self.w1_gs, "w1_gs")} \n' + s += f' - {_describe_tensor(self.w2_gs, "w2_gs")} \n' return s + def is_quantized(self) -> bool: + # or w1_scale is not None? + return (self.w1.dtype == torch.float8_e4m3fn + or self.w1.dtype == torch.uint8 or self.w1.dtype == torch.int8) + def to_current_device(self): self.w1 = self.w1.to(device=torch.cuda.current_device()) self.w2 = self.w2.to(device=torch.cuda.current_device()) - is_quantized = self.w1.dtype == torch.float8_e4m3fn - if is_quantized: + + if self.is_quantized(): assert self.w1_scale is not None assert self.w2_scale is not None self.w1_scale = self.w1_scale.to( @@ -327,56 +286,51 @@ class WeightTensors: self.w2_scale = self.w2_scale.to( device=torch.cuda.current_device()) + if self.w1_gs is not None: + assert self.w2_gs is not None + self.w1_gs = self.w1_gs.to(device=torch.cuda.current_device()) + self.w2_gs = self.w2_gs.to(device=torch.cuda.current_device()) + def slice_weights(self, rank: int, num_local_experts: int) -> "WeightTensors": s = rank * num_local_experts e = s + num_local_experts w1 = self.w1[s:e, :, :] w2 = self.w2[s:e, :, :] - is_quantized = self.w1.dtype == torch.float8_e4m3fn + w1_scale, w2_scale = (None, None) - if is_quantized: + if self.is_quantized(): assert self.w1_scale is not None assert self.w2_scale is not None w1_scale = self.w1_scale[s:e, :, :] w2_scale = self.w2_scale[s:e, :, :] - return WeightTensors(w1, w2, w1_scale, w2_scale) + + w1_gs = self.w1_gs + w2_gs = self.w2_gs + if w1_gs is not None: + assert w2_gs is not None + w1_gs = w1_gs[s:e] + w2_gs = w2_gs[s:e] + + return WeightTensors(w1, w2, w1_scale, w2_scale, w1_gs, w2_gs) @staticmethod def make(config: Config) -> "WeightTensors": - - if config.quant_dtype is None: - # just make normal dtype weights - w1, w2 = make_non_quant_weights(e=config.E, - n=config.N, - k=config.K, - dtype=config.dtype) - return WeightTensors(w1=w1, w2=w2, w1_scale=None, w2_scale=None) - - assert config.quant_dtype == torch.float8_e4m3fn - if not config.is_fp8_block_quantized(): - w1, w2, w1_scale, w2_scale = make_quant_fp8_weights( - e=config.E, - n=config.N, - k=config.K, - per_out_channel_quant=config.is_per_out_ch_quant, - ) - return WeightTensors(w1=w1, - w2=w2, - w1_scale=w1_scale, - w2_scale=w2_scale) - - assert config.quant_block_shape is not None - w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights( + (_, w1, w1_scale, w1_gs), (_, w2, w2_scale, w2_gs) = make_test_weights( e=config.E, n=config.N, k=config.K, - block_size=config.quant_block_shape, + in_dtype=config.dtype, + quant_dtype=config.quant_dtype, + block_shape=config.quant_block_shape, + per_act_token_quant=config.is_per_out_ch_quant, ) return WeightTensors(w1=w1, w2=w2, w1_scale=w1_scale, - w2_scale=w2_scale) + w2_scale=w2_scale, + w1_gs=w1_gs, + w2_gs=w2_gs) @dataclass @@ -449,7 +403,6 @@ class RankTensors: dtype=dtype) topk_weights, topk_ids, _ = fused_topk(hidden_states, score, topk, False) - topk_ids = topk_ids.to(config.topk_ids_dtype) # distribute topk_ids evenly for mi in range(m): @@ -457,7 +410,7 @@ class RankTensors: topk_ids = topk_ids.to(device=torch.cuda.current_device()) expert_map = None - if config.world_size > 1: + if config.world_size > 1 and config.supports_expert_map(): expert_map = torch.full((global_num_experts, ), fill_value=-1, dtype=torch.int32) @@ -480,92 +433,100 @@ class RankTensors: def reference_moe_impl(config: Config, weights: WeightTensors, rank_tensors: RankTensors) -> torch.Tensor: - return torch_experts(a=rank_tensors.hidden_states, - w1=weights.w1, - w2=weights.w2, + if config.quant_dtype == "nvfp4": + quant_blocksize = 16 + dtype = config.dtype + + w1_q = weights.w1 + w1_blockscale = weights.w1_scale + w1_gs = weights.w1_gs + + w2_q = weights.w2 + w2_blockscale = weights.w2_scale + w2_gs = weights.w2_gs + + a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax( + rank_tensors.hidden_states.flatten(), dim=-1)).to(torch.float32) + + assert w1_gs is not None + assert w2_gs is not None + assert w1_blockscale is not None + assert w2_blockscale is not None + + assert w1_blockscale.shape[1] % 128 == 0 + assert w1_blockscale.shape[2] % 4 == 0 + assert w2_blockscale.shape[1] % 128 == 0 + assert w2_blockscale.shape[2] % 4 == 0 + + a_fp4, a_scale_interleaved = ops.scaled_fp4_quant( + rank_tensors.hidden_states, a_global_scale) + + a = dequantize_nvfp4_to_dtype(a_fp4, + a_scale_interleaved, + a_global_scale, + dtype=dtype, + device=a_fp4.device, + block_size=quant_blocksize) + + e = w1_q.shape[0] + n = w1_q.shape[1] // 2 + k = w2_q.shape[1] + + w1 = torch.zeros((e, 2 * n, k), device="cuda", dtype=dtype) + w2 = torch.zeros((e, k, n), device="cuda", dtype=dtype) + + for idx in range(0, e): + w1[idx] = dequantize_nvfp4_to_dtype(w1_q[idx], + w1_blockscale[idx], + w1_gs[idx], + dtype=dtype, + device=w1_q.device, + block_size=quant_blocksize) + w2[idx] = dequantize_nvfp4_to_dtype(w2_q[idx], + w2_blockscale[idx], + w2_gs[idx], + dtype=dtype, + device=w2_q.device, + block_size=quant_blocksize) + a_scale = None + w1_scale = None + w2_scale = None + quant_dtype = None + per_act_token_quant = False + block_shape = None + else: + a = rank_tensors.hidden_states + a_scale = rank_tensors.hidden_states_scale + w1 = weights.w1 + w1_scale = weights.w1_scale + w2 = weights.w2 + w2_scale = weights.w2_scale + quant_dtype = config.quant_dtype + per_act_token_quant = config.is_per_act_token_quant + block_shape = config.quant_block_shape + + return torch_experts(a=a, + w1=w1, + w2=w2, topk_weight=rank_tensors.topk_weights, topk_ids=rank_tensors.topk_ids, global_num_experts=config.E, expert_map=None, - w1_scale=weights.w1_scale, - w2_scale=weights.w2_scale, - a1_scale=rank_tensors.hidden_states_scale, - quant_dtype=config.quant_dtype, - per_act_token_quant=config.is_per_act_token_quant, - block_shape=config.quant_block_shape, - apply_router_weights_on_input=config.topk == 1) + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_scale, + quant_dtype=quant_dtype, + per_act_token_quant=per_act_token_quant, + block_shape=block_shape, + apply_router_weights_on_input=config.topk == 1 + and config.supports_apply_weight_on_input()) -def make_fused_experts( - config: Config, moe: FusedMoEConfig, - num_dispatchers: int) -> mk.FusedMoEPermuteExpertsUnpermute: - - use_fp8 = config.quant_dtype == torch.float8_e4m3fn - batch_kwargs = { - "max_num_tokens": moe.max_num_tokens, - "num_dispatchers": num_dispatchers, - } - quant_kwargs = { - "use_fp8_w8a8": use_fp8, - "use_int8_w8a8": False, - "use_int8_w8a16": False, - "use_int4_w4a16": False, - "block_shape": config.quant_block_shape, - "per_act_token_quant": config.is_per_act_token_quant, - } - deepgemm_kwargs = {"allow_deep_gemm": has_deep_gemm()} - - if config.fused_experts_type == BatchedDeepGemmExperts: - kwargs = batch_kwargs | { - "block_shape": config.quant_block_shape, - "per_act_token_quant": config.is_per_act_token_quant, - } - print(f"Making BatchedDeepGemmExperts {kwargs} ...") - experts = BatchedDeepGemmExperts(**kwargs) - elif config.fused_experts_type == BatchedTritonExperts: - kwargs = batch_kwargs | quant_kwargs - print(f"Making BatchedTritonExperts {kwargs} ...") - experts = BatchedTritonExperts(**kwargs) - elif config.fused_experts_type == BatchedTritonOrDeepGemmExperts: - kwargs = batch_kwargs | quant_kwargs | deepgemm_kwargs - print(f"Making BatchedTritonOrDeepGemmExperts {kwargs} ...") - experts = BatchedTritonOrDeepGemmExperts(**kwargs) - elif config.fused_experts_type == DeepGemmExperts: - print("Making DeepGemmExperts () ...") - experts = DeepGemmExperts() - elif config.fused_experts_type == TritonExperts: - kwargs = quant_kwargs - print(f"Making TritonExperts {kwargs} ...") - experts = TritonExperts(**kwargs) - elif config.fused_experts_type == TritonOrDeepGemmExperts: - kwargs = quant_kwargs | deepgemm_kwargs - print(f"Making TritonOrDeepGemmExperts {kwargs} ...") - experts = TritonOrDeepGemmExperts(**kwargs) - elif config.fused_experts_type == NaiveBatchedExperts: - kwargs = batch_kwargs | quant_kwargs - print(f"Making NaiveBatchedExperts {kwargs} ...") - experts = NaiveBatchedExperts(**kwargs) - elif config.fused_experts_type == CutlassExpertsFp8: - use_batched_format = config.is_batched_prepare_finalize() - num_experts = (moe.num_local_experts - if use_batched_format else moe.num_experts) - kwargs = { - "max_experts_per_worker": num_experts, - "out_dtype": moe.in_dtype, - "per_act_token_quant": config.is_per_act_token_quant, - "per_out_ch_quant": config.is_per_out_ch_quant, - "block_shape": config.quant_block_shape, - "num_dispatchers": num_dispatchers, - "use_batched_format": use_batched_format - } - print(f"Making CutlassExpertsFp8 {kwargs} ...") - experts = CutlassExpertsFp8(**kwargs) - - return experts - - -def make_modular_kernel(config: Config, - vllm_config: VllmConfig) -> mk.FusedMoEModularKernel: +def make_modular_kernel( + config: Config, + vllm_config: VllmConfig, + weights: WeightTensors, +) -> mk.FusedMoEModularKernel: def next_power_of_2(x): import math @@ -579,6 +540,7 @@ def make_modular_kernel(config: Config, dp_size_=get_dp_group().world_size, vllm_parallel_config=vllm_config.parallel_config, ) + moe = FusedMoEConfig( num_experts=config.E, experts_per_token=config.topk, @@ -591,15 +553,16 @@ def make_modular_kernel(config: Config, ) # make modular kernel - prepare_finalize = None - if config.needs_all2all(): - prepare_finalize = FusedMoEMethodBase.maybe_make_prepare_finalize(moe) - assert prepare_finalize is not None - else: - prepare_finalize = MoEPrepareAndFinalizeNoEP() + prepare_finalize = make_prepare_finalize(config.prepare_finalize_type, + config.all2all_backend(), moe) - fused_experts = make_fused_experts(config, moe, - prepare_finalize.num_dispatchers()) + fused_experts = make_fused_experts( + config.fused_experts_type, + moe, + prepare_finalize.num_dispatchers(), + weights.w1_gs, + weights.w2_gs, + ) modular_kernel = mk.FusedMoEModularKernel( prepare_finalize=prepare_finalize, fused_experts=fused_experts) @@ -620,22 +583,45 @@ def run_modular_kernel( # weights for rank rank_weights = weights.slice_weights(pgi.rank, config.num_local_experts) - mk = make_modular_kernel(config, vllm_config) + mk = make_modular_kernel(config, vllm_config, weights) mk_kwargs = { - "hidden_states": rank_tensors.hidden_states.clone( + "hidden_states": + rank_tensors.hidden_states.clone( ), # impls might update the tensor in place - "w1": rank_weights.w1, - "w2": rank_weights.w2, - "topk_weights": rank_tensors.topk_weights, - "topk_ids": rank_tensors.topk_ids, - "expert_map": rank_tensors.expert_map, - "w1_scale": rank_weights.w1_scale, - "w2_scale": rank_weights.w2_scale, - "a1_scale": rank_tensors.hidden_states_scale, - "global_num_experts": config.E, - "apply_router_weight_on_input": config.topk == 1, + "w1": + rank_weights.w1, + "w2": + rank_weights.w2, + "topk_weights": + rank_tensors.topk_weights, + "topk_ids": + rank_tensors.topk_ids.to(mk.prepare_finalize.topk_indices_dtype()), + "expert_map": + rank_tensors.expert_map, + "w1_scale": + rank_weights.w1_scale, + "w2_scale": + rank_weights.w2_scale, + "a1_scale": + rank_tensors.hidden_states_scale, + "global_num_experts": + config.E, + "apply_router_weight_on_input": + config.topk == 1 and config.supports_apply_weight_on_input(), } - out = mk.forward(**mk_kwargs) + + num_tokens = rank_tensors.hidden_states.shape[0] + num_tokens_across_dp = torch.tensor([num_tokens] * config.world_size, + device="cuda", + dtype=torch.int) + + with set_forward_context( + None, + vllm_config, + num_tokens=num_tokens, + num_tokens_across_dp=num_tokens_across_dp, + ): + out = mk.forward(**mk_kwargs) return out diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py index 73214066f7ea6..aecffae36ae5e 100644 --- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py +++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py @@ -1,58 +1,316 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass +from typing import Optional, Union import torch # Fused experts and PrepareFinalize imports +import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( BatchedDeepGemmExperts) from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501 BatchedTritonOrDeepGemmExperts) -from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig -from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 +from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig, + FusedMoEQuantConfig) from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedTritonExperts, NaiveBatchedExperts) -from vllm.model_executor.layers.fused_moe.layer import TritonExperts +from vllm.model_executor.layers.fused_moe.layer import (FusedMoEMethodBase, + TritonExperts) from vllm.model_executor.layers.fused_moe.prepare_finalize import ( MoEPrepareAndFinalizeNoEP) from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts) -from vllm.utils import has_deep_ep, has_pplx +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + cutlass_fp4_supported) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + cutlass_fp8_supported) +from vllm.platforms import current_platform +from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx +from vllm.utils.deep_gemm import is_deep_gemm_supported +from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe -if has_deep_ep(): + +@dataclass +class PrepareFinalizeInfo: + activation_format: mk.FusedMoEActivationFormat + supported_dtypes: list[Union[torch.dtype, str]] + blocked_quantization_support: bool + backend: Optional[str] + supports_apply_weight_on_input: bool = True + + +@dataclass +class ExpertInfo: + activation_format: mk.FusedMoEActivationFormat + supported_dtypes: list[Union[torch.dtype, str]] + blocked_quantization_support: bool + supports_chunking: bool + supports_expert_map: bool + needs_matching_quant: bool = False + needs_deep_gemm: bool = False + + +PREPARE_FINALIZE_INFO: dict[mk.FusedMoEPrepareAndFinalize, + PrepareFinalizeInfo] = {} +EXPERT_INFO: dict[mk.FusedMoEPermuteExpertsUnpermute, ExpertInfo] = {} +MK_ALL_PREPARE_FINALIZE_TYPES: list[mk.FusedMoEPrepareAndFinalize] = [] +MK_MULTI_GPU_PREPARE_FINALIZE_TYPES: list[mk.FusedMoEPrepareAndFinalize] = [] +MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES: list[mk.FusedMoEPrepareAndFinalize] = [] +MK_FUSED_EXPERT_TYPES: list[mk.FusedMoEPermuteExpertsUnpermute] = [] + +standard_format = mk.FusedMoEActivationFormat.Standard +batched_format = mk.FusedMoEActivationFormat.BatchedExperts +common_float_types: list[Union[torch.dtype, str]] = [ + torch.float8_e4m3fn, torch.bfloat16, torch.float16, torch.float32 +] +common_float_and_int_types = common_float_types + [torch.int8] +nv_fp4_types = ["nvfp4"] +fp8_types = [torch.float8_e4m3fn] + + +def register_prepare_and_finalize( + kind, + activation_format: mk.FusedMoEActivationFormat, + supported_dtypes: list[Union[torch.dtype, str]], + blocked_quantization_support: bool, + backend: Optional[str], + force_multigpu: bool = False, + supports_apply_weight_on_input: bool = True, +): + global PREPARE_FINALIZE_INFO + global MK_ALL_PREPARE_FINALIZE_TYPES + global MK_MULTI_GPU_PREPARE_FINALIZE_TYPES + global MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES + assert kind not in PREPARE_FINALIZE_INFO + + PREPARE_FINALIZE_INFO[kind] = PrepareFinalizeInfo( + activation_format, + supported_dtypes, + blocked_quantization_support, + backend, + supports_apply_weight_on_input, + ) + MK_ALL_PREPARE_FINALIZE_TYPES.append(kind) + if backend is not None or force_multigpu: + MK_MULTI_GPU_PREPARE_FINALIZE_TYPES.append(kind) + else: + MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES.append(kind) + + +def register_experts( + kind, + activation_format: mk.FusedMoEActivationFormat, + supported_dtypes: list[Union[torch.dtype, str]], + blocked_quantization_support: bool, + supports_chunking: bool, + supports_expert_map: bool, + needs_matching_quant: bool = False, + needs_deep_gemm: bool = False, +): + global EXPERT_INFO + global MK_FUSED_EXPERT_TYPES + assert kind not in EXPERT_INFO + + EXPERT_INFO[kind] = ExpertInfo( + activation_format, + supported_dtypes, + blocked_quantization_support, + supports_chunking, + supports_expert_map, + needs_matching_quant, + needs_deep_gemm, + ) + + MK_FUSED_EXPERT_TYPES.append(kind) + + +def prepare_finalize_info(kind) -> PrepareFinalizeInfo: + info = PREPARE_FINALIZE_INFO.get(kind) + assert info is not None + return info + + +def expert_info(kind) -> ExpertInfo: + info = EXPERT_INFO.get(kind) + assert info is not None + return info + + +register_prepare_and_finalize( + MoEPrepareAndFinalizeNoEP, + standard_format, + common_float_types, + blocked_quantization_support=True, + backend=None, +) + +register_experts( + BatchedTritonExperts, + batched_format, + common_float_types, + blocked_quantization_support=True, + supports_chunking=False, + supports_expert_map=False, + needs_matching_quant=True, +) + +register_experts( + TritonExperts, + standard_format, + common_float_and_int_types, + blocked_quantization_support=True, + supports_chunking=True, + supports_expert_map=True, + needs_matching_quant=True, +) + +register_experts( + NaiveBatchedExperts, + batched_format, + common_float_and_int_types, + blocked_quantization_support=True, + supports_chunking=False, + supports_expert_map=True, +) + +# Disable on blackwell for now +if has_deep_ep() and not current_platform.has_device_capability(100): from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 DeepEPHTPrepareAndFinalize) from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 DeepEPLLPrepareAndFinalize) + register_prepare_and_finalize( + DeepEPHTPrepareAndFinalize, + standard_format, + common_float_types, + blocked_quantization_support=True, + backend="deepep_high_throughput", + ) + + register_prepare_and_finalize( + DeepEPLLPrepareAndFinalize, + batched_format, + common_float_types, + blocked_quantization_support=True, + backend="deepep_low_latency", + ) + if has_pplx(): from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import ( PplxPrepareAndFinalize) + register_prepare_and_finalize( + PplxPrepareAndFinalize, + batched_format, + common_float_and_int_types, + blocked_quantization_support=True, + backend="pplx", + ) -MK_MULTI_GPU_PREPARE_FINALIZE_TYPES = [] -if has_pplx(): - MK_MULTI_GPU_PREPARE_FINALIZE_TYPES += [PplxPrepareAndFinalize] -if has_deep_ep(): - MK_MULTI_GPU_PREPARE_FINALIZE_TYPES += [ - DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize - ] +if (has_flashinfer_cutlass_fused_moe() + and current_platform.has_device_capability(100)): + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 + FlashInferExperts) + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 + FlashInferCutlassMoEPrepareAndFinalize) -MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES = [MoEPrepareAndFinalizeNoEP] + register_prepare_and_finalize( + FlashInferCutlassMoEPrepareAndFinalize, + standard_format, + nv_fp4_types, + blocked_quantization_support=True, + backend=None, + force_multigpu=True, + supports_apply_weight_on_input=False, + ) -MK_ALL_PREPARE_FINALIZE_TYPES = (MK_MULTI_GPU_PREPARE_FINALIZE_TYPES + - MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES) + register_experts( + FlashInferExperts, + standard_format, + nv_fp4_types, + blocked_quantization_support=True, + supports_chunking=True, + # Note: this is a hack to get it to run for now + supports_expert_map=True, + ) +else: + FlashInferCutlassMoEPrepareAndFinalize = None -MK_FUSED_EXPERT_TYPES = [ - BatchedDeepGemmExperts, - BatchedTritonExperts, - NaiveBatchedExperts, - BatchedTritonOrDeepGemmExperts, - CutlassExpertsFp8, - DeepGemmExperts, - TritonOrDeepGemmExperts, - TritonExperts, -] +if has_deep_gemm() and is_deep_gemm_supported(): + register_experts( + BatchedDeepGemmExperts, + batched_format, + fp8_types, + blocked_quantization_support=True, + supports_chunking=False, + supports_expert_map=False, + needs_matching_quant=False, + needs_deep_gemm=True, + ) + register_experts( + DeepGemmExperts, + standard_format, + fp8_types, + blocked_quantization_support=True, + supports_chunking=True, + supports_expert_map=True, + needs_matching_quant=False, + needs_deep_gemm=True, + ), + register_experts( + BatchedTritonOrDeepGemmExperts, + batched_format, + common_float_and_int_types, + blocked_quantization_support=True, + supports_chunking=False, + supports_expert_map=False, + needs_matching_quant=True, + needs_deep_gemm=True, + ) + register_experts( + TritonOrDeepGemmExperts, + standard_format, + common_float_and_int_types, + blocked_quantization_support=True, + supports_chunking=True, + supports_expert_map=True, + needs_matching_quant=True, + needs_deep_gemm=True, + ) + +if cutlass_fp8_supported(): + from vllm.model_executor.layers.fused_moe import (CutlassBatchedExpertsFp8, + CutlassExpertsFp8) + register_experts( + CutlassExpertsFp8, + standard_format, + fp8_types, + blocked_quantization_support=False, + supports_chunking=True, + supports_expert_map=False, + ) + register_experts( + CutlassBatchedExpertsFp8, + batched_format, + fp8_types, + blocked_quantization_support=False, + supports_chunking=False, + supports_expert_map=False, + ) + +if cutlass_fp4_supported(): + from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + CutlassExpertsFp4) + register_experts( + CutlassExpertsFp4, + standard_format, + nv_fp4_types, + blocked_quantization_support=True, + supports_chunking=True, + supports_expert_map=False, + ) MK_QUANT_CONFIGS = [ None, @@ -85,3 +343,156 @@ MK_QUANT_CONFIGS = [ # block-quantized weights and per-token activations # block-quantized weights and per-tensor activations ] + +if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe(): + MK_QUANT_CONFIGS += [ + FusedMoEQuantConfig(quant_dtype="nvfp4", + per_out_ch_quant=False, + per_act_token_quant=False, + block_shape=None), + ] + + +def _make_gscale(num_experts: int) -> torch.Tensor: + return torch.ones((num_experts, ), + device=torch.cuda.current_device(), + dtype=torch.float32) + + +def make_prepare_finalize( + prepare_finalize_type: mk.FusedMoEPrepareAndFinalize, + backend: Optional[str], + moe: FusedMoEConfig, +) -> mk.FusedMoEPrepareAndFinalize: + if backend != "naive" and backend is not None: + prepare_finalize = FusedMoEMethodBase._maybe_make_prepare_finalize(moe) + assert prepare_finalize is not None + return prepare_finalize + elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize: + return FlashInferCutlassMoEPrepareAndFinalize( + use_dp=moe.moe_parallel_config.dp_size > 1, + a1_gscale=_make_gscale(moe.num_local_experts), + ) + else: + return MoEPrepareAndFinalizeNoEP() + + +def _slice(rank: int, num_local_experts: int, t: torch.Tensor) -> torch.Tensor: + s = rank * num_local_experts + e = s + num_local_experts + return t[s:e] + + +def make_fused_experts( + fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute, + moe: FusedMoEConfig, + num_dispatchers: int, + w1_gs: Optional[torch.Tensor], + w2_gs: Optional[torch.Tensor], +) -> mk.FusedMoEPermuteExpertsUnpermute: + + use_fp8 = moe.quant_dtype == torch.float8_e4m3fn + batch_kwargs = { + "max_num_tokens": moe.max_num_tokens, + "num_dispatchers": num_dispatchers, + } + quant_kwargs = { + "use_fp8_w8a8": use_fp8, + "use_int8_w8a8": False, + "use_int8_w8a16": False, + "use_int4_w4a16": False, + "block_shape": moe.block_shape, + "per_act_token_quant": moe.per_act_token_quant, + } + deepgemm_kwargs = {"allow_deep_gemm": has_deep_gemm()} + + if fused_experts_type == BatchedDeepGemmExperts: + kwargs = batch_kwargs | { + "block_shape": moe.block_shape, + "per_act_token_quant": moe.per_act_token_quant, + } + print(f"Making BatchedDeepGemmExperts {kwargs} ...") + experts = BatchedDeepGemmExperts(**kwargs) + elif fused_experts_type == BatchedTritonExperts: + kwargs = batch_kwargs | quant_kwargs + print(f"Making BatchedTritonExperts {kwargs} ...") + experts = BatchedTritonExperts(**kwargs) + elif fused_experts_type == BatchedTritonOrDeepGemmExperts: + kwargs = batch_kwargs | quant_kwargs | deepgemm_kwargs + print(f"Making BatchedTritonOrDeepGemmExperts {kwargs} ...") + experts = BatchedTritonOrDeepGemmExperts(**kwargs) + elif fused_experts_type == DeepGemmExperts: + print("Making DeepGemmExperts () ...") + experts = DeepGemmExperts() + elif fused_experts_type == TritonExperts: + kwargs = quant_kwargs + print(f"Making TritonExperts {kwargs} ...") + experts = TritonExperts(**kwargs) + elif fused_experts_type == TritonOrDeepGemmExperts: + kwargs = quant_kwargs | deepgemm_kwargs + print(f"Making TritonOrDeepGemmExperts {kwargs} ...") + experts = TritonOrDeepGemmExperts(**kwargs) + elif fused_experts_type == NaiveBatchedExperts: + kwargs = batch_kwargs | quant_kwargs + print(f"Making NaiveBatchedExperts {kwargs} ...") + experts = NaiveBatchedExperts(**kwargs) + elif fused_experts_type == CutlassExpertsFp8: + kwargs = { + "out_dtype": moe.in_dtype, + "per_act_token_quant": moe.per_act_token_quant, + "per_out_ch_quant": moe.per_out_ch_quant, + "block_shape": moe.block_shape, + } + print(f"Making CutlassExpertsFp8 {kwargs} ...") + experts = CutlassExpertsFp8(**kwargs) + elif fused_experts_type == CutlassBatchedExpertsFp8: + kwargs = { + "max_experts_per_worker": moe.num_local_experts, + "num_dispatchers": num_dispatchers, + "out_dtype": moe.in_dtype, + "per_act_token_quant": moe.per_act_token_quant, + "per_out_ch_quant": moe.per_out_ch_quant, + "block_shape": moe.block_shape, + } + print(f"Making CutlassBatchedExpertsFp8 {kwargs} ...") + experts = CutlassBatchedExpertsFp8(**kwargs) + elif fused_experts_type == CutlassExpertsFp4: + assert w1_gs is not None and w2_gs is not None + num_experts = moe.num_local_experts + rank = moe.moe_parallel_config.dp_rank + kwargs = { + "g1_alphas": _slice(rank, num_experts, (1 / w1_gs)), + "g2_alphas": _slice(rank, num_experts, (1 / w2_gs)), + "a1_gscale": _make_gscale(num_experts), + "a2_gscale": _make_gscale(num_experts), + "max_experts_per_worker": num_experts, + "out_dtype": moe.in_dtype, + "per_act_token_quant": moe.per_act_token_quant, + "per_out_ch_quant": moe.per_out_ch_quant, + "block_shape": moe.block_shape, + "num_dispatchers": num_dispatchers, + } + print(f"Making CutlassExpertsFp4 {kwargs} ...") + experts = CutlassExpertsFp4(**kwargs) + elif fused_experts_type == FlashInferExperts: + assert w1_gs is not None and w2_gs is not None + num_experts = moe.num_local_experts + rank = moe.moe_parallel_config.dp_rank + kwargs = { + "g1_alphas": _slice(rank, num_experts, (1 / w1_gs)), + "g2_alphas": _slice(rank, num_experts, (1 / w2_gs)), + "a1_gscale": _make_gscale(num_experts), + "a2_gscale": _make_gscale(num_experts), + "out_dtype": moe.in_dtype, + "quant_dtype": "nvfp4", + "ep_rank": moe.ep_rank, + "ep_size": moe.ep_size, + "tp_rank": moe.tp_rank, + "tp_size": moe.tp_size, + } + print(f"Making FlashInferExperts {kwargs} ...") + experts = FlashInferExperts(**kwargs) + else: + raise RuntimeError(f"Unknown fused experts type: {fused_experts_type}") + + return experts diff --git a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py index dd16ffb2eabec..0da6ee3543521 100644 --- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py +++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py @@ -52,7 +52,7 @@ def profile_modular_kernel( rank_weights = weights.slice_weights(pgi.rank, config.num_local_experts) # make modular kernel - mk = make_modular_kernel(config, vllm_config) + mk = make_modular_kernel(config, vllm_config, weights) mk_kwargs = { "hidden_states": rank_tensors.hidden_states, @@ -83,7 +83,7 @@ def rank_worker( # sanity check from vllm import envs if config.fused_moe_chunk_size is not None: - assert (config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE) + assert config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE # get weights to this device weights.to_current_device() diff --git a/tests/kernels/moe/modular_kernel_tools/utils.py b/tests/kernels/moe/modular_kernel_tools/utils.py deleted file mode 100644 index 866f52882beee..0000000000000 --- a/tests/kernels/moe/modular_kernel_tools/utils.py +++ /dev/null @@ -1,117 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -import vllm._custom_ops as ops -from vllm.utils.deep_gemm import per_block_cast_to_fp8 - - -def per_token_cast_to_fp8( - x: torch.Tensor, block_size: int) -> tuple[torch.Tensor, torch.Tensor]: - assert x.dim() == 2 - m, n = x.shape - pad_size = (block_size - (n % block_size)) % block_size - x = torch.nn.functional.pad(x, - (0, pad_size), value=0) if pad_size > 0 else x - x_view = x.view(m, -1, block_size) - x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) - fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn) - return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1) - - -def make_non_quant_weights( - e: int, - n: int, - k: int, - dtype: torch.dtype, -) -> tuple[torch.Tensor, torch.Tensor]: - """ - Return weights w1, w2 - """ - device = torch.cuda.current_device() - w1 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 15 - w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 15 - return w1, w2 - - -def make_block_quant_fp8_weights( - e: int, - n: int, - k: int, - block_size: list[int], -) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Return weights w1, w2, w1_scale, w2_scale - """ - dtype = torch.bfloat16 - device = torch.cuda.current_device() - - fp8_info = torch.finfo(torch.float8_e4m3fn) - fp8_max, fp8_min = fp8_info.max, fp8_info.min - - w1_bf16, w2_bf16 = make_non_quant_weights(e, n, k, dtype) - w1_bf16 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype) - w2_bf16 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype) - - block_n, block_k = block_size[0], block_size[1] - n_tiles_w1 = ((2 * n) + block_n - 1) // block_n - k_tiles_w1 = (k + block_k - 1) // block_k - n_tiles_w2 = (k + block_n - 1) // block_n - k_tiles_w2 = (n + block_k - 1) // block_k - - w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn, device=device) - w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn, device=device) - - w1_s = torch.empty((e, n_tiles_w1, k_tiles_w1), - device=device, - dtype=torch.float32) - w2_s = torch.empty((e, n_tiles_w2, k_tiles_w2), - device=device, - dtype=torch.float32) - - assert w1_s.shape == (e, (2 * n + (block_n - 1)) // block_n, - (k + (block_k - 1)) // block_k) - assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2] - - for i in range(e): - w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i], - block_size=[block_k, block_n]) - w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i], - block_size=[block_k, block_n]) - - return w1, w2, w1_s, w2_s - - -def make_quant_fp8_weights( - e: int, - n: int, - k: int, - per_out_channel_quant: bool, -) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Return w1, w2, w1_scale, w2_scale - """ - q_dtype = torch.float8_e4m3fn - - w1, w2 = make_non_quant_weights(e, n, k, dtype=torch.bfloat16) - - # w1 -> w1_q, w2 -> w2_q - w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=q_dtype) - w2_q = torch.empty((e, k, n), device="cuda", dtype=q_dtype) - - n_b_scales = 2 * n if per_out_channel_quant else 1 - k_b_scales = k if per_out_channel_quant else 1 - w1_scale = torch.empty((e, n_b_scales, 1), - device="cuda", - dtype=torch.float32) - w2_scale = torch.empty((e, k_b_scales, 1), - device="cuda", - dtype=torch.float32) - - for expert in range(e): - w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant( - w1[expert], use_per_token_if_dynamic=per_out_channel_quant) - w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant( - w2[expert], use_per_token_if_dynamic=per_out_channel_quant) - return w1_q, w2_q, w1_scale, w2_scale diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index edf3e61892430..00b2d780e66f5 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -133,7 +133,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int, per_act_token_quant=per_act_token_quant, ) - B, B_q, B_scale, _, _, _ = make_test_weights( + (B, B_q, B_scale, _), _ = make_test_weights( num_experts, N // 2, K, @@ -243,7 +243,7 @@ def test_fused_moe_batched_experts( act_dtype = dtype quant_dtype = None - w1_16, w1, w1_s, w2_16, w2, w2_s = make_test_weights( + (w1_16, w1, w1_s, _), (w2_16, w2, w2_s, _) = make_test_weights( e, n, k, diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py index 75b2e9f791789..9e4eaf221f245 100644 --- a/tests/kernels/moe/test_block_fp8.py +++ b/tests/kernels/moe/test_block_fp8.py @@ -161,18 +161,20 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed, a = torch.randn((M, K), dtype=dtype) / 10 score = torch.randn((M, E), dtype=dtype) - _, w1, w1_s, _, w2, w2_s = make_test_weights(E, - N, - K, - dtype, - torch.float8_e4m3fn, - per_act_token_quant=False, - block_shape=block_size) + (_, w1, w1_s, _), (_, w2, w2_s, + _) = make_test_weights(E, + N, + K, + dtype, + torch.float8_e4m3fn, + per_act_token_quant=False, + block_shape=block_size) m_fused_moe = modular_triton_fused_moe(use_fp8_w8a8=True, use_int8_w8a8=False, use_int8_w8a16=False, use_int4_w4a16=False, + use_mxfp4_w4a4=False, per_act_token_quant=False, block_shape=block_size) @@ -247,13 +249,14 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed, a = torch.randn((M, K), dtype=dtype) / 10 score = torch.randn((M, E), dtype=dtype) - _, w1, w1_s, _, w2, w2_s = make_test_weights(E, - N, - K, - dtype, - torch.float8_e4m3fn, - per_act_token_quant=False, - block_shape=block_size) + (_, w1, w1_s, _), (_, w2, w2_s, + _) = make_test_weights(E, + N, + K, + dtype, + torch.float8_e4m3fn, + per_act_token_quant=False, + block_shape=block_size) # Note: for now use_compile will error out if the problem size is # large enough to trigger chunking. I'm leaving the flag and diff --git a/tests/kernels/moe/test_block_int8.py b/tests/kernels/moe/test_block_int8.py index 8e680c722935b..5e4a93963f8e8 100644 --- a/tests/kernels/moe/test_block_int8.py +++ b/tests/kernels/moe/test_block_int8.py @@ -118,13 +118,14 @@ def test_w8a8_block_int8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): a = torch.randn((M, K), dtype=dtype) / 10 score = torch.randn((M, E), dtype=dtype) - _, w1, w1_s, _, w2, w2_s = make_test_weights(E, - N, - K, - dtype, - torch.int8, - per_act_token_quant=False, - block_shape=block_size) + (_, w1, w1_s, _), (_, w2, w2_s, + _) = make_test_weights(E, + N, + K, + dtype, + torch.int8, + per_act_token_quant=False, + block_shape=block_size) # Set the context to avoid lots of warning spam. with set_current_vllm_config(vllm_config): diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py index 1aee1ed8c3762..3b1618dacac7b 100644 --- a/tests/kernels/moe/test_cutlass_grouped_gemm.py +++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py @@ -9,6 +9,7 @@ import random import pytest import torch +from tests.kernels.moe.utils import per_token_cast_to_fp8 from tests.kernels.utils import baseline_scaled_mm from vllm import _custom_ops as ops from vllm.platforms import current_platform @@ -16,20 +17,6 @@ from vllm.utils import cdiv from vllm.utils.deep_gemm import per_block_cast_to_fp8 -def per_token_cast_to_fp8( - x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - assert x.dim() == 2 - m, n = x.shape - pad_size = (128 - (n % 128)) % 128 - x = torch.nn.functional.pad(x, - (0, pad_size), value=0) if pad_size > 0 else x - x_view = x.view(m, -1, 128) - x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) - fp8_data = (x_view * - (448.0 / x_amax.unsqueeze(2))).to(dtype=torch.float8_e4m3fn) - return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1) - - @pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [ (4, 8192, 7168, 4096), (4, 8192, 2048, 7168), @@ -76,7 +63,7 @@ def test_cutlass_grouped_gemm( device=device, dtype=torch.float)) for i in range(num_groups): - y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i]) + y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i], [128, 128]) for i in range(num_groups): a = x_fp8[0][ep_offset[i]:ep_offset[i + 1]] diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 9b064db973ddf..6f95581a5e60d 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -70,8 +70,10 @@ def make_block_quant_fp8_weights( """ Return weights w1q, w2q, w1_scale, w2_scale """ - w1, w1q, w1_scale, w2, w2q, w2_scale = make_test_weights( - e, n, k, torch.bfloat16, torch.float8_e4m3fn, block_size) + (_, w1q, w1_scale, _), (_, w2q, w2_scale, + _) = make_test_weights(e, n, k, torch.bfloat16, + torch.float8_e4m3fn, + block_size) return w1q, w2q, w1_scale, w2_scale diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py index b2b78662c9ded..4472f34a6291a 100644 --- a/tests/kernels/moe/test_deepgemm.py +++ b/tests/kernels/moe/test_deepgemm.py @@ -132,9 +132,9 @@ def run_single_case(m, n, k, topk, num_experts, block_size): # Note: W1 has shape (E, 2N, K), so N = 512 # can trigger the deepgemm path. MNKs = [ - (1024, 512, 128), - (1024, 512, 512), - (2048, 512, 512), + (1024, 768, 128), + (1024, 768, 512), + (2048, 768, 512), (512, 1024, 1024), (512, 2048, 2048), (4096, 4096, 1024), diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py new file mode 100644 index 0000000000000..1c14df2b914aa --- /dev/null +++ b/tests/kernels/moe/test_flashinfer_moe.py @@ -0,0 +1,147 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch + +from tests.kernels.moe.utils import make_test_weights +from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX, + FLOAT8_E4M3_MAX, + dequantize_nvfp4_to_dtype) +from tests.kernels.utils import torch_moe +from vllm import _custom_ops as ops +from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( + FlashInferExperts, is_valid_flashinfer_cutlass_fused_moe) +from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEModularKernel) +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP) +from vllm.platforms import current_platform +from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe + +if not has_flashinfer_cutlass_fused_moe( +) or not current_platform.has_device_capability(100): + pytest.skip("Requires flashinfer_cutlass_fused_moe and nvfp4 support", + allow_module_level=True) + +MNK_FACTORS = [ + (2, 1024, 1024), + (2, 1024, 1536), + (2, 3072, 1024), + (2, 3072, 1536), + (64, 1024, 1024), + (64, 1024, 1536), + (64, 3072, 1024), + (64, 2048, 1536), + (224, 1024, 1024), + (224, 1024, 1536), +] + + +@pytest.mark.parametrize("m,n,k", MNK_FACTORS) +@pytest.mark.parametrize("e", [40, 64, 256]) +#@pytest.mark.parametrize("e", [128, 256]) +@pytest.mark.parametrize("topk", [1, 6, 8]) +@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16]) +@torch.inference_mode() +def test_flashinfer_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int, + dtype: torch.dtype): + current_platform.seed_everything(7) + with set_current_vllm_config( + VllmConfig(parallel_config=ParallelConfig( + pipeline_parallel_size=1))): + + a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 + + quant_blocksize = 16 + + (_, w1_q, w1_blockscale, + w1_gs), (_, w2_q, w2_blockscale, w2_gs) = make_test_weights( + e, + n, + k, + in_dtype=dtype, + quant_dtype="nvfp4", + block_shape=None, # use quant_blocksize? + per_act_token_quant=False, + ) + + score = torch.randn((m, e), device="cuda", dtype=dtype) + topk_weights, topk_ids, _ = fused_topk(a, + score, + topk, + renormalize=False) + + a1_gs = torch.ones((e, ), device="cuda", dtype=torch.float32) + a2_gs = torch.ones((e, ), device="cuda", dtype=torch.float32) + + assert is_valid_flashinfer_cutlass_fused_moe(a, w1_q, w2_q) + + assert w1_gs is not None + assert w2_gs is not None + assert w1_blockscale is not None + assert w2_blockscale is not None + + flashinfer_experts = FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + FlashInferExperts( + a1_gscale=a1_gs, + g1_alphas=(1 / w1_gs), + a2_gscale=a2_gs, + g2_alphas=(1 / w2_gs), + out_dtype=dtype, + quant_dtype="nvfp4", + )) + + flashinfer_output = flashinfer_experts( + hidden_states=a, + w1=w1_q, + w1_scale=w1_blockscale, + w2=w2_q, + w2_scale=w2_blockscale, + a1_scale=a1_gs, + a2_scale=a2_gs, + topk_weights=topk_weights, + topk_ids=topk_ids, + ) + + # Reference check: + a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / + torch.amax(a.flatten(), dim=-1)).to(torch.float32) + a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a, a_global_scale) + _, m_k = a_fp4.shape + a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4, + a_scale_interleaved, + a_global_scale, + dtype=a.dtype, + device=a.device, + block_size=quant_blocksize) + + w1_d = torch.empty((e, 2 * n, k), device="cuda", dtype=dtype) + w2_d = torch.empty((e, k, n), device="cuda", dtype=dtype) + + for idx in range(0, e): + w1_d[idx] = dequantize_nvfp4_to_dtype(w1_q[idx], + w1_blockscale[idx], + w1_gs[idx], + dtype=dtype, + device=w1_q.device, + block_size=quant_blocksize) + w2_d[idx] = dequantize_nvfp4_to_dtype(w2_q[idx], + w2_blockscale[idx], + w2_gs[idx], + dtype=dtype, + device=w2_q.device, + block_size=quant_blocksize) + + torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk) + + torch.testing.assert_close(torch_output, + flashinfer_output, + atol=1e-1, + rtol=1e-1) + + +if __name__ == "__main__": + test_flashinfer_fp4_moe_no_graph((2, 1024, 1024), 40, 1, torch.half) diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index 6f2869c3a61d7..d45982384eb3b 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy +import textwrap +import traceback from itertools import product from typing import Optional @@ -10,41 +12,51 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.config import VllmConfig, current_platform, set_current_vllm_config -from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501 - BatchedTritonOrDeepGemmExperts) from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig -from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 -from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( - BatchedTritonExperts) -from vllm.model_executor.layers.fused_moe.layer import TritonExperts -from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( - TritonOrDeepGemmExperts) from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx +from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors, reference_moe_impl, run_modular_kernel) from .modular_kernel_tools.mk_objects import ( MK_FUSED_EXPERT_TYPES, MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, - MK_QUANT_CONFIGS, MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES) + MK_QUANT_CONFIGS, MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, expert_info) from .modular_kernel_tools.parallel_utils import (ProcessGroupInfo, parallel_launch_with_config) -# TODO (varun): These requirements are very strict and could be relaxed. -has_all_packages = (has_deep_ep() and has_deep_gemm() and has_pplx()) +has_any_multi_gpu_package = (has_deep_ep() or has_deep_gemm() or has_pplx() + or has_flashinfer_cutlass_fused_moe()) -meets_package_requirements = pytest.mark.skipif( - not has_all_packages, - reason="Requires deep_ep & deep_gemm & pplx packages", +meets_multi_gpu_requirements = pytest.mark.skipif( + not has_any_multi_gpu_package, + reason="Requires deep_ep or deep_gemm or pplx or flashinfer packages", ) +def format_result(verbose, msg, ex=None): + if ex is not None: + x = str(ex) + newx = x.strip(" \n\t")[:16] + if len(newx) < len(x): + newx = newx + " ..." + + prefix = "E\t" + print(f"{textwrap.indent(traceback.format_exc(), prefix)}") + print(f"FAILED {msg} - {newx}\n") + elif verbose: + print(f"PASSED {msg}") + else: + print(".", end="") + + def rank_worker( pgi: ProcessGroupInfo, vllm_config: VllmConfig, cpu_group, config: Config, weights: WeightTensors, + verbose: bool, ): current_platform.seed_everything(pgi.rank) @@ -61,39 +73,64 @@ def rank_worker( TOPKs = config.topks assert isinstance(TOPKs, list) + exceptions = [] + count = 0 + for m, topk in product(Ms, TOPKs): - print(f"Running m={m}, topk={topk} ...") - # override m and topk - cfgx = copy.deepcopy(config) - cfgx.Ms = m - cfgx.topks = topk + try: + print(f"Running[{pgi.rank}]: m={m}, topk={topk} ...") + count = count + 1 + # override m and topk + cfgx = copy.deepcopy(config) + cfgx.Ms = m + cfgx.topks = topk - # inputs for rank - rank_tensors = RankTensors.make(cfgx, pgi) + # inputs for rank + rank_tensors = RankTensors.make(cfgx, pgi) - # modular kernel out - mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights, - rank_tensors) + # modular kernel out + mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights, + rank_tensors) - with set_current_vllm_config(vllm_config): - ref_out = reference_moe_impl(cfgx, weights, rank_tensors) + with set_current_vllm_config(vllm_config): + ref_out = reference_moe_impl(cfgx, weights, rank_tensors) - torch.testing.assert_close(ref_out, mk_out, atol=3e-2, rtol=3e-2) + if config.quant_dtype == "nvfp4": + atol = 1e-1 + rtol = 1e-1 + else: + atol = 3e-2 + rtol = 3e-2 + + torch.testing.assert_close(ref_out, mk_out, atol=atol, rtol=rtol) + format_result(verbose, config.describe()) + except Exception as ex: + format_result(verbose, config.describe(), ex) + exceptions.append(ex) + + if len(exceptions) > 0: + raise RuntimeError( + f"{len(exceptions)} of {count} tests failed in child process, " + f"rank={pgi.rank}.") + else: + print(f"{count} of {count} tests passed in child process, " + f"rank={pgi.rank}.") -def run(config: Config): +def run(config: Config, verbose: bool): assert config.is_valid() - print(f"Testing config \n{config.describe()} ...") weights: WeightTensors = WeightTensors.make(config) vllm_config, env_dict = config.make_env_data() parallel_launch_with_config(config.world_size, rank_worker, vllm_config, - env_dict, config, weights) + env_dict, config, weights, verbose) Ms = [32, 64] -Ks = [7168] # hidden sizes +# hidden sizes, making this too large will cause fp4 tests to fail. +# Also needs to be a multiple of 1024 for deep_gemm. +Ks = [2048] Ns = [2048] TOPKs = [4, 1] Es = [32] @@ -103,19 +140,16 @@ FUSED_MOE_CHUNK_SIZEs = [None, 16] def is_nyi_config(config: Config) -> bool: # We know these configs to be legitimate. but still fail. + info = expert_info(config.fused_experts_type) - if (config.fused_experts_type in [ - BatchedTritonExperts, BatchedTritonOrDeepGemmExperts, - TritonExperts, TritonOrDeepGemmExperts - ]): + if info.needs_matching_quant: # The triton kernels expect both per-act-token-quant and # per-out-ch-quant or neither. unsupported_quant_config = ((config.is_per_act_token_quant + config.is_per_out_ch_quant) == 1) return unsupported_quant_config - # cutlass kernels dont support expert_maps yet. - return config.fused_experts_type == CutlassExpertsFp8 + return not info.supports_expert_map @pytest.mark.parametrize("k", Ks) @@ -128,13 +162,13 @@ def is_nyi_config(config: Config) -> bool: product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES)) @pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs) @pytest.mark.parametrize("world_size", [2]) -@meets_package_requirements +@meets_multi_gpu_requirements def test_modular_kernel_combinations_multigpu( k: int, n: int, e: int, dtype: torch.dtype, - quant_config: FusedMoEQuantConfig, + quant_config: Optional[FusedMoEQuantConfig], combination: tuple[mk.FusedMoEPrepareAndFinalize, mk.FusedMoEPermuteExpertsUnpermute], - fused_moe_chunk_size: Optional[int], world_size: int): + fused_moe_chunk_size: Optional[int], world_size: int, pytestconfig): config = Config( Ms=Ms, @@ -149,14 +183,15 @@ def test_modular_kernel_combinations_multigpu( fused_moe_chunk_size=fused_moe_chunk_size, world_size=world_size, ) + if not config.is_valid(): pytest.skip(f"Tests config {config} is not valid. Skipping ...") if is_nyi_config(config): pytest.skip(f"Tests config {config} is nyi. Skipping ...") - print(f"{config.describe()}") - run(config) + verbosity = pytestconfig.getoption('verbose') + run(config, verbosity > 0) @pytest.mark.parametrize("k", Ks) @@ -169,13 +204,12 @@ def test_modular_kernel_combinations_multigpu( product(MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES)) @pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs) @pytest.mark.parametrize("world_size", [1]) -@meets_package_requirements def test_modular_kernel_combinations_singlegpu( k: int, n: int, e: int, dtype: torch.dtype, - quant_config: FusedMoEQuantConfig, + quant_config: Optional[FusedMoEQuantConfig], combination: tuple[mk.FusedMoEPrepareAndFinalize, mk.FusedMoEPermuteExpertsUnpermute], - fused_moe_chunk_size: Optional[int], world_size: int): + fused_moe_chunk_size: Optional[int], world_size: int, pytestconfig): config = Config( Ms=Ms, K=k, @@ -196,7 +230,8 @@ def test_modular_kernel_combinations_singlegpu( if is_nyi_config(config): pytest.skip(f"Tests config {config} is nyi. Skipping ...") - run(config) + verbosity = pytestconfig.getoption('verbose') + run(config, verbosity > 0) if __name__ == '__main__': @@ -211,4 +246,4 @@ if __name__ == '__main__': args = parser.parse_args() config = make_config(args) - run(config) + run(config, True) diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py index 3ff385360299b..30388ef9375d4 100644 --- a/tests/kernels/moe/test_nvfp4_moe.py +++ b/tests/kernels/moe/test_nvfp4_moe.py @@ -3,6 +3,7 @@ import pytest import torch +from tests.kernels.moe.utils import make_test_weights from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dtype) @@ -43,41 +44,20 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int, VllmConfig(parallel_config=ParallelConfig( pipeline_parallel_size=1))): - a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 - w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 quant_blocksize = 16 - round_up = lambda x, y: (x + y - 1) // y * y - sf_w1_2n = round_up(2 * n, 128) - sf_w1_k = round_up(k // quant_blocksize, 4) - w1_blockscale = torch.empty((e, sf_w1_2n, sf_w1_k), - device="cuda", - dtype=torch.float8_e4m3fn) - w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 - sf_w2_k = round_up(k, 128) - sf_w2_n = round_up(n // quant_blocksize, 4) - w2_blockscale = torch.empty((e, sf_w2_k, sf_w2_n), - device="cuda", - dtype=torch.float8_e4m3fn) + a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 - w1_q = torch.empty((e, 2 * n, k // 2), - device="cuda", - dtype=torch.uint8) - w2_q = torch.empty((e, k, n // 2), device="cuda", dtype=torch.uint8) - w1_gs = torch.empty((e, ), device="cuda", dtype=torch.float32) - w2_gs = torch.empty((e, ), device="cuda", dtype=torch.float32) - - for expert in range(e): - w1_amax = torch.abs(w1).max().to(torch.float32) - w2_amax = torch.abs(w2).max().to(torch.float32) - w1_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w1_amax - w2_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w2_amax - - w1_q[expert], w1_blockscale[expert] = ops.scaled_fp4_quant( - w1[expert], w1_gs[expert]) - - w2_q[expert], w2_blockscale[expert] = ops.scaled_fp4_quant( - w2[expert], w2_gs[expert]) + (_, w1_q, w1_blockscale, + w1_gs), (_, w2_q, w2_blockscale, w2_gs) = make_test_weights( + e, + n, + k, + in_dtype=dtype, + quant_dtype="nvfp4", + block_shape=None, # use quant_blocksize? + per_act_token_quant=False, + ) score = torch.randn((m, e), device="cuda", dtype=dtype) topk_weights, topk_ids, _ = fused_topk(a, @@ -88,6 +68,11 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int, a1_gs = torch.ones((e, ), device="cuda", dtype=torch.float32) a2_gs = torch.ones((e, ), device="cuda", dtype=torch.float32) + assert w1_gs is not None + assert w2_gs is not None + assert w1_blockscale is not None + assert w2_blockscale is not None + cutlass_output = cutlass_moe_fp4( a=a, a1_gscale=a1_gs, @@ -104,14 +89,13 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int, n=n, k=k, e=e, - device=a.device, ) # Reference check: a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(a.flatten(), dim=-1)).to(torch.float32) a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a, a_global_scale) - _, m_k = a_fp4.shape + a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4, a_scale_interleaved, a_global_scale, @@ -126,14 +110,14 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int, w1_d[idx] = dequantize_nvfp4_to_dtype(w1_q[idx], w1_blockscale[idx], w1_gs[idx], - dtype=w1.dtype, - device=w1.device, + dtype=dtype, + device=w1_q.device, block_size=quant_blocksize) w2_d[idx] = dequantize_nvfp4_to_dtype(w2_q[idx], w2_blockscale[idx], w2_gs[idx], - dtype=w2.dtype, - device=w2.device, + dtype=dtype, + device=w2_q.device, block_size=quant_blocksize) torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk) diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index e4f4a393dfd56..f98937ee6c527 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -9,7 +9,8 @@ import torch from tests.kernels.utils import torch_experts from vllm import _custom_ops as ops from vllm.config import VllmConfig, set_current_vllm_config -from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 +from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + CutlassBatchedExpertsFp8) from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEModularKernel) @@ -123,12 +124,8 @@ def pplx_cutlass_moe( num_local_experts=num_local_experts, num_dispatchers=num_dispatchers) - experts = CutlassExpertsFp8(num_local_experts, - out_dtype, - per_act_token, - per_out_ch, - num_dispatchers=num_dispatchers, - use_batched_format=True) + experts = CutlassBatchedExpertsFp8(num_local_experts, num_dispatchers, + out_dtype, per_act_token, per_out_ch) fused_cutlass_experts = FusedMoEModularKernel( prepare_finalize, diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index fbef6706beaf0..c2064de97358f 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -770,7 +770,7 @@ def test_pplx_moe_slow( a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10 score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16) - _, w1, w1_s, _, w2, w2_s = make_test_weights( + (_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights( e, n, k, @@ -836,7 +836,7 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool, args = dict() if make_weights: - _, w1, w1_s, _, w2, w2_s = make_test_weights( + (_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights( e, n, k, diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index c33134981acc0..82960bd57345d 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -1,11 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional +from typing import Optional, Union import torch import vllm._custom_ops as ops from tests.kernels.quant_utils import per_block_cast_to_int8 +from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX, + FLOAT8_E4M3_MAX) from vllm.model_executor.layers.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts) @@ -169,28 +171,41 @@ def make_quantized_test_activations( def moe_quantize_weights( w: torch.Tensor, w_s: Optional[torch.Tensor], - quant_dtype: Optional[torch.dtype], + quant_dtype: Union[torch.dtype, str, None], per_token_quant: bool, block_shape: Optional[list[int]], -) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - assert (quant_dtype == torch.float8_e4m3fn - or quant_dtype == torch.int8), "only fp8/int8 supported" +) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + assert (quant_dtype == torch.float8_e4m3fn or quant_dtype == torch.int8 + or quant_dtype == "nvfp4"), "only fp8/int8/nvfp4 supported" + + w_gs = None if block_shape is not None: assert not per_token_quant if quant_dtype == torch.int8: w, w_s = per_block_cast_to_int8(w, block_shape) - else: + elif quant_dtype == torch.float8_e4m3fn: w, w_s = per_block_cast_to_fp8(w, block_shape) + elif quant_dtype == "nvfp4": + raise RuntimeError("blocked quantization not supported for nvfp4") + else: + raise RuntimeError(f"Unsupported quant type {quant_dtype}") else: if quant_dtype == torch.int8: w, w_s = ops.scaled_int8_quant( w, w_s, use_per_token_if_dynamic=per_token_quant) - else: + elif quant_dtype == torch.float8_e4m3fn: w, w_s = ops.scaled_fp8_quant( w, w_s, use_per_token_if_dynamic=per_token_quant) + elif quant_dtype == "nvfp4": + assert not per_token_quant + w_amax = torch.abs(w).max().to(torch.float32) + w_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w_amax + w, w_s = ops.scaled_fp4_quant(w, w_gs) + else: + raise RuntimeError(f"Unsupported quant type {quant_dtype}") - return w, w_s + return w, w_s, w_gs def make_test_weight( @@ -198,21 +213,26 @@ def make_test_weight( rows: int, cols: int, in_dtype: torch.dtype = torch.bfloat16, - quant_dtype: Optional[torch.dtype] = None, + quant_dtype: Union[torch.dtype, str, None] = None, block_shape: Optional[list[int]] = None, per_act_token_quant: bool = False, -) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: +) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], + Optional[torch.Tensor]]: w_16 = torch.randn((e, rows, cols), device="cuda", dtype=in_dtype) / 15 + w_gs = None if quant_dtype is not None: w_l = [None] * e w_s_l = [None] * e + w_gs_l = [None] * e for idx in range(e): - w_l[idx], w_s_l[idx] = moe_quantize_weights( + w_l[idx], w_s_l[idx], w_gs_l[idx] = moe_quantize_weights( w_16[idx], None, quant_dtype, per_act_token_quant, block_shape) w = torch.stack(w_l) w_s = torch.stack(w_s_l) + if e > 0 and w_gs_l[0] is not None: + w_gs = torch.stack(w_gs_l) if w_s.ndim == 2: assert w_s.shape[-1] == 1 w_s = w_s.view(-1, 1, 1) @@ -225,8 +245,9 @@ def make_test_weight( else: w = w_16 w_s = None + w_gs = None - return w_16, w, w_s + return w_16, w, w_s, w_gs def make_test_weights( @@ -234,14 +255,30 @@ def make_test_weights( n: int, k: int, in_dtype: torch.dtype = torch.bfloat16, - quant_dtype: Optional[torch.dtype] = None, + quant_dtype: Union[torch.dtype, str, None] = None, block_shape: Optional[list[int]] = None, per_act_token_quant: bool = False, -) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], torch.Tensor, - torch.Tensor, Optional[torch.Tensor]]: +) -> tuple[tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], + Optional[torch.Tensor]], + tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], + Optional[torch.Tensor]]]: return ( - *make_test_weight(e, 2 * n, k, in_dtype, quant_dtype, block_shape, - per_act_token_quant), - *make_test_weight(e, k, n, in_dtype, quant_dtype, block_shape, - per_act_token_quant), + make_test_weight(e, 2 * n, k, in_dtype, quant_dtype, block_shape, + per_act_token_quant), + make_test_weight(e, k, n, in_dtype, quant_dtype, block_shape, + per_act_token_quant), ) + + +def per_token_cast_to_fp8( + x: torch.Tensor, + block_size: int = 128) -> tuple[torch.Tensor, torch.Tensor]: + assert x.dim() == 2 + m, n = x.shape + pad_size = (block_size - (n % block_size)) % block_size + x = torch.nn.functional.pad(x, + (0, pad_size), value=0) if pad_size > 0 else x + x_view = x.view(m, -1, block_size) + x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) + fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn) + return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1) diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py index 127a340fc6c6d..9e5aa4e4c2a89 100644 --- a/vllm/distributed/device_communicators/base_device_communicator.py +++ b/vllm/distributed/device_communicators/base_device_communicator.py @@ -105,7 +105,8 @@ class DeviceCommunicatorBase: # we initialize the all2all manager used in expert parallel. use_ep = config.parallel_config.data_parallel_size > 1 - self.use_all2all = "ep" in unique_name and use_ep + self.is_ep_communicator = "ep" in unique_name + self.use_all2all = self.is_ep_communicator and use_ep self.all2all_manager: Optional[All2AllManagerBase] = None def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: @@ -246,7 +247,7 @@ class DeviceCommunicatorBase: """ Prepare the communication buffer for the model. """ - if not self.use_all2all: + if not self.is_ep_communicator: return moe_modules = [ @@ -254,7 +255,7 @@ class DeviceCommunicatorBase: if module.__class__.__name__ == "FusedMoE" ] for module in moe_modules: - module.quant_method.init_prepare_finalize(module.moe_config) + module.quant_method.init_prepare_finalize() def dispatch( self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 3d40879b4ccbf..3007643d7a288 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -49,7 +49,8 @@ if HAS_TRITON: from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501 BatchedTritonOrDeepGemmExperts) from vllm.model_executor.layers.fused_moe.cutlass_moe import ( - CutlassExpertsFp8, cutlass_moe_fp4, cutlass_moe_fp8) + CutlassBatchedExpertsFp8, CutlassExpertsFp8, cutlass_moe_fp4, + cutlass_moe_fp8) from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( DeepGemmExperts) from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( @@ -69,6 +70,7 @@ if HAS_TRITON: "cutlass_moe_fp8", "cutlass_moe_fp4", "CutlassExpertsFp8", + "CutlassBatchedExpertsFp8", "TritonExperts", "BatchedTritonExperts", "DeepGemmExperts", diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index c48a0137c3060..d9cfe96f7a033 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Optional import torch @@ -254,18 +254,28 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): output = (num_experts, max_num_tokens * num_dispatchers, K) return (workspace13, workspace2, output, a.dtype) - def apply(self, output: torch.Tensor, hidden_states: torch.Tensor, - w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor, - topk_ids: torch.Tensor, activation: str, global_num_experts: int, - expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, - workspace2: torch.Tensor, - expert_tokens_meta: Optional[mk.ExpertTokensMetadata], - apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]]): + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + apply_router_weight_on_input: bool, + ): assert expert_tokens_meta is not None expert_num_tokens = expert_tokens_meta.expert_num_tokens diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py index fc30e84e6656d..89d7412ee2236 100644 --- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Optional import torch @@ -132,18 +132,28 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): a, aq, M, N, K, topk, global_num_experts, local_num_experts, expert_tokens_metadata) - def apply(self, output: torch.Tensor, hidden_states: torch.Tensor, - w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor, - topk_ids: torch.Tensor, activation: str, global_num_experts: int, - expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, - workspace2: torch.Tensor, - expert_tokens_meta: Optional[mk.ExpertTokensMetadata], - apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]]): + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + apply_router_weight_on_input: bool, + ): experts = (self.batched_deep_gemm_experts if self.allow_deep_gemm else self.batched_triton_experts) assert experts is not None @@ -151,4 +161,4 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): activation, global_num_experts, expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1q_scale, a2_scale, workspace13, workspace2, expert_tokens_meta, - apply_router_weight_on_input, extra_expert_args) + apply_router_weight_on_input) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 31ea826f1f97a..7c1a7b636a9c2 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -45,7 +45,6 @@ def get_quant_config_weight_quant( return _get_quant_config_quantization_args(quant_config, "weights") -# TODO (bnell): use scalar_type instead of bools? def get_config_quant_dtype( use_fp8_w8a8: bool, use_int8_w8a8: bool, @@ -65,7 +64,8 @@ def get_config_quant_dtype( @dataclass class FusedMoEQuantConfig: # The post quantization activation type. - quant_dtype: Optional[torch.dtype] = None + # TODO (bnell): use scalar_type instead of Union. + quant_dtype: Union[torch.dtype, str, None] = None per_act_token_quant: bool = False per_out_ch_quant: bool = False block_shape: Optional[list[int]] = None @@ -141,6 +141,7 @@ class FusedMoEQuantConfig: use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, + use_mxfp4_w4a4, ] ]) <= 1, "Quantization flags are mutually exclusive." @@ -334,7 +335,7 @@ class FusedMoEConfig: assert self.max_num_tokens > 0 @property - def quant_dtype(self) -> Optional[torch.dtype]: + def quant_dtype(self) -> Union[torch.dtype, str, None]: if self.quant_config is not None: return self.quant_config.quant_dtype else: @@ -429,7 +430,7 @@ class FusedMoEConfig: block_shape = None per_act_token_quant = False per_out_ch_quant = False - quant_dtype: Optional[torch.dtype] = None + quant_dtype: Union[torch.dtype, str, None] = None input_quant = get_quant_config_input_quant(quant_config) weight_quant = get_quant_config_weight_quant(quant_config) @@ -453,7 +454,7 @@ class FusedMoEConfig: ModelOptNvFp4Config) if quant_dtype is None and isinstance(quant_config, ModelOptNvFp4Config): - quant_dtype = torch.uint8 + quant_dtype = "nvfp4" if weight_quant is not None: per_out_ch_quant = ( diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 2585a2953c9db..0a02b558d09e5 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ CUTLASS based Fused MoE kernels.""" -from typing import Any, Callable, Optional +from typing import Callable, Optional import torch @@ -12,11 +12,10 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.prepare_finalize import ( MoEPrepareAndFinalizeNoEP) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( - TopKWeightAndReduceDelegate) + TopKWeightAndReduceDelegate, TopKWeightAndReduceNoOP) from vllm.model_executor.layers.fused_moe.utils import (_fp8_perm, _fp8_quantize, - _resize_cache, - extract_required_args) + _resize_cache) from vllm.scalar_type import scalar_types logger = init_logger(__name__) @@ -213,19 +212,14 @@ def run_cutlass_moe_fp8( output.copy_(c3[c_map].view(M * topk, K), non_blocking=True) -# TODO (bnell): split class batched vs. non-batched? -# maybe remove need for passing aq to workspace_shapes -class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute): +class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): def __init__( self, - max_experts_per_worker: int, out_dtype: Optional[torch.dtype], per_act_token_quant: bool, per_out_ch_quant: bool, block_shape: Optional[list[int]] = None, - num_dispatchers: Optional[int] = None, - use_batched_format: bool = False, ): super().__init__( FusedMoEQuantConfig( @@ -234,33 +228,84 @@ class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute): per_out_ch_quant=per_out_ch_quant, block_shape=block_shape, )) - assert max_experts_per_worker > 0 - assert not use_batched_format or num_dispatchers is not None - self.max_experts_per_worker = max_experts_per_worker - self.num_dispatchers = num_dispatchers self.out_dtype = out_dtype - self.use_batched_format = use_batched_format + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + # Let PrepareAndFinalize::finalize() decide the impl. + return TopKWeightAndReduceDelegate() + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + apply_router_weight_on_input: bool, + ): + assert w1_zp is None, "w1_zp is not supported in CUTLASS MoE" + assert w2_zp is None, "w2_zp is not supported in CUTLASS MoE" + + expert_num_tokens = None + if expert_tokens_meta is not None: + expert_num_tokens = expert_tokens_meta.expert_num_tokens + + activation_callable = lambda o, i: self.activation(activation, o, i) + + use_batched_format = self.activation_formats[ + 0] == mk.FusedMoEActivationFormat.BatchedExperts + + in_dtype = hidden_states.dtype + run_cutlass_moe_fp8( + output, hidden_states, w1, w2, topk_ids, activation_callable, + global_num_experts, expert_map, w1_scale, w2_scale, a1q_scale, + a2_scale, workspace13, workspace2, expert_num_tokens, + self.out_dtype if self.out_dtype is not None else in_dtype, + self.per_act_token_quant, self.per_out_ch_quant, + use_batched_format) + + +class CutlassExpertsFp8(CutlassExpertsFp8Base): + + def __init__( + self, + out_dtype: Optional[torch.dtype], + per_act_token_quant: bool, + per_out_ch_quant: bool, + block_shape: Optional[list[int]] = None, + ): + super().__init__( + out_dtype, + per_act_token_quant, + per_out_ch_quant, + block_shape, + ) @property def activation_formats( self ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: - if self.use_batched_format: - return (mk.FusedMoEActivationFormat.BatchedExperts, - mk.FusedMoEActivationFormat.BatchedExperts) - else: - return (mk.FusedMoEActivationFormat.Standard, - mk.FusedMoEActivationFormat.Standard) + return (mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard) def supports_chunking(self) -> bool: - return not self.use_batched_format + return True def supports_expert_map(self) -> bool: - return not self.use_batched_format - - def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: - # Let PrepareAndFinalize::finalize() decide the impl. - return TopKWeightAndReduceDelegate() + return True def workspace_shapes( self, @@ -274,54 +319,69 @@ class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute): local_num_experts: int, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: - workspace1: tuple[int, ...] = () - workspace2: tuple[int, ...] = () - output: tuple[int, ...] = () - if self.use_batched_format: - padded_M = aq.size(1) - num_dp = self.num_dispatchers - assert num_dp is not None - workspace1 = (self.max_experts_per_worker, padded_M * num_dp, - max(N, K)) - workspace2 = (self.max_experts_per_worker, padded_M * num_dp, - (N // 2)) - output = (self.max_experts_per_worker, padded_M, K) - else: - workspace1 = (M * topk, max(N, K)) - workspace2 = (M * topk, N // 2) - output = (M * topk, K) + workspace1 = (M * topk, max(N, K)) + workspace2 = (M * topk, N // 2) + output = (M * topk, K) return (workspace1, workspace2, output, self.out_dtype if self.out_dtype is not None else a.dtype) - def apply(self, output: torch.Tensor, hidden_states: torch.Tensor, - w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor, - topk_ids: torch.Tensor, activation: str, global_num_experts: int, - expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, - workspace2: torch.Tensor, - expert_tokens_meta: Optional[mk.ExpertTokensMetadata], - apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]]): - assert w1_zp is None, "w1_zp is not supported in CUTLASS MoE" - assert w2_zp is None, "w2_zp is not supported in CUTLASS MoE" - expert_num_tokens = None - if expert_tokens_meta is not None: - expert_num_tokens = expert_tokens_meta.expert_num_tokens +class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base): - activation_callable = lambda o, i: self.activation(activation, o, i) + def __init__( + self, + max_experts_per_worker: int, + num_dispatchers: int, + out_dtype: Optional[torch.dtype], + per_act_token_quant: bool, + per_out_ch_quant: bool, + block_shape: Optional[list[int]] = None, + ): + super().__init__( + out_dtype, + per_act_token_quant, + per_out_ch_quant, + block_shape, + ) + assert max_experts_per_worker > 0 + self.max_experts_per_worker = max_experts_per_worker + self.num_dispatchers = num_dispatchers - in_dtype = hidden_states.dtype - run_cutlass_moe_fp8( - output, hidden_states, w1, w2, topk_ids, activation_callable, - global_num_experts, expert_map, w1_scale, w2_scale, a1q_scale, - a2_scale, workspace13, workspace2, expert_num_tokens, - self.out_dtype if self.out_dtype is not None else in_dtype, - self.per_act_token_quant, self.per_out_ch_quant, - self.use_batched_format) + @property + def activation_formats( + self + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return (mk.FusedMoEActivationFormat.BatchedExperts, + mk.FusedMoEActivationFormat.BatchedExperts) + + def supports_chunking(self) -> bool: + return False + + def supports_expert_map(self) -> bool: + return False + + # TODO(bnell): maybe remove need for passing aq to workspace_shapes + def workspace_shapes( + self, + a: torch.Tensor, + aq: torch.Tensor, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: + padded_M = aq.size(1) + num_dp = self.num_dispatchers + assert num_dp is not None + workspace1 = (self.max_experts_per_worker, padded_M * num_dp, + max(N, K)) + workspace2 = (self.max_experts_per_worker, padded_M * num_dp, (N // 2)) + output = (self.max_experts_per_worker, padded_M, K) + return (workspace1, workspace2, output, + self.out_dtype if self.out_dtype is not None else a.dtype) def cutlass_moe_fp8( @@ -387,11 +447,9 @@ def cutlass_moe_fp8( fn = mk.FusedMoEModularKernel( MoEPrepareAndFinalizeNoEP(), CutlassExpertsFp8( - max_experts_per_worker=num_experts, out_dtype=a.dtype, per_act_token_quant=per_act_token, per_out_ch_quant=per_out_ch, - use_batched_format=False, ), ) @@ -476,8 +534,9 @@ def run_cutlass_moe_fp4( e_w1, nx2_w1, half_k_w1 = w1_fp4.shape e_w2, k_w2, half_n_w2 = w2_fp4.shape - assert (e_w1 == e_w2 and e_w1 == e), ("Number of experts must match", - " between weights.") + assert (e_w1 == e_w2 + and e_w1 == e), ("Number of experts must match", + f" between weights. {e_w1}, {e_w2}, {e}") assert (k_a == half_k_w1 * 2 and k == k_w2), ("Hidden size mismatch between a, w1 and w2") assert (nx2_w1 == n * 2 and half_n_w2 * 2 == n), ("mismatch in " @@ -554,6 +613,10 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): def __init__( self, + g1_alphas: torch.Tensor, + g2_alphas: torch.Tensor, + a1_gscale: torch.Tensor, + a2_gscale: torch.Tensor, max_experts_per_worker: int, out_dtype: torch.dtype, per_act_token_quant: bool, @@ -562,8 +625,12 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): use_batched_format: bool = False, ): super().__init__( + # NVFP4 requires two levels of quantization, which involves + # computing some scaling factors dynamically. This makes it + # incompatible with the typical prepare -> MoE -> finalize + # pipeline. Move the quantization logic into the MoE body. FusedMoEQuantConfig( - quant_dtype=torch.uint8, + quant_dtype=None, # skip quantization in prepare/finalize per_act_token_quant=per_act_token_quant, per_out_ch_quant=per_out_ch_quant, block_shape=block_shape, @@ -572,6 +639,12 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): self.out_dtype = out_dtype self.use_batched_format = use_batched_format + # TODO(bnell): put this stuff into quant config? + self.g1_alphas = g1_alphas + self.g2_alphas = g2_alphas + self.a1_gscale = a1_gscale + self.a2_gscale = a2_gscale + @property def activation_formats( self @@ -590,8 +663,7 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): return True def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: - # Let PrepareAndFinalize::finalize() decide the impl. - return TopKWeightAndReduceDelegate() + return TopKWeightAndReduceNoOP() def workspace_shapes( self, @@ -620,34 +692,42 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): return (workspace1, workspace2, output, self.out_dtype if self.out_dtype is not None else a.dtype) - def apply(self, output: torch.Tensor, hidden_states: torch.Tensor, - w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor, - topk_ids: torch.Tensor, activation: str, global_num_experts: int, - expert_map: Optional[torch.Tensor], w1_scale: torch.Tensor, - w2_scale: torch.Tensor, w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: torch.Tensor, workspace13: Optional[torch.Tensor], - workspace2: Optional[torch.Tensor], - expert_tokens_meta: Optional[mk.ExpertTokensMetadata], - apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]]): - required_keys = [ - "g1_alphas", "g2_alphas", "a1_gscale", "a2_gscale", "m", "n", "k", - "e", "device" - ] - (g1_alphas, g2_alphas, a1_gscale, a2_gscale, m, n, k, e, - device) = extract_required_args(extra_expert_args, required_keys) + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: torch.Tensor, + workspace13: Optional[torch.Tensor], + workspace2: Optional[torch.Tensor], + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + apply_router_weight_on_input: bool, + ): + e, m, n, k, _ = mk._moe_problem_size(hidden_states, w1, w2, topk_ids) + n = w2.shape[2] * 2 + run_cutlass_moe_fp4( output=output, a=hidden_states, - a1_gscale=a1_gscale, + a1_gscale=self.a1_gscale, w1_fp4=w1, w1_blockscale=w1_scale, - w1_alphas=g1_alphas, - a2_gscale=a2_gscale, + w1_alphas=self.g1_alphas, + a2_gscale=self.a2_gscale, w2_fp4=w2, w2_blockscale=w2_scale, - w2_alphas=g2_alphas, + w2_alphas=self.g2_alphas, topk_weights=topk_weights, topk_ids=topk_ids, workspace13=workspace13, @@ -656,7 +736,7 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): n=n, k=k, e=e, - device=device, + device=hidden_states.device, apply_router_weight_on_input=apply_router_weight_on_input, ) @@ -677,7 +757,6 @@ def cutlass_moe_fp4( n: int, k: int, e: int, - device: torch.device, expert_map: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False) -> torch.Tensor: assert expert_map is None, ("Expert Parallelism / expert_map " @@ -686,6 +765,10 @@ def cutlass_moe_fp4( fn = mk.FusedMoEModularKernel( MoEPrepareAndFinalizeNoEP(), CutlassExpertsFp4( + g1_alphas, + g2_alphas, + a1_gscale, + a2_gscale, max_experts_per_worker=e, out_dtype=a.dtype, per_act_token_quant=False, @@ -693,29 +776,7 @@ def cutlass_moe_fp4( use_batched_format=False, ), ) - extra_expert_args = { - 'g1_alphas': g1_alphas, - 'g2_alphas': g2_alphas, - 'a1_gscale': a1_gscale, - 'a2_gscale': a2_gscale, - 'm': m, - 'n': n, - 'k': k, - 'e': e, - 'device': device, - } - # NVFP4 requires two levels of quantization, which involves computing some - # scaling factors dynamically. This makes it incompatible with the typical - # prepare -> MoE -> finalize pipeline. Move the quantization logic into the - # MoE body. - extra_prepare_args = { - 'skip_quant': True, - } - # Similar reason as above. - extra_finalize_args = { - 'skip_weight_reduce': True, - } return fn( hidden_states=a, w1=w1_fp4, @@ -731,9 +792,6 @@ def cutlass_moe_fp4( a1_scale=None, a2_scale=None, apply_router_weight_on_input=apply_router_weight_on_input, - extra_expert_args=extra_expert_args, - extra_prepare_args=extra_prepare_args, - extra_finalize_args=extra_finalize_args, ) @@ -824,16 +882,6 @@ def run_cutlass_block_scaled_fused_experts( k = w1_q.size(1) n = w2_q.size(1) - expert_offsets = torch.empty((num_experts + 1, ), - dtype=torch.int32, - device="cuda") - problem_sizes1 = torch.empty((num_experts, 3), - dtype=torch.int32, - device="cuda") - problem_sizes2 = torch.empty((num_experts, 3), - dtype=torch.int32, - device="cuda") - topk = topk_ids.size(1) a_q, a1_scale = _fp8_quantize(a, @@ -842,6 +890,16 @@ def run_cutlass_block_scaled_fused_experts( block_shape=[128, 128]) device = a_q.device + expert_offsets = torch.empty((num_experts + 1, ), + dtype=torch.int32, + device=device) + problem_sizes1 = torch.empty((num_experts, 3), + dtype=torch.int32, + device=device) + problem_sizes2 = torch.empty((num_experts, 3), + dtype=torch.int32, + device=device) + a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device) c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device) diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 9b8175f42a9d2..7b8467a5a0cf0 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools -from typing import Any, Optional +from typing import Optional import torch from tqdm import tqdm @@ -230,7 +230,6 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]], ): assert self.block_shape is not None assert a1q_scale is not None diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index f6b62254e7b4c..437e569d3130d 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Optional import deep_ep import torch @@ -127,12 +127,16 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): expert_topk_weights) def prepare( - self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, - topk_ids: torch.Tensor, num_experts: int, - expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - extra_prepare_args: Optional[dict[str, Any]] ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor], Optional[torch.Tensor]]: @@ -187,11 +191,15 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): return (expert_x, expert_x_scale, expert_tokens_meta, expert_topk_ids, expert_topk_weights) - def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - apply_router_weight_on_input: bool, - weight_and_reduce_impl: mk.TopKWeightAndReduce, - extra_finalize_args: Optional[dict[str, Any]]) -> None: + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: assert self.handle is not None diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py index cfc2bdcf02408..93ac11fb4bfbf 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional, Union +from typing import Optional, Union import deep_ep import torch @@ -77,7 +77,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): a1_scale: Optional[torch.Tensor], a2_scale: Optional[torch.Tensor], a1_dtype: torch.dtype, - quant_dtype: Optional[torch.dtype], + quant_dtype: Union[torch.dtype, str, None], per_act_token_quant: bool, block_shape: Optional[list[int]], ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: @@ -111,12 +111,16 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): return x, x_scales def prepare( - self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, - topk_ids: torch.Tensor, num_experts: int, - expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - extra_prepare_args: Optional[dict[str, Any]] ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor], Optional[torch.Tensor]]: @@ -162,11 +166,15 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): return (expert_x, expert_x_scale, expert_tokens_meta, None, None) - def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - apply_router_weight_on_input: bool, - weight_and_reduce_impl: mk.TopKWeightAndReduce, - extra_finalize_args: Optional[dict[str, Any]]) -> None: + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: assert isinstance( weight_and_reduce_impl, TopKWeightAndReduceDelegate ), ("Weight application and reduction happens in the combine kernel.") diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py index 4e3e15a35ada2..3fbe2a0bc69bb 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Optional, Union import torch @@ -8,8 +8,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( - TopKWeightAndReduceDelegate) -from vllm.model_executor.layers.fused_moe.utils import extract_required_args + TopKWeightAndReduceNoOP) from vllm.utils.flashinfer import (flashinfer_cutlass_fused_moe, has_flashinfer_cutlass_fused_moe) @@ -20,7 +19,7 @@ def is_valid_flashinfer_cutlass_fused_moe(hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor) -> bool: """ - Check if the given problem size is supported by the FlashInfer CUTLASS MoE + Check if the given problem size is supported by the FlashInfer CUTLASS MoE kernel. """ if not has_flashinfer_cutlass_fused_moe(): @@ -43,31 +42,34 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): def __init__( self, - use_nvfp4_w4a4: bool = False, - use_fp8_w8a8: bool = False, - use_dp: bool = False, + g1_alphas: torch.Tensor, + g2_alphas: torch.Tensor, + a1_gscale: torch.Tensor, + a2_gscale: torch.Tensor, + out_dtype: torch.dtype, + quant_dtype: Union[torch.dtype, str, None], ep_rank: int = 0, ep_size: int = 1, tp_rank: int = 0, tp_size: int = 1, - num_dispatchers: Optional[int] = None, - use_batched_format: bool = False, ): super().__init__( FusedMoEQuantConfig( - quant_dtype=torch.uint8, + quant_dtype=quant_dtype, per_act_token_quant=False, block_shape=None, )) - self.use_nvfp4_w4a4 = use_nvfp4_w4a4 - self.use_fp8_w8a8 = use_fp8_w8a8 + assert quant_dtype == "nvfp4", ("Only nvfp4 quantization is " + "currently supported.") self.ep_rank = ep_rank self.ep_size = ep_size self.tp_rank = tp_rank self.tp_size = tp_size - self.use_dp = use_dp - assert not use_batched_format or num_dispatchers is not None - self.num_dispatchers = num_dispatchers + self.g1_alphas = g1_alphas + self.g2_alphas = g2_alphas + self.a1_gscale = a1_gscale + self.a2_gscale = a2_gscale + self.out_dtype = out_dtype @property def activation_formats( @@ -84,8 +86,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): return True def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: - # Let PrepareAndFinalize::finalize() decide the impl. - return TopKWeightAndReduceDelegate() + return TopKWeightAndReduceNoOP() def workspace_shapes( self, @@ -117,8 +118,6 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): - Note: in order for activation chunking to work, the first dimension of each tuple must be the number of tokens. """ - assert self.use_nvfp4_w4a4 is True, ("Only nvfp4 quantization is " - "currently supported.") aq_m, aq_n = aq.shape workspace2 = () output_shape = (aq_m, aq_n * 2) @@ -149,21 +148,9 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): workspace2: Optional[torch.Tensor], expert_tokens_meta: Optional[mk.ExpertTokensMetadata], apply_router_weight_on_input: Optional[bool], - extra_expert_args: Optional[dict[str, Any]], ): - assert extra_expert_args is not None, \ - "extra_expert_args must be provided" - required_keys = [ - 'g1_alphas', 'g2_alphas', 'a1_gscale', 'a2_gscale', 'out_dtype' - ] - - g1_alphas, g2_alphas, a1_gscale, a2_gscale, out_dtype = ( - extract_required_args(extra_expert_args, required_keys)) - # Flashinfer CUTLASS kernel takes scalar global scales, # min because inv_scale. - assert self.use_nvfp4_w4a4 is True, ("Only nvfp4 quantization is " - "currently supported.") # Ensure w1_scale and w2_scale are not None before calling view assert w1_scale is not None and w2_scale is not None, ( @@ -171,12 +158,12 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): "be None for FlashInferExperts") quant_scales = [ - a1_gscale, + self.a1_gscale, w1_scale.view(torch.int32), - g1_alphas, - a2_gscale, + self.g1_alphas, + self.a2_gscale, w2_scale.view(torch.int32), - g2_alphas, + self.g2_alphas, ] _ = flashinfer_cutlass_fused_moe( input=hidden_states, @@ -185,7 +172,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): # FlashInfer API requires weight to be long for nvfp4 fc1_expert_weights=w1.view(torch.long), fc2_expert_weights=w2.view(torch.long), - output_dtype=out_dtype, + output_dtype=self.out_dtype, quant_scales=quant_scales, input_sf=a1q_scale, tp_size=self.tp_size, diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py index 36aca8cf74b6d..061b02172c446 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Optional import torch @@ -9,7 +9,7 @@ from vllm.distributed import get_dp_group from vllm.forward_context import get_forward_context from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.utils import ( - extract_required_args, moe_kernel_quantize_input) + moe_kernel_quantize_input) from vllm.utils.flashinfer import nvfp4_block_scale_interleave @@ -21,16 +21,15 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): def __init__( self, - quant_dtype: Optional[torch.dtype] = None, - per_channel_quant: bool = False, - block_shape: Optional[list[int]] = None, + use_dp: bool, + a1_gscale: Optional[torch.Tensor], num_dispatchers: int = 1, ): super().__init__() - self.per_channel_quant = per_channel_quant - self.block_shape = block_shape - self.quant_dtype = quant_dtype self.num_dispatchers_ = num_dispatchers + self.use_dp = use_dp + self.a1_gscale = a1_gscale + self.local_tokens = None @property def activation_format(self) -> mk.FusedMoEActivationFormat: @@ -55,10 +54,11 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): num_experts: int, expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, + # TODO(bnell): use quant_config + scales instead of ctor args quant_config: FusedMoEQuantConfig, - extra_prepare_args: Optional[dict[str, Any]] - ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], - Optional[torch.Tensor], Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor], + Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor], + Optional[torch.Tensor]]: if apply_router_weight_on_input: topk = topk_ids.size(1) @@ -67,22 +67,22 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): "apply_router_weight_on_input is only implemented for topk=1" a1.mul_(topk_weights.to(a1.dtype)) - (a1_gscale, use_dp, local_tokens) = extract_required_args( - extra_prepare_args, ['a1_gscale', 'use_dp', 'local_tokens']) - a1q, a1q_scale = moe_kernel_quantize_input( a1, - a1_gscale, + self.a1_gscale, quant_config.quant_dtype, - self.per_channel_quant, - self.block_shape, - is_fp4_scale_swizzled=not use_dp, # Swizzling after communication + quant_config.per_act_token_quant, + quant_config.block_shape, + # Swizzling after communication + is_fp4_scale_swizzled=not self.use_dp, ) - if use_dp: + if self.use_dp: topk_weights, topk_ids, a1q, a1q_scale = \ - get_dp_group().all_gatherv([topk_weights, topk_ids, a1q, a1q_scale], # noqa: E501 - dim=0, - sizes=get_local_sizes()) + get_dp_group().all_gatherv( + [topk_weights, topk_ids, a1q, a1q_scale], + dim=0, + sizes=get_local_sizes(), + ) a1_m, a1_n = a1q.shape a1q_scale = nvfp4_block_scale_interleave(a1q_scale) @@ -91,13 +91,9 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, apply_router_weight_on_input: bool, - weight_and_reduce_impl: mk.TopKWeightAndReduce, - extra_finalize_args: Optional[dict[str, Any]]) -> None: + weight_and_reduce_impl: mk.TopKWeightAndReduce) -> None: - (use_dp, - local_tokens) = extract_required_args(extra_finalize_args, - ['use_dp', 'local_tokens']) - if use_dp: + if self.use_dp: fused_expert_output = get_dp_group().reduce_scatterv( fused_expert_output, dim=0, sizes=get_local_sizes()) output.copy_(fused_expert_output) diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index 9a5c85e120cc1..b46f4be4b912e 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Fused batched MoE kernel.""" -from typing import Any, Optional +from typing import Optional import torch @@ -496,12 +496,16 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): return self.num_dispatchers_ def prepare( - self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, - topk_ids: torch.Tensor, num_experts: int, - expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - extra_prepare_args: Optional[dict[str, Any]] ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor], Optional[torch.Tensor]]: @@ -590,11 +594,15 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): return b_a1, b_a1_scale, expert_tokens_meta, None, None - def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - apply_router_weight_on_input: bool, - weight_and_reduce_impl: mk.TopKWeightAndReduce, - extra_finalize_args: Optional[dict[str, Any]]) -> None: + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate): weight_and_reduce_impl = TopKWeightAndReduceNaiveBatched(self.rank) weight_and_reduce_impl.apply( @@ -688,18 +696,28 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): else: return t.to(f32) * group_broadcast(scale, t.shape) - def apply(self, output: torch.Tensor, hidden_states: torch.Tensor, - w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor, - topk_ids: torch.Tensor, activation: str, global_num_experts: int, - expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, - workspace2: torch.Tensor, - expert_tokens_meta: Optional[mk.ExpertTokensMetadata], - apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]]): + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + apply_router_weight_on_input: bool, + ): assert hidden_states.dim() == 3 assert expert_tokens_meta is not None expert_num_tokens = expert_tokens_meta.expert_num_tokens @@ -894,18 +912,28 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): output = (num_experts, max_num_tokens * num_dp, K) return (workspace13, workspace2, output, a.dtype) - def apply(self, output: torch.Tensor, hidden_states: torch.Tensor, - w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor, - topk_ids: torch.Tensor, activation: str, global_num_experts: int, - expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, - workspace2: torch.Tensor, - expert_tokens_meta: Optional[mk.ExpertTokensMetadata], - apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]]): + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + apply_router_weight_on_input: bool, + ): # Check constraints. if self.use_int4_w4a16: assert hidden_states.size(-1) // 2 == w1.size(2), ( diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 1c497fa5521b9..e58a9e568d4a4 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1394,9 +1394,9 @@ def fused_experts(hidden_states: torch.Tensor, # E8M0 scale, which means we requantize the weight and input to the specific # scale. Fallen back to cutlass or triton for some cases would cause # accuracy issue. - should_use_deep_gemm = is_blackwell_deep_gemm_e8m0_used( - ) or _valid_deep_gemm(hidden_states, w1, w2) - if (allow_deep_gemm and use_fp8_w8a8 and should_use_deep_gemm): + if (allow_deep_gemm and use_fp8_w8a8 + and (is_blackwell_deep_gemm_e8m0_used() + or _valid_deep_gemm(hidden_states, w1, w2))): assert apply_router_weight_on_input is False assert is_act_and_mul, ( "DeepGemm only supports is_act_and_mul=True for now.") @@ -1905,7 +1905,6 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]], ): # Check constraints. if self.use_int4_w4a16: diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 6b5284dc6c96c..312befe2c1d71 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Optional import torch @@ -8,7 +8,6 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate) -from vllm.model_executor.layers.fused_moe.utils import extract_required_args from vllm.utils import has_triton_kernels logger = init_logger(__name__) @@ -160,12 +159,16 @@ class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): num_dispatchers: int, w1_precision: "PrecisionConfig", w2_precision: "PrecisionConfig", + w1_bias: Optional[torch.Tensor], + w2_bias: Optional[torch.Tensor], ): super().__init__(quant_config) self.max_num_tokens = max_num_tokens self.num_dispatchers = num_dispatchers self.w1_precision = w1_precision self.w2_precision = w2_precision + self.w1_bias = w1_bias + self.w2_bias = w2_bias @property def activation_formats( @@ -219,11 +222,7 @@ class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): workspace2: torch.Tensor, expert_tokens_meta: Optional[mk.ExpertTokensMetadata], apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]], ): - w1_bias, w2_bias = (extract_required_args(extra_expert_args, - ["w1_bias", "w2_bias"])) - return triton_kernel_fused_experts( output, hidden_states, @@ -240,8 +239,8 @@ class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): expert_map=expert_map, w1_scale=w1_scale, w2_scale=w2_scale, - w1_bias=w1_bias, - w2_bias=w2_bias, + w1_bias=self.w1_bias, + w2_bias=self.w2_bias, w1_precision=self.w1_precision, w2_precision=self.w2_precision, a1_scale=a1q_scale, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 36e75825853e6..c3c6e47827504 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -37,7 +37,6 @@ from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum from vllm.utils import (direct_register_custom_op, has_deep_ep, has_pplx, round_up) -from vllm.utils.flashinfer import has_flashinfer if current_platform.is_cuda_alike(): from .fused_batched_moe import BatchedTritonExperts @@ -49,9 +48,6 @@ if current_platform.is_cuda_alike(): from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize from .deepep_ll_prepare_finalize import (DEEPEP_QUANT_BLOCK_SHAPE, DeepEPLLPrepareAndFinalize) - if has_flashinfer(): - from .flashinfer_cutlass_prepare_finalize import ( - FlashInferCutlassMoEPrepareAndFinalize) else: fused_experts = None # type: ignore FusedMoEPermuteExpertsUnpermute = None # type: ignore @@ -80,7 +76,12 @@ class FusedMoeWeightScaleSupported(Enum): class FusedMoEMethodBase(QuantizeMethodBase): - moe: FusedMoEConfig + # TODO(bnell): also pass quant_config? + def __init__(self, moe: FusedMoEConfig): + super().__init__() + self.moe = moe + self.fused_experts: Optional[Callable] = None + self.topk_indices_dtype = None @abstractmethod def create_weights(self, layer: torch.nn.Module, num_experts: int, @@ -99,16 +100,16 @@ class FusedMoEMethodBase(QuantizeMethodBase): return False @staticmethod - def maybe_make_prepare_finalize( - moe: FusedMoEConfig) -> Optional[FusedMoEPrepareAndFinalize]: + def _maybe_make_prepare_finalize( + moe: FusedMoEConfig, ) -> Optional[FusedMoEPrepareAndFinalize]: all2all_manager = get_ep_group().device_communicator.all2all_manager assert all2all_manager is not None prepare_finalize: Optional[FusedMoEPrepareAndFinalize] = None - if moe.use_flashinfer_cutlass_kernels: - prepare_finalize = FlashInferCutlassMoEPrepareAndFinalize( - quant_dtype=moe.quant_dtype, ) + assert not moe.use_flashinfer_cutlass_kernels, \ + "Must be created in modelopt.py" + if moe.use_pplx_kernels: hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes( moe.max_num_tokens, @@ -188,14 +189,25 @@ class FusedMoEMethodBase(QuantizeMethodBase): return prepare_finalize - def init_prepare_finalize(self, moe: FusedMoEConfig): - self.moe = moe - prepare_finalize = FusedMoEMethodBase.maybe_make_prepare_finalize( - self.moe) + def maybe_make_prepare_finalize( + self, + moe: FusedMoEConfig, + ) -> Optional[FusedMoEPrepareAndFinalize]: + if moe.moe_parallel_config.use_all2all_kernels: + return FusedMoEMethodBase._maybe_make_prepare_finalize(moe) + else: + return None + + def init_prepare_finalize(self): + assert self.moe is not None + prepare_finalize = self.maybe_make_prepare_finalize(self.moe) - self.topk_indices_dtype = None if prepare_finalize is not None: - logger.debug("%s", prepare_finalize.__class__.__name__) + logger.debug("%s for %s(%s)", prepare_finalize.__class__.__name__, + self, id(self)) + assert self.topk_indices_dtype is None + assert self.fused_experts is None, \ + f"Attempt to override experts for {id(self)}!" self.topk_indices_dtype = prepare_finalize.topk_indices_dtype() experts = self.select_gemm_impl(prepare_finalize, self.moe) self.fused_experts = FusedMoEModularKernel( @@ -214,12 +226,6 @@ class FusedMoEMethodBase(QuantizeMethodBase): f"{self.__class__.__name__} must select appropriate gemm " "implementation based on the prepare_finalize") - def maybe_swap_experts_impl( - self, - moe_parallel_config: FusedMoEParallelConfig, - ): - pass - @abstractmethod def apply( self, @@ -251,10 +257,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): """MoE method without quantization.""" def __init__(self, moe: FusedMoEConfig): - super().__init__() - self.fused_experts = fused_experts # type: ignore - self.topk_indices_dtype = None - self.moe = moe + super().__init__(moe) self.has_bias = self.moe.has_bias self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled() if self.rocm_aiter_moe_enabled: @@ -266,6 +269,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): def select_gemm_impl( self, prepare_finalize: FusedMoEPrepareAndFinalize, + # TODO(bnell): Remove. Every layer should have an moe config object. moe: FusedMoEConfig, ) -> FusedMoEPermuteExpertsUnpermute: if (prepare_finalize.activation_format == @@ -474,9 +478,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): expert_map=expert_map, activation=activation, apply_router_weight_on_input=apply_router_weight_on_input) - else: - # add w1_bias/w2_bias to kwargs if they exist - kwargs = dict( + elif self.fused_experts is not None: + if self.has_bias: + raise ValueError( + "FusedMoEModularKernel does not support bias.") + return self.fused_experts( hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, @@ -488,17 +494,22 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): global_num_experts=global_num_experts, expert_map=expert_map, ) - if isinstance(self.fused_experts, - FusedMoEModularKernel) and self.has_bias: - raise ValueError( - "FusedMoEModularKernel does not support bias.") - if self.has_bias: - kwargs.update({ - "w1_bias": getattr(layer, "w13_bias", None), - "w2_bias": getattr(layer, "w2_bias", None), - }) - - return self.fused_experts(**kwargs) + else: + assert fused_experts is not None + return fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + w1_bias=layer.w13_bias if self.has_bias else None, + w2_bias=layer.w2_bias if self.has_bias else None, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + ) def forward_cpu( self, @@ -868,8 +879,6 @@ class FusedMoE(CustomOp): moe_quant_params["intermediate_size_full"] = intermediate_size self.quant_method.create_weights(layer=self, **moe_quant_params) - if isinstance(self.quant_method, FusedMoEMethodBase): - self.quant_method.maybe_swap_experts_impl(self.moe_parallel_config) # Chunked all2all staging tensor self.batched_hidden_states: Optional[torch.Tensor] = None diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 6262904e4dca1..2ea6383d5ae90 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -4,7 +4,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from enum import Enum from math import prod -from typing import Any, Optional, final +from typing import Optional, final import torch @@ -150,15 +150,23 @@ class FusedMoEPrepareAndFinalize(ABC): @abstractmethod def prepare( - self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, - topk_ids: torch.Tensor, num_experts: int, - expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - extra_prepare_args: Optional[dict[str, Any]] - ) -> tuple[torch.Tensor, Optional[torch.Tensor], - Optional[ExpertTokensMetadata], Optional[torch.Tensor], - Optional[torch.Tensor]]: + ) -> tuple[ + torch.Tensor, + Optional[torch.Tensor], + Optional[ExpertTokensMetadata], + Optional[torch.Tensor], + Optional[torch.Tensor], + ]: """ Perform any quantization (and/or) dispatching needed for this kernel. @@ -186,11 +194,15 @@ class FusedMoEPrepareAndFinalize(ABC): raise NotImplementedError @abstractmethod - def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - apply_router_weight_on_input: bool, - weight_and_reduce_impl: TopKWeightAndReduce, - extra_finalize_args: Optional[dict[str, Any]]) -> None: + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: TopKWeightAndReduce, + ) -> None: """ Perform any combine plus apply weights and perform a reduction on the fused experts output. @@ -368,7 +380,6 @@ class FusedMoEPermuteExpertsUnpermute(ABC): workspace2: torch.Tensor, expert_tokens_meta: Optional[ExpertTokensMetadata], apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]], ): """ This function computes the intermediate result of a Mixture of Experts @@ -454,18 +465,27 @@ class FusedMoEModularKernel(torch.nn.Module): f"{fused_experts.activation_formats[0]}") def _do_fused_experts( - self, fused_out: Optional[torch.Tensor], a1: torch.Tensor, - a1q: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, global_num_experts: int, local_num_experts: int, - expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], w2_scale: Optional[torch.Tensor], - w1_zp: Optional[torch.Tensor], w2_zp: Optional[torch.Tensor], - a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], - expert_tokens_meta: Optional[ExpertTokensMetadata], - apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]]) -> torch.Tensor: + self, + fused_out: Optional[torch.Tensor], + a1: torch.Tensor, + a1q: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + local_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + expert_tokens_meta: Optional[ExpertTokensMetadata], + apply_router_weight_on_input: bool, + ) -> torch.Tensor: _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids) @@ -509,7 +529,7 @@ class FusedMoEModularKernel(torch.nn.Module): workspace2=workspace2, expert_tokens_meta=expert_tokens_meta, apply_router_weight_on_input=apply_router_weight_on_input, - extra_expert_args=extra_expert_args) + ) return fused_out @@ -533,7 +553,6 @@ class FusedMoEModularKernel(torch.nn.Module): a2_scale: Optional[torch.Tensor], expert_tokens_meta: Optional[ExpertTokensMetadata], apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]], ) -> torch.Tensor: _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids) @@ -541,6 +560,9 @@ class FusedMoEModularKernel(torch.nn.Module): CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE num_chunks = cdiv(M, CHUNK_SIZE) + # TODO(bnell): get rid of one level here, update slice functions + # to nops on num_chunks==1 + if not self.fused_experts.supports_chunking() or num_chunks == 1: return self._do_fused_experts( fused_out=None, @@ -562,7 +584,7 @@ class FusedMoEModularKernel(torch.nn.Module): a2_scale=a2_scale, expert_tokens_meta=expert_tokens_meta, apply_router_weight_on_input=apply_router_weight_on_input, - extra_expert_args=extra_expert_args) + ) # Chunking required case assert num_chunks > 1 @@ -618,15 +640,6 @@ class FusedMoEModularKernel(torch.nn.Module): expert_num_tokens=c_expert_num_tokens, expert_num_tokens_cpu=c_expert_num_tokens_cpu) - m = None - if extra_expert_args is not None and 'm' in extra_expert_args: - m = extra_expert_args.get('m') - - if extra_expert_args is not None: - chunked_extra_expert_args = extra_expert_args - else: - chunked_extra_expert_args = {} - for chunk_idx in range(num_chunks): c_a1q, c_a1q_scale, c_a2_scale, c_topk_ids, c_topk_weights = ( slice_input_tensors(chunk_idx)) @@ -637,11 +650,6 @@ class FusedMoEModularKernel(torch.nn.Module): expert_tokens_meta, c_topk_ids, local_num_experts, expert_map) - s = chunk_idx * CHUNK_SIZE - e = min(s + CHUNK_SIZE, M) - - if m is not None: - chunked_extra_expert_args['m'] = e - s self._do_fused_experts( fused_out=slice_output_tensor(chunk_idx), a1=a1, @@ -662,7 +670,7 @@ class FusedMoEModularKernel(torch.nn.Module): a2_scale=c_a2_scale, expert_tokens_meta=c_expert_tokens_meta, apply_router_weight_on_input=apply_router_weight_on_input, - extra_expert_args=chunked_extra_expert_args) + ) return fused_out @@ -684,9 +692,6 @@ class FusedMoEModularKernel(torch.nn.Module): a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, - extra_expert_args: Optional[dict] = None, - extra_prepare_args: Optional[dict] = None, - extra_finalize_args: Optional[dict] = None, ) -> torch.Tensor: """ This function computes a Mixture of Experts (MoE) layer using two sets @@ -719,12 +724,6 @@ class FusedMoEModularKernel(torch.nn.Module): - apply_router_weight_on_input (bool): When true, the topk weights are applied directly on the inputs. This is only applicable when topk is 1. - - extra_expert_args (Optional[dict]): Extra keyword arguments to pass to - fused_experts.apply. - - extra_prepare_args (Optional[dict]): Extra keyword arguments to pass - to prepare. - - extra_finalize_args (Optional[dict]): Extra keyword arguments to pass - to finalize. Returns: - torch.Tensor: The output tensor after applying the MoE layer. @@ -748,7 +747,6 @@ class FusedMoEModularKernel(torch.nn.Module): expert_map, apply_router_weight_on_input, self.fused_experts.quant_config, - extra_prepare_args, ) # Maybe prepare gathered topk_ids and topk_weights from other EP ranks. @@ -786,12 +784,15 @@ class FusedMoEModularKernel(torch.nn.Module): a2_scale=a2_scale, expert_tokens_meta=expert_tokens_meta, apply_router_weight_on_input=apply_router_weight_on_input, - extra_expert_args=extra_expert_args) + ) self.prepare_finalize.finalize( - output, fused_out, topk_weights, topk_ids, + output, + fused_out, + topk_weights, + topk_ids, apply_router_weight_on_input, self.fused_experts.finalize_weight_and_reduce_impl(), - extra_finalize_args) + ) return output diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 46931f2dd7c78..401f37922b7bb 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Optional, Union import pplx_kernels as pplx import torch @@ -21,7 +21,7 @@ def pplx_hidden_dim_scale_bytes( max_num_tokens: int, hidden_dim: int, in_dtype: torch.dtype, - quant_dtype: Optional[torch.dtype], + quant_dtype: Union[torch.dtype, str, None], per_act_token_quant: bool, block_shape: Optional[list[int]], ): @@ -32,6 +32,7 @@ def pplx_hidden_dim_scale_bytes( # ceil_div(hidden_dim, block_size) * sizeof(float32) # For per-token: set to 4 * sizeof(float32) (x4 for alignment) if quant_dtype is not None: + assert isinstance(quant_dtype, torch.dtype) assert quant_dtype.itemsize == 1 hidden_dim_bytes = hidden_dim * quant_dtype.itemsize elem_size = torch.float32.itemsize @@ -89,12 +90,16 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): return self.num_dispatchers_ def prepare( - self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, - topk_ids: torch.Tensor, num_experts: int, - expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - extra_prepare_args: Optional[dict[str, Any]] ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor], Optional[torch.Tensor]]: @@ -213,11 +218,15 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): return expert_x, expert_x_scale, expert_tokens_meta, None, None - def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - apply_router_weight_on_input: bool, - weight_and_reduce_impl: mk.TopKWeightAndReduce, - extra_finalize_args: Optional[dict[str, Any]]) -> None: + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: assert isinstance( weight_and_reduce_impl, TopKWeightAndReduceDelegate ), ("Weight application and reduction happens in the combine kernel.") diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py index 696c7cdba9a7b..567a0a88fec0a 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Optional import torch @@ -38,7 +38,6 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - extra_prepare_args: Optional[dict[str, Any]], ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor], Optional[torch.Tensor]]: @@ -50,32 +49,26 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): "apply_router_weight_on_input is only implemented for topk=1" a1.mul_(topk_weights.to(a1.dtype)) - if (extra_prepare_args is not None - and extra_prepare_args.get("skip_quant", True)): - # Skip quantization if explicitly requested - return a1, None, None, None, None - a1q, a1q_scale = moe_kernel_quantize_input( a1, a1_scale, quant_config.quant_dtype, quant_config.per_act_token_quant, quant_config.block_shape) return a1q, a1q_scale, None, None, None - def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - apply_router_weight_on_input: bool, - weight_and_reduce_impl: mk.TopKWeightAndReduce, - extra_finalize_args: Optional[dict[str, Any]]) -> None: - if (extra_finalize_args is not None - and extra_finalize_args.get("skip_weight_reduce", True)): - assert output.shape == fused_expert_output.shape - output.copy_(fused_expert_output) - else: - if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate): - weight_and_reduce_impl = TopKWeightAndReduceContiguous() - weight_and_reduce_impl.apply( - output=output, - fused_expert_output=fused_expert_output, - topk_weights=topk_weights, - topk_ids=topk_ids, - apply_router_weight_on_input=apply_router_weight_on_input) + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: + if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate): + weight_and_reduce_impl = TopKWeightAndReduceContiguous() + weight_and_reduce_impl.apply( + output=output, + fused_expert_output=fused_expert_output, + topk_weights=topk_weights, + topk_ids=topk_ids, + apply_router_weight_on_input=apply_router_weight_on_input) diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py index 9d0ff2e06190e..486ca881df48c 100644 --- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Optional import torch @@ -119,18 +119,28 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): local_num_experts, expert_tokens_meta) - def apply(self, output: torch.Tensor, hidden_states: torch.Tensor, - w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor, - topk_ids: torch.Tensor, activation: str, global_num_experts: int, - expert_map: Optional[torch.Tensor], - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor], - w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor, - workspace2: torch.Tensor, - expert_tokens_meta: Optional[mk.ExpertTokensMetadata], - apply_router_weight_on_input: bool, - extra_expert_args: Optional[dict[str, Any]]): + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: Optional[mk.ExpertTokensMetadata], + apply_router_weight_on_input: bool, + ): use_deep_gemm = (self.allow_deep_gemm and (_valid_deep_gemm(hidden_states, w1, w2) or is_blackwell_deep_gemm_e8m0_used())) @@ -158,5 +168,4 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): workspace2, expert_tokens_meta, apply_router_weight_on_input, - extra_expert_args, ) diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index 966471b5c59b4..4c3e700ad3990 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from math import prod -from typing import Any, Optional, Union +from typing import Optional, Union import torch @@ -189,7 +189,7 @@ def moe_kernel_quantize_input( return _fp8_quantize(A, A_scale, per_act_token_quant, block_shape) elif quant_dtype == torch.int8: return _int8_quantize(A, A_scale, per_act_token_quant, block_shape) - elif quant_dtype == torch.uint8: # nvfp4 + elif quant_dtype == "nvfp4": return _fp4_quantize(A, A_scale, is_sf_swizzled_layout=is_fp4_scale_swizzled) @@ -252,17 +252,3 @@ def _validate_scale_shape( assert block_shape is not None expected = (a.shape[0], cdiv(a.shape[1], block_shape[1])) assert a_scale.shape == expected, f"{a_scale.shape} == {expected}" - - -def extract_required_args( - extra_args: Optional[dict[str, Any]], - required_keys: list[str], -) -> tuple[Any, ...]: - if extra_args is None: - raise ValueError("`extra_args` must be provided.") - - missing_keys = [k for k in required_keys if k not in extra_args] - if missing_keys: - raise ValueError(f"Missing keys in `extra_args`: {missing_keys}") - - return tuple(extra_args[k] for k in required_keys) diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py index a9e967e608e96..fb285413ba9ef 100644 --- a/vllm/model_executor/layers/quantization/auto_round.py +++ b/vllm/model_executor/layers/quantization/auto_round.py @@ -241,7 +241,7 @@ class AutoRoundConfig(QuantizationConfig): if isinstance(layer, FusedMoE): if use_marlin: - return AWQMoEMethod(quant_args_marlin) + return AWQMoEMethod(quant_args_marlin, layer.moe) from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) @@ -339,7 +339,7 @@ class AutoRoundConfig(QuantizationConfig): } return MoeWNA16Config.from_config(config).get_quant_method( layer, prefix) - return GPTQMarlinMoEMethod(quant_args_marlin) + return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe) if isinstance(layer, (LinearBase, ParallelLMHead)): if use_marlin: diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index fe42e26a17061..af602eb9aca38 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -113,7 +113,7 @@ class AWQConfig(QuantizationConfig): } awq_marlin_config = AWQMarlinConfig.from_config( marlin_compatible_config_dict) - return AWQMoEMethod(awq_marlin_config) + return AWQMoEMethod(awq_marlin_config, layer.moe_config) return None diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index ed7ffb21e85aa..287d66b06d6e9 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -10,7 +10,7 @@ import vllm.model_executor.layers.fused_moe # noqa from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, + FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported, UnquantizedFusedMoEMethod) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod, @@ -151,7 +151,7 @@ class AWQMarlinConfig(QuantizationConfig): "Falling back to Moe WNA16 kernels.") return MoeWNA16Config.from_config( self.full_config).get_quant_method(layer, prefix) - return AWQMoEMethod(self) + return AWQMoEMethod(self, layer.moe_config) return None @classmethod @@ -328,7 +328,12 @@ class AWQMarlinLinearMethod(LinearMethodBase): class AWQMoEMethod(FusedMoEMethodBase): - def __init__(self, quant_config: AWQMarlinConfig): + def __init__( + self, + quant_config: AWQMarlinConfig, + moe: FusedMoEConfig, + ): + super().__init__(moe) self.quant_config = quant_config if self.quant_config.weight_bits != 4: raise ValueError("AWQMoEMethod only supports 4bit now.") @@ -500,6 +505,8 @@ class AWQMoEMethod(FusedMoEMethodBase): logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for `AWQMoEMethod` yet.") @@ -516,7 +523,8 @@ class AWQMoEMethod(FusedMoEMethodBase): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) return torch.ops.vllm.fused_marlin_moe( x, @@ -535,4 +543,4 @@ class AWQMoEMethod(FusedMoEMethodBase): expert_map=expert_map, w1_zeros=layer.w13_qzeros, w2_zeros=layer.w2_qzeros, - workspace=layer.workspace) \ No newline at end of file + workspace=layer.workspace) diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 0204ff46852f4..b7897a43793c7 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -7,6 +7,7 @@ import torch from packaging import version from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, + FusedMoEConfig, FusedMoEMethodBase) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod, @@ -132,7 +133,7 @@ class BitsAndBytesConfig(QuantizationConfig): return UnquantizedLinearMethod() return BitsAndBytesLinearMethod(self) elif isinstance(layer, FusedMoE): - return BitsAndBytesMoEMethod(self) + return BitsAndBytesMoEMethod(self, layer.moe_config) return None @@ -411,7 +412,12 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): quant_config: The BitsAndBytes quantization config. """ - def __init__(self, quant_config: BitsAndBytesConfig): + def __init__( + self, + quant_config: BitsAndBytesConfig, + moe: FusedMoEConfig, + ): + super().__init__(moe) try: import bitsandbytes if version.parse( @@ -422,7 +428,6 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): raise ImportError("Please install bitsandbytes>=0.46.1 via " "`pip install bitsandbytes>=0.46.1` to use " "bitsandbytes quantizer.") from err - self.topk_indices_dtype = None self.quant_config = quant_config def create_weights( @@ -470,6 +475,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe import fused_experts + assert self.fused_experts is None if enable_eplb: raise NotImplementedError( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 839942beaf406..42c43cbc03e57 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -11,20 +11,21 @@ from compressed_tensors.quantization import (ActivationOrdering, QuantizationStrategy) import vllm.envs as envs +import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase, FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize, FusedMoeWeightScaleSupported) -from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa - FlashInferCutlassMoEPrepareAndFinalize) +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( + is_valid_flashinfer_cutlass_fused_moe) from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import ( # noqa WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP) from vllm.model_executor.layers.quantization.utils import replace_parameter from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( - build_flashinfer_fp4_cutlass_moe_kernel, - flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1) + build_flashinfer_fp4_cutlass_moe_prepare_finalize, reorder_w1w3_to_w3w1, + select_nvfp4_gemm_impl) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( check_moe_marlin_supports_layer, marlin_make_workspace_new, marlin_moe_permute_scales) @@ -58,6 +59,9 @@ __all__ = [ class CompressedTensorsMoEMethod(FusedMoEMethodBase): + def __init_(self, moe: FusedMoEConfig): + super().__init__(moe) + @staticmethod def get_moe_method( quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 @@ -81,18 +85,22 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): "WNA16MoE is not supported with actorder=group/dynamic." ) logger.info_once("Using CompressedTensorsWNA16MoEMethod") - return CompressedTensorsWNA16MoEMethod(quant_config) + return CompressedTensorsWNA16MoEMethod(quant_config, + layer.moe_config) else: logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod") - return CompressedTensorsWNA16MarlinMoEMethod(quant_config) + return CompressedTensorsWNA16MarlinMoEMethod( + quant_config, layer.moe_config) elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant): - return CompressedTensorsW4A4MoeMethod() + return CompressedTensorsW4A4MoeMethod(layer.moe_config, layer) elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant) or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant) or quant_config._is_fp8_w8a8(weight_quant, input_quant)): - return CompressedTensorsW8A8Fp8MoEMethod(quant_config) + return CompressedTensorsW8A8Fp8MoEMethod(quant_config, + layer.moe_config) elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant): - return CompressedTensorsW8A8Int8MoEMethod(quant_config) + return CompressedTensorsW8A8Int8MoEMethod(quant_config, + layer.moe_config) else: raise RuntimeError( f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}") @@ -100,15 +108,16 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): - def __init__(self): + def __init__(self, moe: FusedMoEConfig, layer: torch.nn.Module): from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501 detect_nvfp4_moe_support) + super().__init__(moe) _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__) self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported self.allow_flashinfer = _nvfp4.allow_flashinfer self.use_marlin = _nvfp4.use_marlin self.group_size = 16 - self.fused_experts = None # type: ignore[assignment] + self.layer = layer def create_weights(self, layer: torch.nn.Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -265,19 +274,36 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): layer.w2_input_scale_quant = torch.nn.Parameter( (layer.w2_input_global_scale), requires_grad=False) - def maybe_swap_experts_impl(self, moe_parallel_config): + def maybe_make_prepare_finalize( + self, + moe: FusedMoEConfig, + ) -> Optional[mk.FusedMoEPrepareAndFinalize]: if not self.allow_flashinfer: - return - self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel( - moe_parallel_config) + return super().maybe_make_prepare_finalize(moe) - def select_gemm_impl(self, prepare_finalize, moe): + prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize( + moe, + a1_gscale=self.layer.w13_input_scale_quant, + ) + logger.debug_once("%s", prepare_finalize.__class__.__name__) + return prepare_finalize + + def select_gemm_impl( + self, + prepare_finalize: mk.FusedMoEPrepareAndFinalize, + moe: FusedMoEConfig, + ) -> mk.FusedMoEPermuteExpertsUnpermute: """Return the appropriate GEMM experts implementation.""" - assert moe is not None and prepare_finalize is not None - from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( # noqa: E501 - select_nvfp4_gemm_impl) - - return select_nvfp4_gemm_impl(self.allow_flashinfer, moe, logger) + experts = select_nvfp4_gemm_impl( + moe, + g1_alphas=self.layer.g1_alphas, + g2_alphas=self.layer.g2_alphas, + a1_gscale=self.layer.w13_input_scale_quant, + a2_gscale=self.layer.w2_input_scale_quant, + allow_flashinfer=self.allow_flashinfer, + ) + logger.debug_once("Using %s", experts.__class__.__name__) + return experts def apply( self, @@ -301,6 +327,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError("EPLB not supported for " "`CompressedTensorsW4A4MoeMethod` yet.") @@ -317,6 +345,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): custom_routing_function=custom_routing_function, scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, ) if self.use_marlin: @@ -340,15 +369,22 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): # FlashInfer fused experts path if self.fused_experts is not None: - return flashinfer_fp4_cutlass_moe_forward( - self.fused_experts, - layer, - x, - topk_weights, - topk_ids, + assert is_valid_flashinfer_cutlass_fused_moe( + x, layer.w13_weight, layer.w2_weight), ( + "Flashinfer CUTLASS Fused MoE not applicable!") + + return self.fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=False, # TODO(shuw): fix later, now output is high prec activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, + w1_scale=layer.w13_blockscale_swizzled, + w2_scale=layer.w2_blockscale_swizzled, apply_router_weight_on_input=apply_router_weight_on_input, ) @@ -376,7 +412,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): n=layer.w2_weight.shape[2] * 2, k=x.shape[1], e=layer.w13_weight.shape[0], - device=x.device, apply_router_weight_on_input=apply_router_weight_on_input).to( x.dtype) @@ -384,15 +419,16 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): def __init__( - self, - quant_config: "CompressedTensorsConfig" # type: ignore # noqa E501 + self, + quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + moe: FusedMoEConfig, ): + super().__init__(moe) self.quant_config = quant_config self.weight_quant = self.quant_config.target_scheme_map["Linear"].get( "weights") self.input_quant = self.quant_config.target_scheme_map["Linear"].get( "input_activations") - self.topk_indices_dtype = None per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR and self.input_quant.strategy @@ -429,7 +465,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): self.weight_quant, self.input_quant) self.use_cutlass = (quant_config._is_fp8_w8a8_sm90( self.weight_quant, self.input_quant) or self.is_fp8_w8a8_sm100) - self.fused_experts = None # type: ignore[assignment] self.disable_expert_map = False def create_weights(self, layer: torch.nn.Module, num_experts: int, @@ -614,25 +649,31 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): ) -> FusedMoEPermuteExpertsUnpermute: # cutlass path if self.use_cutlass: - from vllm.model_executor.layers.fused_moe import CutlassExpertsFp8 + from vllm.model_executor.layers.fused_moe import ( + CutlassBatchedExpertsFp8, CutlassExpertsFp8) - use_batched_format = (prepare_finalize.activation_format == - FusedMoEActivationFormat.BatchedExperts) + experts: FusedMoEPermuteExpertsUnpermute num_dispatchers = prepare_finalize.num_dispatchers() - num_experts = (moe.num_local_experts - if use_batched_format else moe.num_experts) - logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__) - - experts = CutlassExpertsFp8( - num_experts, - moe.in_dtype, - self.input_quant.strategy == QuantizationStrategy.TOKEN, - self.weight_quant.strategy == QuantizationStrategy.CHANNEL, - num_dispatchers=num_dispatchers, - use_batched_format=use_batched_format, - ) + if (prepare_finalize.activation_format == + FusedMoEActivationFormat.BatchedExperts): + logger.debug("CutlassBatchedExpertsFp8(%s)", + self.__class__.__name__) + experts = CutlassBatchedExpertsFp8( + moe.num_local_experts, + num_dispatchers, + moe.in_dtype, + self.input_quant.strategy == QuantizationStrategy.TOKEN, + self.weight_quant.strategy == QuantizationStrategy.CHANNEL, + ) + else: + logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__) + experts = CutlassExpertsFp8( + moe.in_dtype, + self.input_quant.strategy == QuantizationStrategy.TOKEN, + self.weight_quant.strategy == QuantizationStrategy.CHANNEL, + ) self.disable_expert_map = (num_dispatchers > 1 or not experts.supports_expert_map()) @@ -834,9 +875,11 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): def __init__( - self, - quant_config: "CompressedTensorsConfig" # type: ignore # noqa E501 + self, + quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + moe: FusedMoEConfig, ): + super().__init__(moe) self.quant_config = quant_config self.weight_quant = self.quant_config.target_scheme_map["Linear"].get( "weights") @@ -934,6 +977,8 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for " @@ -951,7 +996,8 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) return fused_experts( hidden_states=x, @@ -975,9 +1021,11 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): def __init__( - self, - quant_config: "CompressedTensorsConfig" # type: ignore # noqa E501 + self, + quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + moe: FusedMoEConfig, ): + super().__init__(moe) self.quant_config = quant_config # TODO: @dsikka: refactor this to use schemes as other kernels # are supported + check if the layer is being ignored. @@ -1233,6 +1281,8 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for " @@ -1251,7 +1301,8 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) return torch.ops.vllm.fused_marlin_moe( x, @@ -1279,9 +1330,11 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): def __init__( - self, - quant_config: "CompressedTensorsConfig" # type: ignore # noqa E501 + self, + quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + moe: FusedMoEConfig, ): + super().__init__(moe) self.quant_config = quant_config # TODO: @dsikka: refactor this to use schemes as other kernels # are supported + check if the layer is being ignored. @@ -1459,6 +1512,8 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError("EPLB not supported for " "`CompressedTensorsWNA16MoEMethod` yet.") @@ -1475,7 +1530,8 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) return fused_experts( x, diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 47eca80609e0e..3e43caa4cbf72 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -6,7 +6,8 @@ from typing import Any, Callable, Optional import torch from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group -from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase +from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, + FusedMoEMethodBase) from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) from vllm.model_executor.layers.quantization import QuantizationMethods @@ -46,13 +47,18 @@ class ExpertsInt8Config(QuantizationConfig): if isinstance(layer, LinearBase): return UnquantizedLinearMethod() elif isinstance(layer, FusedMoE): - return ExpertsInt8MoEMethod(self) + return ExpertsInt8MoEMethod(self, layer.moe_config) return None class ExpertsInt8MoEMethod(FusedMoEMethodBase): - def __init__(self, quant_config: ExpertsInt8Config): + def __init__( + self, + quant_config: ExpertsInt8Config, + moe: FusedMoEConfig, + ): + super().__init__(moe) self.quant_config = quant_config def create_weights(self, layer: torch.nn.Module, num_experts: int, @@ -122,6 +128,8 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for `ExpertsInt8MoEMethod` yet.") @@ -138,7 +146,8 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) return fused_experts( x, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index dbd5234286952..a497449132510 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import functools from typing import TYPE_CHECKING, Any, Callable, Optional import torch @@ -142,7 +141,7 @@ class Fp8Config(QuantizationConfig): return UnquantizedLinearMethod() return Fp8LinearMethod(self) elif isinstance(layer, FusedMoE): - return Fp8MoEMethod(self) + return Fp8MoEMethod(self, layer.moe_config) elif isinstance(layer, Attention): return Fp8KVCacheMethod(self) return None @@ -479,9 +478,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): quant_config: The quantization config. """ - def __init__(self, quant_config: Fp8Config): - - from vllm.model_executor.layers.fused_moe import fused_experts + def __init__(self, quant_config: Fp8Config, moe: FusedMoEConfig): + super().__init__(moe) self.quant_config = quant_config self.block_quant = self.quant_config.weight_block_size is not None @@ -529,15 +527,6 @@ class Fp8MoEMethod(FusedMoEMethodBase): "CutlassBlockScaledGroupedGemm not supported on the current " "platform.") - self.topk_indices_dtype = None - self.fused_experts = functools.partial( # type: ignore - fused_experts, - use_fp8_w8a8=True, - block_shape=self.quant_config.weight_block_size, - allow_deep_gemm=self.allow_deep_gemm, - allow_cutlass_block_scaled_grouped_gemm=( - self.allow_cutlass_block_scaled_grouped_gemm)) - def create_weights(self, layer: Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): @@ -1033,7 +1022,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): num_expert_group=num_expert_group, topk_group=topk_group, apply_router_weight_on_input=apply_router_weight_on_input) - else: + elif self.fused_experts is not None: return self.fused_experts( hidden_states=x, w1=layer.w13_weight, @@ -1052,6 +1041,30 @@ class Fp8MoEMethod(FusedMoEMethodBase): a1_scale=layer.w13_input_scale, a2_scale=layer.w2_input_scale, ) + else: + from vllm.model_executor.layers.fused_moe import fused_experts + return fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + global_num_experts=global_num_experts, + apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=expert_map, + w1_scale=(layer.w13_weight_scale_inv + if self.block_quant else layer.w13_weight_scale), + w2_scale=(layer.w2_weight_scale_inv + if self.block_quant else layer.w2_weight_scale), + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + use_fp8_w8a8=True, + block_shape=self.quant_config.weight_block_size, + allow_deep_gemm=self.allow_deep_gemm, + allow_cutlass_block_scaled_grouped_gemm=( + self.allow_cutlass_block_scaled_grouped_gemm)) class Fp8KVCacheMethod(BaseKVCacheMethod): diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 86da04c39989b..49d28927d6e74 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -11,6 +11,7 @@ from torch.nn.parameter import Parameter, UninitializedParameter from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, + FusedMoEConfig, FusedMoEMethodBase) from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase from vllm.model_executor.layers.quantization import QuantizationMethods @@ -58,7 +59,7 @@ class GGUFConfig(QuantizationConfig): elif isinstance(layer, VocabParallelEmbedding): return GGUFEmbeddingMethod(self) elif isinstance(layer, FusedMoE): - return GGUFMoEMethod(self) + return GGUFMoEMethod(self, layer.moe_config) return None @@ -445,7 +446,12 @@ class GGUFMoEMethod(FusedMoEMethodBase): quant_config: The GGUF quantization config. """ - def __init__(self, quant_config: GGUFConfig): + def __init__( + self, + quant_config: GGUFConfig, + moe: FusedMoEConfig, + ): + super().__init__(moe) self.quant_config = quant_config def create_weights(self, layer: torch.nn.Module, num_experts: int, @@ -525,6 +531,8 @@ class GGUFMoEMethod(FusedMoEMethodBase): logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ): + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for `GGUFMoEMethod` yet.") @@ -545,7 +553,8 @@ class GGUFMoEMethod(FusedMoEMethodBase): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) return fused_moe_gguf(x, layer.w13_qweight, layer.w2_qweight, topk_weights, topk_ids, layer.w13_qweight_type.weight_type, diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 3299221e3af37..bd14ab9ef6c69 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -10,7 +10,7 @@ import vllm.model_executor.layers.fused_moe # noqa from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, + FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported, UnquantizedFusedMoEMethod) from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) @@ -375,7 +375,12 @@ class GPTQMarlinLinearMethod(LinearMethodBase): class GPTQMarlinMoEMethod(FusedMoEMethodBase): """MoE Marlin method with quantization.""" - def __init__(self, quant_config: GPTQMarlinConfig) -> None: + def __init__( + self, + quant_config: GPTQMarlinConfig, + moe: FusedMoEConfig, + ) -> None: + super().__init__(moe) self.quant_config = quant_config if self.quant_config.quant_type.size_bits == 4: self.quant_type = scalar_types.uint4b8 @@ -646,6 +651,8 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for `GPTQMarlinMoEMethod` yet.") @@ -662,7 +669,8 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) return torch.ops.vllm.fused_marlin_moe( x, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 22fbbab00e919..e0f462b36976f 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -12,7 +12,9 @@ import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig +from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( + is_valid_flashinfer_cutlass_fused_moe) from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, @@ -22,8 +24,8 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( - build_flashinfer_fp4_cutlass_moe_kernel, - flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1) + build_flashinfer_fp4_cutlass_moe_prepare_finalize, reorder_w1w3_to_w3w1, + select_nvfp4_gemm_impl) from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( apply_flashinfer_per_tensor_scale_fp8, register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights, swap_w13_to_w31) @@ -177,7 +179,7 @@ class ModelOptFp8Config(QuantizationConfig): elif isinstance(layer, Attention): return ModelOptFp8KVCacheMethod(self) elif isinstance(layer, FusedMoE): - return ModelOptFp8MoEMethod(self) + return ModelOptFp8MoEMethod(self, layer.moe_config) return None @@ -273,7 +275,12 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): quant_config: The ModelOpt quantization config. """ - def __init__(self, quant_config: ModelOptFp8Config) -> None: + def __init__( + self, + quant_config: ModelOptFp8Config, + moe: FusedMoEConfig, + ) -> None: + super().__init__(moe) self.quant_config = quant_config from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( cutlass_fp8_supported) @@ -454,6 +461,8 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for `ModelOptFp8MoEMethod` yet.") @@ -484,6 +493,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): custom_routing_function=custom_routing_function, scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, ) from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_experts) @@ -699,7 +709,7 @@ class ModelOptNvFp4Config(QuantizationConfig): elif isinstance(layer, Attention): return ModelOptFp8KVCacheMethod(self) elif isinstance(layer, FusedMoE): - return ModelOptNvFp4FusedMoE(self) + return ModelOptNvFp4FusedMoE(self, layer.moe_config, layer) return None @@ -923,10 +933,17 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): quant_config: NVFP4 Quant Config """ - def __init__(self, quant_config: ModelOptNvFp4Config) -> None: - self.quant_config = quant_config + def __init__( + self, + quant_config: ModelOptNvFp4Config, + moe: FusedMoEConfig, + layer: torch.nn.Module, + ) -> None: from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501 detect_nvfp4_moe_support) + super().__init__(moe) + self.quant_config = quant_config + self.layer = layer _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__) self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported self.allow_flashinfer = _nvfp4.allow_flashinfer @@ -952,27 +969,35 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): self.fused_experts: Optional[ mk.FusedMoEModularKernel] = None # type: ignore[assignment] - def maybe_swap_experts_impl( + def maybe_make_prepare_finalize( self, - moe_parallel_config: FusedMoEParallelConfig, - ): + moe: FusedMoEConfig, + ) -> Optional[mk.FusedMoEPrepareAndFinalize]: if not self.allow_flashinfer: - return - self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel( - moe_parallel_config) + return super().maybe_make_prepare_finalize(moe) - # This method update self.fused_experts - # only prepare_finalize is not None call select_gemm_impl - # so when native cutlass fp4, fused_expert is in fuse_moe.py fused_expert - # when it's not called(TP case), we still have 2 kernels to use. - def select_gemm_impl(self, prepare_finalize, - moe) -> mk.FusedMoEPermuteExpertsUnpermute: + prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize( + moe, + a1_gscale=self.layer.w13_input_scale_quant, + ) + logger.debug_once("%s", prepare_finalize.__class__.__name__) + return prepare_finalize - assert moe is not None and prepare_finalize is not None - from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( # noqa: E501 - select_nvfp4_gemm_impl) - - return select_nvfp4_gemm_impl(self.allow_flashinfer, moe, logger) + def select_gemm_impl( + self, + prepare_finalize: mk.FusedMoEPrepareAndFinalize, + moe: FusedMoEConfig, + ) -> mk.FusedMoEPermuteExpertsUnpermute: + experts = select_nvfp4_gemm_impl( + moe, + g1_alphas=self.layer.g1_alphas, + g2_alphas=self.layer.g2_alphas, + a1_gscale=self.layer.w13_input_scale_quant, + a2_gscale=self.layer.w2_input_scale_quant, + allow_flashinfer=self.allow_flashinfer, + ) + logger.debug_once("Using %s", experts.__class__.__name__) + return experts def uses_weight_scale_2_pattern(self) -> bool: """ @@ -1362,7 +1387,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) if self.use_marlin: return torch.ops.vllm.fused_marlin_moe( @@ -1404,21 +1430,28 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): n=layer.w2_weight.shape[2] * 2, k=x.shape[1], e=layer.w13_weight.shape[0], - device=x.device, expert_map=expert_map, apply_router_weight_on_input=apply_router_weight_on_input) else: assert self.allow_flashinfer and \ self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS - out = flashinfer_fp4_cutlass_moe_forward( - self.fused_experts, - layer, - x, - topk_weights, - topk_ids, + + assert is_valid_flashinfer_cutlass_fused_moe( + x, layer.w13_weight, layer.w2_weight), ( + "Flashinfer CUTLASS Fused MoE not applicable!") + + out = self.fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=False, # TODO(shuw): fix later, now output is high prec activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, + w1_scale=layer.w13_blockscale_swizzled, + w2_scale=layer.w2_blockscale_swizzled, apply_router_weight_on_input=apply_router_weight_on_input, ) diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index c5055a02fa3d5..364d1ac314d2d 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -7,7 +7,7 @@ import torch from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) + FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) from vllm.model_executor.layers.quantization import QuantizationMethods @@ -160,7 +160,7 @@ class MoeWNA16Config(QuantizationConfig): else: raise ValueError("moe_wna16 only support gptq and awq.") elif isinstance(layer, FusedMoE): - return MoeWNA16Method(self) + return MoeWNA16Method(self, layer.moe_config) return None @@ -175,7 +175,12 @@ class MoeWNA16Method(FusedMoEMethodBase): quant_config: The MOE WNA16 (W8A16/W4A16) quantization config. """ - def __init__(self, quant_config: MoeWNA16Config): + def __init__( + self, + quant_config: MoeWNA16Config, + moe: FusedMoEConfig, + ): + super().__init__(moe) self.quant_config = quant_config def create_weights(self, layer: torch.nn.Module, num_experts: int, @@ -302,6 +307,8 @@ class MoeWNA16Method(FusedMoEMethodBase): logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for `MoeWNA16Method` yet.") @@ -318,7 +325,8 @@ class MoeWNA16Method(FusedMoEMethodBase): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) weight_bits = self.quant_config.weight_bits has_zp = self.quant_config.has_zp diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index dbe6c603c0625..3c5d83037cde6 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -82,7 +82,7 @@ class Mxfp4Config(QuantizationConfig): class Mxfp4MoEMethod(FusedMoEMethodBase): def __init__(self, moe: FusedMoEConfig): - super().__init__() + super().__init__(moe) self.topk_indices_dtype = None self.moe = moe self.use_marlin = self._should_use_marlin() diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 6f69210d0861c..58f56c6381b31 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -7,7 +7,8 @@ import torch from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase, +from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, + FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.quantization.utils.mxfp4_utils import ( OCP_MX_BLOCK_SIZE) @@ -25,6 +26,9 @@ __all__ = [ class QuarkMoEMethod(FusedMoEMethodBase): + def __init__(self, moe: FusedMoEConfig): + super().__init__(moe) + @staticmethod def get_moe_method( quant_config: "QuarkConfig", # type: ignore # noqa E501 # noqa F821 @@ -42,17 +46,24 @@ class QuarkMoEMethod(FusedMoEMethodBase): input_config = layer_quant_config.get("input_tensors") if quant_config._is_fp8_w8a8(weight_config, input_config): - return QuarkW8A8Fp8MoEMethod(weight_config, input_config) + return QuarkW8A8Fp8MoEMethod(weight_config, input_config, + module.moe_config) elif quant_config._is_mx_fp4(weight_config, input_config): - return QuarkW4A4MXFp4MoEMethod(weight_config, input_config) + return QuarkW4A4MXFp4MoEMethod(weight_config, input_config, + module.moe_config) else: raise RuntimeError("Unsupported FusedMoe scheme") class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): - def __init__(self, weight_config: dict[str, Any], input_config: dict[str, - Any]): + def __init__( + self, + weight_config: dict[str, Any], + input_config: dict[str, Any], + moe: FusedMoEConfig, + ): + super().__init__(moe) self.weight_quant = weight_config self.input_quant = input_config @@ -215,6 +226,8 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet.") @@ -231,7 +244,8 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) return fused_experts( x, @@ -253,8 +267,13 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod): - def __init__(self, weight_config: dict[str, Any], input_config: dict[str, - Any]): + def __init__( + self, + weight_config: dict[str, Any], + input_config: dict[str, Any], + moe: FusedMoEConfig, + ): + super().__init__(moe) self.weight_quant = weight_config self.input_quant = input_config @@ -369,6 +388,7 @@ class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod): logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None if enable_eplb: raise NotImplementedError( @@ -386,7 +406,8 @@ class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) out = fused_experts( x, diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py index cceaf9857c40f..8bdb50e07b137 100644 --- a/vllm/model_executor/layers/quantization/rtn.py +++ b/vllm/model_executor/layers/quantization/rtn.py @@ -10,7 +10,8 @@ import torch.nn.functional as F from torch.nn.parameter import Parameter from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase +from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, + FusedMoEMethodBase) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, set_weight_attrs) from vllm.model_executor.layers.quantization import QuantizationMethods @@ -76,7 +77,7 @@ class RTNConfig(QuantizationConfig): if isinstance(layer, LinearBase): return RTNLinearMethod(self) elif isinstance(layer, FusedMoE): - return RTNMoEMethod(self) + return RTNMoEMethod(self, layer.moe_config) return None @@ -210,7 +211,8 @@ class RTNLinearMethod(LinearMethodBase): class RTNMoEMethod(FusedMoEMethodBase): - def __init__(self, quant_config: RTNConfig): + def __init__(self, quant_config: RTNConfig, moe: FusedMoEConfig): + super().__init__(moe) self.quant_config = quant_config def create_weights(self, layer: torch.nn.Module, num_experts: int, @@ -289,6 +291,8 @@ class RTNMoEMethod(FusedMoEMethodBase): logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + assert self.fused_experts is None + if enable_eplb: raise NotImplementedError( "EPLB not supported for `RTNMoEMethod` yet.") @@ -305,7 +309,8 @@ class RTNMoEMethod(FusedMoEMethodBase): num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) weight_bits = self.quant_config.weight_bits group_size = self.quant_config.group_size diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index 8ef91eeed406f..f5d7c57fe2a87 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -3,33 +3,30 @@ """Utility helpers for NVFP4 + FlashInfer fused-MoE path""" from __future__ import annotations -from typing import Optional - import torch import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk -from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig +from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( - FlashInferExperts, is_valid_flashinfer_cutlass_fused_moe) + FlashInferExperts) from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 FlashInferCutlassMoEPrepareAndFinalize) from vllm.platforms import current_platform - -logger = init_logger(__name__) +from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe __all__ = [ "is_flashinfer_fp4_cutlass_moe_available", "reorder_w1w3_to_w3w1", - "build_flashinfer_fp4_cutlass_moe_kernel", - "flashinfer_fp4_cutlass_moe_forward", + "build_flashinfer_fp4_cutlass_moe_prepare_finalize", ] def is_flashinfer_fp4_cutlass_moe_available() -> bool: """Return ``True`` when FlashInfer CUTLASS NV-FP4 kernels can be used.""" - return (envs.VLLM_USE_FLASHINFER_MOE_FP4 and current_platform.is_cuda() + return (envs.VLLM_USE_FLASHINFER_MOE_FP4 + and has_flashinfer_cutlass_fused_moe() + and current_platform.is_cuda() and current_platform.is_device_capability(100)) @@ -49,105 +46,33 @@ def reorder_w1w3_to_w3w1(weight: torch.Tensor, dim=dim).contiguous()) -def build_flashinfer_fp4_cutlass_moe_kernel( - moe_parallel_config: FusedMoEParallelConfig, ) -> mk.FusedMoEModularKernel: - """Create *and return* a FlashInfer CUTLASS fused-MoE modular kernel""" - experts = FlashInferExperts( - use_nvfp4_w4a4=True, - use_dp=moe_parallel_config.dp_size > 1, - ep_rank=moe_parallel_config.ep_rank, - ep_size=moe_parallel_config.ep_size, - tp_rank=moe_parallel_config.tp_rank, - tp_size=moe_parallel_config.tp_size, - ) - logger.debug_once("FlashInferExperts (util)") - return mk.FusedMoEModularKernel( - FlashInferCutlassMoEPrepareAndFinalize(quant_dtype=torch.uint8), - experts, - ) - - -def flashinfer_fp4_cutlass_moe_forward( - fused_experts: mk.FusedMoEModularKernel, - layer: torch.nn.Module, - x: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - activation: str, - global_num_experts: int, - expert_map: Optional[torch.Tensor], - apply_router_weight_on_input: bool, -) -> torch.Tensor: - """Common forward wrapper for FlashInfer NV-FP4 fused-MoE""" - - assert is_valid_flashinfer_cutlass_fused_moe( - x, layer.w13_weight, - layer.w2_weight), ("FlashInfer CUTLASS fused-MoE not applicable!") - - a1_gscale = layer.w13_input_scale_quant - a2_gscale = layer.w2_input_scale_quant - - extra_expert_args = { - "g1_alphas": layer.g1_alphas, - "g2_alphas": layer.g2_alphas, - # Avoid confusion with a1_scale and a2_scale - # where are batch size related. - "a1_gscale": a1_gscale, - "a2_gscale": a2_gscale, - "out_dtype": x.dtype, - } - extra_prepare_args = { - "use_dp": layer.dp_size > 1, - "local_tokens": x.shape[0], - "a1_gscale": a1_gscale, - } - extra_finalize_args = { - "use_dp": layer.dp_size > 1, - "local_tokens": x.shape[0], - } - - return fused_experts( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=False, # TODO(shuw): fix later, now output is high prec - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - w1_scale=layer.w13_blockscale_swizzled, - w2_scale=layer.w2_blockscale_swizzled, - apply_router_weight_on_input=apply_router_weight_on_input, - extra_expert_args=extra_expert_args, - extra_prepare_args=extra_prepare_args, - extra_finalize_args=extra_finalize_args, - ) +def build_flashinfer_fp4_cutlass_moe_prepare_finalize( + moe: FusedMoEConfig, + a1_gscale: torch.Tensor, +) -> mk.FusedMoEPrepareAndFinalize: + """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel""" + use_dp = moe.moe_parallel_config.dp_size > 1 + return FlashInferCutlassMoEPrepareAndFinalize(use_dp, a1_gscale=a1_gscale) def select_nvfp4_gemm_impl( - allow_flashinfer: bool, - moe, # FusedMoEConfig - logger): + moe: FusedMoEConfig, + g1_alphas: torch.Tensor, + g2_alphas: torch.Tensor, + a1_gscale: torch.Tensor, + a2_gscale: torch.Tensor, + allow_flashinfer: bool, +) -> mk.FusedMoEPermuteExpertsUnpermute: """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers""" - # lazy import - from vllm.distributed import get_ep_group - - all2all_manager = get_ep_group().device_communicator.all2all_manager - assert all2all_manager is not None - if allow_flashinfer: - flashinfer_backend = envs.VLLM_FLASHINFER_MOE_BACKEND - if flashinfer_backend != "throughput": - raise ValueError( - f"Only throughput backend is supported for FlashInferExperts, " - f"but got {flashinfer_backend}.") - logger.debug_once( - "Initializing FlashInferExperts with throughput backend.") return FlashInferExperts( - use_nvfp4_w4a4=True, - use_dp=moe.moe_parallel_config.dp_size > 1, + g1_alphas=g1_alphas, + g2_alphas=g2_alphas, + a1_gscale=a1_gscale, + a2_gscale=a2_gscale, + out_dtype=moe.in_dtype, + quant_dtype="nvfp4", ep_rank=moe.moe_parallel_config.ep_rank, ep_size=moe.moe_parallel_config.ep_size, tp_rank=moe.moe_parallel_config.tp_rank, From 6cd69f51bf9312b2e7f85d4831c1a101c7e9a6e5 Mon Sep 17 00:00:00 2001 From: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Date: Fri, 15 Aug 2025 14:47:56 -0400 Subject: [PATCH 006/361] [Model] Granite-4 support loading quantized checkpoint (#22925) Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> --- vllm/model_executor/models/granitemoehybrid.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 5704496b9a5d4..f451e65338b78 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -471,7 +471,10 @@ class GraniteMoeHybridModel(nn.Module): # Mapping different experts' layout: # from HF (input_linear, output_linear, router) # to vLLM (experts_w13({e}.w1, {e}.w2), experts_w3({e}.w3), gate) - if n.endswith('.block_sparse_moe.input_linear.weight'): + # The renaming and parameter loading logic is the same for weight + # and weight_scale tensors so we can reuse them without issues. + if (n.endswith('.block_sparse_moe.input_linear.weight') or + n.endswith('.block_sparse_moe.input_linear.weight_scale')): for e in range(p.size(0)): w1_name = n.replace( '.block_sparse_moe.input_linear.weight', @@ -490,7 +493,8 @@ class GraniteMoeHybridModel(nn.Module): w3_name, shard_id='w3', expert_id=e) - elif n.endswith('.block_sparse_moe.output_linear.weight'): + elif (n.endswith('.block_sparse_moe.output_linear.weight') or + n.endswith('.block_sparse_moe.output_linear.weight_scale')): for e in range(p.size(0)): w2_name = n.replace( '.block_sparse_moe.output_linear.weight', From df5afa82e5c41be7d87ddd1968e13891d22003a7 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 15 Aug 2025 14:51:50 -0400 Subject: [PATCH 007/361] [Log] Debug Once for Randomizing dummy data for DP Rank (#22860) Signed-off-by: yewentao256 --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3ea39dc519d86..bef67486d5183 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2192,7 +2192,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): high=self.model_config.get_vocab_size(), dtype=input_ids.dtype) - logger.debug("Randomizing dummy data for DP Rank") + logger.debug_once("Randomizing dummy data for DP Rank") input_ids.copy_(rand_input_ids()[:input_ids.size(0)], non_blocking=True) yield From 6e670778cdd87c282c42002a2304cb0a4a165904 Mon Sep 17 00:00:00 2001 From: Zebing Lin Date: Fri, 15 Aug 2025 15:12:12 -0400 Subject: [PATCH 008/361] [Core] direct indexing on self.block_table_np in compute_slot_mapping (#22940) Signed-off-by: linzebing --- vllm/v1/worker/block_table.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index bf38e88f0c2a1..5662fc350e198 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -91,8 +91,7 @@ class BlockTable: # block_size. block_table_indices = (req_indices * self.max_num_blocks_per_req + positions // self.block_size) - block_table_cpu = self.get_cpu_tensor() - block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() + block_numbers = self.block_table_np.ravel()[block_table_indices] block_offsets = positions % self.block_size np.add(block_numbers * self.block_size, block_offsets, From 79899b63f6d4517e002cc53eb1dcd47cd9e371ea Mon Sep 17 00:00:00 2001 From: nvjullin Date: Sat, 16 Aug 2025 04:08:37 +0800 Subject: [PATCH 009/361] [Bugfix] Added more env vars to hash (#22449) Signed-off-by: Julien Lin --- vllm/envs.py | 46 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 82084d1fc5ae1..861e4c6a1bbe5 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1199,14 +1199,6 @@ def compute_hash() -> str: affect the choice of different kernels or attention backends should also be included in the factors list. """ - factors: list[Any] = [] - - # summarize environment variables - def factorize(name: str): - if __getattr__(name): - factors.append(__getattr__(name)) - else: - factors.append("None") # The values of envs may affects the computation graph. # TODO(DefTruth): hash all environment variables? @@ -1221,11 +1213,45 @@ def compute_hash() -> str: "VLLM_DP_SIZE", "VLLM_USE_STANDALONE_COMPILE", "VLLM_FUSED_MOE_CHUNK_SIZE", + "VLLM_FLASHINFER_MOE_BACKEND", + "VLLM_V1_USE_PREFILL_DECODE_ATTENTION", + "VLLM_USE_AITER_UNIFIED_ATTENTION", + "VLLM_ATTENTION_BACKEND", + "VLLM_USE_FLASHINFER_SAMPLER", + "VLLM_FLASHINFER_FORCE_TENSOR_CORES", + "VLLM_DISABLED_KERNELS", + "VLLM_USE_DEEP_GEMM", "VLLM_USE_TRTLLM_FP4_GEMM", + "VLLM_USE_FLASHINFER_MOE_FP8", + "VLLM_USE_FLASHINFER_MOE_FP4", + "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", + "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", + "VLLM_USE_CUDNN_PREFILL", + "VLLM_USE_TRTLLM_ATTENTION", + "VLLM_ROCM_USE_AITER", + "VLLM_ROCM_USE_AITER_PAGED_ATTN", + "VLLM_ROCM_USE_AITER_LINEAR", + "VLLM_ROCM_USE_AITER_MOE", + "VLLM_ROCM_USE_AITER_RMSNORM", + "VLLM_ROCM_USE_AITER_MLA", + "VLLM_ROCM_USE_AITER_MHA", + "VLLM_ROCM_USE_SKINNY_GEMM", + "VLLM_ROCM_FP8_PADDING", + "VLLM_ROCM_MOE_PADDING", + "VLLM_ROCM_CUSTOM_PAGED_ATTN", + "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", + "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16", + "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", ] for key in environment_variables_to_hash: - if key in environment_variables: - factorize(key) + # if this goes out of sync with environment_variables, + # it's not a user error, it's a bug + assert key in environment_variables, \ + "Please update environment_variables_to_hash in envs.py" + + factors = [ + environment_variables[key]() for key in environment_variables_to_hash + ] hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() From a344a1a7da3de182018e2a39ac9739ec6433e5c5 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 15 Aug 2025 16:54:20 -0400 Subject: [PATCH 010/361] Use regex in convert-results-json-to-markdown.py (#22989) Signed-off-by: Michael Goin --- .../scripts/convert-results-json-to-markdown.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 496ee6083abde..77047636bb951 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -4,7 +4,6 @@ import argparse import json import os -import re import shlex from importlib import util from pathlib import Path @@ -12,6 +11,7 @@ from typing import Any import pandas as pd import psutil +import regex as re from tabulate import tabulate # latency results and the keys that will be printed into markdown From 8a87cd27d94f03068b9cbc85058636fc16222e24 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 15 Aug 2025 16:56:31 -0400 Subject: [PATCH 011/361] [CI] Speed up Whisper tests by reusing server (#22859) Signed-off-by: mgoin --- .../openai/test_transcription_validation.py | 316 ++++++++---------- .../openai/test_translation_validation.py | 232 +++++++------ 2 files changed, 260 insertions(+), 288 deletions(-) diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index e103bd206b54c..93239f41a4aeb 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -4,19 +4,20 @@ # imports for guided decoding tests import io import json -from unittest.mock import patch import librosa import numpy as np import openai import pytest +import pytest_asyncio import soundfile as sf -from openai._base_client import AsyncAPIClient from vllm.assets.audio import AudioAsset from ...utils import RemoteOpenAIServer +MODEL_NAME = "openai/whisper-large-v3-turbo" +SERVER_ARGS = ["--enforce-eager"] MISTRAL_FORMAT_ARGS = [ "--tokenizer_mode", "mistral", "--config_format", "mistral", "--load_format", "mistral" @@ -37,6 +38,18 @@ def winning_call(): yield f +@pytest.fixture(scope="module") +def server(): + with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", @@ -60,54 +73,11 @@ async def test_basic_audio(mary_had_lamb, model_name): assert "Mary had a little lamb," in out -@pytest.mark.asyncio -async def test_bad_requests(mary_had_lamb): - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - - # invalid language - with pytest.raises(openai.BadRequestError): - await client.audio.transcriptions.create(model=model_name, - file=mary_had_lamb, - language="hh", - temperature=0.0) - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3-turbo"]) -async def test_long_audio_request(mary_had_lamb, model_name): - server_args = ["--enforce-eager"] - - mary_had_lamb.seek(0) - audio, sr = librosa.load(mary_had_lamb) - # Add small silence after each audio for repeatability in the split process - audio = np.pad(audio, (0, 1600)) - repeated_audio = np.tile(audio, 10) - # Repeated audio to buffer - buffer = io.BytesIO() - sf.write(buffer, repeated_audio, sr, format='WAV') - buffer.seek(0) - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - transcription = await client.audio.transcriptions.create( - model=model_name, - file=buffer, - language="en", - response_format="text", - temperature=0.0) - out = json.loads(transcription)['text'] - counts = out.count("Mary had a little lamb") - assert counts == 10, counts - - @pytest.mark.asyncio async def test_non_asr_model(winning_call): # text to text model model_name = "JackFram/llama-68m" - server_args = ["--enforce-eager"] - with RemoteOpenAIServer(model_name, server_args) as remote_server: + with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server: client = remote_server.get_async_client() res = await client.audio.transcriptions.create(model=model_name, file=winning_call, @@ -120,157 +90,149 @@ async def test_non_asr_model(winning_call): @pytest.mark.asyncio -async def test_completion_endpoints(): +async def test_bad_requests(mary_had_lamb, client): + # invalid language + with pytest.raises(openai.BadRequestError): + await client.audio.transcriptions.create(model=MODEL_NAME, + file=mary_had_lamb, + language="hh", + temperature=0.0) + + +@pytest.mark.asyncio +async def test_long_audio_request(mary_had_lamb, client): + mary_had_lamb.seek(0) + audio, sr = librosa.load(mary_had_lamb) + # Add small silence after each audio for repeatability in the split process + audio = np.pad(audio, (0, 1600)) + repeated_audio = np.tile(audio, 10) + # Repeated audio to buffer + buffer = io.BytesIO() + sf.write(buffer, repeated_audio, sr, format='WAV') + buffer.seek(0) + transcription = await client.audio.transcriptions.create( + model=MODEL_NAME, + file=buffer, + language="en", + response_format="text", + temperature=0.0) + out = json.loads(transcription)['text'] + counts = out.count("Mary had a little lamb") + assert counts == 10, counts + + +@pytest.mark.asyncio +async def test_completion_endpoints(client): # text to text model - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - res = await client.chat.completions.create( - model=model_name, - messages=[{ - "role": "system", - "content": "You are a helpful assistant." - }]) - err = res.error - assert err["code"] == 400 - assert err[ - "message"] == "The model does not support Chat Completions API" + res = await client.chat.completions.create( + model=MODEL_NAME, + messages=[{ + "role": "system", + "content": "You are a helpful assistant." + }]) + err = res.error + assert err["code"] == 400 + assert err["message"] == "The model does not support Chat Completions API" - res = await client.completions.create(model=model_name, prompt="Hello") - err = res.error - assert err["code"] == 400 - assert err["message"] == "The model does not support Completions API" + res = await client.completions.create(model=MODEL_NAME, prompt="Hello") + err = res.error + assert err["code"] == 400 + assert err["message"] == "The model does not support Completions API" @pytest.mark.asyncio -async def test_streaming_response(winning_call): - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] +async def test_streaming_response(winning_call, client): transcription = "" - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - res_no_stream = await client.audio.transcriptions.create( - model=model_name, - file=winning_call, - response_format="json", - language="en", - temperature=0.0) - # Unfortunately this only works when the openai client is patched - # to use streaming mode, not exposed in the transcription api. - original_post = AsyncAPIClient.post + res_no_stream = await client.audio.transcriptions.create( + model=MODEL_NAME, + file=winning_call, + response_format="json", + language="en", + temperature=0.0) + res = await client.audio.transcriptions.create(model=MODEL_NAME, + file=winning_call, + language="en", + temperature=0.0, + stream=True, + timeout=30) + # Reconstruct from chunks and validate + async for chunk in res: + text = chunk.choices[0]['delta']['content'] + transcription += text - async def post_with_stream(*args, **kwargs): - kwargs['stream'] = True - return await original_post(*args, **kwargs) - - with patch.object(AsyncAPIClient, "post", new=post_with_stream): - client = remote_server.get_async_client() - res = await client.audio.transcriptions.create( - model=model_name, - file=winning_call, - language="en", - temperature=0.0, - extra_body=dict(stream=True), - timeout=30) - # Reconstruct from chunks and validate - async for chunk in res: - # just a chunk - text = chunk.choices[0]['delta']['content'] - transcription += text - - assert transcription == res_no_stream.text + assert transcription == res_no_stream.text @pytest.mark.asyncio -async def test_stream_options(winning_call): - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] - with RemoteOpenAIServer(model_name, server_args) as remote_server: - original_post = AsyncAPIClient.post - - async def post_with_stream(*args, **kwargs): - kwargs['stream'] = True - return await original_post(*args, **kwargs) - - with patch.object(AsyncAPIClient, "post", new=post_with_stream): - client = remote_server.get_async_client() - res = await client.audio.transcriptions.create( - model=model_name, - file=winning_call, - language="en", - temperature=0.0, - extra_body=dict(stream=True, - stream_include_usage=True, - stream_continuous_usage_stats=True), - timeout=30) - final = False - continuous = True - async for chunk in res: - if not len(chunk.choices): - # final usage sent - final = True - else: - continuous = continuous and hasattr(chunk, 'usage') - assert final and continuous +async def test_stream_options(winning_call, client): + res = await client.audio.transcriptions.create( + model=MODEL_NAME, + file=winning_call, + language="en", + temperature=0.0, + stream=True, + extra_body=dict(stream_include_usage=True, + stream_continuous_usage_stats=True), + timeout=30) + final = False + continuous = True + async for chunk in res: + if not len(chunk.choices): + # final usage sent + final = True + else: + continuous = continuous and hasattr(chunk, 'usage') + assert final and continuous @pytest.mark.asyncio -async def test_sampling_params(mary_had_lamb): +async def test_sampling_params(mary_had_lamb, client): """ Compare sampling with params and greedy sampling to assert results are different when extreme sampling parameters values are picked. """ - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - transcription = await client.audio.transcriptions.create( - model=model_name, - file=mary_had_lamb, - language="en", - temperature=0.8, - extra_body=dict(seed=42, - repetition_penalty=1.9, - top_k=12, - top_p=0.4, - min_p=0.5, - frequency_penalty=1.8, - presence_penalty=2.0)) + transcription = await client.audio.transcriptions.create( + model=MODEL_NAME, + file=mary_had_lamb, + language="en", + temperature=0.8, + extra_body=dict(seed=42, + repetition_penalty=1.9, + top_k=12, + top_p=0.4, + min_p=0.5, + frequency_penalty=1.8, + presence_penalty=2.0)) - greedy_transcription = await client.audio.transcriptions.create( - model=model_name, - file=mary_had_lamb, - language="en", - temperature=0.0, - extra_body=dict(seed=42)) + greedy_transcription = await client.audio.transcriptions.create( + model=MODEL_NAME, + file=mary_had_lamb, + language="en", + temperature=0.0, + extra_body=dict(seed=42)) - assert greedy_transcription.text != transcription.text + assert greedy_transcription.text != transcription.text @pytest.mark.asyncio -async def test_audio_prompt(mary_had_lamb): - model_name = "openai/whisper-large-v3-turbo" - server_args = ["--enforce-eager"] +async def test_audio_prompt(mary_had_lamb, client): prompt = "This is a speech, recorded in a phonograph." - with RemoteOpenAIServer(model_name, server_args) as remote_server: - #Prompts should not omit the part of original prompt while transcribing. - prefix = "The first words I spoke in the original phonograph" - client = remote_server.get_async_client() - transcription = await client.audio.transcriptions.create( - model=model_name, - file=mary_had_lamb, - language="en", - response_format="text", - temperature=0.0) - out = json.loads(transcription)['text'] - assert prefix in out - transcription_wprompt = await client.audio.transcriptions.create( - model=model_name, - file=mary_had_lamb, - language="en", - response_format="text", - prompt=prompt, - temperature=0.0) - out_prompt = json.loads(transcription_wprompt)['text'] - assert prefix in out_prompt + #Prompts should not omit the part of original prompt while transcribing. + prefix = "The first words I spoke in the original phonograph" + transcription = await client.audio.transcriptions.create( + model=MODEL_NAME, + file=mary_had_lamb, + language="en", + response_format="text", + temperature=0.0) + out = json.loads(transcription)['text'] + assert prefix in out + transcription_wprompt = await client.audio.transcriptions.create( + model=MODEL_NAME, + file=mary_had_lamb, + language="en", + response_format="text", + prompt=prompt, + temperature=0.0) + out_prompt = json.loads(transcription_wprompt)['text'] + assert prefix in out_prompt diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py index bfa9bdef1c001..f4f5c66f2deeb 100644 --- a/tests/entrypoints/openai/test_translation_validation.py +++ b/tests/entrypoints/openai/test_translation_validation.py @@ -4,18 +4,21 @@ import io # imports for guided decoding tests import json -from unittest.mock import patch +import httpx import librosa import numpy as np import pytest +import pytest_asyncio import soundfile as sf -from openai._base_client import AsyncAPIClient from vllm.assets.audio import AudioAsset from ...utils import RemoteOpenAIServer +MODEL_NAME = "openai/whisper-small" +SERVER_ARGS = ["--enforce-eager"] + @pytest.fixture def foscolo(): @@ -25,50 +28,23 @@ def foscolo(): yield f -# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation! -@pytest.mark.asyncio -async def test_basic_audio(foscolo): - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - translation = await client.audio.translations.create( - model=model_name, - file=foscolo, - response_format="text", - # TODO remove once language detection is implemented - extra_body=dict(language="it"), - temperature=0.0) - out = json.loads(translation)['text'].strip().lower() - assert "greek sea" in out +@pytest.fixture(scope="module") +def server(): + with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server: + yield remote_server -@pytest.mark.asyncio -async def test_audio_prompt(foscolo): - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] - # Condition whisper on starting text - prompt = "Nor have I ever" - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - transcription = await client.audio.translations.create( - model=model_name, - file=foscolo, - prompt=prompt, - extra_body=dict(language="it"), - response_format="text", - temperature=0.0) - out = json.loads(transcription)['text'] - assert "Nor will I ever touch the sacred" not in out - assert prompt not in out +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client @pytest.mark.asyncio async def test_non_asr_model(foscolo): # text to text model model_name = "JackFram/llama-68m" - server_args = ["--enforce-eager"] - with RemoteOpenAIServer(model_name, server_args) as remote_server: + with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server: client = remote_server.get_async_client() res = await client.audio.translations.create(model=model_name, file=foscolo, @@ -78,81 +54,117 @@ async def test_non_asr_model(foscolo): assert err["message"] == "The model does not support Translations API" +# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation! @pytest.mark.asyncio -async def test_streaming_response(foscolo): - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] +async def test_basic_audio(foscolo, client): + translation = await client.audio.translations.create( + model=MODEL_NAME, + file=foscolo, + response_format="text", + # TODO remove once language detection is implemented + extra_body=dict(language="it"), + temperature=0.0) + out = json.loads(translation)['text'].strip().lower() + assert "greek sea" in out + + +@pytest.mark.asyncio +async def test_audio_prompt(foscolo, client): + # Condition whisper on starting text + prompt = "Nor have I ever" + transcription = await client.audio.translations.create( + model=MODEL_NAME, + file=foscolo, + prompt=prompt, + extra_body=dict(language="it"), + response_format="text", + temperature=0.0) + out = json.loads(transcription)['text'] + assert "Nor will I ever touch the sacred" not in out + assert prompt not in out + + +@pytest.mark.asyncio +async def test_streaming_response(foscolo, client, server): translation = "" - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - res_no_stream = await client.audio.translations.create( - model=model_name, - file=foscolo, - response_format="json", - extra_body=dict(language="it"), - temperature=0.0) - # Unfortunately this only works when the openai client is patched - # to use streaming mode, not exposed in the translation api. - original_post = AsyncAPIClient.post + res_no_stream = await client.audio.translations.create( + model=MODEL_NAME, + file=foscolo, + response_format="json", + extra_body=dict(language="it"), + temperature=0.0) + # Stream via HTTPX since OpenAI translation client doesn't expose streaming + url = server.url_for("v1/audio/translations") + headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"} + data = { + "model": MODEL_NAME, + "language": "it", + "stream": True, + "temperature": 0.0, + } + foscolo.seek(0) + async with httpx.AsyncClient() as http_client: + files = {"file": foscolo} + async with http_client.stream("POST", + url, + headers=headers, + data=data, + files=files) as response: + async for line in response.aiter_lines(): + if not line: + continue + if line.startswith("data: "): + line = line[len("data: "):] + if line.strip() == "[DONE]": + break + chunk = json.loads(line) + text = chunk["choices"][0].get("delta", {}).get("content") + translation += text or "" - async def post_with_stream(*args, **kwargs): - kwargs['stream'] = True - return await original_post(*args, **kwargs) - - with patch.object(AsyncAPIClient, "post", new=post_with_stream): - client = remote_server.get_async_client() - res = await client.audio.translations.create(model=model_name, - file=foscolo, - temperature=0.0, - extra_body=dict( - stream=True, - language="it")) - # Reconstruct from chunks and validate - async for chunk in res: - # just a chunk - text = chunk.choices[0]['delta']['content'] - translation += text - - assert translation == res_no_stream.text + assert translation == res_no_stream.text @pytest.mark.asyncio -async def test_stream_options(foscolo): - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] - with RemoteOpenAIServer(model_name, server_args) as remote_server: - original_post = AsyncAPIClient.post - - async def post_with_stream(*args, **kwargs): - kwargs['stream'] = True - return await original_post(*args, **kwargs) - - with patch.object(AsyncAPIClient, "post", new=post_with_stream): - client = remote_server.get_async_client() - res = await client.audio.translations.create( - model=model_name, - file=foscolo, - temperature=0.0, - extra_body=dict(language="it", - stream=True, - stream_include_usage=True, - stream_continuous_usage_stats=True)) - final = False - continuous = True - async for chunk in res: - if not len(chunk.choices): +async def test_stream_options(foscolo, client, server): + url = server.url_for("v1/audio/translations") + headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"} + data = { + "model": MODEL_NAME, + "language": "it", + "stream": True, + "stream_include_usage": True, + "stream_continuous_usage_stats": True, + "temperature": 0.0, + } + foscolo.seek(0) + final = False + continuous = True + async with httpx.AsyncClient() as http_client: + files = {"file": foscolo} + async with http_client.stream("POST", + url, + headers=headers, + data=data, + files=files) as response: + async for line in response.aiter_lines(): + if not line: + continue + if line.startswith("data: "): + line = line[len("data: "):] + if line.strip() == "[DONE]": + break + chunk = json.loads(line) + choices = chunk.get("choices", []) + if not choices: # final usage sent final = True else: - continuous = continuous and hasattr(chunk, 'usage') - assert final and continuous + continuous = continuous and ("usage" in chunk) + assert final and continuous @pytest.mark.asyncio -async def test_long_audio_request(foscolo): - model_name = "openai/whisper-small" - server_args = ["--enforce-eager"] - +async def test_long_audio_request(foscolo, client): foscolo.seek(0) audio, sr = librosa.load(foscolo) repeated_audio = np.tile(audio, 2) @@ -160,13 +172,11 @@ async def test_long_audio_request(foscolo): buffer = io.BytesIO() sf.write(buffer, repeated_audio, sr, format='WAV') buffer.seek(0) - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - translation = await client.audio.translations.create( - model=model_name, - file=buffer, - extra_body=dict(language="it"), - response_format="text", - temperature=0.0) - out = json.loads(translation)['text'].strip().lower() - assert out.count("greek sea") == 2 + translation = await client.audio.translations.create( + model=MODEL_NAME, + file=buffer, + extra_body=dict(language="it"), + response_format="text", + temperature=0.0) + out = json.loads(translation)['text'].strip().lower() + assert out.count("greek sea") == 2 From 7f89ed248fef01098c7ce4bebb197b462eb15bc3 Mon Sep 17 00:00:00 2001 From: shixianc <49539556+shixianc@users.noreply.github.com> Date: Fri, 15 Aug 2025 14:02:12 -0700 Subject: [PATCH 012/361] [Fix] enable swap_ab for pplx problem size computation (#22991) Signed-off-by: Shixian Cui Co-authored-by: Shixian Cui --- .../quantization/cutlass_w8a8/moe/moe_data.cu | 45 +++++++++++++------ 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu index 857cca1e82df7..100f485084444 100644 --- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu +++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu @@ -161,6 +161,7 @@ void get_cutlass_moe_mm_data_caller( topk_ids.size(1)); } +template __global__ void compute_pplx_data(int32_t* expert_offsets, int32_t* problem_sizes1, int32_t* problem_sizes2, @@ -168,14 +169,23 @@ __global__ void compute_pplx_data(int32_t* expert_offsets, const int padded_m, const int n, const int k) { int expert_idx = threadIdx.x; - expert_offsets[expert_idx] = expert_idx * padded_m; - problem_sizes1[expert_idx * 3] = expert_num_tokens[expert_idx]; - problem_sizes1[expert_idx * 3 + 1] = 2 * n; - problem_sizes1[expert_idx * 3 + 2] = k; - problem_sizes2[expert_idx * 3] = expert_num_tokens[expert_idx]; - problem_sizes2[expert_idx * 3 + 1] = k; - problem_sizes2[expert_idx * 3 + 2] = n; + + if constexpr (!SWAP_AB) { + problem_sizes1[expert_idx * 3] = expert_num_tokens[expert_idx]; + problem_sizes1[expert_idx * 3 + 1] = 2 * n; + problem_sizes1[expert_idx * 3 + 2] = k; + problem_sizes2[expert_idx * 3] = expert_num_tokens[expert_idx]; + problem_sizes2[expert_idx * 3 + 1] = k; + problem_sizes2[expert_idx * 3 + 2] = n; + } else { + problem_sizes1[expert_idx * 3] = 2 * n; + problem_sizes1[expert_idx * 3 + 1] = expert_num_tokens[expert_idx]; + problem_sizes1[expert_idx * 3 + 2] = k; + problem_sizes2[expert_idx * 3] = k; + problem_sizes2[expert_idx * 3 + 1] = expert_num_tokens[expert_idx]; + problem_sizes2[expert_idx * 3 + 2] = n; + } } void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets, @@ -187,10 +197,19 @@ void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets, const int64_t n, const int64_t k) { auto stream = at::cuda::getCurrentCUDAStream(expert_offsets.device().index()); - compute_pplx_data<<<1, num_local_experts, 0, stream>>>( - static_cast(expert_offsets.data_ptr()), - static_cast(problem_sizes1.data_ptr()), - static_cast(problem_sizes2.data_ptr()), - static_cast(expert_num_tokens.data_ptr()), padded_m, n, - k); + if (num_local_experts * padded_m > SWAP_AB_THRESHOLD) { + compute_pplx_data<<<1, num_local_experts, 0, stream>>>( + static_cast(expert_offsets.data_ptr()), + static_cast(problem_sizes1.data_ptr()), + static_cast(problem_sizes2.data_ptr()), + static_cast(expert_num_tokens.data_ptr()), padded_m, n, + k); + } else { + compute_pplx_data<<<1, num_local_experts, 0, stream>>>( + static_cast(expert_offsets.data_ptr()), + static_cast(problem_sizes1.data_ptr()), + static_cast(problem_sizes2.data_ptr()), + static_cast(expert_num_tokens.data_ptr()), padded_m, n, + k); + } } \ No newline at end of file From 00d6cba0cf430f090e22e93331255cb66d560ff2 Mon Sep 17 00:00:00 2001 From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Date: Fri, 15 Aug 2025 14:09:23 -0700 Subject: [PATCH 013/361] Add PrefixRepetitionRandomDataset to `vllm bench serve` datasets (#20638) Signed-off-by: Seiji Eicher --- vllm/benchmarks/datasets.py | 133 +++++++++++++++++++++++++++++++++++- 1 file changed, 131 insertions(+), 2 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 5299dcf54b395..72d7ce49b8e14 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -26,6 +26,7 @@ from typing import Any, Callable, Optional, Union import numpy as np from PIL import Image from transformers import PreTrainedTokenizerBase +from typing_extensions import deprecated from vllm.lora.request import LoRARequest from vllm.lora.utils import get_adapter_absolute_path @@ -486,7 +487,10 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "--dataset-name", type=str, default="random", - choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"], + choices=[ + "sharegpt", "burstgpt", "sonnet", "random", "hf", "custom", + "prefix_repetition" + ], help="Name of the dataset to benchmark on.", ) parser.add_argument( @@ -603,6 +607,37 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "from the sampled HF dataset.", ) + prefix_repetition_group = parser.add_argument_group( + "prefix repetition dataset options") + prefix_repetition_group.add_argument( + "--prefix-repetition-prefix-len", + type=int, + default=256, + help="Number of prefix tokens per request, used only for prefix " + "repetition dataset.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-suffix-len", + type=int, + default=256, + help="Number of suffix tokens per request, used only for prefix " + "repetition dataset. Total input length is prefix_len + suffix_len.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-num-prefixes", + type=int, + default=10, + help="Number of prefixes to generate, used only for prefix repetition " + "dataset. Prompts per prefix is num_requests // num_prefixes.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-output-len", + type=int, + default=128, + help="Number of output tokens per request, used only for prefix " + "repetition dataset.", + ) + def get_samples(args, tokenizer) -> list[SampleRequest]: if args.dataset_name == "custom": @@ -721,6 +756,17 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: output_len=args.random_output_len, range_ratio=args.random_range_ratio, ), + "prefix_repetition": + lambda: PrefixRepetitionRandomDataset( + random_seed=args.seed, dataset_path=args.dataset_path + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + prefix_len=args.prefix_repetition_prefix_len, + suffix_len=args.prefix_repetition_suffix_len, + num_prefixes=args.prefix_repetition_num_prefixes, + output_len=args.prefix_repetition_output_len, + ), } try: @@ -828,7 +874,9 @@ class CustomDataset(BenchmarkDataset): # Sonnet Dataset Implementation # ----------------------------------------------------------------------------- - +@deprecated( + "SonnetDataset is deprecated and will be removed in a future version.", +) class SonnetDataset(BenchmarkDataset): """ Simplified implementation of the Sonnet dataset. Loads poem lines from a @@ -1537,3 +1585,84 @@ class MLPerfDataset(HuggingFaceDataset): self.maybe_oversample_requests(sampled_requests, num_requests) return sampled_requests + + +# ----------------------------------------------------------------------------- +# Prefix Repetition Dataset Implementation +# ----------------------------------------------------------------------------- + + +class PrefixRepetitionRandomDataset(BenchmarkDataset): + # Default values copied from benchmark_serving.py for the repeated prefix + # dataset. + DEFAULT_PREFIX_LEN = 256 + DEFAULT_SUFFIX_LEN = 256 + DEFAULT_NUM_PREFIXES = 10 + DEFAULT_OUTPUT_LEN = 128 + + def __init__( + self, + **kwargs, + ) -> None: + super().__init__(**kwargs) + random.seed(self.random_seed) + np.random.seed(self.random_seed) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + prefix_len: int = DEFAULT_PREFIX_LEN, + suffix_len: int = DEFAULT_SUFFIX_LEN, + num_prefixes: int = DEFAULT_NUM_PREFIXES, + output_len: int = DEFAULT_OUTPUT_LEN, + **kwargs, + ) -> list[SampleRequest]: + vocab_size = tokenizer.vocab_size + prompts_per_prefix = num_requests // num_prefixes + if prompts_per_prefix == 0: + raise ValueError( + f"num_requests ({num_requests}) must be greater than or equal " + f"to num_prefixes ({num_prefixes})" + ) + + def _generate_exact_length_tokens(target_length: int) -> list[int]: + """Generate tokens that decode and re-encode to exactly + target_length.""" + # Generate random tokens + tokens = np.random.randint( + 0, vocab_size, size=target_length).tolist() + text = tokenizer.decode(tokens) + re_encoded = tokenizer.encode(text, add_special_tokens=False) + + if len(re_encoded) == target_length: + return re_encoded + elif len(re_encoded) < target_length: + # Recursively generate additional consistent tokens + needed = target_length - len(re_encoded) + extra_tokens = _generate_exact_length_tokens(needed) + return re_encoded + extra_tokens + else: + # Truncate to target length + return re_encoded[:target_length] + + requests = [] + for _ in range(num_prefixes): + prefix_tokens = _generate_exact_length_tokens(prefix_len) + + for _ in range(prompts_per_prefix): + suffix_tokens = _generate_exact_length_tokens(suffix_len) + + combined_tokens = prefix_tokens + suffix_tokens + prompt = tokenizer.decode(combined_tokens) + prompt_len = len(combined_tokens) + requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + ) + ) + + random.shuffle(requests) + return requests From 1723ef1aae749929c1cbddd964ab3ffd96452a70 Mon Sep 17 00:00:00 2001 From: eigen <52445717+yyihuang@users.noreply.github.com> Date: Fri, 15 Aug 2025 17:38:10 -0400 Subject: [PATCH 014/361] minor: zero workspace buffer init for flashinfer trtllm-gen attn (#22603) --- tests/kernels/attention/test_flashinfer_trtllm_attention.py | 4 ++-- vllm/attention/backends/flashinfer.py | 2 +- vllm/v1/attention/backends/flashinfer.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py index 53e225ea3ea6c..4b84e6a00eceb 100644 --- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py +++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py @@ -113,7 +113,7 @@ def test_flashinfer_trtllm_decode_with_baseline( kv_indices = torch.tensor(kv_indices, dtype=torch.int32) kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32) - workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8) + workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8) wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper( workspace_buffer, kv_layout, @@ -247,7 +247,7 @@ def test_flashinfer_trtllm_prefill_with_baseline( kv_indices = torch.tensor(kv_indices, dtype=torch.int32) kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32) - workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8) + workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8) wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( workspace_buffer, kv_layout) wrapper.plan(q_indptr, diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 208cacec38eb5..a85ec24632834 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -203,7 +203,7 @@ class FlashInferState(AttentionState): def _get_workspace_buffer(self): if self._workspace_buffer is None: - self._workspace_buffer = torch.empty( + self._workspace_buffer = torch.zeros( FLASHINFER_WORKSPACE_BUFFER_SIZE, dtype=torch.uint8, device=self.runner.device) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 02decb171fc05..eac3f33e15096 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -252,7 +252,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): def _get_workspace_buffer(self): if self._workspace_buffer is None: - self._workspace_buffer = torch.empty( + self._workspace_buffer = torch.zeros( FLASHINFER_WORKSPACE_BUFFER_SIZE, dtype=torch.uint8, device=self.device) From 177e55e3bd3dbb54089d9062b763a413c8718dff Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 15 Aug 2025 17:41:07 -0400 Subject: [PATCH 015/361] [Attention] FA3 Attention Sinks Perf Boost (#22478) Signed-off-by: Lucas Wilkinson --- cmake/external_projects/vllm_flash_attn.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index d24d8e8e5e795..4e2a0e4533e60 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 93cf5a08f421a3efd0c4a7e005ef8f742b578ce0 + GIT_TAG 2d3b7508f67ad976f781e2042ace676419dd78dd GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn From f5d412bafbd9d4700ff57cb6a2d5220cf2b7637e Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Sat, 16 Aug 2025 00:55:26 +0200 Subject: [PATCH 016/361] [BugFix] Fix regression caused by mamba state dtype PR (#22998) Signed-off-by: Thomas Parnell --- vllm/model_executor/models/phi4flash.py | 8 ++++++-- vllm/model_executor/models/plamo2.py | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py index 493a4192d35ad..fcdfcb7bc1603 100644 --- a/vllm/model_executor/models/phi4flash.py +++ b/vllm/model_executor/models/phi4flash.py @@ -650,8 +650,12 @@ class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only): num_mamba_layers = self.config.num_hidden_layers \ // 2 // self.config.mb_per_layer + 1 self.mamba_cache = MambaCacheManager( - self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers, - *self._get_mamba_cache_shape()) + self.vllm_config, + num_mamba_layers, + *self._get_mamba_cache_shape(), + self.lm_head.weight.dtype, + self.lm_head.weight.dtype, + ) mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) attn_metadata = get_forward_context().attn_metadata diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 8b1df66f02805..e5034b536266a 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -767,8 +767,12 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP, self.vllm_config.parallel_config, LayerBlockType.mamba) self.mamba_cache = MambaCacheManager( - self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers, - *self._get_mamba_cache_shape()) + self.vllm_config, + num_mamba_layers, + *self._get_mamba_cache_shape(), + self.lm_head.weight.dtype, + self.lm_head.weight.dtype, + ) mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) From 76144adf765af39a0702a542b1b99bf3a2ad4e8f Mon Sep 17 00:00:00 2001 From: Eli Uriegas <1700823+seemethere@users.noreply.github.com> Date: Fri, 15 Aug 2025 16:16:23 -0700 Subject: [PATCH 017/361] ci: Add CUDA + arm64 release builds (#21201) Signed-off-by: Eli Uriegas --- .buildkite/release-pipeline.yaml | 16 ++++++++++++++++ docker/Dockerfile | 17 ++--------------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 6314afd652340..85d3e56387421 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,4 +1,20 @@ steps: + # aarch64 + CUDA builds + - label: "Build arm64 wheel - CUDA 12.8" + id: build-wheel-arm64-cuda-12-8 + agents: + queue: arm64_cpu_queue_postmerge + commands: + # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: + # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" + - "bash .buildkite/scripts/upload-wheels.sh" + env: + DOCKER_BUILDKIT: "1" + + # x86 + CUDA builds - label: "Build wheel - CUDA 12.8" id: build-wheel-cuda-12-8 agents: diff --git a/docker/Dockerfile b/docker/Dockerfile index 66a6e6fd6f67d..74938917781ac 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -139,21 +139,6 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ WORKDIR /workspace # install build and runtime dependencies - -# arm64 (GH200) build follows the practice of "use existing pytorch" build, -# we need to install torch and torchvision from the nightly builds first, -# pytorch will not appear as a vLLM dependency in all of the following steps -# after this step -RUN --mount=type=cache,target=/root/.cache/uv \ - if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - uv pip install --system \ - --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ - "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \ - uv pip install --system \ - --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ - --pre pytorch_triton==3.3.0+gitab727c40; \ - fi - COPY requirements/common.txt requirements/common.txt COPY requirements/cuda.txt requirements/cuda.txt RUN --mount=type=cache,target=/root/.cache/uv \ @@ -234,6 +219,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && sccache --show-stats; \ fi +ARG vllm_target_device="cuda" +ENV VLLM_TARGET_DEVICE=${vllm_target_device} ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/uv \ From 1fc375dc053424c3b8656802d35d5251e75fc857 Mon Sep 17 00:00:00 2001 From: rishitdholakia13 <123388671+rishitdholakia13@users.noreply.github.com> Date: Fri, 15 Aug 2025 17:25:05 -0600 Subject: [PATCH 018/361] [Structured Outputs] [Bug] Fix misalignment in apply_grammar_bitmask causing unintended masking and NaN logits (#22963) Signed-off-by: rishitdholakia13 --- vllm/v1/worker/gpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index bef67486d5183..4c919b392fbd9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1355,10 +1355,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): cumulative_index += 1 + num_spec_tokens grammar_bitmask = sorted_bitmask - # If the grammar bitmask and the logits have the same shape + # If the length of out indices and the logits have the same shape # we don't need to pass indices to the kernel, # since the bitmask is already aligned with the logits. - skip_out_indices = grammar_bitmask.shape[0] == logits.shape[0] + skip_out_indices = len(out_indices) == logits.shape[0] # Serialization of np.ndarray is much more efficient than a tensor, # so we receive it in that format. From b9dc9d260762cdd98946ac6bae88e298ed28b055 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 15 Aug 2025 16:38:42 -0700 Subject: [PATCH 019/361] [BugFix] Handle case where async utility call is cancelled (#22996) Signed-off-by: Nick Hill Co-authored-by: Yinghai Lu --- tests/v1/engine/test_engine_core_client.py | 24 +++++++++++++++++++++- vllm/v1/engine/core_client.py | 21 +++++++++++++------ 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index c82285639aee4..37eb869fe69a3 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -121,8 +121,13 @@ async def loop_until_fully_done_async(client: EngineCoreClient, outputs: dict): # Dummy utility function to monkey-patch into engine core. -def echo(self, msg: str, err_msg: Optional[str] = None) -> str: +def echo(self, + msg: str, + err_msg: Optional[str] = None, + sleep: Optional[float] = None) -> str: print(f"echo util function called: {msg}, {err_msg}") + if sleep is not None: + time.sleep(sleep) if err_msg is not None: raise ValueError(err_msg) return msg @@ -289,6 +294,23 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch): await core_client.call_utility_async("echo", None, "help!") assert str(e_info.value) == "Call to echo method failed: help!" + + # Test that cancelling the utility call doesn't destabilize the + # engine. + util_task = asyncio.create_task( + core_client.call_utility_async("echo", "testarg2", None, + 0.5)) # sleep for 0.5 sec + await asyncio.sleep(0.05) + cancelled = util_task.cancel() + assert cancelled + + # Ensure client is still functional. The engine runs utility + # methods in a single thread so this request won't be processed + # until the cancelled sleeping one is complete. + result = await asyncio.wait_for(core_client.call_utility_async( + "echo", "testarg3"), + timeout=1.0) + assert result == "testarg3" finally: client.shutdown() diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 29ee0a9dfb1e2..079dd9a7d38d1 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -574,13 +574,22 @@ class MPClient(EngineCoreClient): def _process_utility_output(output: UtilityOutput, utility_results: dict[int, AnyFuture]): - """Set the result from a utility method in the waiting future""" + """Set the result from a utility method in the waiting future.""" future = utility_results.pop(output.call_id) - if output.failure_message is not None: - future.set_exception(Exception(output.failure_message)) - else: - assert output.result is not None - future.set_result(output.result.result) + failure_message = output.failure_message + try: + if failure_message is not None: + future.set_exception(Exception(failure_message)) + else: + assert output.result is not None + future.set_result(output.result.result) + except asyncio.InvalidStateError: + # This can happen if the future is cancelled due to the + # original calling task being cancelled. + if failure_message is not None: + logger.error( + "Cancelled call to utility method failed " + "with error: %s", failure_message) class SyncMPClient(MPClient): From c280066f9dad0288a768a6234bea08171c4b88b9 Mon Sep 17 00:00:00 2001 From: Or Ozeri Date: Sat, 16 Aug 2025 02:52:52 +0300 Subject: [PATCH 020/361] [v1] Move block_hashes from KVCacheManager to Request.block_hashes (#19728) Signed-off-by: Or Ozeri --- tests/v1/core/test_async_scheduler.py | 22 +- tests/v1/core/test_kv_cache_utils.py | 50 ++-- tests/v1/core/test_prefix_caching.py | 225 ++++++++++-------- tests/v1/core/test_scheduler.py | 29 ++- .../core/test_single_type_kv_cache_manager.py | 2 - tests/v1/core/utils.py | 17 +- .../kv_connector/unit/test_nixl_connector.py | 2 + .../unit/test_remote_decode_lifecycle.py | 10 +- .../unit/test_remote_prefill_lifecycle.py | 17 +- tests/v1/kv_connector/unit/utils.py | 31 ++- vllm/utils/__init__.py | 18 ++ vllm/v1/core/block_pool.py | 75 ++---- vllm/v1/core/kv_cache_coordinator.py | 33 +-- vllm/v1/core/kv_cache_manager.py | 51 +--- vllm/v1/core/kv_cache_utils.py | 78 +++--- vllm/v1/core/sched/scheduler.py | 2 - vllm/v1/core/single_type_kv_cache_manager.py | 10 +- vllm/v1/engine/core.py | 22 +- vllm/v1/request.py | 22 +- 19 files changed, 381 insertions(+), 335 deletions(-) diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py index 3ccefbd81cab5..3a9492269f9c9 100644 --- a/tests/v1/core/test_async_scheduler.py +++ b/tests/v1/core/test_async_scheduler.py @@ -7,6 +7,7 @@ import pytest from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import RequestStatus +from vllm.v1.utils import ConstantList from .utils import create_requests, create_scheduler @@ -140,7 +141,8 @@ def test_prefix_caching_for_prefill_dedup(): requests = create_requests(num_requests=5, num_tokens=num_prompt_tokens, max_tokens=3, - same_prompt=True) + same_prompt=True, + block_size=BLOCK_SIZE) requests_copy = requests.copy() # Two requests with the same prompt. @@ -188,7 +190,8 @@ def test_prefix_caching_for_multi_turn(): block_size=BLOCK_SIZE) requests = create_requests(num_requests=5, num_tokens=num_prompt_tokens, - max_tokens=num_output_tokens) + max_tokens=num_output_tokens, + block_size=BLOCK_SIZE) for req in requests: scheduler.add_request(req) @@ -208,14 +211,19 @@ def test_prefix_caching_for_multi_turn(): # Create next-turn requests whose prompts are the full output of the # previous turn. - next_turn_requests = create_requests( - num_requests=5, - num_tokens=num_prompt_tokens + num_output_tokens, - max_tokens=num_output_tokens, - ) + next_turn_requests = create_requests(num_requests=5, + num_tokens=num_prompt_tokens + + num_output_tokens, + max_tokens=num_output_tokens, + block_size=BLOCK_SIZE) for i, req in enumerate(next_turn_requests): req.prompt_token_ids = (requests[i].prompt_token_ids + list(requests[i].output_token_ids)) + req._all_token_ids = req.prompt_token_ids.copy() + req.all_token_ids = ConstantList(req._all_token_ids) + req.block_hashes = [] + req.block_hashes = req.get_hash_new_full_blocks() + # Schedule the next-turn requests. for req in next_turn_requests: scheduler.add_request(req) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 182ea2b2345c4..e0b91e6dd7ee4 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib -from typing import Optional +from typing import Callable, Optional import pytest import torch @@ -19,7 +19,7 @@ from vllm.v1.core.kv_cache_utils import ( FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics, estimate_max_model_len, generate_block_hash_extra_keys, get_kv_cache_config, get_max_concurrency_for_kv_cache_config, - hash_block_tokens, hash_request_tokens, init_none_hash, + get_request_block_hasher, hash_block_tokens, init_none_hash, is_kv_cache_type_uniform, unify_kv_cache_configs) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheTensor, @@ -33,6 +33,8 @@ from vllm.v1.request import Request def make_request( request_id: str, prompt_token_ids: list[int], + block_size: int = 3, + hash_fn: Callable = hash, mm_positions: Optional[list[PlaceholderRange]] = None, mm_hashes: Optional[list[str]] = None, cache_salt: Optional[str] = None, @@ -49,18 +51,17 @@ def make_request( mm_item = MultiModalKwargsItem.from_elems([mm_elem]) mm_kwargs = [mm_item] * len(mm_positions) - return Request( - request_id=request_id, - prompt_token_ids=prompt_token_ids, - multi_modal_kwargs=mm_kwargs, - multi_modal_hashes=mm_hashes, - multi_modal_placeholders=mm_positions, - sampling_params=SamplingParams(max_tokens=17), - pooling_params=None, - eos_token_id=100, - lora_request=None, - cache_salt=cache_salt, - ) + return Request(request_id=request_id, + prompt_token_ids=prompt_token_ids, + multi_modal_kwargs=mm_kwargs, + multi_modal_hashes=mm_hashes, + multi_modal_placeholders=mm_positions, + sampling_params=SamplingParams(max_tokens=17), + pooling_params=None, + eos_token_id=100, + lora_request=None, + cache_salt=cache_salt, + block_hasher=get_request_block_hasher(block_size, hash_fn)) def new_kv_cache_spec(block_size=16, @@ -428,12 +429,14 @@ def test_hash_block_tokens(hash_fn): @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash]) -def test_hash_request_tokens(hash_fn): +def test_request_block_hasher(hash_fn): import vllm.v1.core.kv_cache_utils init_none_hash(hash_fn) request = make_request( request_id="0", prompt_token_ids=[_ for _ in range(6)], + block_size=3, + hash_fn=hash_fn, mm_positions=[ PlaceholderRange(offset=0, length=3), PlaceholderRange(offset=3, length=3), @@ -441,9 +444,7 @@ def test_hash_request_tokens(hash_fn): mm_hashes=["hash1", "hash2"], ) - block_size = 3 - block_hashes = hash_request_tokens(hash_fn, block_size, request) - + block_hashes = request.block_hashes assert len(block_hashes) == 2 assert isinstance(block_hashes[0], vllm.v1.core.kv_cache_utils.BlockHash) assert isinstance(block_hashes[1], vllm.v1.core.kv_cache_utils.BlockHash) @@ -464,6 +465,8 @@ def test_hash_tokens_different_mm_input(hash_fn): request1 = make_request( request_id="0", prompt_token_ids=[_ for _ in range(6)], + block_size=3, + hash_fn=hash_fn, mm_positions=[ PlaceholderRange(offset=0, length=3), PlaceholderRange(offset=3, length=3), @@ -479,9 +482,8 @@ def test_hash_tokens_different_mm_input(hash_fn): ], mm_hashes=["hash3", "hash2"], ) - block_size = 3 - block_hashes1 = hash_request_tokens(hash_fn, block_size, request1) - block_hashes2 = hash_request_tokens(hash_fn, block_size, request2) + block_hashes1 = request1.block_hashes + block_hashes2 = request2.block_hashes assert block_hashes1[0] != block_hashes2[0] assert block_hashes1[1] != block_hashes2[1] @@ -493,12 +495,13 @@ def test_hash_request_tokens_no_mm_inputs(hash_fn): request = make_request( request_id="0", prompt_token_ids=[_ for _ in range(6)], + block_size=3, + hash_fn=hash_fn, mm_positions=None, mm_hashes=None, ) - block_size = 3 - block_hashes = hash_request_tokens(hash_fn, block_size, request) + block_hashes = request.block_hashes assert len(block_hashes) == 2 assert block_hashes[0].token_ids == (0, 1, 2) @@ -858,6 +861,7 @@ def test_allocate_with_lookahead(): request = make_request( request_id="0", prompt_token_ids=[], + block_size=block_size, mm_positions=None, mm_hashes=None, ) diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 87acdef220133..28cfca6767b1e 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -3,7 +3,7 @@ """Compare the with and without prefix caching.""" import copy -from typing import Optional +from typing import Callable, Optional import pytest import torch @@ -17,8 +17,9 @@ from vllm.utils import sha256, sha256_cbor_64bit from vllm.v1.core.block_pool import BlockPool from vllm.v1.core.kv_cache_manager import KVCacheManager, Request from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId, - KVCacheBlock, hash_block_tokens, - init_none_hash) + KVCacheBlock, + get_request_block_hasher, + hash_block_tokens, init_none_hash) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, SlidingWindowSpec) @@ -26,6 +27,8 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, def make_request( request_id: str, prompt_token_ids: list[int], + block_size: int, + hash_fn: Callable, mm_positions: Optional[list[PlaceholderRange]] = None, mm_hashes: Optional[list[str]] = None, prompt_logprobs: Optional[int] = None, @@ -43,19 +46,18 @@ def make_request( mm_item = MultiModalKwargsItem.from_elems([mm_elem]) mm_kwargs = [mm_item] * len(mm_positions) - return Request( - request_id=request_id, - prompt_token_ids=prompt_token_ids, - multi_modal_kwargs=mm_kwargs, - multi_modal_hashes=mm_hashes, - multi_modal_placeholders=mm_positions, - sampling_params=SamplingParams(max_tokens=17, - prompt_logprobs=prompt_logprobs), - pooling_params=None, - eos_token_id=100, - lora_request=None, - cache_salt=cache_salt, - ) + return Request(request_id=request_id, + prompt_token_ids=prompt_token_ids, + multi_modal_kwargs=mm_kwargs, + multi_modal_hashes=mm_hashes, + multi_modal_placeholders=mm_positions, + sampling_params=SamplingParams( + max_tokens=17, prompt_logprobs=prompt_logprobs), + pooling_params=None, + eos_token_id=100, + lora_request=None, + cache_salt=cache_salt, + block_hasher=get_request_block_hasher(block_size, hash_fn)) def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig: @@ -105,11 +107,11 @@ def make_kv_cache_config_hybrid_model(block_size: int, @pytest.mark.parametrize("hash_algo", ["sha256", "sha256_cbor_64bit", "hash"]) def test_prefill(hash_algo): + block_size = 16 manager = KVCacheManager( - make_kv_cache_config(16, 11), + make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, - caching_hash_algo=hash_algo, ) # choose the hash function according to the parameter @@ -123,9 +125,9 @@ def test_prefill(hash_algo): # Incomplete 1 block (7 tokens) unique_token_ids = [3] * 7 all_token_ids = common_token_ids + unique_token_ids - req0 = make_request("0", all_token_ids) + req0 = make_request("0", all_token_ids, block_size, hash_fn) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) - assert len(manager.req_to_block_hashes[req0.request_id]) == 3 + assert len(req0.block_hashes) == 3 assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 blocks = manager.allocate_slots(req0, 55, @@ -152,9 +154,10 @@ def test_prefill(hash_algo): # Cache hit in the common prefix when the original block is still in use. # Incomplete 1 block (5 tokens) unique_token_ids = [3] * 5 - req1 = make_request("1", common_token_ids + unique_token_ids) + req1 = make_request("1", common_token_ids + unique_token_ids, block_size, + hash_fn) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) - assert len(manager.req_to_block_hashes[req1.request_id]) == 3 + assert len(req1.block_hashes) == 3 assert computed_blocks.get_block_ids() == ([1, 2, 3], ) assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 @@ -187,9 +190,10 @@ def test_prefill(hash_algo): # Cache hit in the common prefix when the original block is already free. # Incomplete 1 block (6 tokens) unique_token_ids = [3] * 6 - req2 = make_request("2", common_token_ids + unique_token_ids) + req2 = make_request("2", common_token_ids + unique_token_ids, block_size, + hash_fn) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) - assert len(manager.req_to_block_hashes[req2.request_id]) == 3 + assert len(req2.block_hashes) == 3 assert computed_blocks.get_block_ids() == ([1, 2, 3], ) assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 @@ -208,7 +212,7 @@ def test_prefill(hash_algo): manager.free(req2) # Cache miss and eviction. - req3 = make_request("3", [99] * (16 * 10)) + req3 = make_request("3", [99] * (16 * 10), block_size, hash_fn) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -242,9 +246,9 @@ def test_prefill_hybrid_model(): # Incomplete 1 block (7 tokens) unique_token_ids = [3] * 7 all_token_ids = common_token_ids + unique_token_ids - req0 = make_request("0", all_token_ids) + req0 = make_request("0", all_token_ids, block_size, hash_fn) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) - assert len(manager.req_to_block_hashes[req0.request_id]) == 3 + assert len(req0.block_hashes) == 3 assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 blocks = manager.allocate_slots(req0, 55, @@ -274,9 +278,10 @@ def test_prefill_hybrid_model(): # Cache hit in the common prefix # Incomplete 1 block (5 tokens) unique_token_ids = [3] * 5 - req1 = make_request("1", common_token_ids + unique_token_ids) + req1 = make_request("1", common_token_ids + unique_token_ids, block_size, + hash_fn) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) - assert len(manager.req_to_block_hashes[req1.request_id]) == 3 + assert len(req1.block_hashes) == 3 assert computed_blocks.get_block_ids() == ([1, 2, 3], [0, 6, 7], [0, 10, 11]) assert num_computed_tokens == 3 * 16 @@ -290,7 +295,7 @@ def test_prefill_hybrid_model(): if block != manager.block_pool.null_block: assert block.ref_cnt == 2 - block_hashes = manager.req_to_block_hashes[req1.request_id] + block_hashes = req1.block_hashes manager.free(req0) manager.free(req1) @@ -300,12 +305,13 @@ def test_prefill_hybrid_model(): def test_partial_request_hit(request_id: str, hash_to_evict: list[BlockHashWithGroupId], expect_hit_length: int): - req = make_request(request_id, common_token_ids + unique_token_ids) + req = make_request(request_id, common_token_ids + unique_token_ids, + block_size, hash) for hash_with_group_id in hash_to_evict: manager.block_pool.cached_block_hash_to_block.pop( hash_with_group_id) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req) - assert len(manager.req_to_block_hashes[req.request_id]) == 3 + assert len(req.block_hashes) == 3 assert num_computed_tokens == expect_hit_length * block_size for block_per_group in computed_blocks.blocks: assert len(block_per_group) == num_computed_tokens // block_size @@ -364,8 +370,9 @@ def test_prefill_plp(): 2. Schedule non-plp request and validate blocks 3. Schedule plp request; no hit should occur; validate blocks ''' + block_size = 16 manager = KVCacheManager( - make_kv_cache_config(16, 11), + make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, ) @@ -380,9 +387,13 @@ def test_prefill_plp(): # Incomplete 1 block (7 tokens) unique_token_ids = [3] * 7 all_token_ids = common_token_ids + unique_token_ids - req0 = make_request("0", all_token_ids, prompt_logprobs=5) + req0 = make_request("0", + all_token_ids, + block_size, + hash_fn, + prompt_logprobs=5) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) - assert len(manager.req_to_block_hashes[req0.request_id]) == 0 + assert len(req0.block_hashes) == 3 assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 blocks = manager.allocate_slots(req0, 55, @@ -411,9 +422,10 @@ def test_prefill_plp(): # Cache hit in the common prefix when the original block is still in use. # Incomplete 1 block (5 tokens) unique_token_ids = [3] * 5 - req1 = make_request("1", common_token_ids + unique_token_ids) + req1 = make_request("1", common_token_ids + unique_token_ids, block_size, + hash_fn) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) - assert len(manager.req_to_block_hashes[req1.request_id]) == 3 + assert len(req1.block_hashes) == 3 assert computed_blocks.get_block_ids() == ([1, 2, 3], ) assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 @@ -447,9 +459,11 @@ def test_prefill_plp(): unique_token_ids = [3] * 6 req2 = make_request("2", common_token_ids + unique_token_ids, + block_size, + hash_fn, prompt_logprobs=5) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) - assert len(manager.req_to_block_hashes[req2.request_id]) == 0 + assert len(req2.block_hashes) == 3 assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 blocks = manager.allocate_slots(req2, 55, @@ -469,8 +483,9 @@ def test_prefill_plp(): def test_decode(): + block_size = 16 manager = KVCacheManager( - make_kv_cache_config(16, 11), + make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, ) @@ -481,7 +496,8 @@ def test_decode(): # Fully cache miss # Incomplete 1 block (7 tokens) unique_token_ids = [3] * 7 - req0 = make_request("0", common_token_ids + unique_token_ids) + req0 = make_request("0", common_token_ids + unique_token_ids, block_size, + hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -518,14 +534,15 @@ def test_decode(): def test_evict(): + block_size = 16 manager = KVCacheManager( - make_kv_cache_config(16, 11), + make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, ) last_token_id = 5 * 16 + 7 - req0 = make_request("0", list(range(last_token_id))) + req0 = make_request("0", list(range(last_token_id)), block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -536,7 +553,8 @@ def test_evict(): # 3 blocks. req1 = make_request("1", list(range(last_token_id, - last_token_id + 3 * 16))) + last_token_id + 3 * 16)), block_size, + hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -558,7 +576,7 @@ def test_evict(): ] == [10, 6, 5, 4, 3, 2, 1, 9, 8, 7] # Touch the first 2 blocks. - req2 = make_request("2", list(range(2 * 16 + 3))) + req2 = make_request("2", list(range(2 * 16 + 3)), block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert computed_blocks.get_block_ids() == ([1, 2], ) assert num_computed_tokens == 2 * 16 @@ -583,7 +601,7 @@ def test_hash_block_correct_reuse(): # Allocate 1 block and cache it. num_tokens = block_size * 1 - req = make_request("0", list(range(num_tokens))) + req = make_request("0", list(range(num_tokens)), block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -597,7 +615,7 @@ def test_hash_block_correct_reuse(): # Allocate a new block that's not full, make sure hash info on the # block is cleared. - req = make_request("1", list(range(num_tokens - 1))) + req = make_request("1", list(range(num_tokens - 1)), block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -624,7 +642,7 @@ def test_computed_blocks_not_evicted(): # Allocate a block and cache it. num_tokens = block_size * 1 - req0 = make_request("0", list(range(num_tokens))) + req0 = make_request("0", list(range(num_tokens)), block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -635,7 +653,8 @@ def test_computed_blocks_not_evicted(): assert blocks.blocks[0][0].block_id == 1 # Allocate another block. - req1 = make_request("1", list(range(num_tokens, num_tokens * 2))) + req1 = make_request("1", list(range(num_tokens, num_tokens * 2)), + block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -651,7 +670,7 @@ def test_computed_blocks_not_evicted(): # Now if we have a cache hit on the first block, we should evict the second # cached block rather than the first one. - req2 = make_request("2", list(range(num_tokens * 2))) + req2 = make_request("2", list(range(num_tokens * 2)), block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert len(computed_blocks.blocks[0]) == 1 assert computed_blocks.blocks[0][0].block_id == 1 @@ -675,7 +694,8 @@ def test_basic_prefix_caching_disabled(): enable_caching=False, ) - req1 = make_request("1", list(range(10))) # 2 blocks and some more + req1 = make_request("1", list(range(10)), block_size, + hash) # 2 blocks and some more computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert not computed_blocks.blocks[0] @@ -689,7 +709,8 @@ def test_basic_prefix_caching_disabled(): manager.free(req1) # No caching. - req2 = make_request("2", list(range(16))) # shared prefix + req2 = make_request("2", list(range(16)), block_size, + hash) # shared prefix computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -699,7 +720,7 @@ def test_basic_prefix_caching_disabled(): assert len(blocks.blocks[0]) == 4 # New requests should not have any blocks. - req3 = make_request("3", list(range(4))) + req3 = make_request("3", list(range(4)), block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -727,20 +748,17 @@ def test_cache_blocks(hash_fn): # Block 1: [4, 5, 6, 7] # Block 2: [8, 9, 10, 11] # Block 3: [12, 13] - req = make_request("0", list(range(14))) + req = make_request("0", list(range(14)), block_size, hash_fn) # Test that blocks are cached correctly for 2 full blocks from the start. blocks = [KVCacheBlock(block_id=i) for i in range(2)] - block_hashes: list[BlockHash] = [] block_pool.cache_full_blocks( request=req, blocks=blocks, - block_hashes=block_hashes, num_cached_blocks=0, num_full_blocks=2, block_size=block_size, - hash_fn=hash_fn, kv_cache_group_id=0, ) @@ -752,11 +770,9 @@ def test_cache_blocks(hash_fn): block_pool.cache_full_blocks( request=req, blocks=blocks, - block_hashes=block_hashes, num_cached_blocks=2, num_full_blocks=3, block_size=block_size, - hash_fn=hash_fn, kv_cache_group_id=0, ) assert len(block_pool.cached_block_hash_to_block) == 3 @@ -775,23 +791,20 @@ def test_cache_blocks_multi_group(): # Block 1/5: [4, 5, 6, 7] # Block 2/6: [8, 9, 10, 11] # Block 3/7: [12, 13] - req = make_request("0", list(range(14))) + req = make_request("0", list(range(14)), block_size, hash) # Cache the blocks for group 0. blocks = [KVCacheBlock(block_id=i) for i in range(2)] - block_hashes: list[BlockHash] = [] block_pool.cache_full_blocks( request=req, blocks=blocks, - block_hashes=block_hashes, num_cached_blocks=0, num_full_blocks=2, block_size=block_size, - hash_fn=hash, kv_cache_group_id=0, ) assert len(block_pool.cached_block_hash_to_block) == 2 - assert len(block_hashes) == 2 + assert len(req.block_hashes) == 3 assert all([block.block_hash is not None for block in blocks]) # Cache the blocks for group 1. @@ -799,38 +812,36 @@ def test_cache_blocks_multi_group(): block_pool.cache_full_blocks( request=req, blocks=blocks, - block_hashes=block_hashes, num_cached_blocks=0, num_full_blocks=3, block_size=block_size, - hash_fn=hash, kv_cache_group_id=1, ) assert len(block_pool.cached_block_hash_to_block) == 5 - assert len(block_hashes) == 3 + assert len(req.block_hashes) == 3 assert all([block.block_hash is not None for block in blocks]) # Block hash 0: hit for group 0 and 1 # Block hash 1: hit for group 0 and 1 # Block hash 2: hit for group 1 - assert block_pool.get_cached_block(block_hashes[0], + assert block_pool.get_cached_block(req.block_hashes[0], kv_cache_group_ids=[0]) is not None - assert block_pool.get_cached_block(block_hashes[1], + assert block_pool.get_cached_block(req.block_hashes[1], kv_cache_group_ids=[0]) is not None - assert block_pool.get_cached_block(block_hashes[2], + assert block_pool.get_cached_block(req.block_hashes[2], kv_cache_group_ids=[0]) is None - assert block_pool.get_cached_block(block_hashes[0], + assert block_pool.get_cached_block(req.block_hashes[0], kv_cache_group_ids=[1]) is not None - assert block_pool.get_cached_block(block_hashes[1], + assert block_pool.get_cached_block(req.block_hashes[1], kv_cache_group_ids=[1]) is not None - assert block_pool.get_cached_block(block_hashes[2], + assert block_pool.get_cached_block(req.block_hashes[2], kv_cache_group_ids=[1]) is not None - assert block_pool.get_cached_block(block_hashes[0], + assert block_pool.get_cached_block(req.block_hashes[0], kv_cache_group_ids=[0, 1]) is not None - assert block_pool.get_cached_block(block_hashes[1], + assert block_pool.get_cached_block(req.block_hashes[1], kv_cache_group_ids=[0, 1]) is not None - assert block_pool.get_cached_block(block_hashes[2], + assert block_pool.get_cached_block(req.block_hashes[2], kv_cache_group_ids=[0, 1]) is None @@ -838,8 +849,9 @@ def test_mm_prefix_caching(): """ This tests that the multi-modal prefix caching is correct. """ + block_size = 16 manager = KVCacheManager( - make_kv_cache_config(16, 11), + make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, ) @@ -865,6 +877,8 @@ def test_mm_prefix_caching(): mm_hashes = common_mm_hashes + ["ccc"] req0 = make_request("0", all_token_ids, + block_size, + hash, mm_positions=mm_positions, mm_hashes=mm_hashes) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) @@ -872,7 +886,7 @@ def test_mm_prefix_caching(): # Completed block should have hashes with extra keys. assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 - block_hashes = manager.req_to_block_hashes[req0.request_id] + block_hashes = req0.block_hashes assert len(block_hashes) == 3 assert block_hashes[0].extra_keys == ("aaa", ) assert block_hashes[1].extra_keys == ("aaa", "bbb") @@ -905,6 +919,8 @@ def test_mm_prefix_caching(): mm_hashes = common_mm_hashes + ["ccc"] req1 = make_request("1", all_token_ids, + block_size, + hash, mm_positions=mm_positions, mm_hashes=mm_hashes) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) @@ -927,13 +943,13 @@ def test_cache_key_salting(): # 3 complete blocks and an incomplete block with 11 tokens. common_token_ids = [i for i in range(3) for _ in range(block_size)] token_ids = common_token_ids + [3] * 11 - req0 = make_request("0", token_ids, cache_salt="salt1") + req0 = make_request("0", token_ids, block_size, hash, cache_salt="salt1") computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) # Completed block should have hashes with extra keys. assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 - block_hashes = manager.req_to_block_hashes[req0.request_id] + block_hashes = req0.block_hashes assert len(block_hashes) == 3 assert block_hashes[0].extra_keys == ("salt1", ) assert block_hashes[1].extra_keys is None @@ -959,7 +975,7 @@ def test_cache_key_salting(): # Test cache hit with a new request that has the same salt. token_ids = common_token_ids + [4] * 11 - req1 = make_request("1", token_ids, cache_salt="salt1") + req1 = make_request("1", token_ids, block_size, hash, cache_salt="salt1") computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) # Should match only a prefix of 3 blocks. assert len(computed_blocks.blocks[0]) == 3 @@ -967,11 +983,11 @@ def test_cache_key_salting(): # Test cache miss with same content but different salt. token_ids = common_token_ids + [4] * 11 - req2 = make_request("2", token_ids, cache_salt="salt2") + req2 = make_request("2", token_ids, block_size, hash, cache_salt="salt2") computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert len(computed_blocks.blocks[0]) == 0 assert num_computed_tokens == 0 - block_hashes = manager.req_to_block_hashes[req2.request_id] + block_hashes = req2.block_hashes assert len(block_hashes) == 3 assert block_hashes[0].extra_keys == ("salt2", ) @@ -992,7 +1008,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): # Complete 3 blocks (48 tokens) # | Common-0 | Common-1 | Common-2 | ... | common_token_ids = [i for i in range(3) for _ in range(16)] - req0 = make_request("0", common_token_ids) + req0 = make_request("0", common_token_ids, block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -1003,7 +1019,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): req0.request_id] # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... | - req1 = make_request("1", common_token_ids * 2) + req1 = make_request("1", common_token_ids * 2, block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert computed_blocks.blocks[0] == block_part0 assert num_computed_tokens == 3 * 16 @@ -1020,19 +1036,19 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) | # | Req1-5(F)| Req2-0 | Req2-1 | ... | - req2 = make_request("2", [7] * block_size * 2) + req2 = make_request("2", [7] * block_size * 2, block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 manager.allocate_slots(req2, block_size * 2, - len(computed_blocks.blocks[0]) * 16, + len(computed_blocks.blocks[0]) * block_size, computed_blocks) # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed, # but it cannot be allocated due to insufficient free blocks (2). # In this case, the ref_cnt of the computed blocks should not be changed. assert manager.block_pool.free_block_queue.num_free_blocks == 5 - req3 = make_request("3", common_token_ids * 3) + req3 = make_request("3", common_token_ids * 3, block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3) assert computed_blocks.blocks[0] == block_part1 assert num_computed_tokens == 6 * 16 @@ -1047,8 +1063,9 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): def test_reset_prefix_cache(): + block_size = 16 manager = KVCacheManager( - make_kv_cache_config(16, 11), + make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, ) @@ -1056,15 +1073,15 @@ def test_reset_prefix_cache(): full_block_token_ids = [i for i in range(3) for _ in range(16)] unique_token_ids = [3] * 7 all_token_ids = full_block_token_ids + unique_token_ids - req0 = make_request("0", all_token_ids) + req0 = make_request("0", all_token_ids, block_size, hash) blocks = manager.allocate_slots(req0, 55) assert blocks.get_block_ids() == ([1, 2, 3, 4], ) unique_token_ids = [4] * 7 all_token_ids = full_block_token_ids + unique_token_ids - req1 = make_request("1", all_token_ids) + req1 = make_request("1", all_token_ids, block_size, hash) computed_blocks, _ = manager.get_computed_blocks(req1) - assert len(manager.req_to_block_hashes[req1.request_id]) == 3 + assert len(req1.block_hashes) == 3 assert len(computed_blocks.blocks[0]) == 3 blocks = manager.allocate_slots(req1, 7, len(computed_blocks.blocks[0]) * 16, @@ -1086,8 +1103,9 @@ def test_reset_prefix_cache(): def test_prefix_cache_stats_disabled(): """Test that prefix_cache_stats is None when log_stats is False.""" + block_size = 16 manager = KVCacheManager( - make_kv_cache_config(16, 11), + make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, log_stats=False, # Disable logging stats @@ -1095,7 +1113,7 @@ def test_prefix_cache_stats_disabled(): assert manager.prefix_cache_stats is None # Call all functions that check whether log_stats is disabled. - req = make_request("0", list(range(16))) + req = make_request("0", list(range(16)), block_size, hash) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req) assert not computed_blocks.blocks[0] assert num_computed_tokens == 0 @@ -1192,7 +1210,7 @@ def test_kv_cache_events(blocks_to_cache: int): ) num_tokens = block_size * blocks_to_cache - req0 = make_request("0", list(range(num_tokens))) + req0 = make_request("0", list(range(num_tokens)), block_size, hash) _ = manager.allocate_slots(req0, num_tokens) events = manager.take_events() @@ -1208,7 +1226,7 @@ def test_kv_cache_events(blocks_to_cache: int): # Should see block_to_cache number of removed block events and a new block # stored event manager.free(req0) - req1 = make_request("1", list(range(num_tokens))) + req1 = make_request("1", list(range(num_tokens)), block_size, hash) _ = manager.allocate_slots(req1, num_tokens) events = manager.take_events() @@ -1242,7 +1260,7 @@ def test_eagle_enabled_removes_last_block(): # Request with 3 full blocks (48 tokens) token_ids = [0] * (3 * block_size) - req = make_request("divisible_request", token_ids) + req = make_request("divisible_request", token_ids, block_size, hash) # Prime the cache computed_blocks, _ = manager.get_computed_blocks(req) @@ -1252,7 +1270,7 @@ def test_eagle_enabled_removes_last_block(): manager.free(req) # New request with same tokens + Eagle enabled - req_eagle = make_request("eagle_divisible", token_ids) + req_eagle = make_request("eagle_divisible", token_ids, block_size, hash) computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle) # Should retain 1 block: @@ -1273,7 +1291,7 @@ def test_eagle_with_partial_blocks(): ) # 2 full blocks + 5 tokens (non-divisible length) token_ids = [0] * (2 * block_size + 5) - req = make_request("partial_block_test", token_ids) + req = make_request("partial_block_test", token_ids, block_size, hash) # Prime the cache computed_blocks, _ = manager.get_computed_blocks(req) @@ -1283,7 +1301,7 @@ def test_eagle_with_partial_blocks(): manager.free(req) # New request with Eagle enabled - req_eagle = make_request("partial_eagle", token_ids) + req_eagle = make_request("partial_eagle", token_ids, block_size, hash) computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle) # Original match: 2 full blocks → Eagle removes 1 → 1 remaining assert len(computed_blocks.blocks[0]) == 1 @@ -1314,7 +1332,7 @@ def test_eagle_with_sliding_window(): # 2 full blocks + 5 tokens (non-divisible length) token_ids = [0] * (2 * block_size + 5) - req = make_request("partial_block_test", token_ids) + req = make_request("partial_block_test", token_ids, block_size, hash) # Prime the cache computed_blocks, _ = manager.get_computed_blocks(req) @@ -1322,12 +1340,12 @@ def test_eagle_with_sliding_window(): len(computed_blocks.blocks[0]) * 16, computed_blocks) # record the block hash of the first block in the request for later use - block_hash_first_block = manager.req_to_block_hashes[req.request_id][0] + block_hash_first_block = req.block_hashes[0] assert block_hash_first_block is not None manager.free(req) # New request with Eagle enabled - req_eagle = make_request("partial_eagle", token_ids) + req_eagle = make_request("partial_eagle", token_ids, block_size, hash) computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle) # Original match: 2 full blocks → Eagle removes 1 → 1 remaining assert len(computed_blocks.blocks[0]) == 1 @@ -1340,7 +1358,8 @@ def test_eagle_with_sliding_window(): BlockHashWithGroupId(block_hash_first_block, 0)) # New request - req_after_evict = make_request("partial_eagle_after_evict", token_ids) + req_after_evict = make_request("partial_eagle_after_evict", token_ids, + block_size, hash) computed_blocks, num_tokens = manager.get_computed_blocks(req_after_evict) # Cache miss. The only hit prefix is [NULL_BLOCK, BLOCK_2] if eagle is # not considered. But after dropping the last matched block due to eagle, diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 1c7dd0ca90b7e..ac70c90d92add 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -589,7 +589,7 @@ def test_preempt_during_execution(): block_size=16, num_blocks=11, enable_prefix_caching=False) - requests = create_requests(num_requests=2, num_tokens=80) + requests = create_requests(num_requests=2, num_tokens=80, block_size=16) # Schedule the first request. scheduler.add_request(requests[0]) @@ -762,7 +762,7 @@ def _assert_right_scheduler_output( def _assert_right_kv_cache_manager( scheduler: Scheduler, - req_ids: list[str], + requests: list[Request], num_tokens: int, block_size: int, num_requests: int, @@ -772,12 +772,12 @@ def _assert_right_kv_cache_manager( # Make sure the request stats are right. EXPECTED_TOTAL_BLOCKS = num_tokens // block_size - for req_id in req_ids: + for req in requests: blocks = (scheduler.kv_cache_manager.coordinator. - single_type_managers[0].req_to_blocks[req_id]) - hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id] + single_type_managers[0].req_to_blocks[req.request_id]) + hashes = req.block_hashes assert (scheduler.kv_cache_manager.coordinator.single_type_managers[0]. - num_cached_block[req_id] == EXPECTED_TOTAL_BLOCKS) + num_cached_block[req.request_id] == EXPECTED_TOTAL_BLOCKS) assert len(blocks) == EXPECTED_TOTAL_BLOCKS assert len(hashes) == EXPECTED_TOTAL_BLOCKS @@ -840,7 +840,8 @@ def test_kv_connector_basic(): MAX_TOKENS = 3 requests = create_requests(num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS, - max_tokens=MAX_TOKENS) + max_tokens=MAX_TOKENS, + block_size=BLOCK_SIZE) req_ids = [] req_to_index = {} for i, request in enumerate(requests): @@ -868,7 +869,7 @@ def test_kv_connector_basic(): ) # Ensure KVCacheManager is correct. - _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE, + _assert_right_kv_cache_manager(scheduler, requests, NUM_TOKENS, BLOCK_SIZE, NUM_REQUESTS, NUM_TOTAL_BLOCKS) # Continue Generation until done. @@ -886,7 +887,8 @@ def test_kv_connector_basic(): NUM_TOKENS = NUM_TOKENS_PREFIX * 2 requests = create_requests(num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS, - max_tokens=MAX_TOKENS) + max_tokens=MAX_TOKENS, + block_size=BLOCK_SIZE) req_ids = [] req_to_index = {} for i, request in enumerate(requests): @@ -915,7 +917,7 @@ def test_kv_connector_basic(): NUM_MATCHED_NEW_TOKENS)) # Ensure KVCacheManager is correct. - _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE, + _assert_right_kv_cache_manager(scheduler, requests, NUM_TOKENS, BLOCK_SIZE, NUM_REQUESTS, NUM_TOTAL_BLOCKS) # Continue Generation until done. @@ -953,7 +955,8 @@ def test_kv_connector_unable_to_allocate(): MAX_TOKENS = 2 requests = create_requests(num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS, - max_tokens=MAX_TOKENS) + max_tokens=MAX_TOKENS, + block_size=BLOCK_SIZE) req_ids = [] req_to_index = {} for i, request in enumerate(requests): @@ -1034,7 +1037,8 @@ def test_kv_connector_handles_preemption(): MAX_TOKENS = BLOCK_SIZE * 2 requests = create_requests(num_requests=NUM_REQUESTS, num_tokens=NUM_TOKENS, - max_tokens=MAX_TOKENS) + max_tokens=MAX_TOKENS, + block_size=BLOCK_SIZE) req_ids = [] req_to_index = {} for i, request in enumerate(requests): @@ -1162,7 +1166,6 @@ def assert_scheduler_empty(scheduler: Scheduler): # KVCache Manager. assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0]. req_to_blocks) == 0 - assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0 assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0]. num_cached_block) == 0 num_free_blocks = ( diff --git a/tests/v1/core/test_single_type_kv_cache_manager.py b/tests/v1/core/test_single_type_kv_cache_manager.py index b67c05bd7ac10..7dcebba491fab 100644 --- a/tests/v1/core/test_single_type_kv_cache_manager.py +++ b/tests/v1/core/test_single_type_kv_cache_manager.py @@ -17,7 +17,6 @@ from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec, def get_sliding_window_manager(sliding_window_spec, block_pool): return SlidingWindowManager(sliding_window_spec, block_pool, - caching_hash_fn=lambda x: x, kv_cache_group_id=0) @@ -25,7 +24,6 @@ def get_chunked_local_attention_manager(chunked_local_attention_spec, block_pool): return ChunkedLocalAttentionManager(chunked_local_attention_spec, block_pool, - caching_hash_fn=lambda x: x, kv_cache_group_id=0) diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 484afe61fc3fb..52093d3d381ae 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -10,6 +10,8 @@ from vllm.multimodal.inputs import (MultiModalBatchedField, MultiModalFieldElem, MultiModalKwargsItem, PlaceholderRange) from vllm.sampling_params import SamplingParams +from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, + init_none_hash) from vllm.v1.core.sched.async_scheduler import AsyncScheduler from vllm.v1.core.sched.scheduler import Scheduler from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, @@ -114,6 +116,9 @@ def create_scheduler( ) +_none_hash_initialized = False + + def create_requests( num_requests: int, num_tokens: int = 10, @@ -122,7 +127,14 @@ def create_requests( stop_token_ids: Optional[list[int]] = None, prompt_logprobs: Optional[int] = None, same_prompt: bool = False, + block_size: int = 16, ) -> list[Request]: + global _none_hash_initialized + if not _none_hash_initialized: + init_none_hash(hash) + _none_hash_initialized = True + + block_hasher = get_request_block_hasher(block_size, hash) sampling_params = SamplingParams(ignore_eos=False, max_tokens=max_tokens, stop_token_ids=stop_token_ids, @@ -139,9 +151,11 @@ def create_requests( ) mm_item = MultiModalKwargsItem.from_elems([mm_elem]) mm_kwargs = [mm_item] * len(mm_position) + mm_hashes = ["hash"] * len(mm_position) else: mm_position = None mm_kwargs = None + mm_hashes = None prompt_token_ids = ([0] * num_tokens if same_prompt else [i] * num_tokens) request = Request( @@ -151,8 +165,9 @@ def create_requests( pooling_params=None, multi_modal_kwargs=mm_kwargs, multi_modal_placeholders=mm_position, - multi_modal_hashes=None, + multi_modal_hashes=mm_hashes, eos_token_id=EOS_TOKEN_ID, + block_hasher=block_hasher, ) requests.append(request) return requests diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index b185936ab025f..e6859ea738277 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -147,6 +147,7 @@ def test_basic_interface(): NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) request = create_request(request_id=1, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, do_remote_prefill=True) request_id = request.request_id @@ -186,6 +187,7 @@ def test_prompt_less_than_block_size(): # Request will have 1 partial remote block. request = create_request(request_id=1, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, do_remote_prefill=True, num_remote_blocks=1) diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py index 2f8228864e7b4..d8c56ac42f718 100644 --- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py @@ -21,6 +21,7 @@ def test_basic_lifecycle(): NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) request = create_request(request_id=1, + block_size=BLOCK_SIZE, max_tokens=1, num_tokens=NUM_TOKENS, do_remote_decode=True) @@ -103,8 +104,10 @@ def test_short_prompt_lifecycle(): scheduler = create_scheduler(vllm_config) # Not enough tokens for full block. - NUM_TOKENS = vllm_config.cache_config.block_size // 2 + BLOCK_SIZE = vllm_config.cache_config.block_size + NUM_TOKENS = BLOCK_SIZE // 2 request = create_request(request_id=1, + block_size=BLOCK_SIZE, max_tokens=1, num_tokens=NUM_TOKENS, do_remote_decode=True) @@ -148,7 +151,9 @@ def test_prefix_cache_lifecycle(): NUM_EXTERNAL_FULL_BLOCKS = 3 NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) - request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS) + request_normal = create_request(request_id=1, + block_size=BLOCK_SIZE, + num_tokens=NUM_TOKENS) scheduler.add_request(request_normal) scheduler_output = scheduler.schedule() @@ -166,6 +171,7 @@ def test_prefix_cache_lifecycle(): NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) request_remote = create_request(request_id=1, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, do_remote_decode=True) diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py index 87f7490698a31..21fec5344255c 100644 --- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py @@ -23,6 +23,7 @@ def test_basic_lifecycle(): scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks) request = create_request(request_id=1, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, do_remote_prefill=True) @@ -133,14 +134,17 @@ def test_interleaved_lifecycle(): NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) request_remote = create_request(request_id=1, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, do_remote_prefill=True) request_local_a = create_request( request_id=2, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, ) request_local_b = create_request( request_id=3, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, ) @@ -236,6 +240,7 @@ def test_no_spurious_prefix_caching(): # Both of these requests have prompts like [1,1,1,1,1, ...] request_remote = create_request( request_id=1, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, do_remote_prefill=True, use_all_1s_for_prompt_tokens=True, @@ -243,6 +248,7 @@ def test_no_spurious_prefix_caching(): request_local = create_request( request_id=2, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, do_remote_prefill=False, use_all_1s_for_prompt_tokens=True, @@ -292,6 +298,7 @@ def test_full_block_prompt(): NUM_TOKENS = int(BLOCK_SIZE * NUM_EXTERNAL_FULL_BLOCKS) request = create_request(request_id=1, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS, do_remote_prefill=True) @@ -364,8 +371,11 @@ def test_cannot_schedule_after_recv(): NUM_TOKENS_LOCAL = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS) NUM_TOKENS_REMOTE = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS) - request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS_LOCAL) + request_normal = create_request(request_id=1, + block_size=BLOCK_SIZE, + num_tokens=NUM_TOKENS_LOCAL) request_remote = create_request(request_id=2, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS_REMOTE, do_remote_prefill=True) @@ -456,8 +466,11 @@ def test_cannot_recv(): NUM_TOKENS_LOCAL = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS) NUM_TOKENS_REMOTE = int(BLOCK_SIZE * (NUM_PROMPT_BLOCKS + 0.5)) - request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS_LOCAL) + request_normal = create_request(request_id=1, + block_size=BLOCK_SIZE, + num_tokens=NUM_TOKENS_LOCAL) request_remote = create_request(request_id=2, + block_size=BLOCK_SIZE, num_tokens=NUM_TOKENS_REMOTE, do_remote_prefill=True) diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 60847c48585c6..8c5d132c00ae4 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import tempfile from collections import defaultdict -from typing import Any, Optional +from typing import Any, Callable, Optional import torch @@ -14,6 +14,8 @@ from vllm.distributed.kv_transfer.kv_connector.factory import ( from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( # noqa SharedStorageConnector) from vllm.v1.core.kv_cache_manager import KVCacheBlocks +from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, + init_none_hash) from vllm.v1.core.sched.scheduler import Scheduler from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec) @@ -40,7 +42,6 @@ def assert_scheduler_empty(scheduler: Scheduler): # KVCache Manager. assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0]. req_to_blocks) == 0 - assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0 assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0]. num_cached_block) == 0 num_free_blocks = ( @@ -115,16 +116,23 @@ def create_scheduler( ) -def create_request( - request_id: int, - num_tokens: int = 10, - max_tokens: int = 16, - do_remote_decode: bool = False, - do_remote_prefill: bool = False, - use_all_1s_for_prompt_tokens: bool = False, - num_remote_blocks: int = 3, -) -> Request: +_none_hash_initialized = False + + +def create_request(request_id: int, + num_tokens: int = 10, + max_tokens: int = 16, + do_remote_decode: bool = False, + do_remote_prefill: bool = False, + use_all_1s_for_prompt_tokens: bool = False, + num_remote_blocks: int = 3, + block_size: int = 16, + hash_fn: Callable = hash) -> Request: """Make dummy request for testing.""" + global _none_hash_initialized + if not _none_hash_initialized: + init_none_hash(hash) + _none_hash_initialized = True kv_transfer_params: Optional[dict[str, Any]] = None @@ -158,6 +166,7 @@ def create_request( multi_modal_placeholders=None, multi_modal_hashes=None, eos_token_id=EOS_TOKEN_ID, + block_hasher=get_request_block_hasher(block_size, hash_fn), ) req.kv_transfer_params = kv_transfer_params return req diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index a1f8ad164762d..72857ee2abc77 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -3243,6 +3243,24 @@ def sha256_cbor_64bit(input) -> int: return full_hash & ((1 << 64) - 1) +def get_hash_fn_by_name(hash_fn_name: str) -> Callable: + """Get a hash function by name, or raise an error if + the function is not found. + Args: + hash_fn_name: Name of the hash function. + Returns: + A hash function. + """ + if hash_fn_name == "sha256": + return sha256 + if hash_fn_name == "sha256_cbor_64bit": + return sha256_cbor_64bit + if hash_fn_name == "builtin": + return hash + + raise ValueError(f"Unsupported hash function: {hash_fn_name}") + + def is_torch_equal_or_newer(target: str) -> bool: """Check if the installed torch version is >= the target version. diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index ad9854dd29c38..839297135fe0a 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -2,15 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import defaultdict from collections.abc import Iterable -from typing import Callable, Optional +from typing import Optional from vllm.distributed.kv_events import (AllBlocksCleared, BlockRemoved, BlockStored, KVCacheEvent) from vllm.logger import init_logger from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId, - FreeKVCacheBlockQueue, KVCacheBlock, - generate_block_hash_extra_keys, - hash_block_tokens) + FreeKVCacheBlockQueue, KVCacheBlock) from vllm.v1.request import Request logger = init_logger(__name__) @@ -97,84 +95,39 @@ class BlockPool: self, request: Request, blocks: list[KVCacheBlock], - block_hashes: list[BlockHash], num_cached_blocks: int, num_full_blocks: int, block_size: int, kv_cache_group_id: int, - hash_fn: Callable, ) -> None: """Cache a list of full blocks for prefix caching. This function takes a list of blocks that will have their block hash - metadata to be updated and cached. Given a request, it computes the - block hashes for the blocks starting from `num_cached_blocks` to - `num_full_blocks`, updating the metadata for each block - and caching them in the `cached_block_hash_to_block`. + metadata to be updated and cached. Given a request, it updates the + metadata for each block and caching it in the + `cached_block_hash_to_block`. + The block hashes values are computed by the Request object immediately + when it is created and when new tokens are appended. Args: request: The request to cache the blocks. blocks: All blocks in the request. - block_hashes: Block hashes of the blocks in the request. Note that - this list may be shorter than the blocks list. In this case the - missed block hash will be computed in this function. num_cached_blocks: The number of blocks that are already cached. num_full_blocks: The number of blocks that are full and should be cached after this function. block_size: Number of tokens in each block. kv_cache_group_id: The id of the KV cache group. - hash_fn: The hash function to use for block hashes. """ if num_cached_blocks == num_full_blocks: return new_full_blocks = blocks[num_cached_blocks:num_full_blocks] - assert len(block_hashes) >= num_cached_blocks - new_block_hashes = block_hashes[num_cached_blocks:] + assert len(request.block_hashes) >= num_full_blocks + new_block_hashes = request.block_hashes[num_cached_blocks:] - # Update the new blocks with the block hashes through the chain. - if num_cached_blocks == 0: - prev_block_hash_value = None - else: - prev_block = blocks[num_cached_blocks - 1] - assert prev_block.block_hash is not None - prev_block_hash_value = prev_block.block_hash.get_hash_value() - - parent_block_hash = prev_block_hash_value new_hashes: Optional[list[int]] = ([] if self.enable_kv_cache_events else None) for i, blk in enumerate(new_full_blocks): assert blk.block_hash is None - - if i < len(new_block_hashes): - # The block hash may already be computed in - # "get_computed_blocks" if the tokens are not generated by - # this request (either the prompt tokens or the previously - # generated tokens with preemption), or by other - # single_type_managers with the same block_size. - # In this case we simply reuse the block hash. - block_hash = new_block_hashes[i] - else: - # Otherwise compute the block hash and cache it in the request - # in case it will be preempted in the future. - blk_idx = num_cached_blocks + i - start_token_idx = blk_idx * block_size - end_token_idx = (blk_idx + 1) * block_size - block_tokens = request.all_token_ids[ - start_token_idx:end_token_idx] - assert len(block_tokens) == block_size, ( - f"Expected {block_size} tokens, got " - f"{len(block_tokens)} at {blk_idx}th block for request " - f"{request.request_id}({request})") - - # Generate extra keys for multi-modal inputs. Note that since - # we reach to this branch only when the block is completed with - # generated tokens, we only need to consider the last mm input. - extra_keys, _ = generate_block_hash_extra_keys( - request, start_token_idx, end_token_idx, -1) - - # Compute the hash of the current block. - block_hash = hash_block_tokens(hash_fn, prev_block_hash_value, - block_tokens, extra_keys) - block_hashes.append(block_hash) + block_hash = new_block_hashes[i] # Update and added the full block to the cache. block_hash_with_group_id = BlockHashWithGroupId( @@ -184,9 +137,15 @@ class BlockPool: blk.block_id] = blk if new_hashes is not None: new_hashes.append(block_hash.hash_value) - prev_block_hash_value = block_hash.hash_value if self.enable_kv_cache_events: + if num_cached_blocks == 0: + parent_block_hash = None + else: + parent_block = blocks[num_cached_blocks - 1] + assert parent_block.block_hash is not None + parent_block_hash = parent_block.block_hash.get_hash_value() + self.kv_event_queue.append( BlockStored( block_hashes=new_hashes, diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index f3a16d64e19fd..a0ea4d96015a2 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod -from typing import Callable, Optional +from typing import Optional from vllm.v1.core.block_pool import BlockPool from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock @@ -23,7 +23,6 @@ class KVCacheCoordinator(ABC): max_model_len: int, use_eagle: bool, enable_caching: bool, - caching_hash_fn: Callable, enable_kv_cache_events: bool, ): self.kv_cache_config = kv_cache_config @@ -40,7 +39,6 @@ class KVCacheCoordinator(ABC): kv_cache_spec=kv_cache_group.kv_cache_spec, block_pool=self.block_pool, kv_cache_group_id=i, - caching_hash_fn=caching_hash_fn, ) for i, kv_cache_group in enumerate( self.kv_cache_config.kv_cache_groups)) @@ -99,19 +97,17 @@ class KVCacheCoordinator(ABC): manager.allocate_new_blocks(request_id, num_tokens) for manager in self.single_type_managers) - def cache_blocks(self, request: Request, block_hashes: list[BlockHash], - num_computed_tokens: int) -> None: + def cache_blocks(self, request: Request, num_computed_tokens: int) -> None: """ Cache the blocks for the request. Args: request: The request. - block_hashes: The block hashes of the request. num_tokens: The total number of tokens that need to be cached (including tokens that are already cached). """ for manager in self.single_type_managers: - manager.cache_blocks(request, block_hashes, num_computed_tokens) + manager.cache_blocks(request, num_computed_tokens) def free(self, request_id: str) -> None: """ @@ -184,10 +180,9 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator): """ def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int, - use_eagle: bool, caching_hash_fn: Callable, - enable_kv_cache_events: bool): + use_eagle: bool, enable_kv_cache_events: bool): super().__init__(kv_cache_config, max_model_len, use_eagle, False, - caching_hash_fn, enable_kv_cache_events) + enable_kv_cache_events) self.num_single_type_manager = len(self.single_type_managers) def get_num_common_prefix_blocks(self, request_id: str, @@ -213,10 +208,9 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator): def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool, enable_caching: bool, - caching_hash_fn: Callable, enable_kv_cache_events: bool): + enable_kv_cache_events: bool): super().__init__(kv_cache_config, max_model_len, use_eagle, - enable_caching, caching_hash_fn, - enable_kv_cache_events) + enable_caching, enable_kv_cache_events) self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[ 0].kv_cache_spec self.block_size = self.kv_cache_spec.block_size @@ -250,10 +244,9 @@ class HybridKVCacheCoordinator(KVCacheCoordinator): def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool, enable_caching: bool, - caching_hash_fn: Callable, enable_kv_cache_events: bool): + enable_kv_cache_events: bool): super().__init__(kv_cache_config, max_model_len, use_eagle, - enable_caching, caching_hash_fn, - enable_kv_cache_events) + enable_caching, enable_kv_cache_events) self.verify_and_split_kv_cache_groups() def verify_and_split_kv_cache_groups(self) -> None: @@ -386,17 +379,15 @@ class HybridKVCacheCoordinator(KVCacheCoordinator): def get_kv_cache_coordinator( kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool, - enable_caching: bool, caching_hash_fn: Callable, + enable_caching: bool, enable_kv_cache_events: bool) -> KVCacheCoordinator: if not enable_caching: return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len, - use_eagle, caching_hash_fn, + use_eagle, enable_kv_cache_events) if len(kv_cache_config.kv_cache_groups) == 1: return UnitaryKVCacheCoordinator(kv_cache_config, max_model_len, use_eagle, enable_caching, - caching_hash_fn, enable_kv_cache_events) return HybridKVCacheCoordinator(kv_cache_config, max_model_len, use_eagle, - enable_caching, caching_hash_fn, - enable_kv_cache_events) + enable_caching, enable_kv_cache_events) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index ce333dbe61a19..bfaa7ab08f5cf 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -1,16 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections import defaultdict from dataclasses import dataclass from typing import Optional from vllm.distributed.kv_events import KVCacheEvent from vllm.logger import init_logger -from vllm.utils import sha256, sha256_cbor_64bit from vllm.v1.core.kv_cache_coordinator import get_kv_cache_coordinator -from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock, - hash_request_tokens, init_none_hash) +from vllm.v1.core.kv_cache_utils import KVCacheBlock from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request, RequestStatus @@ -71,23 +68,13 @@ class KVCacheManager: kv_cache_config: KVCacheConfig, max_model_len: int, enable_caching: bool = True, - caching_hash_algo: str = "builtin", use_eagle: bool = False, log_stats: bool = False, enable_kv_cache_events: bool = False, ) -> None: self.max_model_len = max_model_len - if len(kv_cache_config.kv_cache_groups) == 0: - # Attention free models don't have kv cache, - # thus don't need prefix caching. - enable_caching = False self.enable_caching = enable_caching - - self.caching_hash_fn = ( - sha256_cbor_64bit if caching_hash_algo == "sha256_cbor_64bit" else - sha256 if caching_hash_algo == "sha256" else hash) - init_none_hash(self.caching_hash_fn) self.use_eagle = use_eagle self.log_stats = log_stats # FIXME: make prefix cache stats conditional on log_stats @@ -107,19 +94,12 @@ class KVCacheManager: max_model_len=self.max_model_len, use_eagle=self.use_eagle, enable_caching=self.enable_caching, - caching_hash_fn=self.caching_hash_fn, enable_kv_cache_events=enable_kv_cache_events, ) self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups) self.block_pool = self.coordinator.block_pool self.kv_cache_config = kv_cache_config - # Mapping from request ID to kv block hashes. - # This is to avoid recomputing the block hashes for each call of - # `get_computed_blocks` or `allocate_slots`. - self.req_to_block_hashes: defaultdict[ - str, list[BlockHash]] = defaultdict(list) - @property def usage(self) -> float: """Get the KV cache usage. @@ -161,15 +141,6 @@ class KVCacheManager: and request.sampling_params.prompt_logprobs is not None)): return self.create_empty_block_list(), 0 - # The block hashes for the request may already be computed - # if the scheduler has tried to schedule the request before. - block_hashes = self.req_to_block_hashes[request.request_id] - if not block_hashes: - assert self.block_size is not None - block_hashes = hash_request_tokens(self.caching_hash_fn, - self.block_size, request) - self.req_to_block_hashes[request.request_id] = block_hashes - # NOTE: When all tokens hit the cache, we must recompute the last token # to obtain logits. Thus, set max_cache_hit_length to prompt_length - 1. # This can trigger recomputation of an entire block, rather than just @@ -178,7 +149,7 @@ class KVCacheManager: # could slightly improve performance in the future. max_cache_hit_length = request.num_tokens - 1 computed_blocks, num_new_computed_tokens = ( - self.coordinator.find_longest_cache_hit(block_hashes, + self.coordinator.find_longest_cache_hit(request.block_hashes, max_cache_hit_length)) if self.log_stats: @@ -296,11 +267,7 @@ class KVCacheManager: # at `request.num_tokens`, ensuring only "finalized" tokens are cached. num_tokens_to_cache = min(num_computed_tokens + num_new_tokens, request.num_tokens) - self.coordinator.cache_blocks( - request, - self.req_to_block_hashes[request.request_id], - num_tokens_to_cache, - ) + self.coordinator.cache_blocks(request, num_tokens_to_cache) return KVCacheBlocks(new_blocks) @@ -373,14 +340,6 @@ class KVCacheManager: return self.coordinator.get_num_common_prefix_blocks( request.request_id, num_running_requests) - def free_block_hashes(self, request: Request) -> None: - """Discard the block hashes for the request. - - NOTE: Unlike `free`, this method should be called only when the request - is finished, not when it is preempted. - """ - self.req_to_block_hashes.pop(request.request_id, None) - def take_events(self) -> list[KVCacheEvent]: """Take the KV cache events from the block pool. @@ -397,9 +356,7 @@ class KVCacheManager: def cache_blocks(self, request: Request, num_computed_tokens: int) -> None: """Cache the blocks for the request, if enabled.""" if self.enable_caching: - block_hashes = self.req_to_block_hashes[request.request_id] - self.coordinator.cache_blocks(request, block_hashes, - num_computed_tokens) + self.coordinator.cache_blocks(request, num_computed_tokens) def create_empty_block_list(self) -> KVCacheBlocks: """Creates a new KVCacheBlocks instance with no blocks.""" diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 626aa35a770c9..6a62c55fb2d5f 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -547,41 +547,61 @@ def hash_block_tokens( curr_block_token_ids_tuple, extra_keys) -def hash_request_tokens(hash_function: Any, block_size: int, - request: Request) -> list[BlockHash]: - """Computes hash values of a chain of blocks given a sequence of - token IDs. The hash value is used for prefix caching. - - Args: - block_size: The size of each block. - request: The request object. - - Returns: - The list of computed hash values. +def get_request_block_hasher( + block_size: int, + caching_hash_fn: Callable[[Any], + int]) -> Callable[[Request], list[BlockHash]]: """ - token_ids = request.all_token_ids + Returns a function which computes the list of un-computed block hashes + of a request. - req_need_extra_keys = need_extra_keys(request) - req_extra_keys = None - curr_mm_idx = 0 + Each request holds a list of its block hashes (request.block_hashes). + When a request is created, it calls the below function to compute + the hashes of all full blocks of the request's initial tokens. + The hashes are then stored in request.block_hashes. + Later, whenever new tokens are appended to the request, it calls + the below function again to compute any new full blocks of tokens. + The returned new hashes are appended to request.block_hashes. + """ - ret = [] - parent_block_hash_value = None - # Only full blocks will be hashed - for start in range(0, len(token_ids) - block_size + 1, block_size): - end = start + block_size - block_token_ids = token_ids[start:end] + def request_block_hasher(request: Request) -> list[BlockHash]: + start_token_idx = len(request.block_hashes) * block_size + num_tokens = request.num_tokens + + curr_mm_idx = 0 + if start_token_idx > 0: + # Set curr_mm_idx = -1 to indicate the last mm input. + # Note that since we reach to this branch only when the block is + # completed with generated tokens, we only need to consider the + # last mm input. + curr_mm_idx = -1 + + prev_block_hash_value = request.block_hashes[-1].hash_value \ + if request.block_hashes else None + new_block_hashes: list[BlockHash] = [] + while True: + end_token_idx = start_token_idx + block_size + if end_token_idx > num_tokens: + # We only hash full blocks + break - if req_need_extra_keys: # MM and LoRA requests need extra keys for block-hash computation. - req_extra_keys, curr_mm_idx = generate_block_hash_extra_keys( - request, start, end, curr_mm_idx) + extra_keys, curr_mm_idx = generate_block_hash_extra_keys( + request, start_token_idx, end_token_idx, curr_mm_idx) - block_hash = hash_block_tokens(hash_function, parent_block_hash_value, - block_token_ids, req_extra_keys) - ret.append(block_hash) - parent_block_hash_value = block_hash.hash_value - return ret + # Compute the hash of the current block + block_tokens = request.all_token_ids[start_token_idx:end_token_idx] + block_hash = hash_block_tokens(caching_hash_fn, + prev_block_hash_value, block_tokens, + extra_keys) + + new_block_hashes.append(block_hash) + start_token_idx += block_size + prev_block_hash_value = block_hash.hash_value + + return new_block_hashes + + return request_block_hasher def max_memory_usage_bytes(vllm_config: VllmConfig, diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index dcb9f4dd36f52..9810234090453 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -155,7 +155,6 @@ class Scheduler(SchedulerInterface): kv_cache_config=kv_cache_config, max_model_len=self.max_model_len, enable_caching=self.cache_config.enable_prefix_caching, - caching_hash_algo=self.cache_config.prefix_caching_hash_algo, use_eagle=self.use_eagle, log_stats=self.log_stats, enable_kv_cache_events=self.enable_kv_cache_events, @@ -1036,7 +1035,6 @@ class Scheduler(SchedulerInterface): def _free_blocks(self, request: Request): assert request.is_finished() self.kv_cache_manager.free(request) - self.kv_cache_manager.free_block_hashes(request) del self.requests[request.request_id] def get_num_unfinished_requests(self) -> int: diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 8f310023a8cd3..82e0292522b9a 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -3,7 +3,6 @@ import itertools from abc import ABC, abstractmethod from collections import defaultdict -from typing import Callable from vllm.utils import cdiv from vllm.v1.core.block_pool import BlockPool @@ -25,7 +24,6 @@ class SingleTypeKVCacheManager(ABC): kv_cache_spec: KVCacheSpec, block_pool: BlockPool, kv_cache_group_id: int, - caching_hash_fn: Callable, ) -> None: """ Initializes the SingleTypeKVCacheManager. @@ -33,7 +31,6 @@ class SingleTypeKVCacheManager(ABC): kv_cache_spec: The kv_cache_spec for this manager. block_pool: The block pool. kv_cache_group_id: The id of the kv cache group of this manager. - caching_hash_fn: The caching hash function. """ self.block_size = kv_cache_spec.block_size @@ -52,7 +49,6 @@ class SingleTypeKVCacheManager(ABC): # data for reempted ones. self.num_cached_block: dict[str, int] = {} - self.caching_hash_fn = caching_hash_fn self.kv_cache_group_id = kv_cache_group_id self._null_block = block_pool.null_block @@ -130,14 +126,12 @@ class SingleTypeKVCacheManager(ABC): req_blocks.extend(new_blocks) return new_blocks - def cache_blocks(self, request: Request, block_hashes: list[BlockHash], - num_tokens: int) -> None: + def cache_blocks(self, request: Request, num_tokens: int) -> None: """ Cache the blocks for the request. Args: request: The request. - block_hashes: The block hashes of the request. num_tokens: The total number of tokens that need to be cached (including tokens that are already cached). """ @@ -147,12 +141,10 @@ class SingleTypeKVCacheManager(ABC): self.block_pool.cache_full_blocks( request=request, blocks=self.req_to_blocks[request.request_id], - block_hashes=block_hashes, num_cached_blocks=num_cached_blocks, num_full_blocks=num_full_blocks, block_size=self.block_size, kv_cache_group_id=self.kv_cache_group_id, - hash_fn=self.caching_hash_fn, ) self.num_cached_block[request.request_id] = num_full_blocks diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index ed426f8ff452b..1e52f93a581b3 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -25,9 +25,11 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.tasks import POOLING_TASKS, SupportedTask from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) -from vllm.utils import (decorate_logs, make_zmq_socket, +from vllm.utils import (decorate_logs, get_hash_fn_by_name, make_zmq_socket, resolve_obj_by_qualname, set_process_title) -from vllm.v1.core.kv_cache_utils import (get_kv_cache_config, +from vllm.v1.core.kv_cache_utils import (BlockHash, get_kv_cache_config, + get_request_block_hasher, + init_none_hash, unify_kv_cache_configs) from vllm.v1.core.sched.interface import SchedulerInterface from vllm.v1.core.sched.output import SchedulerOutput @@ -140,6 +142,19 @@ class EngineCore: self.batch_queue_size) self.batch_queue = queue.Queue(self.batch_queue_size) + self.request_block_hasher: Optional[Callable[[Request], + list[BlockHash]]] = None + if (self.vllm_config.cache_config.enable_prefix_caching + or self.scheduler.get_kv_connector() is not None): + + block_size = vllm_config.cache_config.block_size + caching_hash_fn = get_hash_fn_by_name( + vllm_config.cache_config.prefix_caching_hash_algo) + init_none_hash(caching_hash_fn) + + self.request_block_hasher = get_request_block_hasher( + block_size, caching_hash_fn) + def _initialize_kv_caches( self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]: start = time.time() @@ -417,7 +432,8 @@ class EngineCore: request.mm_kwargs = self.mm_input_cache_server.get_and_update( request.mm_kwargs, request.mm_hashes) - req = Request.from_engine_core_request(request) + req = Request.from_engine_core_request(request, + self.request_block_hasher) if req.use_structured_output: # Note on thread safety: no race condition. # `grammar_init` is only invoked in input processing thread. For diff --git a/vllm/v1/request.py b/vllm/v1/request.py index d1f1c7f98755f..562925bde669e 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -3,7 +3,8 @@ import enum import time -from typing import TYPE_CHECKING, Any, Optional, Union +from functools import partial +from typing import TYPE_CHECKING, Any, Callable, Optional, Union from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange from vllm.pooling_params import PoolingParams @@ -16,6 +17,7 @@ from vllm.v1.utils import ConstantList if TYPE_CHECKING: from vllm.lora.request import LoRARequest + from vllm.v1.core.kv_cache_utils import BlockHash class Request: @@ -36,6 +38,8 @@ class Request: structured_output_request: Optional["StructuredOutputRequest"] = None, cache_salt: Optional[str] = None, priority: int = 0, + block_hasher: Optional[Callable[["Request"], + list["BlockHash"]]] = None, ) -> None: self.request_id = request_id self.client_index = client_index @@ -108,8 +112,18 @@ class Request: # indicates that the output is corrupted self.num_nans_in_logits = 0 + self.block_hashes: list[BlockHash] = [] + self.get_hash_new_full_blocks: Optional[Callable[ + [], list[BlockHash]]] = None + if block_hasher is not None: + self.get_hash_new_full_blocks = partial(block_hasher, self) + self.block_hashes = self.get_hash_new_full_blocks() + @classmethod - def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": + def from_engine_core_request( + cls, request: EngineCoreRequest, + block_hasher: Optional[Callable[["Request"], list["BlockHash"]]] + ) -> "Request": if request.mm_kwargs is not None: assert is_list_of(request.mm_kwargs, MultiModalKwargsItem), ( "mm_kwargs was not updated in EngineCore.add_request") @@ -131,6 +145,7 @@ class Request: if request.sampling_params else None, cache_salt=request.cache_salt, priority=request.priority, + block_hasher=block_hasher, ) def append_output_token_ids( @@ -144,6 +159,9 @@ class Request: self._output_token_ids.extend(token_ids) self._all_token_ids.extend(token_ids) + if self.get_hash_new_full_blocks is not None: + self.block_hashes.extend(self.get_hash_new_full_blocks()) + @property def is_output_corrupted(self) -> bool: return self.num_nans_in_logits > 0 From 3e2f7985a2fc69288c952d950e68ded7f5ef530f Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Fri, 15 Aug 2025 16:54:10 -0700 Subject: [PATCH 021/361] Support multiple attention groups for KV sharing (#22672) Signed-off-by: Yong Hoon Shin --- tests/v1/test_kv_sharing.py | 189 ++++++++++++++++++++++++++++++++++++ vllm/v1/worker/utils.py | 38 +++++--- 2 files changed, 212 insertions(+), 15 deletions(-) create mode 100644 tests/v1/test_kv_sharing.py diff --git a/tests/v1/test_kv_sharing.py b/tests/v1/test_kv_sharing.py new file mode 100644 index 0000000000000..6b01b7d3e1d6c --- /dev/null +++ b/tests/v1/test_kv_sharing.py @@ -0,0 +1,189 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from unittest.mock import Mock + +import torch + +from vllm.v1.attention.backends.flash_attn import ( + FlashAttentionBackend, FlashAttentionMetadataBuilder) +from vllm.v1.attention.backends.flex_attention import ( + FlexAttentionBackend, FlexAttentionMetadataBuilder) +from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec +from vllm.v1.worker.utils import (AttentionGroup, + initialize_kv_cache_for_kv_sharing) + + +def new_kv_cache_spec(): + return FullAttentionSpec(16, 1, 1, torch.float32, False) + + +def test_initialize_kv_cache_for_kv_sharing_different_attn_groups(): + """ + Test initializing KV cache sharing with different attention groups. + Layers in the same KV cache group might be placed in different attn groups + if they have different attention backends. + """ + shared_kv_cache_layers = { + "model.layers.2": "model.layers.0", + "model.layers.3": "model.layers.1", + } + + # Layers 0 and 1 both belong in KV cache group 0 + # However, if they have have different attention backends, they will be + # placed in different attention groups for KV cache group 0 + kv_cache_groups = [ + KVCacheGroupSpec(["model.layers.0", "model.layers.1"], + new_kv_cache_spec()), + ] + + attn_groups = [ + # KV cache group 0 has two attention groups + [ + AttentionGroup( + backend=FlashAttentionBackend, + metadata_builder=Mock(spec=FlashAttentionMetadataBuilder), + layer_names=["model.layers.0"], + ), + AttentionGroup( + backend=FlexAttentionBackend, + metadata_builder=Mock(spec=FlexAttentionMetadataBuilder), + layer_names=["model.layers.1"], + ), + ], + ] + + # Only layers 0 and 1 will have KV caches allocated + kv_caches = { + "model.layers.0": torch.zeros(1, 2, 3), + "model.layers.1": torch.ones(1, 2, 3), + } + + initialize_kv_cache_for_kv_sharing( + shared_kv_cache_layers=shared_kv_cache_layers, + kv_cache_groups=kv_cache_groups, + kv_caches=kv_caches, + attn_groups=attn_groups, + ) + + # Check that the KV caches were shared correctly + assert kv_caches["model.layers.2"].data_ptr( + ) == kv_caches["model.layers.0"].data_ptr() + assert kv_caches["model.layers.3"].data_ptr( + ) == kv_caches["model.layers.1"].data_ptr() + + # Check that the layers were added to the correct KV cache group + assert len(kv_cache_groups) == 1 + assert kv_cache_groups[0].layer_names == [ + "model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3" + ] + + # Check that the layers were added to the attention groups + assert len(attn_groups) == 1 and len(attn_groups[0]) == 2 + assert attn_groups[0][0].layer_names == [ + "model.layers.0", "model.layers.2" + ] + assert attn_groups[0][1].layer_names == [ + "model.layers.1", "model.layers.3" + ] + + +def test_initialize_kv_cache_for_kv_sharing_same_attn_groups(): + """ + Test case assuming that all layers in the same KV cache group have the same + attention backends. This is true for most models. + """ + shared_kv_cache_layers = { + "model.layers.2": "model.layers.0", + "model.layers.3": "model.layers.1", + } + + kv_cache_groups = [ + KVCacheGroupSpec(["model.layers.0", "model.layers.1"], + new_kv_cache_spec()), + ] + + attn_groups = [ + # KV cache group 0 has a single attention group + # as all layers have the same flash attention backend + [ + AttentionGroup( + backend=FlashAttentionBackend, + metadata_builder=Mock(spec=FlashAttentionMetadataBuilder), + layer_names=["model.layers.0", "model.layers.1"], + ), + ], + ] + + kv_caches = { + "model.layers.0": torch.zeros(1, 2, 3), + "model.layers.1": torch.ones(1, 2, 3), + } + + initialize_kv_cache_for_kv_sharing( + shared_kv_cache_layers=shared_kv_cache_layers, + kv_cache_groups=kv_cache_groups, + kv_caches=kv_caches, + attn_groups=attn_groups, + ) + + # Check that the KV caches were shared correctly + assert kv_caches["model.layers.2"].data_ptr( + ) == kv_caches["model.layers.0"].data_ptr() + assert kv_caches["model.layers.3"].data_ptr( + ) == kv_caches["model.layers.1"].data_ptr() + + # Check that the layers were added to the correct KV cache group + assert len(kv_cache_groups) == 1 + assert kv_cache_groups[0].layer_names == [ + "model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3" + ] + + # Check that the layers were added to the attention groups + assert len(attn_groups) == 1 and len(attn_groups[0]) == 1 + assert attn_groups[0][0].layer_names == [ + "model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3" + ] + + +def test_initialize_kv_cache_for_kv_sharing_no_attn_groups(): + """ + Test KV sharing set up when no attention groups are provided. + This is the case for the TPU model runner, which doesn't have + support for attention groups yet. + """ + shared_kv_cache_layers = { + "model.layers.2": "model.layers.0", + "model.layers.3": "model.layers.1", + } + + kv_cache_groups = [ + KVCacheGroupSpec(["model.layers.0"], new_kv_cache_spec()), + KVCacheGroupSpec(["model.layers.1"], new_kv_cache_spec()), + ] + + kv_caches = { + "model.layers.0": torch.zeros(1, 2, 3), + "model.layers.1": torch.ones(1, 2, 3), + } + + initialize_kv_cache_for_kv_sharing( + shared_kv_cache_layers=shared_kv_cache_layers, + kv_cache_groups=kv_cache_groups, + kv_caches=kv_caches, + ) + + # Check that the KV caches were shared correctly + assert kv_caches["model.layers.2"].data_ptr( + ) == kv_caches["model.layers.0"].data_ptr() + assert kv_caches["model.layers.3"].data_ptr( + ) == kv_caches["model.layers.1"].data_ptr() + + # Check that the layers were added to the correct KV cache group + assert len(kv_cache_groups) == 2 + assert kv_cache_groups[0].layer_names == [ + "model.layers.0", "model.layers.2" + ] + assert kv_cache_groups[1].layer_names == [ + "model.layers.1", "model.layers.3" + ] diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index e7079235d6510..b138f11af1eb1 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -225,26 +225,34 @@ def initialize_kv_cache_for_kv_sharing( Note that layers in shared_kv_cache_layers.keys() are not originally included as it only contains layers which have its own KV cache allocation. + attn_groups: Optional list of attention groups. Layers in the same KV + cache group may be placed in different attention groups if they + have different attention backends. Currently only provided by + GPU model runner. """ - # Record index of KV cache group for each layer that allocates a KV cache. - layer_to_kv_cache_group_idx: dict[str, int] = {} - for i, kv_cache_group in enumerate(kv_cache_groups): - for layer_name in kv_cache_group.layer_names: - layer_to_kv_cache_group_idx[layer_name] = i + # mapping from layer name to tuple of (kv_cache_group_idx, attn_group_idx) + layer_to_attn_group_idx: dict[str, tuple[int, int]] = {} + if attn_groups: + for kv_cache_group_idx, kv_attn_groups in enumerate(attn_groups): + for attn_group_idx, attn_group in enumerate(kv_attn_groups): + for layer_name in attn_group.layer_names: + layer_to_attn_group_idx[layer_name] = (kv_cache_group_idx, + attn_group_idx) + else: + for kv_cache_group_idx, kv_cache_group in enumerate(kv_cache_groups): + for layer_name in kv_cache_group.layer_names: + # attn group idx default to 0 if not provided + layer_to_attn_group_idx[layer_name] = (kv_cache_group_idx, 0) for layer_name, target_layer_name in shared_kv_cache_layers.items(): kv_caches[layer_name] = kv_caches[target_layer_name] - group_idx = layer_to_kv_cache_group_idx[target_layer_name] - kv_cache_groups[group_idx].layer_names.append(layer_name) + kv_cache_group_idx = layer_to_attn_group_idx[target_layer_name][0] + kv_cache_groups[kv_cache_group_idx].layer_names.append(layer_name) - if attn_groups is not None: - assert len(attn_groups[group_idx]) == 1, ( - "Only one attention group per KV cache group is supported " - "for KV-cache sharing for now.") - # TODO(lucas): I think in the future the layers that re-use a - # KV cache will be in a different attention group so we can - # remove this code from here. - attn_groups[group_idx][0].layer_names.append(layer_name) + if attn_groups: + attn_group_idx = layer_to_attn_group_idx[target_layer_name][1] + attn_groups[kv_cache_group_idx][attn_group_idx].layer_names.append( + layer_name) def bind_kv_cache( From 236b864e4f74c4018e1111e0d5b787d54b37c694 Mon Sep 17 00:00:00 2001 From: Yichen Yan Date: Sat, 16 Aug 2025 07:56:17 +0800 Subject: [PATCH 022/361] [BugFix] Make `run_once` thread-safe (#22978) Signed-off-by: Signed-off-by: Yichen Yan --- vllm/utils/__init__.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 72857ee2abc77..40f41893abb6a 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1640,15 +1640,19 @@ def weak_bind(bound_method: Callable[..., Any], ) -> Callable[..., None]: return weak_bound -# From: https://stackoverflow.com/a/4104188/2749989 def run_once(f: Callable[P, None]) -> Callable[P, None]: def wrapper(*args: P.args, **kwargs: P.kwargs) -> None: - if not wrapper.has_run: # type: ignore[attr-defined] - wrapper.has_run = True # type: ignore[attr-defined] - return f(*args, **kwargs) + if wrapper.has_run: # type: ignore[attr-defined] + return + + with wrapper.lock: # type: ignore[attr-defined] + if not wrapper.has_run: # type: ignore[attr-defined] + wrapper.has_run = True # type: ignore[attr-defined] + return f(*args, **kwargs) wrapper.has_run = False # type: ignore[attr-defined] + wrapper.lock = threading.Lock() # type: ignore[attr-defined] return wrapper From ad0297d1139f55cbd602652afd54276a8ae217ce Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 15 Aug 2025 17:00:36 -0700 Subject: [PATCH 023/361] [Misc] Support passing multiple request ids at once to `AsyncLLM.abort()` (#22944) Signed-off-by: Nick Hill --- tests/v1/engine/test_async_llm.py | 77 ++++++++++++++++++++++++++- vllm/engine/async_llm_engine.py | 5 +- vllm/engine/multiprocessing/client.py | 10 ++-- vllm/engine/protocol.py | 7 +-- vllm/utils/__init__.py | 5 ++ vllm/v1/engine/async_llm.py | 15 +++--- 6 files changed, 105 insertions(+), 14 deletions(-) diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 484640233f522..df04a14af70ce 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -212,6 +212,79 @@ async def test_abort( assert not engine.output_processor.has_unfinished_requests() +@pytest.mark.parametrize( + "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) +@pytest.mark.asyncio +async def test_multi_abort( + monkeypatch: pytest.MonkeyPatch, + output_kind: RequestOutputKind, +): + + with monkeypatch.context() as m, ExitStack() as after: + m.setenv("VLLM_USE_V1", "1") + + with set_default_torch_num_threads(1): + engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) + after.callback(engine.shutdown) + + NUM_REQUESTS = 50 + NUM_EXPECTED_TOKENS = 100 + NUM_EXPECTED_TOKENS_LONG = 50000 + REQUEST_IDS_TO_ABORT = [5, 10, 15, 20, 25] + PARALLEL_SAMPLE_REQ_IDS = [5, 15, 30, 35] + + request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)] + + # Create concurrent requests. + tasks: list[asyncio.Task] = [] + for idx, request_id in enumerate(request_ids): + max_tokens = (NUM_EXPECTED_TOKENS_LONG if + (idx + in REQUEST_IDS_TO_ABORT) else NUM_EXPECTED_TOKENS) + n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1 + tasks.append( + asyncio.create_task( + generate(engine, request_id, TEXT_PROMPT, output_kind, + max_tokens, n))) + + # Let requests start + await asyncio.sleep(0.5) + + # Use multi-abort to abort multiple requests at once + abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT] + await engine.abort(abort_request_ids) + + # Wait for all tasks to complete + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Verify results + for idx, result in enumerate(results): + if idx in REQUEST_IDS_TO_ABORT: + # Aborted requests should return partial results + assert isinstance( + result, tuple + ), f"Request {idx} should have completed with partial results" + num_generated_tokens, request_id = result + # Should have generated some tokens before abort + assert num_generated_tokens > 0, ( + f"Aborted request " + f"{request_id} should have generated some tokens") + else: + # Non-aborted requests should complete normally + assert isinstance( + result, + tuple), f"Request {idx} should have completed successfully" + num_generated_tokens, request_id = result + n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1 + expected_tokens = NUM_EXPECTED_TOKENS * n + assert num_generated_tokens == expected_tokens, ( + f"{request_id} generated {num_generated_tokens} but " + f"expected {expected_tokens}") + + # Make sure all aborted requests were cleaned up + assert not engine.output_processor.has_unfinished_requests() + + @pytest.mark.parametrize("n", [1, 3]) @pytest.mark.parametrize( "engine_args,prompt", @@ -460,7 +533,9 @@ async def test_abort_final_output( token_count = sum( len(output.outputs[0].token_ids) for output in outputs) assert token_count > 0 - assert len(final_output.outputs[0].token_ids) == 0 + # This would ordinarily be 0, but could end up > 0 if the + # final abort is coalesced with another chunk in the output queue. + assert len(final_output.outputs[0].token_ids) >= 0 else: # For FINAL_ONLY, we should only get the final output assert len(outputs) == 0 diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 73726eeab5fc7..84ad2299b0655 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -998,7 +998,7 @@ class AsyncLLMEngine(EngineClient): await self.abort(request_id) raise - async def abort(self, request_id: str) -> None: + async def abort(self, request_id: Union[str, Iterable[str]]) -> None: """Abort a request. Abort a submitted request. If the request is finished or not found, @@ -1007,6 +1007,9 @@ class AsyncLLMEngine(EngineClient): Args: request_id: The unique id of the request. """ + if not isinstance(request_id, str): + raise RuntimeError("Only single-request abort supported in" + " deprecated V0") if not self.is_running: raise AsyncEngineDeadError( "Background loop is not running. If it was running, " diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index f69f72edf6a52..eca29af50055f 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -5,8 +5,8 @@ import asyncio import copy import pickle from contextlib import contextmanager, suppress -from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping, - Optional, Union, cast) +from typing import (Any, AsyncGenerator, Dict, Iterable, Iterator, List, + Mapping, Optional, Union, cast) import cloudpickle import psutil @@ -404,9 +404,13 @@ class MQLLMEngineClient(EngineClient): error_message="Unable to start RPC Server", socket=socket) - async def abort(self, request_id: str): + async def abort(self, request_id: Union[str, Iterable[str]]): """Send an ABORT_REQUEST signal to the RPC Server""" + if not isinstance(request_id, str): + raise RuntimeError("Only single-request abort supported in" + " deprecated V0") + with suppress(MQClientClosedError): await self._send_one_way_rpc_request( request=RPCAbortRequest(request_id), socket=self.input_socket) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 671e9648a3d0c..c610fb5eae60c 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -3,7 +3,7 @@ import asyncio from abc import ABC, abstractmethod -from typing import AsyncGenerator, Mapping, Optional +from typing import AsyncGenerator, Iterable, Mapping, Optional, Union from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function from vllm.config import DecodingConfig, ModelConfig, VllmConfig @@ -229,11 +229,12 @@ class EngineClient(ABC): ... @abstractmethod - async def abort(self, request_id: str) -> None: + async def abort(self, request_id: Union[str, Iterable[str]]) -> None: """Abort a request. Args: - request_id: The unique id of the request. + request_id: The unique id of the request, + or an iterable of such ids. """ ... diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 40f41893abb6a..64f7426bd65d3 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1315,6 +1315,11 @@ def common_broadcastable_dtype(dtypes: Collection[torch.dtype]): ) +def as_list(maybe_list: Iterable[T]) -> list[T]: + """Convert iterable to list, unless it's already a list.""" + return maybe_list if isinstance(maybe_list, list) else list(maybe_list) + + # `collections` helpers def is_list_of( value: object, diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index edc2e235c3c3f..664fec31a4da5 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import time -from collections.abc import AsyncGenerator, Mapping +from collections.abc import AsyncGenerator, Iterable, Mapping from copy import copy from typing import Any, Optional, Union @@ -27,7 +27,8 @@ from vllm.transformers_utils.config import ( from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device, cancel_task_threadsafe, cdiv, deprecate_kwargs +from vllm.utils import (Device, as_list, cancel_task_threadsafe, cdiv, + deprecate_kwargs) from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError @@ -431,14 +432,16 @@ class AsyncLLM(EngineClient): self.output_handler = asyncio.create_task(output_handler()) - async def abort(self, request_id: str) -> None: + async def abort(self, request_id: Union[str, Iterable[str]]) -> None: """Abort RequestId in OutputProcessor and EngineCore.""" - request_ids = self.output_processor.abort_requests((request_id, )) - await self.engine_core.abort_requests_async(request_ids) + request_ids = (request_id, ) if isinstance( + request_id, str) else as_list(request_id) + all_request_ids = self.output_processor.abort_requests(request_ids) + await self.engine_core.abort_requests_async(all_request_ids) if self.log_requests: - logger.info("Aborted request %s.", request_id) + logger.info("Aborted request(s) %s.", ",".join(request_ids)) async def encode( self, From 070da660c1bf9e7a7be8b9efeff4b06f91c7342f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Sat, 16 Aug 2025 02:14:08 +0200 Subject: [PATCH 024/361] [Kernel] Simplify `get_kv_cache_layout` and cache `use_trtllm_attention` env-dependent bit (#22735) Signed-off-by: NickLucche --- vllm/utils/flashinfer.py | 46 +++++++++++++++++++---------- vllm/v1/attention/backends/utils.py | 18 ++++++----- 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 0d7d4b694f076..2e31b7bad7476 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -148,6 +148,31 @@ def has_nvidia_artifactory() -> bool: return False +@functools.cache +def supports_trtllm_attention() -> tuple[bool, Optional[str]]: + """Cache result which only depends on the environment""" + # This is a lambda, call it once + env_value = envs.VLLM_USE_TRTLLM_ATTENTION + + # Requires SM100 and NVIDIA artifactory to be accessible to download cubins + if not (current_platform.is_device_capability(100) + and has_nvidia_artifactory()): + return False, env_value + + if env_value is not None: + logger.info_once("VLLM_USE_TRTLLM_ATTENTION is set to %s", env_value) + # Environment variable is set - respect it + # Making the conditional check for zero because + # the path is automatically enabled if the batch size condition + # is satisfied. + use_trtllm = (env_value == "1") + if use_trtllm: + logger.info_once("Using TRTLLM attention.") + return use_trtllm, env_value + + return True, None + + def use_trtllm_attention( num_tokens: int, max_seq_len: int, @@ -157,9 +182,8 @@ def use_trtllm_attention( attn_head_size: Optional[int], has_sinks: bool = False, ) -> bool: - # Requires SM100 and NVIDIA artifactory to be accessible to download cubins - if not (current_platform.is_device_capability(100) - and has_nvidia_artifactory()): + use_trtllm, env_value = supports_trtllm_attention() + if not use_trtllm: return False # Check if the dimensions are supported by TRTLLM decode attention @@ -174,18 +198,7 @@ def use_trtllm_attention( "Using TRTLLM attention (required for attention sinks).") return True - env_value = envs.VLLM_USE_TRTLLM_ATTENTION - if env_value is not None: - logger.info_once("VLLM_USE_TRTLLM_ATTENTION is set to %s", env_value) - # Environment variable is set - respect it - # Making the conditional check for zero because - # the path is automatically enabled if the batch size condition - # is satisfied. - use_trtllm = (env_value == "1") - if use_trtllm: - logger.info_once("Using TRTLLM attention.") - return use_trtllm - else: + if env_value is None: # Environment variable not set - use auto-detection use_trtllm = (num_tokens <= 256 and max_seq_len < 131072 and kv_cache_dtype == "auto") @@ -193,6 +206,9 @@ def use_trtllm_attention( logger.warning_once("Using TRTLLM attention (auto-detected).") return use_trtllm + # Environment variable is set to 1 - respect it + return True + if has_flashinfer(): diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 1c7d087989649..5e6bc331835b6 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -248,19 +248,23 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]): @functools.lru_cache def get_kv_cache_layout(): + # Format specified by the code. global _KV_CACHE_LAYOUT_OVERRIDE - # Override with format specified by the user. + + if _KV_CACHE_LAYOUT_OVERRIDE is not None: + cache_layout = _KV_CACHE_LAYOUT_OVERRIDE + logger.info_once("`_KV_CACHE_LAYOUT_OVERRIDE` variable detected. " \ + "Setting KV cache layout to %s.", cache_layout) + return cache_layout + + # Format specified by the user. cache_layout = envs.VLLM_KV_CACHE_LAYOUT + # When neither the user nor the override specified a layout, get default if cache_layout is None: - if envs.VLLM_USE_TRTLLM_ATTENTION: - cache_layout = "HND" - else: - cache_layout = get_kv_connector_cache_layout() + cache_layout = get_kv_connector_cache_layout() else: logger.info_once("`VLLM_KV_CACHE_LAYOUT` environment variable " \ "detected. Setting KV cache layout to %s.", cache_layout) - if _KV_CACHE_LAYOUT_OVERRIDE is not None: - cache_layout = _KV_CACHE_LAYOUT_OVERRIDE return cache_layout From fbd88728b3aa7add999529e2c3f1b6b0aa8e428d Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Fri, 15 Aug 2025 21:25:06 -0400 Subject: [PATCH 025/361] [Bugfix] Fix DeepSeek MTP (#22934) Signed-off-by: Benjamin Chislett --- vllm/model_executor/models/deepseek_mtp.py | 13 +++++++------ vllm/model_executor/models/glm4_moe_mtp.py | 7 +++---- vllm/model_executor/models/mimo_mtp.py | 7 +++---- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index 2e026d582a6de..0ad001be71c19 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -158,14 +158,13 @@ class DeepSeekMTP(nn.Module, SupportsPP): self, input_ids: torch.Tensor, positions: torch.Tensor, - previous_hidden_states: torch.Tensor, + hidden_states: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, spec_step_idx: int = 0, ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, - previous_hidden_states, inputs_embeds, - spec_step_idx) + hidden_states = self.model(input_ids, positions, hidden_states, + inputs_embeds, spec_step_idx) return hidden_states def compute_logits( @@ -213,13 +212,15 @@ class DeepSeekMTP(nn.Module, SupportsPP): # for mlp.experts[0].gate_gate_up_proj, which breaks load. if (("mlp.experts." in name) and name not in params_dict): continue - name = name.replace(weight_name, param_name) + name_mapped = name.replace(weight_name, param_name) # QKV fusion is optional, fall back to normal # weight loading if it's not enabled if ((param_name == "fused_qkv_a_proj") - and name not in params_dict): + and name_mapped not in params_dict): continue + else: + name = name_mapped # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py index 0624640054d16..322c5619c1783 100644 --- a/vllm/model_executor/models/glm4_moe_mtp.py +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -180,14 +180,13 @@ class Glm4MoeMTP(nn.Module, SupportsPP): self, input_ids: torch.Tensor, positions: torch.Tensor, - previous_hidden_states: torch.Tensor, + hidden_states: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, spec_step_idx: int = 0, ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, - previous_hidden_states, inputs_embeds, - spec_step_idx) + hidden_states = self.model(input_ids, positions, hidden_states, + inputs_embeds, spec_step_idx) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py index 19afc5be3fb87..5a2079bf5121a 100644 --- a/vllm/model_executor/models/mimo_mtp.py +++ b/vllm/model_executor/models/mimo_mtp.py @@ -164,15 +164,14 @@ class MiMoMTP(nn.Module): self, input_ids: torch.Tensor, positions: torch.Tensor, - previous_hidden_states: torch.Tensor, + hidden_states: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, spec_step_idx: int = 0, ) -> torch.Tensor: assert spec_step_idx == 0, "mimo_mtp only support predict one token now" - hidden_states = self.model(input_ids, positions, - previous_hidden_states, inputs_embeds, - spec_step_idx) + hidden_states = self.model(input_ids, positions, hidden_states, + inputs_embeds, spec_step_idx) return hidden_states def compute_logits( From f6b5040590e2ca986e6221d98b736a71896eaa53 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 15 Aug 2025 19:06:30 -0700 Subject: [PATCH 026/361] [Frontend] Avoid list copies in `serving_chat.py` (#22947) Signed-off-by: Nick Hill --- vllm/entrypoints/openai/serving_chat.py | 29 +++++++++++++------------ vllm/reasoning/abs_reasoning_parsers.py | 2 +- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index b4231c6d10c4e..12349234c320f 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -50,6 +50,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls, truncate_tool_call_ids, validate_request_params) +from vllm.utils import as_list logger = init_logger(__name__) @@ -670,10 +671,10 @@ class OpenAIServingChat(OpenAIServing): # avoid the None + list error. if previous_token_ids: - current_token_ids = previous_token_ids + list( + current_token_ids = previous_token_ids + as_list( output.token_ids) else: - current_token_ids = list(output.token_ids) + current_token_ids = as_list(output.token_ids) if self.use_harmony: if is_final: @@ -703,11 +704,10 @@ class OpenAIServingChat(OpenAIServing): # set reasoning status to end. # Only keep 'content', remove 'reasoning_content'. if reasoning_parser.is_reasoning_end( - list(output.token_ids)) or \ - (res.prompt_token_ids and - reasoning_parser.is_reasoning_end( - list(res.prompt_token_ids) - )): + as_list(output.token_ids)) or ( + res.prompt_token_ids + and reasoning_parser.is_reasoning_end( + res.prompt_token_ids)): reasoning_end_arr[i] = True if delta_message and delta_message.content: # This need to be added to next `delta_text` @@ -771,6 +771,7 @@ class OpenAIServingChat(OpenAIServing): assert reasoning_parser is not None assert added_content_delta_arr is not None assert reasoning_end_arr is not None + output_token_ids = as_list(output.token_ids) if not reasoning_end_arr[i]: delta_message = ( reasoning_parser. @@ -780,7 +781,7 @@ class OpenAIServingChat(OpenAIServing): delta_text, previous_token_ids, current_token_ids, - output.token_ids, + output_token_ids, )) # When encountering think end id in prompt_token_ids # i.e {"enable_thinking": False}, @@ -789,9 +790,9 @@ class OpenAIServingChat(OpenAIServing): # to 'reasoning_content'. if res.prompt_token_ids and \ reasoning_parser.is_reasoning_end( - list(res.prompt_token_ids)): + res.prompt_token_ids): reasoning_end_arr[i] = True - current_token_ids = list(output.token_ids) + current_token_ids = output_token_ids if delta_message and delta_message.content: current_text = delta_message.content delta_message.content = None @@ -802,11 +803,11 @@ class OpenAIServingChat(OpenAIServing): # Remove the text and token ids related # to 'reasoning_content'. if reasoning_parser.is_reasoning_end( - list(output.token_ids)): + output_token_ids): reasoning_end_arr[i] = True current_token_ids = \ reasoning_parser.extract_content_ids( - list(output.token_ids)) + output_token_ids) if delta_message and delta_message.content: current_text = delta_message.content delta_message.content = None @@ -815,7 +816,7 @@ class OpenAIServingChat(OpenAIServing): # handle tool calls only after reasoning is done, else: - delta_token_ids = list(output.token_ids) + delta_token_ids = output_token_ids # First time to tool call, # add the remaining text and token ids # to delta from previous @@ -899,7 +900,7 @@ class OpenAIServingChat(OpenAIServing): self.request_logger.log_outputs( request_id=request_id, outputs=delta_content, - output_token_ids=list(output.token_ids), + output_token_ids=as_list(output.token_ids), finish_reason=output.finish_reason, is_streaming=True, delta=True, diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index 4f4522d726e89..df9e84163f16c 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -44,7 +44,7 @@ class ReasoningParser: return self.model_tokenizer.get_vocab() @abstractmethod - def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: + def is_reasoning_end(self, input_ids: list[int]) -> bool: """ Check if the reasoning content ends in the input_ids. From e4e37ded563912d6d413cfc23cc1db098b3d2f09 Mon Sep 17 00:00:00 2001 From: Calvin Chen Date: Sat, 16 Aug 2025 10:28:10 +0800 Subject: [PATCH 027/361] [V1] support min_tokens for detokener (#22014) Signed-off-by: calvin chen Co-authored-by: Nick Hill --- tests/detokenizer/test_min_tokens.py | 50 ++++++++++++++++++++++++++++ vllm/v1/engine/detokenizer.py | 11 ++++-- 2 files changed, 58 insertions(+), 3 deletions(-) create mode 100644 tests/detokenizer/test_min_tokens.py diff --git a/tests/detokenizer/test_min_tokens.py b/tests/detokenizer/test_min_tokens.py new file mode 100644 index 0000000000000..887e83342536e --- /dev/null +++ b/tests/detokenizer/test_min_tokens.py @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +from transformers import AutoTokenizer + +from vllm import SamplingParams +from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine.detokenizer import FastIncrementalDetokenizer + +PROMPT = "Hello, my name is Lee, and I'm a student in the " + \ + "college of engineering" + + +@pytest.mark.parametrize("min_tokens,stop,truth", [ + (0, None, " is Lee, and I'm a student in the college of engineering"), + (0, "e", " is L"), + (5, "e", " is Lee, and I'm a stud"), +]) +def test_min_tokens_with_stop(min_tokens: int, stop: str, truth: str): + """Test for a specific min_tokens and stop. + + See https://github.com/vllm-project/vllm/pull/22014 + """ + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m") + all_prompt_ids = tokenizer(PROMPT, add_special_tokens=False).input_ids + + # The prompt is "Hello, my name is" + prompt_token_ids = all_prompt_ids[:4] + params = SamplingParams( + stop=stop, + min_tokens=min_tokens, + ) + request = EngineCoreRequest("", + prompt_token_ids, + None, + None, + None, + params, + None, + None, + 0.0, + None, + cache_salt=None, + data_parallel_rank=None) + + detokenizer = FastIncrementalDetokenizer(tokenizer, request) + + detokenizer.update(all_prompt_ids[4:], False) + assert detokenizer.output_text == truth diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 2f5504ea14b41..04ad51aae0a8c 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -74,6 +74,7 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC): params = request.sampling_params assert params is not None self.stop = stop = params.stop + self.min_tokens = params.min_tokens self.include_stop_str_in_output = params.include_stop_str_in_output # Number of chars to hold back when stop strings are to be excluded @@ -111,10 +112,14 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC): # 1) Detokenize the new token ids incrementally. # TODO(woosuk): This method becomes very inefficient when the number of # new_token_ids is more than 1. We need to optimize this. - offset_before = len(self.output_text) + stop_check_offset = len(self.output_text) for new_token_id in new_token_ids: self.token_ids.append(new_token_id) self.output_text += self.decode_next(new_token_id) + # Support min_tokens, see https://github.com/vllm-project/vllm/pull/22014 + if self.min_tokens and len( + self.output_token_ids) <= self.min_tokens: + stop_check_offset = len(self.output_text) if stop_terminated: if skipped_stop_token_id is not None: @@ -125,10 +130,10 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC): # 2) Evaluate stop strings. stop_string = None - if self.stop: + if self.stop and len(self.output_token_ids) > self.min_tokens: stop = StopChecker.check_stop_strings( output_text=self.output_text, - new_char_count=len(self.output_text) - offset_before, + new_char_count=len(self.output_text) - stop_check_offset, stop=self.stop, include_in_output=self.include_stop_str_in_output, ) From 1f83e7d849ccb03990bb896f49df20343a2828b9 Mon Sep 17 00:00:00 2001 From: Grace Ho <146482179+gracehonv@users.noreply.github.com> Date: Fri, 15 Aug 2025 19:52:51 -0700 Subject: [PATCH 028/361] [misc] nsys profile output kernel classifier and visualizer (#22971) Signed-off-by: Grace Ho --- tools/profiler/nsys_profile_tools/README.md | 175 +++++++ .../nsys_profile_tools/gputrc2graph.py | 426 ++++++++++++++++++ .../nsys_profile_tools/images/csv1.png | Bin 0 -> 148416 bytes .../nsys_profile_tools/images/html.png | Bin 0 -> 72163 bytes .../nsys_profile_tools/images/html_tbl.png | Bin 0 -> 36615 bytes 5 files changed, 601 insertions(+) create mode 100644 tools/profiler/nsys_profile_tools/README.md create mode 100755 tools/profiler/nsys_profile_tools/gputrc2graph.py create mode 100644 tools/profiler/nsys_profile_tools/images/csv1.png create mode 100644 tools/profiler/nsys_profile_tools/images/html.png create mode 100644 tools/profiler/nsys_profile_tools/images/html_tbl.png diff --git a/tools/profiler/nsys_profile_tools/README.md b/tools/profiler/nsys_profile_tools/README.md new file mode 100644 index 0000000000000..75ae0811cc543 --- /dev/null +++ b/tools/profiler/nsys_profile_tools/README.md @@ -0,0 +1,175 @@ +# gputrc2graph.py + +This script processes NVIDIA Nsight Systems (`nsys`) GPU trace files +(`.nsys-rep`) with -t cuda tracing enabled, and generates kernel-level +summaries and visualizations of GPU and non-GPU time. It is useful for +profiling and analyzing nsys profile output. + +## Usage + +### Command-line Arguments + +- `--in_file` + **(required)** + List of input files and their metadata. Each entry should be in the format: + `,,,` + - `nsys-rep`: Path to the `.nsys-rep` file. + - `engine`: Engine name (e.g., `vllm`). + - `model`: Model name (e.g., `llama`, `gpt-oss`, `ds`). + - `elapsed_nonprofiled_sec`: Wall-clock runtime (in seconds) without + profiling. Specify `0` to use the elapsed time from the nsys-rep file + (this may inflate non-GPU time if actual runtime without profiling is + less). Multiple entries can be provided, separated by spaces. + +- `--out_dir` + Output directory for the generated CSV and HTML files. + If not specified, results are saved in the current directory. + +- `--title` + Title for the HTML chart/visualization. + +- `--nsys_cmd` + Path to the `nsys` command. + Default: `nsys` (assumes it is in your PATH). + Use this if `nsys` is not in your system PATH. + +## Notes + +- Make sure you have pandas installed. +- Make sure nsys is installed, and specify the path to the `nsys` command with + `--nsys_cmd` if it is not in your PATH. +- For more details on available engines and models, see the help string in + the script or run: + +```bash +python3 gputrc2graph.py --help +``` + +## Example 1: analyze a single profile + +To analyze the GPU cycles for say, gpt-oss model with vLLM engine: + +1. Run the following command to collect nsys profile, for vllm serve config. + + ```bash + nsys profile -t cuda -o run1 -f true --trace-fork-before-exec=true \ + --cuda-graph-trace=node --delay --duration \ + vllm serve openai/gpt-oss-120b ... + ``` + + where: + + - DELAY: how many seconds to delay nsys from collecting profiles, needed so + that profiles aren't captured till vllm server has come up and load + generation starts. + - DURATION: how many seconds for nsys profile to run before generating the + profile. This should be > the duration of the run. + +2. Run again, this time without collecting the profile, and get the total run + time in seconds. This value will be used by the script to calculate the + CPU(non-GPU) seconds for the analysis. + +3. Say the run elapsed time is 306 seconds, from step #2. Run script to + analyze: + + ```bash + python3 gputrc2graph.py \ + --in_file run1.nsys-rep,vllm,gpt-oss,306 \ + --title "vLLM-gpt-oss profile" + ``` + +The command will produce 2 files for analysis: + +- result.html: this categorizes kernel names into different categories in a + stacked bar chart. +- result.csv: shows how the kernel names are mapped to the different + categories. + +### HTML visualization with result.html + +The html file shows the number of elapsed seconds due to different GPU +Substages or categories, which consist of moe_gemm (Mixture of Experts GEMM) +kernels the biggest category, at 148 seconds, followed by "attn" or attention +kernels. This lets the user prioritize the kernels to focus on for performance +optimizations. + +![Example GPU Trace Visualization](images/html.png) + +There's also an appended data table underneath the bar chart for copying out to other post-processing tools. + +![Example GPU Trace Table](images/html_tbl.png) + +### Kernel to category mapping with result.csv + +Suppose the user would like to focus on improving triton kernels. It's not the +biggest consumer of cycles at 9.74 sec but perhaps it hasn't been optimized. +The next step is to use the result.csv to dive into what the kernels are which +compose the triton kernel GPU cycles. The following image shows that +triton_poi_fused__to_copy_add_addmm_cat_.. kernel to be the biggest +contributor to GPU cycles. + +![Example GPU Trace csv](images/csv1.png) + +## Example 2: analyze multiple profiles + +Suppose the user has multiple nsys trace files, captured for different models, +say llama and gpt-oss in this case, and wish to compare their GPU/non-GPU +time, something like the following command can be used. + +```bash +python3 gputrc2graph.py \ +--in_file run1.nsys-rep,vllm,llama,100 run2.nsys-rep,vllm,gpt-oss,102 \ +--out_dir results \ +--title "Comparison of vLLM Models" +``` + +The analysis process is similar to example 1 but now there will be multiple +stack bar charts that can be compared. The categories for the different +kernels will remain the same, so that it's easy to compare the GPU cycles for +the same categories. + +Once a category is shown to have more cycles for one configuration than +another, the next step would be to use the csv file to see what kernels are +mapped into that category, and which kernels are taking the largest amount of +time which would cause a difference for the overall category. + +## Example 3: add new classification for a new model + +Suppose there's a new model ABC that is available for engine DEF, and say there +are 4 kernels to be classified into "gemm" and "attn", where the gemm kernels +have names with "*H*" or "*I*" in them, and attn kernels have names with "*J*" +or "*K*" in them, add a new entry like so: + +```python +engine_model = { + 'DEF': { + 'ABC': { + 'layer_anno': { + 'Stage': { + '.*': 'layer', + }, + 'Substage': { + 'H|I': 'gemm', + 'J|K': 'attn', + 'CUDA mem': 'non-gpu-H_D_memops', + '.*': 'misc' + } + } + }, + } + 'vllm': {...} +``` + +Basically Substage is a dictionary with a list of key/value pairs, where the +keys are regex's of the kernel names to be classified, and values are the +classification bins which one wishes to compare across engines/models. + +The last 2 entries are common for all engine/models, consisting of CUDA memory +operations and a 'misc' for anything that's leftover and can't be classified. + +When invoking gputrc2graph.py, specify a trace file with this new model/engine +like the following: + +```bash +--infile new.nsys-rep,DEF,ABC, +``` diff --git a/tools/profiler/nsys_profile_tools/gputrc2graph.py b/tools/profiler/nsys_profile_tools/gputrc2graph.py new file mode 100755 index 0000000000000..8921e1f20f3da --- /dev/null +++ b/tools/profiler/nsys_profile_tools/gputrc2graph.py @@ -0,0 +1,426 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" + This generates gpu kernel analysis output from nsys rep. Will call nsys + stats -r cuda_gpu_kern_trace, get non-overlapped gpu cycles, then generate + csv and html output for analysis +""" +import argparse +import logging +import os + +import regex as re + +logger = logging.getLogger(__name__) + + +# helper data class for annotating kernels +class EngineModelData: + # engine + model mappings + engine_model = { + 'vllm': { + 'llama': { + 'layer_anno': { + 'Stage': { + '.*': 'layer', + }, + 'Substage': { + 'gemm': 'gemm', + 'fused_moe_kernel|GroupProblemShape|group_gemm_starts': + 'moe_gemm', #llama4 + 'moe|sigmoid': 'moe', #llama4 + 'CatArrayBatched|prepare_inputs': 'prepare_next', + 'flash': 'attn', + 'ncclDevKernel|cross_device_reduce': + 'nccl_and_custom_ar', + '_norm_': 'norm', + 'act_and_mul_': 'silu', + 'rotary_embedding_kernel': 'rope', + 'SoftMax': 'softmax', + 'elementwise': 'elementwise', + 'fp8_quant': 'quantize', + 'reduce_kernel': 'reduce', + 'triton': 'triton_kernel', + 'CUDA mem': 'non-gpu-H_D_memops', + '.*': 'misc' + } + } + }, + 'ds': { + 'layer_anno': { + 'Stage': { + '.*': 'layer', + }, + 'Substage': { + 'block_fp8|gemm_fp8_blockwise': + 'block_fp8_gemm', + 'fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal': + 'moe_gemm', + 'gemm|matmul|nvjet': + 'gemm', + 'moe|sigmoid|expert': + 'moe', + '_fwd_|FlashAttn|_mla_|_attn_': + 'attn', + 'CatArrayBatched': + 'prepare_next', + 'ncclDevKernel|cross_device_reduce': + 'nccl_and_custom_ar', + 'Norm|_norm_': + 'norm', + 'sbtopk': + 'topk', + 'act_and_mul_': + 'activation', + 'compute_position_kernel': + 'rope', + 'elementwise': + 'elementwise', + 'fp8_quant|quant_fp8|cvt_fp16_to_fp4': + 'quantize', + 'reduce': + 'reduce', + 'SoftMax': + 'softmax', + 'triton': + 'triton_kernel', + 'CUDA mem': + 'non-gpu-H_D_memops', + '.*': + 'misc' + } + } + }, + 'gpt-oss': { + 'layer_anno': { + 'Stage': { + '.*': 'layer', + }, + 'Substage': { + 'block_fp8|gemm_fp8_blockwise': + 'block_fp8_gemm', + 'fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_' + # this section is triton_moe_gemm + '|matmul_ogs_|_topk_forward|_combined_routing' + '|_sum_bitmatrix_rows|_compute_writeback_idx': + 'moe_gemm', + 'gemm|matmul|nvjet': + 'gemm', + 'moe|sigmoid|expert|splitKreduce': + 'moe', + '_fwd_|FlashAttn|_mla_|_attn_|_flash_|flash::prepare_varlen|fmha': + 'attn', + 'CatArrayBatched': + 'prepare_next', + 'ncclDevKernel|cross_device_reduce': + 'nccl_and_custom_ar', + 'Norm|_norm_': + 'norm', + 'sbtopk': + 'topk', + 'act_and_mul_': + 'activation', + 'compute_position_kernel': + 'rope', + 'elementwise': + 'elementwise', + 'fp8_quant|quant_fp8|cvt_fp16_to_fp4|quantize': + 'quantize', + 'reduce': + 'reduce', + 'SoftMax': + 'softmax', + 'triton': + 'triton_kernel', + 'CUDA mem': + 'non-gpu-H_D_memops', + '.*': + 'misc' + } + } + } + }, + } + + +class GPUTrace2Graph: + """ + Parses output of nsys report, generates csv and bar chart output + """ + + def __init__(self, nsys_cmd): + self.nsys_cmd = nsys_cmd + import pandas as pd # avoid importing till needed + self.pd = pd + self.pd.options.mode.copy_on_write = True + + # helper functions for generating trace->summary csvs + def gen_nonoverlapped_sum_from_gputrace(self, in_file, out_file): + logger.info('loading %s', in_file) + df = self.pd.read_csv( + in_file, + usecols=['Start (ns)', 'Duration (ns)', 'Device', 'Strm', 'Name']) + df['End (ns)'] = df['Start (ns)'] + df['Duration (ns)'] + df = self.sum_non_overlapping_intervals(df) + # get ready to print table with elapsed times per kernel + df['Instances'] = 1 + df_sum = df.groupby('Name', as_index=False).agg({ + 'Elapsed Time (ns)': 'sum', + 'Duration (ns)': 'sum', + 'Instances': 'size' + }) + + # generate csv + df_sum['Total Time (sec)'] = df_sum['Duration (ns)'] / 1e9 + df_sum['Elapsed Time (sec)'] = df_sum['Elapsed Time (ns)'] / 1e9 + df_sum = df_sum.sort_values(by='Elapsed Time (sec)', ascending=False) + df_sum[['Elapsed Time (sec)', 'Total Time (sec)', 'Instances', + 'Name']].to_csv(out_file, index=False) + + def sum_non_overlapping_intervals(self, df): + """ + returns new sorted df with Elapsed Time (ns) column using + vectorized operations + """ + logger.info("sorting %s trace records by start time", str(df.shape)) + + # Sort by start time and reset index + df = df.sort_values(by='Start (ns)').reset_index(drop=True) + + # Initialize elapsed time as duration + df['Elapsed Time (ns)'] = df['Duration (ns)'] + + # Get numpy arrays for faster operations + starts = df['Start (ns)'].values + ends = df['End (ns)'].values + + # Keep track of current interval end + current_end = ends[0] + display_units = int(len(df) / 100) + # Update current_end for overlapping intervals + for i in range(1, len(df)): + if i % display_units == 0: + print(f'processing trace: {int(i/len(df) * 100)} %', end="\r") + if starts[i] <= current_end: + if ends[i] > current_end: + # Partial overlap + df.iloc[i, df.columns.get_loc('Elapsed Time (ns)' + )] = ends[i] - current_end + current_end = ends[i] + else: + # Complete overlap + df.iloc[i, df.columns.get_loc('Elapsed Time (ns)')] = 0 + else: + # No overlap + current_end = ends[i] + + return df + + # functions for generating html files + def make_html(self, df, output_dir, title): + """ make html graph from df """ + import plotly.express as px + if df.empty: + return + output_name = output_dir + '/result' + if not title: + title = 'Model_Engine' + x = 'Model_Engine' + y = 'Elapsed Time (sec)' + color = 'Substage' + """ generate kernel mapping table """ + # Sort Model_Engine categories by last field after underscore + df['Model_Engine'] = self.pd.Categorical( + df['Model_Engine'], + sorted(df['Model_Engine'].unique(), + key=lambda x: x.split('_')[-1])) + df[['Model_Engine', color, 'Instances', 'Name', + y]].sort_values(by=color).to_csv(f'{output_name}.csv', index=False) + graph = px.histogram(df.round(2), + x=x, + y=y, + title=(f'{y} for {title}'), + color=color, + text_auto=True) + # wrap x axis labels + graph.update_xaxes(automargin=True) + graph.write_html(f'{output_name}.html') + """ + Generate data table with columns per Model_Engine into result.html + """ + pivot_df = df.pivot_table(values='Elapsed Time (sec)', + index='Substage', + columns='Model_Engine', + aggfunc='sum', + observed=False).round(2) + # Add sum row at bottom + pivot_df.loc['total_elapsed_sec'] = pivot_df.sum() + pivot_df.fillna('').to_html('temp.html') + print('got') + with (open(f'{output_name}.html', 'a', encoding='utf-8') as + outfile, open('temp.html', encoding='utf-8') as infile): + outfile.write(infile.read()) + os.remove('temp.html') + + print(f'Finished generating: \n' + f' {output_name}.html for stack bar chart \n' + f' {output_name}.csv for Kernel-Substage mapping') + + def anno_gpu_kernname(self, df, mapping): + """ add "stage" and "substage" columns """ + + def anno_gpu_kernname_helper(name, stage): + for kern_name, val in mapping['layer_anno'][stage].items(): + if re.search(kern_name, name): + return val + + for stage in ['Stage', 'Substage']: + df[stage] = df['Name'].apply(anno_gpu_kernname_helper, stage=stage) + + def make_nongpu_row(self, df, nongpu_sec): + """ this will append non-gpu time entry at end of df """ + nongpu_row = self.pd.DataFrame([df.iloc[-1]]) + nongpu_row['Substage'] = nongpu_row['Name'] = 'CPU(non-GPU)' + nongpu_row['Instances'] = 1 + nongpu_row['Elapsed Time (sec)'] = nongpu_sec + return (nongpu_row) + + def is_valid_file(self, base_file): + """ asserts if base_file is non-existent or is empty """ + assert os.path.isfile(base_file) and os.path.getsize(base_file) > 0, \ + f"{base_file} doesn't exist or is empty" + + def should_gen_file(self, new_file, base_file): + """ figure out if new file should be generated from base_file """ + self.is_valid_file(base_file) + if (os.path.exists(new_file) + and (os.path.getmtime(new_file) > os.path.getmtime(base_file)) + and (os.path.getsize(base_file) > 0)): + logger.info('reusing %s', new_file) + return False + else: + logger.info('generating %s', new_file) + return True + + def gen_sum_file(self, file): + """ + generates sum file from nsys trace with times per kernel and + returns the name of the sum file + """ + import subprocess + file_dir = os.path.dirname(file) + file_name = os.path.basename(file) + + if not file_dir: + file_dir = '.' + # Walk through trace and get the total non-overlapped time + nsys_stats_file = f'{file_dir}/{file_name}_cuda_gpu_trace.csv' + sum_file = f'{file_dir}/{file_name}_cuda_gpu_kernel_tracesum.csv' + if self.should_gen_file(nsys_stats_file, file): + cmd = [ + self.nsys_cmd, 'stats', '-r', 'cuda_gpu_trace', file, '-o', + f'{file_dir}/{file_name}' + ] + cmd_str = ' '.join(cmd) + logger.info('+ %s', cmd_str) + try: + subprocess.run(cmd) + except Exception: + logger.error( + "%s failed, specify --nsys_cmd for correct nsys path", + cmd_str) + exit(1) + logger.info('generating non-overalapped sum %s', sum_file) + self.gen_nonoverlapped_sum_from_gputrace(nsys_stats_file, sum_file) + self.is_valid_file(sum_file) + logger.info('Finished generating %s', sum_file) + return sum_file + + def gen_graph(self, in_file, out_dir, title): + """ generates graph and csv file from in_file into out_dir """ + # Initialize an empty DataFrame to store combined data + combined_df = self.pd.DataFrame() + for idx, (file, engine, model, total_sec) in enumerate(in_file): + file_dir = os.path.dirname(file) + file_name = os.path.basename(file) + if not file_dir: + file_dir = '.' + sum_file = self.gen_sum_file(file) + # read kernel summary file + df = self.pd.read_csv(sum_file) + # annotate kernel to their categories + assert EngineModelData.engine_model.get(engine) + assert EngineModelData.engine_model[engine].get(model) + # remove nsys-rep from file_name for shorter x-label + file_name = file_name.replace('.nsys-rep', '') + df['Model_Engine'] = f'{model}_{engine}_{file_name}_{idx}' + self.anno_gpu_kernname(df, + EngineModelData.engine_model[engine][model]) + # patch in non-gpu time + gpu_sec = round(df['Elapsed Time (sec)'].sum(), 1) + total_sec = round(float(total_sec), 1) + if total_sec < gpu_sec: + logger.warning( + "Elapsed sec %.2f < GPU sec %.2f resetting Elapsed sec ", + total_sec, + gpu_sec, + ) + total_sec = gpu_sec + nongpu_row = self.make_nongpu_row(df, total_sec - gpu_sec) + df = self.pd.concat([df, nongpu_row], ignore_index=True) + combined_df = self.pd.concat([combined_df, df], ignore_index=True) + if out_dir is None: + out_dir = '.' + else: + os.makedirs(out_dir, exist_ok=True) + # generate html file + self.make_html(combined_df, out_dir, title) + + +def parse_tuple(s): + return tuple(s.split(',')) + + +def main(): + logging.basicConfig(format=('%(asctime)s - %(levelname)s - %(message)s'), + level=logging.INFO) + parser = argparse.ArgumentParser( + description=( + 'Process nsys rep and generate kernel non-overlapped cycles. \n' + 'Example:\n' + "gputrc2graph.py --in_file d1.nsys-rep,vllm,llama,100 \n" + "d2.nsys-rep,vllm,gpt-oss,102 " + "--out_dir results/ --title \"Model=gpt-oss vLLM chart\""), + formatter_class=argparse.RawDescriptionHelpFormatter) + + # Build help string showing available engine/model combinations + engine_model_help = [] + for engine, models in EngineModelData.engine_model.items(): + model_list = list(models.keys()) + engine_model_help.append(f"{engine}:[{','.join(model_list)}]") + engine_model_str = ' '.join(engine_model_help) + parser.add_argument( + '--in_file', + type=parse_tuple, + nargs='+', + help=( + 'list of (nsys-rep, engine, model, elapsed_nonprofiled_sec) ' + 'separated by space. Elapsed_nonprofiled_sec is runtime without ' + 'profiling used to calculate non-gpu time. Specify 0 to use ' + 'elapsed time from nsys-rep but that might inflate non-gpu time. ' + f'Available engine:[model] are: {engine_model_str} ' + f'Example: --infile d1.nsys-rep,vllm,llama,100 ' + 'd2.nsys-rep,vllm,gpt-oss,102'), + required=True) + parser.add_argument('--out_dir', help=('output dir for result.csv/html')) + parser.add_argument('--title', help=('title for html chart')) + parser.add_argument('--nsys_cmd', + help=('nsys cmd, e.g. /usr/bin/nsys, Default: nsys'), + default="nsys") + args = parser.parse_args() + gputrace = GPUTrace2Graph(args.nsys_cmd) + gputrace.gen_graph(args.in_file, args.out_dir, args.title) + + +if __name__ == '__main__': + main() diff --git a/tools/profiler/nsys_profile_tools/images/csv1.png b/tools/profiler/nsys_profile_tools/images/csv1.png new file mode 100644 index 0000000000000000000000000000000000000000..bdeb47c3c2a3575c200ae8dee23bd14ca6dc491b GIT binary patch literal 148416 zcmeFYbyVET5 z{guZc)HhUKR_qPJI|MK=us0IoB8p&O&=_D~5J_+_pd7TOWiv1^ zgc>tpVR;E*VPbiE8)Gv|02r8fXhIUKywX0_K!=A7H$45fhM#FySb~y!;CU{HU&sZi znV}#;2sE_2>at1why5$Us2i~ae?f#8YGGnlF@_XT9~ufN)15*H+ur{0uvr;O=jyoM zUAoLTO#=dC!Ls+h2X;HEWBDQp)l-ko1`uN4i48=7@e2AP*@AIf$|gqRqNBsfr+xKl zxb*@{4Qm)JFn}yq9bYHUbs9vY)hZ;NXMxsb}KW{82odzz|o}?!zv8vIARDO zS4^y@L4B|hy@7iOc~;U1?L>`zH^sPdF5~V}4M{AkGyoVp%5ny=LX7(KQwXjKyuS0@ zwW+eOOY)DAk<5kYUBz`4GyyJ(v@sDIf~YZc+tJ;La{BUPu;k_**2=*}o+%d$Meic>Uxk7XA=YL06Kc$t-4E59nDsj9TN}VIe@?DcM^GiO2y^PL zu40}uvSg@-(+cAAJIqpOLyt--2DVk0)2EH2{-j~0TxXHe8v-ZxmywQ3I)fzu) zpma^MfcrQXYx}@>Gh2f`vl6-xws4zNO{0=rriKNXtpt z_^lwcakqIqhI!2k@@*eRHxcR%WT|WF6G?uQxui>r{auB@~ zyo9)*Idvj&^l>owV|-two%caF}&_y=uQ2sW9GFOWVrdcW}#qy zr>?x1e%!h8n1H8b8KULd`PfQx$6GT6ACik~8!it%&v5Ai$k ze{OefAMw(9s`3W+)_Qu-ct?j^n#i_uy4Eq-LFS8(N+^$Z#`%+69&@f5On0qAf6*US z=t0OoyFedx6k!GHk*Sx7oOP=~UfWdM6tFkNR9f}cAY*;{?Cz{*{gjNClwE8o>O&Z& zSiRVg2z^+|*VK@kFov&$A>TtZa2@DX*%FBs@q?qn!+20R+JAbiM+WwUA7Qgbgbp>=q?OHLPoM#E2tYy zxu;&z^uflg;is}XIsTyI@KOvHB?2nJpno-%37g5(PlNsDIq}B;SEd2BLY7@&Yvq$r z-X>n5m1_Xy2s@oe;%VH=i2bPj*u~A;1=?Q~INC$n8f^%g35_LQbtebiGd&ibv-p}) znrZbnw-6)#lm0G;IkR<`=9m>U_zzvT!Of)mqzVcz?z`720*2mOFLQNW#yu5p^un9^ z)THR7h9&uJwk%s8;%4Nw1-usHYl|&vo_5-_Y_*`Y9_pK|_>|wPrRs6lT6NX;A735! zEv_DitlH&|)|Z)6nH9S33cBgLT{5)kXgPaTh!^jbUlm!-mA8BJoP_c%@$q>q-~LiF zQ%+F!dPsS>%rIX)=zkZGTJ6>7#RbefiCS2_N8C(WfGmNPME2yO=H#@cK9UzJ7q&V! zxb8_iT;sM|9*hwcvnIUeUeLBt)mzSg7RKsR*_WJWKb|F?@za7; zFE7U3m8~Z>lq!@kqG5hIkG$ui9h6OhcQ8k=W617rbbauj@19Xj_O|27sIr0)vlKHE z`KH`eflCkfDQHz^5MmR zQ_I!5sNe_z_pEbo`<%Og;b37RTp}I;Q6G2q6d>+P(_{E~Qeq{z9+BtB-Hps3*7rg& za408E8hBq0Sn%v#2*EaZFthd12tKQyqc8qu5HF!7J7;D!uxo?wz`B>f;XZwS`+h%{ zVHrUXu0*sEa<2c=M$P5f)e%3SS`6%L@9eA|9^d^V#7k{T({oGv3$9<@!Rkf_n3xHC z2N;on7)Xm%1*l6H%gBJe2R*}qfd`v`L4lsYL7#V^4;UC^Y!DbM=ocOI5y^)5M=3N$ zHsn8^A(CDz3MmOoNPvEoKHCET)()mNj&(<0ctBmvnklP0s>?`of3~q=Ffg()1TeT- z*}gUb<8|c*Jz4=A4TxQ>EUg{5UHM4=sKE_-e$8eiCH|v|qXi$Sx{N%ru#G){n4N)% zfr*qKftZ+>*WSpOTT$fGU(G@P@sXN3I@)qGGP=09Fu1TX*w~veGIMcpF*30*varyD zYS25lSvwlI(px)_{n^Ps`Vj#*e6}~Ubu_cFCVuVLz|h9Yk&l%0b)bL#{W(v7tJ%Ls zvUd1uS)c_nzLqdDGcYm!vu{vS-q&1ic{5jlrG|)^6^J~bG5FcpS$Y4c|6e8l8u4!} z)&JF!nT3n%?@j+!^q);t902yhHddf99r^#YHGeh!d*NRVc^O}q{x?$mN#{RuL4@W< z;AQ;hrtu>HhcqI=zy!f0M1+)G!H+Va(^dMg2T&9t!l+66iOBPLpe6R>6XQ`up!QEN z^T?qvDe|=P-atV?Lnc^9%Qt(3zTE#f@oK%DQyV$CSZQCKA79jR*d6mU*;#4Y&E&$M zB`4o%mlyYi`fcij^rPa%-!(ud2LEkhg4)36_DrV!Z6~1qJIVai@ptGT0?kvWi}|_@ z!G7BtoG}Fc4*qEVcxxOOeDwcJz9>*d(pTVE0kQwH7?df_O7OQ2efjpFX7^z>q14|56r$T=0JF-EFz6J>z$IG`b`?obvM?o8iv@>b4BJn%k>x2c3^8gku z^xGQ5$?)41J($GW;zRA^5M&PFmK2f%sP@6l6kM zyL_K4ndWw#ASi7~1e%y8BB8@wk67g3uAV1N5~JSDdkTl$@uVzv>#O&cj?nMeJ->K% zI9oaDFnrS6jVjjy+-A1z1YZkAr*YVryWb8b(5iXLo&;sF54S=idh$n6K~eb$S`u|> zD2~?gK$H!f?49BKv5~$Vs38*Lv_3w&XDU={+G6!qs|qrug>r=MPT zFNihmfms5_bPDABl2{S`poHKkIjZS6Ezm{BV%dGyYuY8Uh{3>x+fP)g-Ptv&;lh|6 z+vlmya`n5CYK1n%?pRjTLEgfKfZIj<2cM3F-YBBgdd3n}UAKDxF28*|oksJA5{7k$ zR=;Wf)*ssVHjpgoCJ^Tv{cL{5qnTs@z*`4SgV~eii`msh{G0p3PmxFSfZnPo{Me)q zW{SbbA=bU21X}DBxhjSt_kk`e!R0F`HVv!Chm++kM~gm!iS$pJEC5%TXD)YTZl`^7 z%m|B)Yu)uHYTU(EK=>6Nr?~k{5rbXN!eMUjfWuRkV0LZPAmx3GrtfUAhVW0jZ&3Vx zf+Yh~av8BFZNR}^yU&nBybga#p*0LYL3drzZJtWGpQK#2#jzAyEh@CqWj@c>4UYRO z!a*nmq+AYLOh8@tb5XaQ6q7vf5(yIXLwb<>UB(0rjMck?f7 z$|b55baK4wMC^{Q1fhX>$v%*ggrpu-V>LzyvR@dH#x|~BElMCPQ>fk^BfG}qh;cgP zn9HKP3)3sq6&{MA%Bnq_lc>;b+p&Foegd+%kE+w?wC7-Y<`y{~PbgJrJ0NY^`eWn4 zV~|Hj;IeQ&#`3+ia@EMafL}ZZTLHRq`V^0TOe$6=0#cYA3WUo9Dzt2*92SuY0NV#M ziml4H_op3kl-k#tb%5@?@lo#f^tHKDAIpUIGuM3$wrKb29i-aLSDhCZ-GPWZl*t`G z@f7O~DR4RLVtcOry*Agr3HxVtjM(@*l0aYX##w5xNTqYykz1vHV6l62-Jhx{Tx$1P zB<~Q}XJIi}@zhd{B;c_=fR)}j20R62ZT+a;hIY#FIBiEhw%RO^DPRB$XHHFz10H7H zuozxZv3Pfj)aZUUXT#-m4bd2=gDtMJSvRDR<|(_C{oLh5FQnDa1XjzKXLH=_G`Y3B zb1_jBRZa|1nzlo)mEs(B)4H>GYB|t&;R+8W39KrFI&@mXUPsO`yWEMT)dSCU6|gR;YGh)Gi0%dA~;Y3!s6?_IG=x1zb%MzeR_D~z*~~la(lAO zdMXBxOk$a>dG=xkl*H31jYX&n6HR)5DWoFkyuZZGbC|WhaE~!mEHXV04zfz3QPCVr z=OXKPdTw!Q8fZt#yIFrj_a39|+m>+hfsIN+-Z-C0(1m8k+E3xGuV zX0N&W404&9)~kaRnGIgF#pGw}r%Of$d;*iDS_bV_Vk>@AitmPfEIZ9NF%gQQ8-*pw zrK)vd@Q@TzX()^E@ev9n@^IG5L=oXLMvWSAE?X(R61D3$tlsdpe676gw?Y<8&QGb1 zrn9OWec@D3Fi0%J6)i5pRB&AOSIOyoF2Bf8+KDELV32W1ZB&X$MG>%LBx4DD%j%ZiGr=IqwiNMJ=T$`>1?+bb;as<&$SF&|Lg(Xo}P#f5E;jHGipuGydvKKP_qCXPmE@?5`4$6__R@zYaXa**%#VjFr472DE0inLg%0+ydG8({b)J+mxjg#62<_(B)fgWP zT!mfj&nW52qNL@msWK;eN|SR?z7}Q{2`_(yaRZMk?yvPTwV+Rch^IiwO#s z&vUwhb9WoK(b-RfYEAlR;LQom6YwTR&j%>ee0>CnJkRog@KcRKd`m~o*W$z$CMXr- z^^*BHt9Qx^`-G8F=K14BZ{+6(c31Z7=dKq8i@N^tM2hf;X^#e{tZlD3FM z4B>avdJ!lw@~d2BqD~GdNRV&P15@pIk1*9Bgs<9B?lHb{p`0T!S>Qy=b-(Jf)Arok z@sc{ycnIs}9^{sbaYn%i=W%~QF*V!j@si2K20AYC=o*i(G_bHln1 zY}x=HE|m&>D4HC$lPa(qRq@XI8x}eoYKgdfzlTngYMon%`&6 zC$aRsNR#zqqpN$1iq!8(=`KX15XffmI0S8tqzxRkVAzWlct5A`-fzYATjHq3t$N=E zu+K$M{;1*>PR;VV(ZC?YN=UT-Nut>Drqy+8JZb!BJ|82Dn7@_SMkW78fu46_emK@U zN;q3#dPV`YK;;pvAIFAkB^?;PZeCMZ^Z;K?sEu3Ahbzt9QpsGH>+Su*tQ+q8eW)&o-*9%F?91Ipg2+8PGhagOt`W~_jXE=}2R&x1_|{Wz7^iZ)83$|{|fafO9W zO#!l7MWKOW2|6l-C@&q_IU8MMz1Fl&Ype7ATo*oL`EjG^5Uk$Jjv3FJCIV8Wjyr91 zN@-%Yk~RS0*wpk#T(j(GK@s>?otAoUk^9bcZ>bJbVLMo{`J!C;XqSVZx?!#7LH9y5 zQM)K7Bb+Eb+$?Q3sTyKX`zl@S#(gY9=`?Ynj3$vs=4M)|_0QKy@r+u(jvPE}^cuy_ z43Msus+E1%CD6N~RR;?(1d>QaSE9cKz@{Fv!M9zW5}10usH zyUqO!f?=ZQ3Uyvjr~u31_DFmT=|g*!`kSzY)>Xx&e_R^)=;v^F3inHV4wTc;d)rBR z-}BFGoWO%`zq>z7;OD-(D5_ZB6A;_ZY#({LzFoC;0>haJK?$2G^d$LJwO_k18|aUI z>Tg>PlcYMN=PLA7=A(e+BEx8cI2p+9%lqbuidL4CeY8;dygpedXA)$Uvw3usYqfi# zqu3y8l`Gym&zY&FSM}4C*yHwC#bImKGEC&C2QfEada05z&h@)?m1zteJk|UXe3qCd-7XJGy%6KCX2(3Wo~xg!>xsn@<_i!fAcYPn-Ek@cEqd~ zQHge>shb$K0Cle#oert|Uip(<#`}4-LHW81;0^E;F!AWNQ*bJynW;okmoF}Imt%v; zE_uzTd7Djqze-yvPF5#NcW*2~Q<7LwrCW;`yxy0^V5K>KcR7BLeT&ECv{U=5#*K20 zO-1viv`2En&y{R62Mt!y%uDVYV@={?`!z}(VLP=K@V3Eu$mYQpn59XlNXY0XXil^q zGn{(eC6-~SQG-C|IG)aDao}1U+Xu))UVae`{)wp+Xh>0)F1qYO!Ps_{qup+=!IZw0 ztcgp=h0c9G0Wz4Z3L9G*PM7MWSoSUl70Ea9WRDl;$NI#G;DXZBA+|S>RnCOptCgAZ zlXkLwI?oZ!_iU<$&=bWRhl2P{*uya~7!_Px8v1_cS$pd%DKf5;%WB;N%Kmjg97W^l zW-cveDC(CkwlWgqRurwgMsko>z;8f|_-CeC=87AAtn2ey?opIEbb}WVnE=;bDDluE z;@#indaW`;KS!U1B*GIodwaaIc^R(G9*%xl+nr*eKc2 zz}H+~Wbm~7U!n3?hI1EtXg%9U^RRDmy&?6{k9pJ3f}mGh-A1&Z`pXv$WeyA}U3u1#oLOh*@p^=%Wl&dZj^(s7U~k2;tYBfd}| zg0i_(Y_pYORJ^z~<0uS@AhtA*hH{};Bd3ts-ZL#%T+_CGP0J>jRbwdwWg`B z-_gDKXEpi35=4UNC=x(L0JZ|ml&4_Cj5-1N z>>_*Xrm`9l@dySWioINW$R0T$ZN`jn8lcuTOHO_L>}4|6uzyN`z&*+BZ0RqQd^MV9 z>xV&Y`l&Z%58cpR5e=8v4D4Uw3p%39xyj-vIEl{V>6I3!>Aw>ZSWD~1QcJ@d27avo zID1Jv3DLtE=!D|tu8o7mfr%(r2i>c59<@{!NSoy}IqXKJXoIQ7V8(nmhzq;BJd&>} zFrP1}tM#l5j8_d8l0v|u){hVUZWzc9m)*)^vXCdy0{WQOol8Cin9%(ps3iLl22f_w?q zN|c7w>}9sQ`@L^fo0*%!wqcRi9d?KF6LqN~TEJ_?;5I~;3wgaE{C@I#=0j~0c;8iB zBtDD;R=1d6r?PzRM}h@x(6?}Ne%`TkqKMJk`H^YB(BLsnc&+!)c^p*u!DwQAB3T^C zX|Y=RD&P}5LiCk$Sy<6$A%Nbb9c2*@rYw+%)tbyXF7s$M&9!8$*vJ1`%74|+pN&ZOsl*L;u2;k z(cQYwizNE+!*AeKARfb@!339=CnLfXfiHV0zv{2>+p#@ zV);7*Rln_##JEy-GV^lTW1nY0&3m1K!CAdvS!w^fZ2huq?*M*6=E>0q_`=NO7*@4s z|N4};o|cM2FZuY+u)b##0ouZr%L!Lz@@D5Y7-KBjQ5FQT7qo<>hdv0&-kSrLeUvt# z&moq}FmIgdS5gV--;%oj5ahN2xrc^r>3trnPAVa2+tNy;_rArY(5dth9=tzzM$JIG z55$h$ha`)W;C{3=!~q(X?{-+O2x`9RacZh7xA6!6)mSF#t%N`LsHfECSu{fCOhGz)=6-pdzn%uX@( z_vmj38(tuq2sa@|iqMGCp2HF(KsGyVI-}G|u-CV~s#lc@l?@8w*lqim$7YgAAhh(C zCoPk5^0)RypQWmGl8~C+A4&TMtOOX|wUaJyaXRB?Zec@OyrT5VA^WyWtSQslH^Yt@hpaN*Gcd#1p@Ggm_5V*j+Ks;S?~Q@fk#s% z@*Epa_EElQrym1NNVOm`qx(z>!&wyE&B$w3gdwM2w0l+A9*dArI%rQ& z{j1W2kw%UEsvu4-BiaydZuMb9tr9t2&py{O(R!4Ud4}5KpF^WYx;5_C!PQ@a(J=93v|6dEtcPQ{;v#@XdGKoAK606@ zl!ac+!t2^^{U~&53}~&yxK@UnuRAG9==McnOT8jz_R;bzA&mzd!s->zI_h5 zif+~sY3>B@zui9Ds0?LP${RT7eW{$6Jtn+k@~m`o$n7kCk;V8t6povWK_yGaX5L{% zx!?UouN`h-{!Z=_o?iQ_&zR5sHs&3--z4mTGX7+h5G1R@#GD+J_;(H|Jb-%|ie}0# zaikB6{g?0619^KT;X#i-LJmb?@f# z=cW0HYArpnXyqo}c=zygj?@)w{i!R{`x@aE_E@!5Jp<}55;IJxcIoe8$)ysddH2E4 z_fZ|B+heH}KAN4$e!hGT98%@S%Yv^9BIQH^%)#rL|wsU!#DITSUrzuUkXZfCY2M@d!QQWBEP5kD!9x?Jh zT1Gsg;Pw22(G=E;BZu+%cDg`p7(JkMQ5_i}`hyP7>VPh*tWsyufhK^w>QTOKr^T z5zx~tmf8q?{&c%4Q5Wkbl_`CehpGIc4n)EK;7!t_)XJ%swMf1e1;tOuB}plro~KtA zHmDSumH3sr^}&sxp3kJ;F*xPd6Z5e&#=pa5pnKU?mGaYa0tAmS^+#rR=o2}~1UlUO zlrB^g93Ld2hnN&SQb|Uv-Wa8nqCg^~VG(t(PM36^?+vNx$*DV| z)5N_mgjV?@h_BN5e}9tWFJQfPtE+1x!;$fditG_h)#`i0TsT`ai-dMGwyCQ;#sgV_EXG$%_) zKI3{6wE8H)%0AMvfwBQVz3>!BkWWFMxO4X{2#b~!qpLx6{ecQ2p)))P!2 zb1=Q4gh)lM3niDyO(Bu#_9@fjq{;T9ty&cZr_Xb%cB`X=u5>2v9%dK_EY=0`?PeVZ zjH8U#^}YyK5!Mg_edqxQulz3fWt^73at>J=18(PLJ$1N8jC1tffD&y1GF;?6I@uQn zQ(6C5Tqh_ein%YxWOI^Bw~y6cXTtHPb^R}6`h3QGHyxTon6JFh*#mXgx^a21uGz(h z<)?2=gav7ns7&cDJQU7^vk)OEMgr~vB`+qg<}WkH?_b_amZ(=B6=Juk@hs|lJ=ZZD zP)zd25r)eGA@J)C>$>P*e)?8K?eY9xln)%jN(@?dgvv{J4~ z>2SGhwx_px$}y~Bg%)*0s;5@>9`s-J^~#A)Ld}JFpHqxaWO3^a2g@RCfC{#Y_ za(l=r@)AfXl7H)ZhUC^J`wyAp7Yu(^zHdOeB>V?guMI%y$Kc?GiR{vm%J$0qxxBy2 zDO(xe#L$ZQGq^uak}nZ@=g%Fnm>J;dU@D|}WRW7f4lhm8b5|~^9hD3!P$6fx=|ANc zYA!slYe3kCXJK&tawuR-in2#R`!jkJt6{&q2Dt9EYWPU+w>% z2on_NXv$l^KV&g+HXLW^8yh^~K6P|jEJ}HJlVV`mlLa!bJh^5{eHrHWWBi7RQg`C~g8Y;I|YxT*nhx_r$*$FWVMR*?4F$@7o&OXiJq6~E+4zZ(b?(mOo^Q@GrwLZoqG(d~AGwb%#Rv+b-2pC* zaRm%w7e5R{5@>lpoWmA_y!P}O zoUVJ(t*I;kdB8&2&H4GD0*hBaQj`02YU|^q)MT+2Pq9KS5VXX#A2RApVaZGgu855*qmA&wEFhpYW+mlyD%EUKv5 zr!(Q94BB9>n;#e3)mw-8iWQa`xUAN#y!S?)X7ZI4AFP%*i-!A%SPx}@clUET!Fj?o zAtYmHI@}@!c!7!{r=!%WU6rDqcwpNLBd_ONjCliIIg5}3oWGLL1^(>e zS(+GUF_>{%XhcNMl$j3X<%mZsdbwG=a6VozF1px~z{qp`p0lGlAe$cD#ogOdg6#Ed zevo9m3cu{~)c-~zo%1`f&j2@GjK~8yseGc2Iqq(Lj{N0XD(zQvo99p6@DWZNl+PiA}FTYPDF)4m>G3?ThIN zmeS_K#tNrZt4y4+X!qJ%7@OUye?jvV%~_iDF&ME+vuTnsN}Ju}`GVqp>Df^0dS-_f z5_)?-&=))VXIxqcH-x9ID1lZ#{vDK|Jun+!xHc=d;|pcT>o=<_b*e-Na=lXAUu-*Z zQe`1Kt<*pHZ5x2NCW3SVAztQfvRD*B9eWcJF+}3Hh{Zn5zA7i^n>Xse6-8cR!I55A zy2eis;KIbH^e9Bo)^wlkcf61|Y?fnP?KZj7Q5ki4o4kMpRzzjE1W>A0@xJ7~58#I% zM@&fRPzXG!514_Qpzd*_Q;c&y^ZMzxj1SyF0AOBkBjO-*PykgB7G2F-n!i?&r;PUL zBgiA60d=zMkgnac=z6-`W`dYX;b1&p99K6tB6}U2tZ}PkNa*)7CH=Z1j!|T>UIBF4 z2romJ#QsZOIzgpNJLH^vj7R~Eb^*@`)Q7ubB-)=UjPv3KUSzg;Q0OewLS%)k3|j0@ z4!H{d|H&j^%A~fPbW5(0KS-1~kicJowp_wueL63D?U)8I)>T z$Iw>Uj`1CyC~T}9Lzo_m2*05lR^2&eiI@y^>f};L)pL~LmBXOkfDVIyK~c&C9BN4D zaX?}yS`i#?AZSwNaG3Aa<-^{(2@k7Pyk|9W69|>QbV(Zqg@EMA#~4x?I2}$W?)*p{ z@+jAo0zseKQ=xH2k5su7QdpXlpJaykR@CWNgVhZMYwC{E-|uezs(0YD{_lNkq(lYj zbiGJO4g};oL|l${RUl38A%^8)d?GiNT_#m5EPx6Y)(>Ql4VBU;!H606*za2a8$qtz zUXT!^AN6!FD!OG2J=~%_zt|oDfdm`b8w7(X*9BgWo}+fut=mAR94QmsK*e6?mACne z?e$UAGDu`mm=G(|o8nE;M55j9M`D3pd9OO8ZxTp#xIo7YfTr-%w9<3mv3d+Pkr^|0&j~Dc3A3 zAB-$VckvGpCuS+e>aYjsfn?hM9q$PSGHf(Z{uI#=VS*d3kwa~69G3hhb%I~eNa^0T z>8v>7Gt-aR@#uzX{U>7yzFvOFuZscgEv6qfhS+j0?fXq+>OpCceEWAjxse#v$;OC* zVQ~4cUe2hYVktu?Xfh)o!G}UH?3Didp3ue!*?d5a`l}~jA9V4UEqDo>m%mgWWP)G# zG4UUrLcaZ|63-(AX*+Jn|30i>6&i>xv68%`tmB_1^KoT-(ct=<(IecDtIYQ~hOYA~ z1n-kR2w8SWc!*$ZNM(|_zsbaI`%7t>&?j|pOfuzgbaSwt_@PIh51GodpD)>dQssz{Gi;){NW^3M&4!9~ppqd*K+n>u` z#{ivmPlhEa2x8Q-BnK{x=_h zatvP?^|K%e3*vt>;a@qZ1R$F>0(rO<=HHp%Z+zGb1{$c}_oETk{~Y8slbHL}>?MeI zd-MD0fq`2y9(a{d^Ic=55?NFgG1;$Kn;#L$CJ z`_M1aI7*^_S%l6|_1x3kDo>zx)_kn zP3W(Ao+S!CK_|7n8MXz@LqXb?A0rosDfhom{9jb}Oa2HlHIo%%M`3?UE)ixB(f|Km z{>DIm(*OT)x_HM%1;C;VE!`tp7HibHh8`_9jMo!Psa0sVy&s6m3fZ5JsD51T@X@1` zOSw=#jBp@`tR8j8{oav3Fk-~24h65>>?m`#mec%PLtGodASO%pS@=4SNt*Fiw+EC9AEe^obEiK?VV#xk-=WCHvdU&#nB~X~% z=5eA=;Utm{B9k+hkWPz5Y%D{7KqJX9WVZuj!03L~0sn@}CY$hNwPnQ3Mc(qI^%*E; z4y&N!9isp@TcOA1bae4{*2{OWQC2#%C(S#l((QU$?vjn)rhBvLYSrgus9dW-8|F5C z7WLTUWU)h!L9hLY1(}c-+yerD9^Nc;d4+BYKP484|J_RLIt)bMgG0Bl z)z!&Onl@fSW(bq<%sB3AJjs4bt~}Er#aM}?@Fed|)cFIOjyQiC%mfa}_;@-l`=M+S zB6#ko8y3LAhasgiCk)D(AUCnBgx#@>8tC(FU8my(HX7~LFRk$`32Qyk6d(cYz@}>TZS)~paqtvXRKmOxl?Z1CXgkql zlb1u2#*`~0k|?o|?aV=&>; z)1cJwYE!bYwughHe)fvKK#>hfAS&zEK;X=7|RQ;?8&+A@oB&&78J`sz*#smZN?0u}cKjq`Qn9g^*!oetMW$T+FaV}|684Gnr<2N0E;jQuQDT%Bhs3(nT z{Rz_j`NFKL)2YSUT$w|k_$iZd@s`MpGUAPqwi2yz-PpVRLaSX(+FxF8&R<#$GLCOo zT7HxfYs1ihtGCPGunk(BWJx@h(Jh`ee6XJQ^`)iq_I~}Tc{r26r-^So3z>VjE3MY6 z!7%iL&7$bT8$2pGf$Ip4!aJ*EjDZ(sfb;zYbcYNX2G%9KNFHHiTD9OO>2%!7brC~k z?6Et=cAa)k^v_pOMZDK7=}K+~S7!Iw+aC4&Q8O4sbnDqz1stNv9Bv}Re2OzKH8PzzubPgI$E6V4fM&p z;0A@lgJRkfM^l(}kWUxMR?9U>(!8HFnk@u=YUtzpw;Q$dx}~Go@xZ}DrL&yF)-74Q z7~rJ{m?W;got-J2%`0UG9nFiqBtQSx&~&r8#?aB^x2dY?>RlI5ScFv!LY!X)+@jyl zm=6YJdB?(MLCdaxaHtr{5?2>R}i;$1rHNn6$5doOMhu1Xa0gq;a% zBG78w#{r6X@3G(7W4<^%?MST1@9N0TvR0{?(YJx~xkrOix{S-b7GCG%--tzSjZzXZghBuSoZ!S)x*TZ+ky zgCO4PUzlv z^n-KJt?BsP|1EiP#knRqFzfcW}yfoKzu!#1o{L7ltaf;Nsn%QA9v^$tERPp0%X-@Y%b zq|8-vGEbIX3Y$Lu{4+nZfoGH254}cttOK?QE~iA$`$I!gjBTbHavARv)H_VFynb;} zHw1)qntT>e49^O+-?@?~)~jnQQ|F%DYL_^!u6A8-)WGbht>aye69^MtKLclZ-hq77%E;>6Kp>1V`y;5ck9zHOO9sY|W*_=nk#$=``?v1=)8 zdbwYKDtc*`rAb4KxdI1dvlYQN1E{ccY?MCS+R@(lFO5DXTuu){L?goig@**d-RLHm zZkZR$GK~%|U?S4NgzC=tIP~2X8kA@q9)e}eN6v(nEftz*S+$6ab`Duv#% zN>A|@wKOU?>q+B~WJTebUxc%y@jN*=JhU(KAYJB)Aur?v0B8;t5@k++rl6Mu*Z^Ou z)xd(K5=v%uT&6e6Zhvzmp(UP|JHs<0{Vbnl=M~c8Zpy=zo-ni-ghEe^4?f8xT#+*QXuB!I0wy}03eoIxU=bwia>b&Dyj0H$hVwrr`EtbCB zsMg5=jw>{vD_ zEVKDXosvBSF_!oSDd{;k6 z7PYvXYIH%ZM?PXa{OpxMl`PRM(KesX_N$%1zWkx(oiOjPBk|z4zcuy!GkLK${e@_x zsb{qGLynCE=q0Ql;V+gUMn)NU|(dM$m zeKwmKH|CSt88voWTN4_}q)+2L?+qAmzB!s76+@)P$-Wcxkj&!Ga6Fh?_AC=>@nsW; zk3zLMRjHahnQy7Q+Mn;aNmvp=y*a(wXN`Rf#5Ot;6aJdhX5I4%qEuYc7K2)&Agon) zOGYA@DW>*hPRo7=xo!N#Vb52=7%&_w+>Cg6j-}M(6WBSN{B_f1Ea3bFa}_Fz(d4_FQFO{J~;IVpzF^L~r~>vi|A1 zR4q_HL%?a0O0i+cV={Xl_9Eatme>K4kyL*%g!S08gyJ`I0<-!0(xa6o8TWcCYv7vM zb{E2_`Mq{EHWCUxqtZHk4Iz~*K9|>lUX$Z~YmXTYrT6i|+ZvmVJ(7#tU#oc*tLyHv zL~-pYO8se{anM)FoiJ*U`NzrKWShNzs@w8IAz)Ky*zfx2a9`n*zTopyuhK=}Ta-m; z)JXhj{x)lUrHfa5rfTINBL^^sXu5+9#J#}CY1fZg!FN+`Di!vdt$-&$IHfsZHF=Jk zSE4L&I$WLnHb6|!rTl*=d&l5NyLWFp$;7rbu`$uan%K$2wryJz&BV5CcC3lbj%}My z-+R}ypI!fY?+b ztX++zKdM@XqMa_lVX`q;df^0r{{X$}^mjkZqm;WU#qd&!e<8DGeTqpY9rC&9`1D4x z?YwVE5n!&qYW&41C|6F)s40|@z%Nma$V&>e_*tY-lKidGp1KU!Sj28Ur@2@bH#q2A zoTlP~WUo8CfIiAqSmv{@HG38utEK)Z1vIlqz-$9WZhv3crc3_%X;~z*8AmCaY;({B z231$#b#%hB_`z19Ff!bUV+tqtrWbt=Kp7yWfvfIX2)hhP&x>?O^x~Xy0QZ;SuNgFpIE*IZ_mYhz7G&Z$ooDkU!KHh9y$nW?&3>^2tqgOL9CpN~|6dnC z$nMGershG}H&l}7ehdZQ`Zv#8Y1RdzT#^W$*LM_a(~qFggLSxT&xED#TFmyF?dJ}j zxWvL|hY(L^QOq&fdDhgIm&V9{oXCgJJX$$lJ+Tugq;^th2Wo4QD%3|pu6GNylBUCd z1in2NI@F<~6yd;E@U@BZvPj)g*0^>&IB6vnKH*>OMCq5JvxX)rg$D5ka_oXo(NRz= z-4>ex>gJFuS>sA6mEIiL7;3S~ncB0!;p;8EW_}3Ys|`ALO&7btSL_W|_oV?=mw-G# zh?FY`#*w=Mh5V&rtQPjlO3*!d)(La?u2ll-8P#JmZ_ZH!B~#>yY&@51EfZew*Y;KU z)N(zoo0$Lc&(F?0 z7nM*G>!ydGErfN15&hp&)bgk<&~&OaHPYknR=rKjG09eB>o?k40?IB;yN1}72PzetkC9BG}8-cDof_2XfYYoUClFlM49prkcc^_h=m@H1Q%=XxgDD zme+(b1__5=4&`OB9GzBQ@vXj)8(^@Aqui{_p~@cS_rt5e!_xA>4eHr#pUiURAh54y z^9t@}&3UBIWJUVs^&n?21xZk{nlDdU_Vu(?>yq1~(PR>Rl|&}<2@h|!qo7e>QzD5C z&U7VTbGB`hg;u|;n44dF@@?pwIdGA2b;+yM{_q4p5?#P__YbYLJg~juYVl(o_!&>w zb}JRwH#lBBzT&*B_n^kCO*6&+8Fcq<0XR;ux=>ec-}&MP zm1G^<@=IGuaQ54C)MtW?=S|x88*B!$k%Vx8764x|4UD+Ts9Ut{g{&l{QOO1wGJD~H2PYkt}K~jxVt~{q=0XC zIF*wuC+KIs6Y6@JzAsa#k`q@blB8H<9~Qk*=-DZaqT4bxY&>!G8l8SPL9i&D@CXE+ zd@Sm79fvUHsog9_c7?S}Ht6s4(+fbjobhYWsrdp4N0Z5G=r#?u-*PAyEnK za$7aOTbJMA^9|KXKuYC!3wya)d?j%?F9;?ZBzuXy&j*ygs~obp6~wc>IYp)A3QGQGnET#T=yr+pj<2JL6_IQZffp9UEdv@I*>=Ed zVr!NE?zz%158VH1r}hAvyy-NF!@Z}!d-Xyzi-Ld(+9%u1HWlek%~_4_eOjRTqN;Mf z2Wy#P9^f3m*3ZRt96K*6socsmj2R=96<)e`!K&!H@d+kQ9+7)6%w!-afMms(pgdNC?V2~f22EG%E zZ#>V#daw<2HoVyMn2m>PKuw4(eHv4>j3@}7sU)yfiYR@pi6 z6;SUI8M+x?h1*BNj7&CJfD8Y4U3z`xn;c7g?!Bcp?+T(S_y4Jc$$X}n&9)RQfSt(s zM6}#6M{Sy)B%q&CG}cY2@53V!TZ*-EY7d|l+hj0j2tl1nH;o#1xS0_ZO{!XM(Vv5o zsW7YJ$+%CyC-y*g-gNr!RAnwxiel!+^Dy%_(;b?EWxm3$ z0a7meO_pN_XZK8$icg>kGXRvS-&tGp>{NH_&p#SWQTfZ$G|%sKlpDL+TG_wlU%L?e z-r)v!!U#Hhp=TRf3`T?>-cY;@9ESWh%#sEB!?(vJ@u{Mvr^&flS=_g|_7~P3^u&kz z+h6kdoDUgocoUVYP&*z;41rz0w`jA#-w{CEFp)^^xhl^YJ~8$<4#IV_Q_0Yuw# zINZELoOQKmM^u-l+D)gPPLe039KlMPaRv9>f~#Sk6B#ViWO(|Q|$(K+W>Fi*kb zoYg#pqdGK|5aP#S`KD>kKJZ1U*)aHJRwJBN8#^TFzVqcie{cb2N1TJH)Bw?~(l5=m zx+%s_>&HNbKpK`kP7}6|L^z%Oi)TZ7&$Euifo^lmRefLUs5Lc_65QtJKAkG-2jO;k zh5Q8mP{fWW5iK;{H=ES!1dYyBZ8aC2_yEwhnN-qN8Vl6Qz4ekUt5NB7(wMppP@|Vc zVXOtBxoUg0IuW|ou2ByRjToZ)=Vuu0TCQXr4q#f-aRNj?S3Y0x_Ws@j`8-lL>nWpL zzdPBfezXagk?4Dza=UuaCMIu|sT$TFCZzFU(Q8ova#7BsO8PU%RCEU2Hz%tYyUuu` z28mvCBJM+O4Re}%A)J)W<${Z1DZ^t4m5Zt$7ZDacdz%~WQL=CR)b+o8z0UE;ZR7x6 z@2sBiH*d2I=w(kMd;ZwhZnAJ`#{9_1LV$Q`4yLT7R2Yagt>R(OU1%dP%rbiYK4U0V z?h2PBBz=Y2)7haJ^%&!Xk5Q?y7)BZ(l@&|m&yorK%0SZgRBE(-2){RWYH$xm!gEFb z3%QW%vHVX?GzXECPf|9wH7AoX z-X6A~2Fr@fnupTTeoOUn+4$LEqse$;KTVMT z$FT$v_}wq1;(gzB=WZ-%knBJp-p491?W>HNR4UsaB^PpAwe~f~p~`i~Oo362q= zc2|$SXv*f-+>+BBt*`DPSP`ExXQ)Tl#9Myj5OND{>y;N^oQ_nobU$`~JcQ3U`*1iH91A4VFBQgd}u)QHvsHZ-a;cI?ZgGhd; zSI)`NczSFdABzO39P>z%_uFh0Q5)Yu;efrq!Ki}+X!EyV806`%L`4h!i|8EslLt4r zm@(OqS@utP+I3bjTK@d@i@KV%9>3`8J`EZa*pM~1H@cUth*u6Dv=E`TAm*U%&yR4qyug&YO&8ucEfVqKH}|^; zpDRP{ETxvJv<1Yhu+f+O8APImG&jaTAec8v)}o(%ie`O?U(wKSW$ObS#4M0$BFGII zaOvUY%^!BF0Fl}T5IwBsvGJ>^)M>u^8g+X265UH7t_ZIsfSOXH@zvog*eO=&*bFC2 zWMZemGLnE%FWFkT!o`KEmv>6*%MQ2InZ~Qde9SO@o8N7+5>=l9DkO%SW{@FUEo?E_ z3>i8==5&8S9irvuHAG+%R@m>0wJz0P$BI8;FK&5V`_K?EVi^KM=Nv)HE6k>J82frn zvB8DPpCbxk_mmG8%S=-^D5{VkLc8}V-TFq)8{=>wNyms^h~N ztuc3+Zn%1X%KBFJxk)6le%QAvfTvf}l_!2_{U3D~$zQ)8F8& z{`uJM&&0iQ$tc-2XWpM^*QKV(tk%fSVpzVC)EDNch?>K`b+yQjkHrFKYt0K92z^8- zCZMI=XjRg%G||yxhx?rQ$4wbpy(F$Nbu9D261crP`-OTa<&P?@*|sG`XUK#B0}t8( zg2}QuGZuko#b!lh;fISiAVcSKi|^^B)Abv7T7yq#vaW^xZOvwjs1x5%saP;8!e2Co?O;sRjAtOzR1veNeH?Pv0u+$)=x3NedY8hi5#E*L$cUGw9$0E z@ve94E~lvQ&i?uY(0wdc2y;)uj(%>Do>(I8!=%4nlFqd@rA^8^&iiL3gm)znWCo14<)v39pZS{A#sEEl1mEZES%~_Or!;F3lzhz~ z5fBi@4|^s%%TqgW_{FcP%hod(;(J&p3T!?YyEaegR-_-=Ew1YARv>E19+V)pzfjFl zc>39Fw{mvXe;%k@Jz{f zXtc}PV@Rkfo=i0CcRQwjeJZ6aqAp`;38eNw+|ZbYy%|VeEXTd3-3$#-UGch8*{JE; z?+h{Q6hOOkxxyYA-()(eTCLV=>6}jL`@hjubm@@}j)l=_+QR5oQurTZ1&Y{DRd<`d zHgB5lUpT?vp)!fd+aP}h2z)&ABpK?n4)VsmI6oK}>sbDPciJ;YviFjOD-#&08QiAx#f&K3*pF$_A5%QVO( zvKeL@IaQHNhBkX^Tm!XMG`MwweErn=x1(2qQl#<*RN5*?s{Ggo-TrUEHKj|lB`TCa z@yIHDaqFehuT5E*IekA{O|EyuH3NQ%zQ)06R_1XwSue2khg)~)s#ks;sdnP>Tf?#Alx#Z-(oRLPgB-(Ke3{+L>Em#h8xrf7$_$N^Llt zJgGhAvzw~DIVi8S8SXdo+<{}3;m9`G>Sm4#&b zF#MLzZw0XdGP_Yh+HDz&$29sHx;ttqo)0NL`Dm$M)t)SLW+^hKJ*e)L2U|@WQLE=1zPGE?^ z<&dT->w^w@SzsD>*YR*&QppMi26;A^OQl?_7dbVZP|2_144rU*cB~EF2Bzh?PtrOY z)%J}PY?(``b1J^obja5#H<}!cADu6VVE3mo_7*CqhQR*fe~d_`@V!X2 zz^eY?F<806PlavnIuWiQ#tEHynjbnAQ&z-)q7ub|#M9$Z>vebXRPo0i^G{ix#!`Fa zZL#j*t(j&&ktyMV{oN1Uaho>S-`v{rc${|Wo`2k9>B)cw4ZBmD`vn4dAbVc89CDCP z4^PeJ@UCjztQh_hgK>Via;a;{h8e&01E1M!xMhy7k=f&*yzDoK92-Dy31OYR9is*oe;Hv#i%I8Ox!wC zU6eYj*YWh68Q^t9*nQjlPA~59i1_Nek*FWnTVSFg*r{+-IfuHYF7C&vlH~F-PRun= zgGq24{WqdTF6CpkQr^?IJA+)+n%zM@z28l$YRcRFf~7yh5zG#6)#cpWjC#1a<5aKH z1xncFzELgMY?X6zBm^y3cD-(Aw1SOYbS()yHl<2O)NiF#jpwA(X{`||o>P~K+wY7d z%a(T@0-Yo_w-AuGxKT?cm^aL@ME`wLe^90Slq75I;y2x)!2--~Sg7;ik+&X#F?$GK zFDlq81J1XNyEz8zI%C1@lcr_Jx|;As?&{>rQiZc9tHAG);T+oJWHLz%c?pfNwOKo~ zHTSCFjz@nDE$6y|xnabyuA9T@G)9fyR|U)kNA$_*joesM&d=D znn?DezrWYbBgilncgGCry%xpTACb*(ry`8EOs6d0R->uIAk*-`TKrcCk!#D8?E;rjygO7N81AY_(#rbvn&Vrg{ zvn9;O3nPWu^q5uvkxQ-=MSJ{&P}2y>+HoP7{FEn}x)yG0?Do^;dxNic?G#$&Dr>*9 zG|__*l5MkK>^-QpZ*xNx8NuIwJvrf-T-tB`{I9fbnkv0xAq8qb8b~I?X?;}0^=Ct6 z)9vwM!}*Fe{FTqFu*t0co{Crl|6;Rq%yfE>xr?F1wRoA*xn!)e=(Mrhb?gbNA*`P# z2bP}jDckvB#v=6gz$=w+FiEGwPt`~e4QP0S_FAi6qr3Q_%@;1kc^0SN<)q}R&5vbA zavA%;W?v>K@(OtOek--n`t0<@L_U38h-dXJu0(!4Zswv4{-aWc0eQKJ^ zp9g}L>JA&2JRK%{s728~A?HG$J8-H72Funik<7MVT#u96J?m>K)Oc7=oo&w9+Hbfm z7W$j4yJn=qm^4cG8yjmnP>Ako``g|!gA0&#x;-aM@m9hgi!Sm$qj-?4kFN$&PV=*S zP!8Fw(C{}pS9I~Vbk<-PAHiDnfh|gfbL}Vw%bgpz#Lw^afHxz3P!`(s6lVL@!Bzgp z1NqbwcX!R3`%R%Eb(lah{q?+pv+G8|nS_?Y2@Ls@j(fIBFVMp==lk_#cRY_OUCJY` z{jO2#7AdNyeQ$5?m?_gtS|0_}S+ajslw>8j^pBG`oz7oKeHi^TjqIuuXULvXtg^wOF=8c}>ZPEovvDL3>k- zipb$5q_NR11Bu^#%7ALwO4(}EgR9|gJ5gk4htJB%i>^A1zMah#9%qj#kroQUr`52z z35W8z%VSqsrOedo+hn_Xp_3;Cei6<|0v9MY5Q_*U%xyhUW$$COlY_n+IAUaz+&&K7 zZ;ftPYW-MGhu*!D?xeFPOr1}t1RKK|^iKBGhw^E-**62f2xMOIew{yRpI$r}bPBo%K^}Fxvv!3my1d4*v@k#mzfW&iXO0%*JKJyW1?|o9T5*}0O zTDXwW*w#UOfpNS*i606HpcESWL5YQz9Et7l#IyFYr0;ZkAmiJ}Kvb%4HY*Tsc z#%+$)k%t#XXhmpYT!aw(<^}OyZ`ndFnp7|RuvEVx-vU6mzZUGDGTFP}Z>H$~4@&5v zhUhJny@Nyi^53|?6A~zi|6WX!ii(bUnU#!&{jZt*-$IK2VCJP!Icfiz{Rh7U$-$5K z>BIhAA&~oz-CYq!kWJxVS@jLjab~>L5T*Y06~b~L#2}Z+&n)|2g$Dp?klp>U5;y)| zUlEiJx<%HU0yBX*u2+{U?0# zKW~QG`j6^it=@R_AK38!^OscXA6a}=t0(KfpN0;IoPoAmVcq}t16BRQ->9tt{@cy| zAD{HU{)J=$-G(nMXQ@vAdY}lke^?-TJH66;a%_)Qh1fdFvASO=S(BOkq21Vb{~WrA=XzKiv)cb#cA1%!T=ho1}^#n$$C=Kb|#vbUPmZ-Xzhl7sI1$3A{u9oi!b^ z-Bu+2|I2*fWg~?~rTk7&E{-2dVFt!-f3R8nl`s=bPR`}$uvz;tYd%T#&&;)K*8I2S zTz5Tk#(o`0UT&7<6;CBfHIh^lU^12@fql}jsCU4uLxWdul0Zhe0is8oW!A%DN`E%O z47d^m3bjoE(cdEI>D4OdQ4CZ%V@*Nk@SEo5TUP?kBZNb-nyIMD01vqH+0GK76)ZgT z^{K#V731a35w0Q^IV2kpjtADGo{DO9Ldug(0EDenS3$ zdVj2Gx$sKWb#ph1>j>_wXg9YvZ%{O*=`MWwB(fSFEPG)j?1rCjw+c}tp*z2o-^565LLyUtY>QSZ zb>iEu*JfjPaY;~o%4q}8O04WbdAhp~<#(FPA4Nu{^S+Vf@xXphMzc*GcyKtR@8B?reNJ>CHoR3U6J z>w)Nr$Aa(vCuAQ{V61M(tpDeS#H+xjTuC=KNZRa{+kDQeu`2WK@u%47K0WcI?Ql;O zDHv`7o27==%};tH+au^qWl3BvV@}M>ti^U!s3ex8XytM;cjFyKV|Dkb>txG3Ww=KB&*%6AwGc91E^*{ZO+^5`A+Py1r8T{m%(l?3|eU_G=@Q$uEd+O z9BurZ;eBJ<8<>Nl-@Q_@pG~&aws=2D>O-m4>53M3c{0Wqw5+GN1l_Owo+kB8zdOlz z3ODM`xDwcq7?x`Wdm=_n!1w#3)lkwSTAfnVBvAC1N-KqF@Z+Sq`ynW)C;QD&xPcsm zeKcB(yyjyCF;bFNZi(DNqWe_xkrMWi6CX8|FId7|wO|2nCU46btei=yQ=ZT1g^($t4GJ~i;@{{FHhOq=2BjjX2k`=j|B_%2qrHvVc% zK!4S1zy5**X)8M*!L8kFDf6I;PhX!ykg&1mUOf65hsPCIYx`wcL=1@9Xj8sW&KS4s zgye70CG3srDdK$irXV;Is_&Nvq8F0gF4u%X#TK|`h-x%wUygQ@*nO@HMnU+*oypA} zyU*XP2&uQ&t<>*S#1asQcmhFGa@it>Q=b%1Qy@q62BNL|!5R0@A}=J(GSN=e8rAT{ z+Y>@|YIsaqs%@`_6Gq#Cp1^hq!g>7!vx&||eYuX*+YP=E1j83(T-<%z0ubkx2$)e5 zwlJ2{YD@&2a#)=$TwL=TQ`Uh5i#u_xy{nm?B#nX$$y>bQ9gtMg*D&BVTv9oljQ7V2 z3Ceu$?Cb8EzCXk9q^JKBNbJ8%v`|~#zPQfkTdH_+ej*d|$j0Ea=Pr==|DaVaWSOjw zxsK}eF0(F?zY`gXh74=cNt^u{J3JwE2_WCDC0lRv^~Ps4fqnFyQj8BLDR$Z){j~}5 zr|x_0kD3Mr)QLQ+(}J@0iJWrybh`PF=dUO9^vs<~kO4y9?{CdAYj06J!VTVe-|G0= zOjev`yG1K@Ut$Ox+{Bj8Yt7cRv<89Nz2T(HclV51E%(3TuC3;Q1&mx46{Ji<-=a|- z&VS1r!!(Zt)`;j7UQ<58%?S4nnKOJC>xgILmP~kDQt7|Gf)xk@EDSd zuKxBc$|t1yhZHwq1Ri2B+WS!_TC-fX*Jb1{*Sls|c6BYpknq?8vSre~9eE3@Kt(Hdw!4=4 zZ*DZ^G_~O0{JC{^b^y_(`< zbyk^8n({1O(8&8`_h-Xski*dr;aWWCPrvbbj!ONURp*izw?>B<^wHk}{BmjE1G5dz zvlWlnOJGI^f&=m-h&_KmhpKF5amru6Xf@qflfC&r<%Jj8;xM3otqdjNYTPB{%az0D zQVYS_WS#TdKf#jDX23(*Q5Bdzb+wwy{dnIa4l=NyH}ifr#eZMS*nt*@k4F027-c#p zn4itxu2PlG;qlt5Q)Ui_+9$}O+oWHk@Z>j|#7GXTI_0pSpe4N=PN3S4lqAP~BE)FP z(Etlh8W5~`m+B8kPUgg_ocN0VmT}|muytBv39?211nIqdDdtDQFoixtDIq{CB?{7k z0IPYdjoP!IA%S!*LmJzd>qwC2Y{}o?bN%(zrjTb@m&;C0^?Yw`x!=BIv4^f}?giaL zba%lh0yL{1Bk}yg8kv3ADcj_mg@-Cd`Fu5ipsk;k7D^ZT9e?>h3?X8wJZtvGE!&P` z$iN``%5u1NL$dq>xMM)w1l1wSx^k%yxYy}~l1or1j7FoAOlP}$K_zqNz~)ahJeav+ zdGZ4Yzo)N8-^^#Z-9PwrBN{Cuz6om=!74E1mBs&g#@7A^fz-&!R2sJNPiCQ<6C$SQaac2eqd-) z*!}7C&Wq6rmqjh4yEJg#l9ZSiG}!DzSZ=vz+zOSuBI$+oH4UbcGY!PO{P}s`&K?6L z+CVNRE)+$85QW0$kkc4dk|Xt1OZtJ``~H~Tv&9}5=C0>b90_QJ{}^Tzu7JIT8aN+0 zxA+WvpO0}L2w0nMPyurIj2=1emQ4`YUqPt);tM*WCwVqOV1fKACpY2Fg340@Xg(UH z+gTfI^LLr60yZh#ljo=%A-Sq3erP`Am}Ej*3oQOG;lQ;m;V-7WNWj^^A_9@O5r*#r zxIAum+K`6!&(cX^N`?cqUw@``H?i(gPiIdFrtBFMOm^*K#cmPQ=ydH+b!hkHo*PKa zV@T!vZud+)9C~t#Mwez0z=L2M4}Oqfg5B}{qEKrnIuOJiVv`yrOo|;Xn}tNk7h=Vh zvtH-hVbFWYBM2Hrc1{O+FkbyAjYGtnz2#cb#x!N2e*S$(iVBa<<9xu770FKjWt$Nj z#I<*Ts%B-(qux=tVe zuJ`^Xs1s`%kbhV&Rzt==)l=`Yrl(&^*5UQpmnj9MMu zI5iQDcZPj?v}*jR>H}PS2Ga(&H0q5cb9QNEXf_~yaVS=?&Y*NQau$#F)cCXqJSO9T zVHK_;+j*alibF`Zy?`2o09P>*36IZ@)o=FTwIa365ru=RG&@7GA&m*W{b6Iy{X=s6 zl%t9FxocsWtAxPGcy~VU2N;aO{#D#~WI|a_28GVZTi9IYVfO&X$rMie6}t0KjvL?3 zLdNtiZfL$AzZ-{YBlJ?uYFdA%%-$pdX z0Z?dy`~GqZZq+q<%N6zGWTAri>iZUOP5E6srnBp`MLh1>K(+zni4@x>Vs7=G0MNyy zM|gF%{+I;A7dy8Mu7|pL(z2P|7!M-|$%Rd-VW-r2%_Y##oI-8i+egeR7~<~OuTV#o zmKZ~Qp!R36TKn>NwKIi}x5)N{9W~DO$z=<|sLFoi@~F=EhVY2wK(c3T<1nb34M6uK z24F)Ft)fs2?@5FDLwp5jHSaCmNBMxTGxIa>glq->v^zxG$l?I*G_C)4Lxvy15%nlh zvjwI}OpUK_-6C=Tk|YB%6z80+UQ6lc@lKE-vyqHX&bXcT@c{Twe)A}>&gAnjs~H3$ zKCkXkZ8SR>&lIzR4QZc0%atek)1&pI`agq|xRLeA&roW)1NJ_p$2+ z4;~naRH)W!NQUu6fpugl`694odCgDiYk&@Fn|X1+KvTub8(%>mg|HlzuHdmt?p!q$2) z-4nu(u*|34k6>uA@!-~pz~9Iwc+%H6;P$FvENJC@eqd_B+TAnpbk~a&9f7gLanrJ+ zDbykvUrhVT<$qx+Y$^jXx$UZhuvzf6sJs50*L4NfIdOVl=rA9e%3$A0iw#7kKi}>* zz2(XUZ3=G?>z13q%{YWK?}6&Nlz}8{KeOxluys%yNxnR5!a=u|jaT4)db?BEYw{~h z_XHP@0ah9yLLS_pi6g#voz+EPW&G4J6CG*u5xiB?XbHK?RmGrNX@YII9z|ha%9mEO zgfSGu0Ae8Nwt2-UQ+vj~<$NJM-tlI%ZfrC~;UVSX;FW6QM)*+61QO&F5b{^pZe~siQ zS4~YO;4hSr9WhWM6peru=!kAHfJ{tNd6t&4Ui$dznvm5@1%o>g{s-0osom5ERnW-L3q6s-i}r~n>$`C`BRrb5e6@y6|GBV*S}dDh7c zpt?bB!8l0+2jB*bap*Jxq?2eS4_rCn;{b+pW-||A#cC|=DNS7}{3*)gE42p6?gYB% zT{ygvLNIM>9*3`&-J8Qzd&&mG^_JLIOy&PB-?q~gO(qK;Nb z{0qnUs`JnM>nu@3EvU_+Xvd?rcyHIU?fPCi%(lPaAW{0V8|fLlC4SW%AJzyk4*Kzm zb;~2lBCH(U&4zt7!t{k&0FM_r+Oj;RGK(|5iB`M54GLQeE-uCkjLPn)=)y4MkdJ)Ff-t8+?EN7mf@#-_SNz&yS7T18;Hq)bzsxr{v_lWo?#hEV z`}^Lv3(*`?D1w-*Q!Zn3&;4yE)*^jxtFWB?p&-^91|{ih?S3s(6IQhcJ2!~Q|GZa9!}grz->1hKkL zUW7%^+w0?^;noctF&;M{o?3Q(zLV;TP1 z_8*pBxuhFyh;|1s?A#o`E}`#21JfW%CDo7-R1h2=4E*&Gl{8}ijf8D5_w}NiwlV2& zAhOae&Rq89ERn@&tYm%Sod0Y(al2~RP1=%87!2sfUB9`Ooqm-K`H;Klo6GsQq>naU z#V^M~q~M@!G=W0jwK3oeIqlw$jZL+_JZq7YDZYQs`gOU(9OrzsA-qN7UHtjxz>uH` z7#a!LZw>pr_cvM25A`P?ldZ*O=6IZn7qOR0qb0l30@pr;LA$-taj}AI3qG);E(c{m zaQW3}t~;ZS`VY#~zO`Nt7Os4thjvxo-bljC!vujP6p#(r=t^H#oO-2iy)0fnx9s?y zb2@J9b*uAmxv__qrS1_4N7nj`Q3c-kk#v58iW-!8!kyTh!IJ0yA@&WVlbpZui@$Pm%E(p$4nOSBaah60hiV37 z|MKKq=4?r@&!RLE_=X8Ta0(^;#kuc@0v7c=#I}~}qq7&Ljr(?_7D4c4qGbS@s+kW| z4ORj2EXS`oDz;WL1f5g3h#$rO(e89yf32hU-0ivgA6bMR`6YduSb)Vw*^g_Z`XbyQ zY1o!ceAYFO5f0ard85^{(;AdtENI#{Qy&hK^oY$MH6;zC)7YGp0k9ji@;V22MzBE4 zunbtCQS1VsjY@u(t&wP{Hra#ged+~`N`FlgnL(a$iW~Nwe_dW@@gr@R=B5;*Ufz;t$%MbqYG5+WjQ|3Q) z{fi@_MM8sK-r&Yz85SC*Cmq}f!F_l zz3F`VkHY-9B*9WO72;dcllN+ee>b6IAV4_PV%gS3ytVx^hf*d)4salsp+Q~HD=-Su z_%p3iU#QIf-i`)4b8lDPR$$)<~F9I^HZwleAfG^jLS5i)WC zMB&NIzMJ?G4vI46Kc}vD{!Xbm9ZeZvAX3Y6Scd5qX$UzhvHqt386W(Tz4~{>|20 zOH2Id9H%mx#|`TpfQ^3^O{z@T%*II{LHmA2c|!1Q!zP55?O*3f&T9IrU1ClaD5Q5@ z(W9Eb5`!8}@oIHO{qRrRlIE%6=pSIg3D&IUfuvrzB&kj%M|t5S`51sN6R0P5ZgA)I z!ku36h))KDN07EsTt43p0#N}9X`C^ncOrxqs5J)Qy!OsTGkn9bD40Y(IM|6S%)Q{* zZhd*4J-8j$1}JrY1yt-g95@pI)LWPFl}(EooqW@+L4CrXmLYKt1BSY_4G6EXb{;(5 z9}nIIe_jI=NPPSYjfPyaGlW%UB1!fXB5~x=XO<%|&PQ&0jZTkOH$O{RW)PB|ZXLYo zH0vTTLZBu#EGW6nc0`4iFA(Z{F`wTp*bMNa#BN66F=|^RVhY_o5g_D`$~B7UknlJ~ zD4z7dpTQyES;!PwFGuMMYxYJdsV&I}qpj@?C(tU9dvRS!gP?WN;wj5`hr@?pq5p7Z zGR@!R`39X^tvDb4NusW_A>?tnI{M@cyt)YaR5Xh?DukjkzhB1XVD*<~s zRJUzW`dYIWi%Yk{Dy1vJm7s6zPb= z+-{uDiq}g4iHLlH@w3BBE?vMmx$>7`exM2#D_+Gw6yl7S0ho||456@sz&%Ay*or*z zV_A~&qI5%?P{&~7;zfu9np75s%c$b_e2WTya1)A2cizi zjv;WE@4Xnd1SJB&xlM{yW&_f0*=uc;PI)#4c1f7vy#L*HV!TT}I@MzE7ng$_yVN(q zog4z~dGh%BGr=v+LLx&LOP^o?{x^_A@}o;YG# z{QIR%ju$uXodv|->QK{JFDJKYu)zUK{<-4QYA@chu@Fff>W zBh9w>x|bCprX^K}Hnlrou%yqKZxW0*Wol#-v@4-V*oK_O!O z7KBobYnVM8sQ``g4)*$gT>$=yg$^Olf$3u4KshXey{`ZrK(L*{4!jXFk}+iVv=d*_HCGwZnn^e^Of^^QMZCHlma}JlBW^Pter0q zh6QP@^fx~kevSh9R_f-)H1+e%$rqx}FY0{bFgCuKNlwWQ$+w5|o@fDl^VRms|Ii{d z4+w8pPQ7fbiWwpED#GpM1do)>mcst>< z%2N0=s>Hncp@JY zoSkbL1?1f2N(-bbXbyk{AY*teRlM>bShPrdZeRYBL*x|fz3<1QDD4MnK*CpH3v)NC;l9DHx6Y^dZa3E{ zez?x|m2FYz>&L}E<>K&PW=Q1C!`E{skk^gE`Y&o8uTcVPj}PCYF_xjwLEmsK5rfv5FYrO)D9ysaCXXzu zwWUgUdAA|lZ?T~fPrcb3YvZY^Rnn$BJwY=6hrPEBimO={cZ0iSaF-y#3GNOdcnD5# zcL`2#2~L8$OM*KD_h7+-yGs}t+=sjPww!P8ea?64{&(yCepOUaMJ;AlPj|2B?)Q10 z*VX~iC>M>V($ED9*Y?og?QCR!%2ZZ)+nW0Y>+N%9HyMJPn6bI#K#@?a+Rq_+35|-S zyn4}B<7?VNe&pGo79v&jvDy3Y0+$NFy(Tnd8P?Gu>BOuJd^AEMDV-X06whcxqsCqb zVXXx;LQWiF2>ZK(obptjWx*G-ngq;`{~81}XM=4|euEe5ibPorE)OadwN{yK_g}`- zMNkqJA4;r6OeIkjAT)oqC~o-eASAxvD+1!X+fMI$?<4t0U=3CBQ3HJEBej)uxpx@BX(0lCJAH<&7 z4EBs#1V&t5-jbg`>5VC-TA3x)41=yb0vc`{-f5+n!|~OP+NCw*8qwV zd>b6B!&T*@fa%G;NRODgx41EDp9F$PZke$cxNd*0JRo9IV7_?Zb_>eE;sWBAsg6yz zNlr&)?(^>yDXb^+3h~f4hf98dXYH@WnetW`V0qx5SEYfM&(s%D@zWc}@2ea*W8SQ& za8UF$kXF|>z>=sV7;&TPM7Ydp2{)vU8rpqJEmLjw-o;dH-kr+4SL%(!x8;N9d}qf- zqY-HDcrB*;x(WlNkTx%25^HnS!{gt4wzJsg&|LYhIP>wW@V1GUf(P~l!(Y6y5VM{Ko5E%lcH4kYXuS3sqHo5WYO;>%qY`|)7e zM>EkT21COO6?jr?U4#VZ5++x$d0=iIf68ZHv*q20U8+GQgQXgRj zMe-F9=#8K&V|t}%zt$X2@Y^s^=SX$HRbjTC&xlrE-SW)tCxtQ@o(R#lvo5CJu|M>g z-BR3gFx&$Bf_=`V&BhXo=2}>`sO>`{&O;9ozSstKyqGIGhKo;FJJQnpRFD`CtwVal znCBt%Y4PQ8Cw#D(48(uwwRI=-l`q+cuofo9z3Jh*WjH7HrH|@F1!PE1$6M8HvH8 zZSpx>p=jWj;UaL2U}^FB93giJL{vHFg3S+C!%yuzCsTWbm#txduC%(f9u>C-m%PV-yw4n zrN#J@?|vBw)G{fA2X`K{y;x94tIu}C+^BB*Jx!nn0X#c45C3%fy0_6}Kk3`Gbc(F5 zczDnY>Zs#&M@N@opm^YV@9eE{%sF_g61?zWvH^ALz$ku&V8dk3b?7sNRhW6(@9O22 zMj?ezYG_Bz=<{lYigBC$_-ME^9RdBi1#c*|?&uuvdVM#kQ?f0NV?!ZG%xP&PcM4wn zTKc=a$9J)te32;djF?dZ1$uDQ_d7{3ZV*x8-natSTKCL)du4l`9~7F7QDgpPsv7+a zMp1BX$kD|sLr%Fe!k;<^nF02{eqj^B(36vN_rA0p#t{7c?ewzU5o8PR>v9q9fV@_v z%!AJ;F*!M{uSu_s&1exjsNv{U{suk)|3nuFZ?0v5&Om6M__jtV?3jI`MQ z^lJkXmQf)}(DkcB-e12S!_#%D9^6j<^J@X`LwQ7*)7{&omV8V1Uv2#Ndjr}p7DWwQ zM;M|B()>T>_Dnq}ybr4h5w!oeHp9ZAvl9Pj{BoiKgA*!$fxtn=nHlM zpx#Cs`$^-fUom|=eJuAlH_c!76e-TQ-A_H6WY+hn1K+1!G5u*!{%40(NBEZ2F*3fM$xPDw%T|0+e`-akqmB7rkKya5F$^geWNH5ER+K)qLSubqGDq`uE4lpFY3!NTADl8jtF>{_3g-v9hODXocu< zs{Q3m>?R1P0?~O1k>9QlQ0xySGOb5?k+R+l#ur3iL7F_PvgyBOs_5>uV*nhIIgAW!a_F;taWixq|DIJ7?`L4h)a3`LGM zb)}k*1)x=6rFeffR-W~l)$ieg;Nkpi4CdkS_z@ZlOp7nRci={UqhFFZjzOKv<6|u^ zi__V&{cdRO7FzohRhULDl|^zNM?I32K3-*RQ?#twmVWa)I=rU`1tLh zj~Z05@3mCwwWVQ_iI8$Byc1sjFL*QhlIOX^23X+k=0E+=a zf=|BpC#0GD-aK)Qz0$;hv%tHPUFn{fnp$Lq2|Iv7$nBnws(dl`O!NUap|Lfyj@#aY z5}T)6$|uETceHqf&7$}|B=YT!Eq{q|qV=n06s(fvMCacPcIP()22E%_1fjrr8^r|$ z-w($5oZKGE*(2eZvt{2P6wcsWqE}|QueO=KvmVt0`I~?~E`j4r{`^fq;JJ0-tZ|gw zhxa8d%xjjOgsC?I=59h^U!^8MC57%ttcMTqeH*IKju3_~?0=2sJcb}V@?J%gt+^5eQF{M;K z${57Fx#+32DidD{U%G}8SpHnPifYswgF@h@OCAIYKT?6VJ!sj?Mn6X;`8JI(jois6 zPoinpyj|a*eJOvL@vc!^5|7b~ssIZyz;S<9(bM7TyVnJ={P_mXj$DBQPVtC;S3Kq$F0PX1>MV$0Fh(sQCji#pR3cy|0~?ez}y` z@yein3rB^O7rl=IfY>t2-dC{p0G;?5_CyDn(|iejXv9Z1cY~Hev;Dpi;$T1_^u&th z0E z_3B*f>c+xnd#~Zq(IupL)#(56e2!zCsmq1hLDF0t+hse$^Hfgff|$Dqu&1X}dPU1V zp^8q^3$DLnL*7M`@Z@0FXiRip?TjPnYuo**^BnIISUPb212Uc5Ue`F}Q`Ik9dsnJ0 zADj!+-$C2Y{C>G7=k1Z#@fVosI*je>aZO++HL&0NS@IVk3`9X%x8Jp2wptCS4C+@? zmZn_7*_*3pdbm3Ft2;hh{&1VV`K#Fk$$GIpsIV>gr1GDT>F5wImu{!W(cH~TTJH^% zn#wQ}pVl8ydku;td%sTdx$c&UeziP4llLZF)y%xb*gK*BGMU^W67Nunw!7HwLZDqO zBjbGqE(Xl$6W1L1`y+!6dsARtGxd*aak=Z5%}Jt&*yFw;d?o3Pw)qiwjN^Hp$Ep<> z{DDaA!yjFJ+J~85M-~GQd~qDdjh4gDK_@!nm*m3sq>7nB;V~2^^8c)F4P?T(`F*yS7r9xuqbX6U$#Yl$qu;^_HPsMvh3+0dU$836H zQ(9_JxNK=t=GR;f(`ovHiNlydOO^U98n~i#iiB*(wSYr^!qIl(`zVXiG}RFeF!Ot* zu9hU9s3G#@?{A9;3<|nKP*o?oLrmX-Y=3LmYEX%^W_`R1H<$g{dSf~9NXB$IUH1w;ewwv4fORZL9aPX^cjxA(%FKHPo$E*Sf#^2+!ikfBPE>yaCydWE ze1SX`hf_f{1o<*lqYH$&h3SS|GqrLJJGtk&73)}|yp~wh%9cktaP5#`w)4ZKL*7sKI7*dG<<$K{gr+uOF#GT+=4_O*)DD%F35QT_i17WuX~}~~oSiEk z?L_2hoWZl%mhJ;ShKoWx$?LoLJzHMVHOb>j{AmC&7Tp%i6@DXYsUwKvp+9lD<1W$c z!|mB%yG%#C;$dM4$`tgACy-rOvy0>PaD}vwt6+q%)>_d+2tGw}rPbSl>$Si|SwZ#h zdy4%Hc))j~6DE5tQ^g^E{oY0~fj(lnq3&L@B?T4^Owen(5f=39Sa*p#c6eqEzwYT@ z;ZqGCKm?k-BqGiU%x-WbTS1=F316pP2p8~s9r7`uy0OCe3Fa#N=C!XAY;H$7j^Z)( zP#tmNv0d20qd5K@s|c_sGXdH;*j`gi9`l*7MfaDnhYF=+S zJO+K5l^`79)#0*{D14KYBm3nAjtC17D=gk3naN+a~c`!fg zp6?DD(#5eXo7tem)rvRUuo~*y9R;?Pn)HfQf0*ZPVjbKZY{j1N8RJ0c+9r)OM+EJ|5aPk_8Yut8O>VlBPt9V6`dy z@&bk9pK0LrpVMkNmK2A3Vy*yw59 zroY+)7qvTCC@+pSov&;l7mLI&_618MuIY)S(rfdY?xCoqm(6O;x{W6G7MZ{KYVkZ7 z#%#XT%toanhp2{r0d+{|!f&cPrxihIvfX+6TLK(3RDURIiqreZ`_yuU;&`CJSO|lJ zH_-#RCs1>&4A4z_-WLqgd5xFKVI2#(uLzBGNrnlHnChBweE!T@^^ENL)`m&>*F^B3 z+SW&pmT(kSpls`Zab?QiXcf)Bw+80u2~k0uBP+3(gD>l>R>iAi7R~$ha>UYm0nbzx zR+6t}JzL$b=;QrVt=j|Q2?e`lRTvR%cu%12FveAMI#0Ys!vhtx==~qa(xQ~`$6G9H zKOZD}WY2V-`@Upv00W~Kch>_t)qbTrsCCix@CCG0UHo1#Jyi6G=WI1eH$m)Dt1za; zT0mnbqDd(jA5i{;d6&Z!{B4(x#Vg7jb9ZeJrVX=VQ(Z-nW@&?CKpeEg^`==-$z?!K+ z9t?%L=V0{_$C|!V`LtZiXp?Ap2(2`)dD@&4?mvQ0IDD&Q+WB%7e!vM0#wfda34AzU z9&7#d+lg4P6D<&{56jF#JS@uKb*%f_xmxrak@4$$a3eh}79?vAI4>n(-R_&)k!p=& zzRqb~8&mM~Y^joR{Tor5PJjOvAOBEWT3ic1O?wP#KdzalKv;aD6q+sLOSyu|u)mQx z!HVvHnb8#)q|g>%FfbS6v+l5i^qk>`XaUL!oYO9g7K0GWHAY;zmz#Afo zv718cI~9oHetV!)L+HCJw=$P`ZA{-`?#J1509;nvUHj>~3Ju{spCp3N7Y-xb=+!cXO-7i(PGn|q zewII?63c|B9w_>GMz$wbnfgTG88YC*8A@@`bBuT`mr?}#nA}6yPr5@4woKq?jP|y^ zs%@C3AoT1(zQv(X2l1?g)W7bl0JNHHaSsOqyNPjo1XA&CX{0on zg9Sj+3%CLXf+5rjTtQeVB!VymGz<+b1GC=9_R%V{hW-s$<^xKG3JNXUL0l=ql7@R- zDI~_Drbb&)*nM?qXNpv$bg=C=Dc9|qQ=wR7BGl-K@0&2z*(bZjFkz<^ml~Nqw>UVy1F?aZFBG`YtarRkScFq41Cc&FO_B}Whs(wIq8~GPU@$7^b56%eYJHoO ziurCBB_V>F68t*i&~TWvw)kv?E*mU}kMG}bOswus!{BBly6sC5g8R&M^)5CEWDA0~ zfysR>VnB?9;L1Dz+cTiUKRuNKewe~{Z51pOs4C;1{kJ)x+;we?d~6D@;F!)YC;rMUa*1)-{VDg}1Zq{h&uM{Lk>Q@iJ;%(!;%xcIQRs2ahKgxo~fL)z= z*&FU)>Rq;t;~B(42)KhTob?+_IZ9<@GfGlL4fKbPZ{B@H?g+A#MEf=)jY@#8t68d9 zS-8;`@z!&tD3qls$Iw?J5XPQS(>EG=b)>*icl(1TcKmyEy5-8DHjIR~ARJaN43Gzs z@2uEo#*>gSe8uSy;wt|d%X%sIEuF?bPXc(m%=UbS#AI`K2i5fhcIB_(ODWLq(6mj0 zzD^<(#i^ZGUt*15n1cNCE+E(s!{zelCX7jts_NFg5Z}&sPe^3U?Hvsr*52MIT%Lz{@^$ zF3*kKonm#}UabgwW8dB;joPOXbzW9b+u9F6OkyadsKhEr`L`BX-d6~zd(@NSvFT?F zB{04;N6YAs$@h6nzrykUd!5G55D$6X|3EY2rP$Z8KY2W7-X4t;X~Qefn|~Ed@#Cc( z=p}aS`w~MKMs-*^`MyXlyAZzLi&}8lCUP3UOU{m{SVn{q&x<^8#g8o>G>p2NHo^ zEL5h<5(T1sF}}~N_K#hH4E9Cf*DIXcF14o0V3J6%%rC)WEN zp|VAaU4(x40X$}tu=N>8W^4{a;e>dU>u@Erj5>0GvF$zpAZN`((-X)kP15g)6B>A8 z!!GIHEdRv9;0ze$^7z3WM}!BKlklnxC5)w+ev*C<@BLWfGN-K2`#@^Rk!}gc*5m>8 zVJbVuXz1<961J60+Tz%^SdkF6?M42!T7vTNy6ZU;%c{pMDK8GPppo$mb1F*5P^Efa zL9D1u>#~@@m@Qj}kF3q!#A_HL0W8UD6(A$Sn0y;`r|Sc;{teJ{`Nmn|&HI9?k+ z1G0^p*`=iCJ4;F4Wc+Apwcmh812G84f(c<)Or7PQBf5LMP;e|=+f70iC-q0zE6!%a z#>D3MaKPn$KpUAZ+lDCC2^EG%Hs*gEP~g~WziXp^aYd_STJL>r^2MS!*w%&n8@wQw z*_*`Eh^$v%jDl%w7q@?8jh-rdD8sv0^GmJ3Tf6u@;B_r6Oxdb*kPCO&g>#Gg#82$< zMLoNknY2~4V_r5yJd+&V-3HR?%ZEy9QlL5Qk-mXy@y;V$0f#3Zn z*vb2`>7M3bDYp$pq6%fE+=7jm#{)Z&<^94RSQ`ecyNM{pC%>wzz9K(f6J=_OtYL-D zCBrp=sL`Ewr(1)TcRSH_f%HH;(4KY%7X-DzSbXhDELidAlIf}7kcAdlo$}(JTxZ}l zz;$Z;cU-4+O<*9X+KQG@UqB*9yIR8Y4aLhtg)c9Rscy169;sK|oE%;_#MY@|Q3eGG zK}1_16HN%;0mLORkwK*RmF*diU0PkjMPag(*e8h61?ihb#guW;$LA$_xuIAIF;4D( zu$^55>ej!WXBdK5@y={pBY&9EXs!y#_xhMJ;C`A#OF-NSo{+<)T-ev&adEi!4#IMF z*@-tI@ZCyx2**&E6CKTF%SN6PeXu1U<97~zGY6k|DRjTwQqJk;bd1=zxZPq%qY6LK zz=;OYw-|%I)_!Gn>Ls1=DXNFx30bp4q>ZKH3I9AYvY1$rO=i$nIAoQWLxJIEAna(c z9DM$AxXitHO8D~`IkSh^dVFSbxt2KD!io?j0*`8S6?(JS!LP5hb{Tskg5{>wCJ&() z8~uyv+Ywd39L5oGVq{}9F->l^$^dVt4F+D>4x%Ok96})j$=_i@J z10GwcSuJBMlY-Eyx5%&h<)iO1o6@Ga2#v@DJaTZ2_G`=<%QlRr0~+JPjnv_dFN{!Z zHa6G$FVdMTJe_yx?tS+A2wp`@1yo|`l*;lU2RDYP#8$jF8gkjNZ?=1eof-d}K)`Ky^4#{0(>sYcW-wO~oX(9J{ zRpheRru}@k<;+4!Z4r~~MyesE#yYI4;klo(w`Wdo0B$Jz#PxAU2?^=X?ns&(fCc@u zZoagMoww?H_r6XUQigkl(~0Vwd?Y+y_24w|JSw8Uo{W|~1VpDu#e7b7RDzK=NOtIhHy_}mwkqBdmGys0Nd`{pL? zXh^ZS|AK?wi245w2aVFet@CKV4#X1PymF6k0L4c4@NGB6xpzV z7e?;;sp@})LaY7>h2|GdslVrvxV$)e`UWI9{3uY$A}-Z1fmb` zXQVjqF4h{0nOMpti%`>&jU|!gbH#z(Sb4aub!0-CZ=~wAr0S-0%FsXyB|SJHV1pif z>rII=d>$xHyvQq7Z!=Q>EU|HvR(ho1iW!Y{WoTyzKe2YNVR+eWA$y&ErN7}CaqH#D zDS_Rsc3<@!1c7jV@h;-!X^RvgOT0wVpgvh{@1nO4)b4E5YjY#Bp00S$Flaf)F8fRe zR_oAexcDJc$n$MKQWTDsPXwwsdjQy#WwFjEI!XAI_@1C+oI*~E+Yy7Du|yXVzsgaKYnfp6 z-{^lt8vm-MdMBc|!fx@5SkQ?2WF6VLFRv;Wk4aM%pC>bbNvU$V(J$LWTyWG10MU^9 zlV-pIKbl}!?y}nKt$R|0^1Lp1MKsqCc>jToDtTpg>oB|8OgoRzVa0~bB`v4bvvFYN z#ahkP5q<0@_M~jILfWjRFwa%G_@-{=O#!KOxI=r_X*x&^Up;Pnmvm>ILa}f%NUs4o zZG9j;7IJj#6|8OJ)af)&LGRjn({+!)>`V5*LGjw6m8`P=#W>;$2E6zfkiuh0jRuR} z{B<|QvW4zZ(8<;MPFeUOfqAbMMyF%gH^JegdAv6S(*Y$7A z=$|=Ja~JW&hGXodyl>iwVB?J2Xb1Q@t22E5{Zuf{5T}wSAUCdBTE%z-3>ggutNM{dlwGHbRv1;#nYj3(l1pAsg|pQu!7)*&olw2Lg})4c^@6q>ptH z2e`yhDNO59b{Vi3wB)?^I{TBfAr`-9YGdornBmGTO%Jf&0-=;vlTdBpDaoG;!0a->tNQ=x*5UD1&GXg5z(adELJTyof z`gx^7ednP7i5oKE5JQdr?DvdA99Zn)p#3N80?Ue7HC`XDpT~_$t#tEtgZeQ3hbW?8 zx>h`d1mVSDE~7(28|l3y040U_Pn6UM$k~EcgS7Y07zQNGQa#c zlvJqcs_2b6O&#Vqo5Ge*0-w0Ce3pE0GoWo3+dO9RrUw-U%|H|%5e?ZWAHy=Jh#Nw zY2U{AAYsnILZ4V%lup- zzUt#H>qhkLv(!_mrO5c-eq^b~!BxK4Y$Ya31_ zbEG#cmMw$)EXd25KlI%|mj(?ugJI#=6mO`N)+Va?oWTnQ#`#z}vpzi0D82)gNnVNuTFr%+YI3qW^N@G^WqOFwq_KGdb?U5XG|wEmmiQ;f+H@+I zFLukvN|f)#>a{?ghL66^$DsyHZ%g9NJK#JqZSlFr={NL(GWe-SIK+{WZ$HdhoW>(T z!#sa4BvPagqb%+JLL!qy=Mvgm`i81!rHvstqH~@5=lyd zVr~1MSgEMFL=P9E-s5g2Q8LVyG_E9-*W+F1fUq?fa|@A8F6PX6g6hAT|ICtXaX*Sv zM-|~8JeESY>I#))mz=(YHKg4`48y7mfnQ|rDoADFx@>Y2Ez20mU6ScMm-D`(z=(cAND|_Rf@m2p_HYLhNl_BSl)M2I(O5!_$fXS$_BTEZH+);c6 zUacGvjue_KDj)h}A{!Ig-X^3L07mSZ44`OW{#P8;<3LoJQVKo?#pePrS!4>hcBcHd zFzQbq&WF*>l;x}ot1QP2u~;#9kpcv?0$3bK_ZX?lE(s=xH_F|uW#p0_5WcJ+MVf?S zN$Gc83tK)HH#9j6SUl@)%S1Zk6`?^t?Y$!;N7vErsOHZSOAyqwx9N_&>bh(d^Qm%J zcs<1uJ6ar9BNtBoVXM=Z$R=pr{{zM&(d!(+;{vc6N{(BO&*Ph333%?Zi>)YXOS#^AC6DP8NCs8|X z%R@T;iK6~yYZKf@yoa2Hm3KI=zt2_w1b2{k8?$F(Sd5KS4sT5lL_^LIa>O1IJ5Iw6 z5N)eclFq9KT5s!ArY0ck(PF^&1aZoVlSn*uh-a%!1p@)5r{&7G`m(oyFoECsUR0MY_JK+uh3*aKuJcR)^HUtarrd70lJUnp{@-sqc1(Z@Y!!JnmkT|t>+0ER7wf1xrPme(7OLj zp~hup0!c2bS56yO5D~d+!vG{|u;Kha`!&iTOHlM7VU#B$G}*xG2}AX!`*n4qI8Kq3 zT&aiC;yHZd_|no<{2824yjmMkI{}@Y50-GALOkuo(31GXyRL z4)lI@vqu!5tHwJ!zQmVWx=#5^L>q(M6+xyGGl~LhJ^DWgomeGuAV5bIw^OPtgSHWtaTLeeF&SBmA_-&g++01lRB zt=B{{g)hV|&4~W3`icm-ePDE_Lxso*1`O#Xige-!Tjw`&PyZGm7!HtSL=StobSr~H zzgqGB0x!i4NRgs6QlP7_`fI*0LTpJKpx>N|SuYg-nvx@YqU1PGE9w3hB#`Wh;d4KD zyDIe`P5tv5d{TY7E+1mO@z<2x|DSFT{n#W{7(buV=6`9ko)~3UBfYZMMt9_%;Gd!8 zKT(E%|I|)!0zOzx^)J}aH#u#}v=Y!kFYk_@BxTiz-^~mD65b4^f&D(3Y6Av>)azRA zI_Cf1gdYK3HPm6*YVfZIyVld-tVV4#FZ$~yvjMy+`s(+oFMmBN{x5a=pC6;x->KV{ zmYOYk<*dBEV?V=0aF5yW{&hHL#zn6KP`)!hKzd@G9=J^oAB4!UMpL+d$21VF@{pRa z{eHC$@QFaYyR z*Qn4}RD0yjgaXRRMHY!4C!&w1@(XpI@;^?vBrK8n)}!3m9lBs|Z@NRqpSuy=-f>!3 z0_B6nCG*M11$_7{Q<@H%eK-3TZ!wL(L-XL=N2JI5JDC{xX&g6K{ZLwYZe4jlK<_ z7A3C&%l^KvX^Y?ONIpXQ2#oHOyswV)0l%yEHU-xQLhwS{fLHsX!}w=O!Za#1d`7kQ zyx$x8EpE{|jW)_eoE=h?xD2l=HN03>URei-0q0wU?P?E=Z?%+wJ&@&RpI++B3zfc}|O3BQrlZ5*5BqyiHmKbcp4^26aj939=lScOH+_ zu2lu7Q52A-q)cZ3l*uB?CqbYG?Omm+39JjUPTkgn?j)~gsG6wVie{xdkp4Y6w#w=Z z?Po0JYPmX4eEw*=h`6TeBVe(F!@Az}~)ymCTa6*fevsnw=fB#`8g)totb^Ty? zOpqY*Uf`Q<2i3#ra9`YH_;3KP-Mj=GA{NSR1OQ_50sF~M-(8PhtWz$5m&ygff)Lkn zkAtts!Gf?I>}T0miA(46F?RYL!2bA0sS$0{^NjzOtu|q6*(fbU1t{j~@dm5{EWhyI7c3a#fexfJdBI>;<#7BJAH^J(>g(d> z-Guo-)n=@hnJ(e&a^B>`<$C%;!1tQ6-Z!7N-flI3(gspEtXZLwZ$7#oZmK-qKIT3} zugg32Rc%!jINL^X!BY{{DEg|`w!`u&0uWJd^?85%{*kHh^$C~R&jh&*FklH5aNC#J z9Ed97r0<}*&gS<{Lc}CugpN~(R=EuPRDL-RQjUA?SED+$^WkLHz-qAw2K#k;qJ*Nhg{%JpC}5#ZK~igtKceVJ~=03Meb z4aT4wIV0HnyC;e1;@9K$T8aCiOv1@Vn{lnI29iJVqDFmaf8#~JEGF9j;V?x_(t015 zgjrSZKo!^@tNqZGWOFTfhOrRz1LPVUE}+4e{qa(Jq0I#eZzOwF<8-Z23-7;mm;U@X zpje)__O;h;m8wT}{DnS>h#!=`%MaQuecC-<)@`Wqq%O7lySmh2j-m7tAp$qvQCK@4 z&aM~xfZR#NPtn7%(MCPhjbpdX^9w=uIT`!74Z_cY%^g%b_O0j!R0)zE>lc$DGs$hs zo8J8qMlCU!eH2bX_5AUo3kE=tqJS*_eHzTl^>6MY8pnXf) zSZ3rnYnf^}Bduau!WRxn-OOY0y%sSbAVnEv-u$X?KR1j#?^ni5soC)4$tHd9yy7XO zN8d^`n-K_`vlKolcsR#|Xsq^t&FQy4j#>M|#qJ5&9XF~+w6=o@8pZ;SpG6b#>K_>e zK5HR@@@~Gb+yi(}8rvRt@lpgIvTMN}Ht3B7dSoLXZwalUfwep0yvvAv#BEEeZv*T~yn69nsJx+X_Kz-xFr}ZCy9l z(*!wkfDn)}Et_ZD>p$Kdi*`*wO4bA~!J=z;Gbk3nL_Ks9(2g&*SK9PXGRZp~B6n~G zWCe*w1bq*Qr9Hxk>kX36Nw}?Kc7LvBAW_;u)Yk&WLp$Bs^jdqF4T-91dD$D#Zm3=%1HYl`l3R zU7)2=c^l&)boT&+r?z+_L@ahX6|k7<+C%L}?i?0c6GPES#k93)NhDG+Gh5!?@z|9p zkCE3oPg~C}Ix2V#YnJ64y%$uQe^uCv3EvQNe`V0ih%*%6Slscb9|4a@4t+?ew$|zp zsdtzKT*o&V_^dh-M3cqEz6mjA>#i`x-0+m)dDGCu|k8NGQQf+ZECy8It-6?03FfFKTr?-ZfbiSkK=Z;HL(~#PoUK#yFf!ev`~d zn{xf?9=R0P9mih``++CLN8z<&-`0Xk%;CJL&z`7dRUMv!B@*Q0OtStLw@}AQXOa+> zjCC5>@b{z&uc|Xk#UH#9*w@Q@9u{1G7lIqCEeX7o|Cnydym^p=H((0jBrb~kj4bvdT%zLQo? zq%AIdqgUvhoFS;VIgK9AJp+N_u}(3V-E)h1o}6;uiXc>$}-Br_>t|HGR1*{-fnsW!Hhm##--C8WIB zWu1c(SU+kw0uRu%1g3T2D|ThbcjPpjK@1hA9KOaY{ku~|4` zWsEeg#AYlHOa*9hb)1LW7LUxFXiwe_9M%&3w)br+E;AqLRw6)HuOPD&T-q%{Dq?@W zqVvCcW!jF=*y>rbt|$^vVYv&$n~T2J+o-bb%qv0i{HBTPQ(_;LZR0vts;ZIg0Vi(* z5)=z|Lk!HWBGUhGuT0=EtI`Xb0rJ}^f`*10Wi+p)sTco(B zi_gCPAl4(>KDY`MewXTLOyAABQs9Zk__l;{G>s$k)hTHqCQ|{Sw^!Hcg(8U^h08Bs zz6@Rk9G8nQy(KhDGW_Ey#e3XxL9~j0s|!(a>-K?1Anf$*dci=0B9ByN-V-OR2YwbkQ^<=#?{{^O`&M+WoH~k!e~;a$Q#pGD6uetJ#roi~J)|B!z*AL5*$GQ%2hi*;CmYhu2B%#}Nsqt~7vBAu zjrE7Ll+zJ{tqhR1W!M0)eO3n^m&^KWa7)0c>&s>~wr^ABiO7EvX12H>kQ=9guV z==4SikbTqdh3RH8xmn(;!LDB)&lM1xt6jV6;k-4-6w6$N^T=&8OXQ%O4TTeuev?p> z<8lcaqtCbLBg zDr9s4d7+t=*-!0r1s0Hjn0<1*I8evB=76M3h__;>3|Uh&X7ukE-qau_Ly*e4%n&4X zC4>VqCi~|iukJQa&6s>=kJ?OIL*cG91eK9QrZD{aburp*way1hXdqLR^ch{TG(u6 z2F?~m73XRqFnQnkU03*5emojT>)S9t7#TTJXj>dOr|df~d*2~GX@ntx(hp111)IXZ zJ&Z>7d=`R~Qd>=?$Rs$Du5yHt&f}SJ@a*%_PsMWd?zNqzW|u&iooF|}{Zuq7G;9k9Yg21eZ1V)&7K%~@MF<1?QV zoKU43M!)mycZa)yUC3Q$>kx1h`{H(IZ+|22?Rg<`M6<@S(H@CVEEyV&UBAJrBUKi< zVka%^B_zT5(jZH-EKgjKhzG~#rA5qPk{V2YuImQLJh_dh)QZ@dYrds8zYW;UKl#Km=); zt|p_4@jBNgBiG{*EKx#-f!}`^OZ5{SwJ^AzjHPMgweG#&IG3uwhfRZFKGG~XSFM}+ z=@ z;n6NKm9y?${8GIk1U#mU$xVLuW7KqGu#j8Ybsk}j1l*4-5CAnpb8e&4#=|I>PjsgB zYwa^L?s*+zTG|DF!c?%~mOlAQg`Oez#u_mZG7!en!Wcs+MhFdmA34oX2labN@2ppO zBjDOHAB?sLv9s3O&irB|8zIMisrO@pj3w6cyOSUf($J9&t3x%lv}t-4%2WXNzL-6b zGNKbY_sz1zazdic@PvBa`IguL2VvDK>E;;(sq~s_6HwTvxJu`(llfzttrTBP4S8Jv&I8w%1jpEROxPJfL0_BM%j zuUG&6p|l4lCa1U1X(kOf@ut`Qvseq3HbnUtf=Pd_{59KeB`b5)d(OM;@0@fU!JKk1 zqLA%5o#HJa?V2d|Dc4!T_tr(U`)ybr&u170yvO*ce*29%2ZRhz+H*!#|9lQ!*ez>iMC*I&BGfWnGP-yg9!?jom$`uUQbz;&7?~HPA^XdTb{VHNDlJP+L&I z{`FNozR}8}_8znX;&9Z({LBkXF$2E`7OF!l<8i;h#pi@rh1UVk4!7q}x})9-d9v$f z60m4fM{WF#rvwvq2Qd58oW0_?@DIOwXD-$yMI4H=Unn1+Hl*$oeR;g9?5Q8Fb=bWH zyJ$Q9bHvE}vK9E2`eYA()bA@oC~A%$9C)HN5rA{uD;De^fOXa|XD#TrF5%@pqNT4z zD5Yo@X?5w;6D4h)p|I6Macln>Ey3a4l46#UiAzphHKn`53(-T8e!!2%ZC)((`F^KE zC}!sec~_(TLS5?H8b?f5to1^W_@_QWtP-sW>I7k3o}n`pd+l*4A+-uaK7+?yfpn77 z9d9bi5At)9I;|gwP-5WfCeJi9Re%J+$hkkHr#lVe_=4zR%Z=iO*T%pkcjkuzp- zj_&rHhj#_W;ne|$MfZd8^!ea)-bqy7RYl##(^MF+=}~tZ6|<#$gY**k2|Qm>Ex1}KrIwbG5|5Z< z#>Lxz4f+a8O_}G9j2cmL?~n28E}Kn0biSo=#4OqD_$YaIzU}{E@2#TZ3b!@w;O@bK zTOeqV;7)M&!X1LUI|K;9Ew}{_?oQ$E?(Xg`y|VW?yU*$I?;ic%^+j`448|yGeYI-U z`sRG*`zW&{5A!5%13%Es_K@0BPQZ-qdRV2^!KD2t9Jil;P-!ZAKkUuHeb|Q&*ebXf zcILM|qdL9my*hjON+Mj=P;I``zFPcY7i)(*3oV0*x5Uq~T+l7`-x8xkQyf#9%coSs z&-YN{KFTNX*nqcU>99^{$h;s?7}J zk>-P|Rh}P0;7A(Q2p2DUJa9)Z1MsS<_FYSo@MOYJb2z@;bCsPR-pI+K(aR8oqT6X=?`6 zvmThLbi9Xn?OUhDfXJD#D+|L=3%-IL&EWF%p$;fz-GgRl^V5G$X7{6+gWnI&78-UO zt(Slv2oCqS9v9lGFLPeHt@64M9*F-Gr=nC`S^6j5p3aB#xY^q&Sca{A;CRogjLmkj zyYi1;wM|JZe&{+rsX6gaOh}q86FL=^Uted6Rh=Co>N;o1d@T#mr5uVSzEU<@i_bD! z_>?~O*uyQEJ83z5$NIK1=a>pjPKC~J<3;06?|f)EVgwggNp6Bk^XfdOSMa*gW-oWZ z_Qi~VK;ZQ{PE&4oa?}7~i?t!)Q8B%w?JP?Mc?M@AHDptNsm@xdQAsrJ<9kFcr?)cX zy#$0x8y9~bv$s(5sjVgUF8PI@QR*LhLR9MvNaf8AQ($0PYMwyz(7%M$n1l90VA6@A zLzbKLC&-OXuCAw44M!*uZtp9QL%>xNOp%`zKR2-2JA{9!#~nrEyZ5`8if~2RL5F-! z^+ZXn_cDE^ve5;PRvK%94RS7;L)aX6(?n)%m-XpqN+F-COPA8O0V4{3b*s@HquB;* zdau=_6S+<&3LUFRSyh;yK?nT3aCylq)eKdPdx$hHOekvja+v zkzWj{k48HB_nR7|(hM`n#D#RvB5JjaVyO_)JLEw~$y3t57+Ar{tUY)M_j+5a;*EG* z9!}#z+SdddZ#6ln^nVj^z0Jq1bGclW-TZ~3RQWgOOX}BLyo3x51fcifmxWgPiohpH z{C^V7u4wg|&N@!wh1RRSxefo#O5cEbGJLr?Sm-h)qC4Yv9L2cK^TKg@)Ke)Kvn|Y{ zR3j(`iKbf}O(=2KEBUSDCH1O2y3!GbEm4KHRWdR&j2eMXADaEy``SQz^;es9&rgdrIwaME35 zeoeJSAncWA(x3?F8k3YTNBMg%@flV&5NIlvlnGe4^m`mjtbP=YB{)sHd3m)`}yCGwCjvERs>HFX5u$j)87A zpgT5J(-9ZK=e`2#;29#=E@tfaC)_yf=F>He&xMF+&`jn=@9Hh4~qyiZ~^1^eYptLqfHEYq- zXcfr(c|-eV0Psv+kc;+TbUn4g36;#*vr!KsTrGagVjWv3vaAPbpsAo5iT>D=Yq$%6 zA&^d>DMK=MQMas^+2uL?z4U;DJ<%g8m#Uh^bHuNWdVMl8CF9v!?UC#pAR1CtyhQjc z%++Y^IXt~fpInVc*RODQ<7-9JT?vnjbKB+0E{VO_h{T(3Zo@H^CejL(yR*g-G)Y{{ zo5M@4Px_8dXsL%kh0W4NEt-Qm?i@VYbbjzrb$*L$T`IqRRshtE8u$c%2wtYw3^?(! zn1@2z0bCOJ8bsb_7)+6|vrT?PrpUoaXTImzMS&P=0{Re%f4*2GcU6dm5Po|b$ALz| zbMvPQG}Ek|?D@)(97N-JN21@BFiNzPI-~$6$c6BL$%<&t)L<>B18URe!s;8Uc8_5S z+==Upm`EORfu+leDjGKkNpGQH2$=?Pr1E}kzmn-`y}Kcr@p;|F0a<*2 zLM+lDEvNHLeq}`{`%gKUw;u9!^d;{6AZ0CHFv_fd#nJ=ACH&r`Zh)`H?FVd| z^gW=AXE)2(4%w!ko#VpZG{!CNg1X-uZO~=Jb>xzXRD)75PMgJ5vO&*FhI5MmXs`H! z@2t~b+b$2GukhS(ZTe<7f3So(7zZ*a|FDEgtIe_6z-wb}5g~v~K;ks)+vL{`Xw8fW zyZiB^Z-5NFjeFP4{G9Ez(BrD$vLaW8u_m1y$m;3^W|y5t3K?VQ-^?V*9f*F0Z%5!B zEM$PX?-4dSeb={AIAldQZ96p60Y>>2c46Icq){YI|1vDFI)RumVP8}PXHL$BJg8X1u z@O(~Kx7-z}n*vJ(I{Ksqm%<$krw9v>XblO%0!ZC9!hIlcraGKySfjy}CngX$xoD`j zuhA$@O~vItIlXeXYiay^zj(g~(PEGwRgmgqz|vCiJpq)rw8u(nLu zEwGFh_Iv(WsI9Y;1|HPjnSR#>1)IdevT+kC_tSqc+mWRm7H72*slqL0tg}^s{&CBu zSy7i?sbE+nLv1Djwn5m6>< zP%tuG-M^__4u9}OJ`rAC?_T4dPTAMdsg|tE6h}I;K%lidVC7782nX|=y&vLIAS`VV z4MI2Szz;3Y7c>`*O;qN_bU)mdG;0#ex>kB*j%Sq#t5#z_h4hF5kp&TW zwc878{h!+-=9AqQ5hje=0++Me;AbT^}<4q2EK7$l`{I@=SR7eAQ^oQ3+cmf^zaZ(hGN3?Hs zT+1({IkgHy3?MEra!?}l%4LeJG9f)i%sapzmFvCaJ1(-FA2BwcZt2#&tYyA^p@E9t z2xz5neU@lLP^BTzm^+DENmrx1>$wZ`AsLh;#pj*oX|FwP`?T$F$MMr^u?oeVXieVh zN$+&8C26Uj2gj^4fclhoEBRt?4Dr|RB4@GRiLTzvS0{1=tZj0c{8Y))RLT{8u2E+c z)xGv$9@l5PF<9bwrj_mnB5Z!FoQ*HpmmTD2BU=1s!}Hnj+O4Lg$(`CN>Ic_ooFp*%zA2v&Y{pCijAR`j3;q>ZJXM)f_7&|hb1y&`)Uq1s7I=6 zVNGfAl7saA`8Pi`LGlp!fI1Bo5hhaif`IMo73r!VLF|^;rmd|Tc^|M5!0*- zhg}%SxXvmgGWU0HcZk zW9-xKwC{%mK^|}ovpVvRL)&UGKg_Px+ZOiZ{($>=-dnvwjD-ulMZe!Et}(a8ehxyU z5zijPEJL}oTE}3NE95=bzN8Qw`s!VPV37PR4Ax7JY6Uo_nVj@*Vxj zI&Ohjtpz)`?|-$HW5&GOsr>pq9lkz*slu|Xt+(W!VBZBqYP;qSr6lByyakpp!Pif9y z;=jGK|J8O+`T@Z%rMj&ai(2mMRrZRbgUl$kTPYDX-GJgTZ2C_zhT~SZ|%rWY? ze3`~vlU^{gUQNhNlAY2}RG$^=A`IsJ+S@Dj+VK+g-{?}9eDu4jsRG}`a`1!LD}Z=3 zP>&HkT#X6r^kw=Wdb|QuX&m6&7%B(9eqckDkM_fbl+4b!iyc$2XWYgII8!E#3cG9> zH}Bu;p3=t^57lGFJ$-aoPjxMsVeeMo%2t|n#4%m2nngm4dxtd;QfZVa{Eg+-O9r)~ zYZc2q5emUkulGj%h0zIe8NB#xsKzZMKAhxluTQq)oSeS<=Nnmjf$8mzUY_Ur{k-I( z6FK4{gFzjW^?j|&eZ4!jmsIFq2D^d7)@3mr_^?+lmfkdQcv7O-stITnF>A+aoS)ba zmVEu<-T=3GKd{|Qd&zC{M`exAzT=D68H_LA=W6zSz6jx+%P(EdJTVwhFRb9zI4BCE z{p>Lc!u)x^lTp5%BN`^WyI;TN6Xyd3I{=5o@36VyO*k#aTp8etFIVye+bpj%Bwyim z>GljyDDWCK{bx>)c=)1_SNC^3Z!Y+X|NgVu?{5T%xO9vMXQS_>B}89Xjpywg3Bl~^ zo)+T`q05O3&Af_LHO5V65XrZ@TDLV1d?nVFH=52=Lr`dvWnM6p;$^%nW~VJ6`_*(- zapL|Ak)JqBpB5Zt;)?W%_Mg6lB;$X_+*%H~pzMpZU#0k|+x5qn@ zpD;`KLgQ%Aq6Jw#ezq6}4dh>{#cWbh%6If__UoLA3*4``Mz+NB_zF=l0qF7)ktjc~ z3qizjU+$Me<9+@S)9P}%xYI4vI-BRolrz@<1R`ssyJ}suC8oi%s@e!k2N)Pmy(~M$ z>eiq3lSHq6cbC6A;7H1n@`F3esT5h3)h|p@=hNf_O|36 zqan!59^XoJt9wJSPWk1&+E98zD(1-Wz78?%njjyp$>+;vY(E3K^)DK&xpN<7f4S}r z%d3sOXF!7r%L2B?A56Z@mfVTOpNy_F4ShV>FW0X7>SDD*u5s3?4*2NtiX9?pI*jq$ zdj=CGZP&A+ig&PPGcE~c#_-{PF;>CRqIaf$+w+(B_Su|vb^_3FV6(t2J1!$ zcGfvO^NgNcCY!8zxXw@NcPAeANo4``#rA19{s=#`Ff^!95MHlY}=L@px~}?kuf!>mSQNxb3IPDC$n^Q9xV1y z9}JJoE+ryE@@xXdMy=wq$H1g?>gDlyNVs^7Rvpjom#-ltgVI~N%y-|FDm6zIm9n4X zs%*z>;^bV2(93kn+l+efj-PazZPEt>wmG+Lk9uz$s&9)wzq2acFqocbUVG>MGIR0q z#kp8*@J`$(diCXrXYBdvVoQlsPgeubpp<}t_F^ih?&%g~Z-lWtsYQG!I4xCEHQ(ve zCykbdt3+zoGRmvi&j}?!P_Jn&xvPHw%Tu~*gep8O9W)5kaS5&L%WMIGgx=lf3@^{8 zBRlJJ846UGhSJbDupvzsx@wdHdJrZL$BU|&li5QL1Y{DrUi3}iEPnxcqMSJi*`o_b zP%n!Um`fd^X?iu)Kd-G#KJ0T7R*yxQm#hbZsiYGT#KP_QE+1ryXE&YM7W3~0i?N2U zjGPdr+}dZ=QW}`VK!0XSf@E2}arO)zV2^~Bv3~JUx zDF;tQFR`_}{+{S=RRjXQdsAxMdY1_Tf|{4NJcCU@skcNU?%HAIuN{MFBT$2S1!XHT zTw;yTSi$ik@20e?Ob5tYJ)9D+U6;w!^}K7R%K(kgy@UMlw`-24+o3+`M2WC`SwP^@ zXj#u@F)oM$gmUM~XRdPHO;s9InH;GdEN2Ju+P%HdzSE=I8I1qsuoxWad83MjCT!VP zXKgZ{C8T+`@`*#+w>6$leKw89VY~2w&-pM{@NJf~1h3!R%rM<$!- zcko}8`D~X$^)K-(^Q`f3f&O-iJjFrjxfLTQ1|IU zeMbfg(TeF0I?7;h60od*Ox6ej-&f;IaJyN}L$^;vL zNV}brbeFciiguB<|UnKrMg&`+J#RW+I1nsEQ2B{leh6roG;X>O8|U{bp{#6*Pd?$JETbyOxhcX5t#JU zTZB3vUVoBIWG_YDpPSP!)@uHRyuZ|EC4zxm93)QS{#7i}t%EQy+Sc1~(Sh0a^=YzI zOT9=VrzzzcY`*H4d$taAuyWN_9s|w_fOs_1CTBh(gRkv?x97o>=)#gRX))wHhgQEA zeB1Q=+IPm{bS|izFPy-o9e5>s!!v);gPT9Wr1K}<0Mon=%AENtwf_(kSR2+NJ&9*7 zXB5io^1!1y9cX)BJw+n=7AilBq-%c3Yn?Sx*R>ZdQC9z!0*`L9Vh}pJQdn+~ki_k5 zQKVb?Cs)?9SP@l!@M?U6sh`hvjR7zpvnFB#>EES#JY16PV`%vYJ45M9T)ilm@BF|0 z*|X-skRk56`W{qrf`!%Y(H)3V}DMuxy`SW@$wN`3M$oa7I#-J)JDpbPs3Ctm_O$ zpxhe)f_1Xh5;wuMUK%$YBLKZDB0{B(J@inZpE}9e;b^2j<@WWXDtr6l66!T@8JOIk zD6Ws;TdcD&xqPWYEhLvM@Ax&LUxi%K$DD;A{i!8~=Tyd)KC&g3q=fInfnh6J(1G+F zKKuDl%ub&MRLlqZ$4pRDqrruxLQADDC6 zJ#9R_pqlY#8_B9BFuJ#O&uM;^u zS=T3vbhNDvF*Ofg3miM&_k`m@A9*tSuC`o$Kg1^;66!z@XgX`-5I>S(n|5I53BOOwWF@x?5%2s@NZt=h(0Z_kD= z{i=gqE}h%DO!-Kh`X}x1aODmi|1FV#&$1QxYSnYpy>!|OxR9c_kA@N&=Df8FwIL=H zb25M6u->&?DGOZadpVD^a^p0!S)1lqV`;v9aOSRDn2RBd6EBspcGBD0W$+r&s&lv% zv7W!49t*^hrE`n>ct8Ky=|abjMi%SK{PCJeyUB^`{`j?Jh?h50tg&_kP^@! zo2}I6u}JtfsX~_tQf$xbaa_%ynhhFoGCQW*O25-5!B}$wLqMIBJ%~P3?x-=;Aulza z`L1_=Qm>3H*9nOnZF8aY3d*3_RW8PJ9R7v$B)dU6`n1+3*>2nQYeT0Q%x0odK+MUU z;cK5ODb2t;%xoPl%Z1NrT)1F6-%wQkHMxp^dnwq?#O7wPb{8LwR(y z34R#ei|7>ng!ZHzWttS{bv`B9BLwylHPcsh>F^(;g5oOA^H;1B5;=gFlCCIzhu>;{ zlx8@b?ypAq+0pk}xg1BB`>9TNTpUJ^1PkSPGED`vwZ1oH$v)vVBIBmWGI)hV6X~;@h z{Md@i_ib5&35*QD_{8UrO^@uk2^NEEYStbcM0ww_r}B_JZ+k%>v4=^FM*Lg!x$(p> zpG8i!xza^~N`hwR){+h?M?)P6@D|b^cT+1-$2|9}zqm$fC|*?tz@#}=eC$}0yUB)$ zvj%Z?YC3Fm_9(T?V=3tt6d52g3Ku%c1qiXz9_>W+G6jG|RVeRYyW1SO?+%!g%rMBOJ;@Qqj)O34A33B`>u;Nn<%* z&sUC8*~R#++p6w`=h>xlo#alJ5{$oKezkS`|g#yRR}<01}e|H zsZ*SiU1(dsVMzM3MM7p6y0t%D8N)n)Ucj1f;q1R%!go|_TY}aM!!uZ;{nQ~PD>KAb z0|$)ev>hYdYZO?1&6$D6-Da+}h+aG47m8=CDnR9hEQF@zq~9&VJPzs6YNW^5EDx7I zOzjXeX?DjDrDF-Pw~Zf#@x3-A2b<;8C?oMegN`Y_8$s0-Ddn-)Z?r+2RHX+K94-vQ z_`IDqGZizg*MW5WWUw24xton@gYhcOLb}zL6^0+o7xB(S>&o=$vf>^vw+P zc|7YvpsQKQ5weYw(`k9$dz56&SN(JXGE8?Hj6=8i!=)iVMYKEbJ=)lbucP`tzlJQ^ zZK0fsm=av9qg^m1+)@8oTS}c;rVmSGOkgMCPATf&cU@8_lhH_*jFojc6sA?4A#X8E zC@^{06wS9Cmp1Da>Lp>9&@no&ieqv)WxEC1cAD?**}uJUf{pQWO&GOFE~DCi2bsx(X_2Zv62MMd^a zv*Qr{vTt`$#1~8;E^nN*mB*Nt0QAwA7_lgI}k8KYQ~#$Zf!i z{^Su%HknG{cdoi<4g54<*Aykd=@H}`tgyFy0JAt+G^LPLqg`gYa4?zA+Btz0g^Tk3 z%V=eBS$O&yCk8n$Iz3;eB|an(ZdBZ#ok*<2yr0GQp^id7p!EIEx8(bvKWnSeib+^(rh z=a8uf_P%Z?;p+K%dWbRVfOp~QmIIXhbTtN+dGDHnayS>KRZHKINGvi)=GR2f)}ED? z!D4P$BDWc&;J0aTl#fznF!eEA?xFaxXP}(lVh(TP263+Do0FI$-(%qeUCxwq1?IZ8 zGuF7Gj{W>QUmkP?=9l~u)MMsoK}zUYLp~BI;%X@2Y~vD%3;~dKe`p6}yz9f&F0)k4 z-%zix95G9A8$XxJ4?KwtODIb?3oe&Z0uSSB*a2}80MsX>o)#znBCmzdN|?$Cq{}SA zGL4yTZPo*sWxc8h&rv1=X+1Ks-#e#?0%|KyIb&=@{P$7qNbGMjt9KO&dD9uX*1N%OIEa_9uv}|m; z8nSN?d=d)^OlGP2h6^l`yP68sZ*c5{Hu^fr;sOv9b^!VL#`vhN^e~X875{4!_0daj z`+#@oO~{WYJXCK-7umMm;Z!p*m}OD89O!dVsA5rQ`fmaH<$bpc ztjvQjR$cWFug_h9Xv?;!C-;U0_VZCh;j^72(j2A8>om9ZSk%LXkYcRC7CTBMD&{Ao zqw0E&Tmcz@znQfKsUUk!JA0&O9_dOK>dq*aYjL0cfgEbJCt{>6k$buB^IUmWzM2f< z<$xv+cp~Ll;Tu6#(8b)WY1DSNGU4bxuRrQbSr;pX1Os8g_L<&wGx&KTW>#URdy@Jj zHsWYRY_Gza_15JSKar@E^JSDI#SiLi)`*LP$E4!Pu@#3X0^>-BvF!i^O>otiluKP{@HegE=U1zv+wW2t2hoC6)X}ouO1>5P{u&j~Xh=t0c|(akEV@dPfYt1ev>}s(R0@GY~ ze8Sd%d3}QZW8i^<8VChjpmKC2@r#%{hZxbSA?;v5P*c{>b`kE)ZQ*?-;!!`dk?}y) zj+oGLwY(lH`A!D8qarH?OsG+BM=cK*MXEm%Ss4ZKs_kHu3;@8i!gSW%Q%zD?Z(hM+NN?{qY zrvATIU*M8rZvHY|l*|?1VMrJ2&Ws1YsHJ-wY|~)D74uW*cl`PD#gNdkOA67w%2$Ey zbF^QoAluAUzw2jr`n2#JpIxMt;F}?q^o?x(`f%1ao7U0hvG#h^CT;g>i531*w2B{u zYELypo&O>L180FSM^+>z_Yq4`>!Z({5ue<@nN@<`;gCFB>?^WDQN%yvpAv6mj5M_= zg7tPAOQi`bI)yDk7Y`;n&}6aes&%*WSS*ZcJ+M%qQ#_5F%{WX(dAn+$rYRg59?9LH zx->t_|9bb+3H%(lBcrndOq1t;(*OM2O|BlJVs58CEJPG=Fb)XP zII5{h)F|L?U~G-~_R-f@VFRg6hT_>>uN+9!>l%`m5x#b9(y>RCw{UlGzN?j!rc-ra zgoFEY`yBN8u^W<}+t3`6Q2al+yNY&r9G{BOUmG&`qp{-Qggd-&X=$CzwJ5aL$((Lt zwRKe?P_G%h&VzF`qSDk1^gn;&Q2qnEgWLFpOW2}GU6DP?h46jBFVL3r$d!qG0}O&D z^eL{*mqd`WzIbMDC*!=_b@2(~HQc~t>JZ@bL?Iko1h~73tOy-H7%%%V5}*4HvAYFP zyELdKW}u-phE%c6P2Zm#@Vo0n*mp|fMu$BH&H9VBQY<~MA$T|3Q_(QH;kBaHe-d|H z|0M3}p|%Bgeml+)_aaob@=uoo(47%(K}DDWDY$_eP@}{%HQ%ESG7eVoxP5Z8+m1Y& z%MbO~IB$f}c)r?_OI;cW@${WkdAA@4gI(PR|%M1p6}its6n{f9sFGW74fj63qbJJRPRT8k}7XxFB=rPY)Aw8 z1kMyV6cuDD8R&3f+6EUe+F=6pb%8-LA7-iGsXM zyE}xHU9$PRyU4X%HgHmBEwQK0 zfCTci8Kaa1%G&%Cpb256T|ax@t&~k4 zDezT0q-H&?w>1ea5)KIzzgf~7V(yUTXPyMQ?v_)v@3YVB`UWpXD;y++O~*O4A>0#o z#s#u5{7Mb#x7$lMQD6$qhp8v)KlziA322O~A&tj56zoQkeTMi45jW)k z+4s}?{TbGj;L6m&MS45sTYjn`&Sj{#=z6Qfs>%BMQIa0E@6(4`l$Zi4d*!GtK?fu7 z9c_TZ?1Bd2`^R_E>r(~t$wiMHhMkziH;JAbzxD0i)1W&sBo1#{P&Ykd`_9ugj7#q8 z=s2^~M`BO)1VtsTQ^z;*dt4+Nxa zEmGgdec91;_!}z;o`l}&DmvqL4Ojj%`$7HAYIXd7&Q^!63cx&ZEq}s5d@m|gMi|hR zguabE(is(;B=e1Hz^5%dl`aG6lJU|^B=>Ayw@(jKA{07H%a4i#=a=Ig!+qWP-rI_Z zuJ8(sbeBGa^UMyn2y`1h6wLCKS^8smkRs$vsejpp2eNrr%8Ohr8}e*syJN=+^mDbA zK*a087Jd{Fk7>ia$2?KrkoWRMV>&Ow*NCyNVI)BhZBJE;E0kAZgK%NIY_<<+8XKrT z+X2zk2glvtO|-GT?GO5c!nxSTX};X^AuJT}!dc^5p4gwq0_C-b$(t5}Xm_6NG8E@?$`(CzmIUk7Q5*edHxqO&P*WsfUnC^?3? z=q2MxYv7BR$C>^?Llm6l^@@FiEMyF(-&>CzKVEh}eE#r{YoBoWe`0Fd0sjqC1DQ?# zsse85{=up>z_vA4xEU0#t9=ZX5eR?AKP2YFJzRL+(RAA1qL|ZI;GWzQ=MP+&lg@a? zRTT2LU|^N#g5^!riutjHdl}Bo_ZllGMe%M05{L2v?eA)nwDe2AuI2iEXrRu=)(iKm zDfR8rKiF#QyxCI}2>FxKx=cO);?Dbk{)2m{JItS-j2_FnZT-k@X)kkV?a+}d^|7nt zP)=C!2}V#dnpHTJ)dM3m$ZgGQn7JW{qnFxUpjc4w{3boF8fMxU4dgQ&8b zfLBN^{D!5J>^8Cb9_!Dzr@yssNcJP%O|xH z0H{@~y&?Vody;=)TmM*-1mzSq^OlMK-NJN0E1fKX*AfFVGo$4GEfS|FyS_@Vfywap z|F|*W4F127)%w4Lr&BDFS@vD`Bjjbf{;u)kFa*e1m41Jamd?z zuFNtAfMGl_uk<;j&b+9-Odzh{`8~{~r-Oy-io<=PXIpI1)av!46cgtv&}bL=RU0;N z2Ru^(FnWTtB&K`bC2B`@7@8viThGAX3D{7dYJ?`vzlRB`e8K{Z-wZ*SG>kn`2{F7d zNRmCYsypkhYmLOzj~wcy0OV3{6}kww(Bz$ZeIn34@aEScV%*g$JL;pfCMb$#vhR&% zk_ZT|Jw{JSe4ok2YR&2H%sfGt*tGQWY93N9C#xq9kMZaj!pGuaI6+=2G?I@fc$6gc z_x+(EclPJ&LS}D8#7e+23^A*`?Y+z>Y8OC4{TW(8t-6R;HP;PchJp9vUovhV zTLGH?#_3B3PZ08KZV#&i;1&X1OoRYt?1q@y<-Pw8-mO=WHh3s3@yQq1Sxxx{Wl)|O z1bsYB|j|I9{T07D0oQ z-t3$xyXaT)-)S+4jt)`#f0AOPb1@C6^~(-nUN1Y02a}b7TnIZheT+ILOOevl?|Vn6 zqM`*mvF;!4?LS2`m6zx>bI~5}$5b ze6@CVM8_r~)y^{bj3>WeIBZ35cr<>ummn2FxIUa%=?DF#WF+r^cNAwhlkTjQZ)lmK za;M=hH=@qt+K(j~_K~Jp0<){#{mchy`$oMlwa#tjPr~I(3`wr*+iMJER|9VHJNi8F zNPG$`U3_X!*R%M&3~w=@rbnI1ZVPDq0K!EQh&0@pt*QS978j>VGUN}nCUxX;0%Jm$ z9ra-fkWq@9j^a_sYe@G2MG@&IIi2l1MT(vASKIBWFU8yA8(ovWU7#ZahZ3=Wq2kBLDU7lZVlRbYRAS0Thu!$TFq{cA{W7(8<(woJc|P8F z&L?Fgs*&36PSd}$Dk04QPBVGdzs10~Osy3ba)hr$8XfJzo(NKm=YLXRW(%;&`v#hy z+PPuOo%g4?eo0;dR8#U(hZIZ>B``E@FVi5B&YFyHdG;56Tt)SID#hAL)bn9JXlD49 z7FUMy^qka-$bYX@<&=z%FZvoi2GnrP;ggFLTKH+CkH(PVLU^W zp`YeB(S|rcXqy>s;`@apPY-4T(PV+Zj!F$t;5I3dK8dqJ!R@UJhQ(=JP+81YI#Dhd z^#e^`!q8#uUg=L>>Lg?e)DPS;=O`q6k2xD?bVkteD+9^}k_i?wBQ(UTTD2*)CI?kJ zy&l=)TRk!@rxp|bEQV04d}SR^mcJvKm;Ma?3NyEr<+u}0TjjvsDSPICmQzo zL#UI%%?{(;eMSf}0i*SzE8CvEbs>w{Rp8}ZktK(6q#|R-cQO+UzoSTcjVg(M06;pP zC$+HM07^Jw3lJJZBOGh==Binx`AhJH&FH8kuey_~9>wu=%9>@(r`!X%=6o7;CC4jI zwdT|l{!RtHV=Jyl>obJrK;LNDeJ}kZ$}5FG5*51aO^Y>kz<`lHx?FFSwJN7>5=Q^3c!#WVNWb6 zFVc|$ts0heS~4LPE745T(97rrfBCb#WAXRat)JoCj@r{BG&e?AF3ubbv*G$8fiq%2 zL)&X1zBV`zt#M+7LZ?Unc5k$&Q|ne%>-93i0TJh=XR-#HBK}?Ck&vdm;qAjo&ip&+ z=GQWa&E)Dec5l=)lE>$(09QD=X?+L6O(A~qraG^{>O}OP2#?f%%d;X_e^1{s5rIsIugKiUSz%MTDp1V#t(?VKaXRR8!fKXyljHIr9#}<>+&qo-rRX`(t zqv_UUv$lYBemUYccHYXD6?4CT9w&=^jpPZR=#Tb?GVU3~>ml9rsn2f-;^;_L2|3Dm1qSMjxz&1vpE)&0i z_H5DR=Nm2!OjhoRK5qa-^Y=>Tkhd8q^}yC*r#P6}Tp#mn?cOMS^+uv0AnZbg**0&BwQN?wYu#vu1hhJW-QzKcCo(vSwj^Trt7kBJxt* z^u@_~SNVmqSuK=W^6wR)6VaMCheH55My9-$MCoiX%cf>ftX7WT&bWgg9|b42ACJy=?lrHZA}V<#_WxSS>A{E>YAeNNxESKNkK4uPxKTWC#zU zO*c^NBmVLej_Tp`gf~yny143nxbKPc!%Nhf^SV*iC*|xxLd?#G{70hW@lRYYPuIUD z^Q7=URWdQPWk+A=)yJ*sZL%hb!`hJi}aYp$6eMCPN4pD>PUaUUm{$ zKv%juMN*S>B7LBGanFaJw5ZAjatt}*k?5e+_;Qs6a9|UGc zo?Wu~ErG%MWJO5J)?iI>Vf&P4Hu3 zA>y^B>!Y)mAzm2o<%CDGsb-^sP{e( z_5gw{D!tR?-K4}8w?=_O_i3SV6n#IaKtT%UcSurQ0G_zKTr5|bA_nn;n{e!5HXOf7 z2E0SNC@*rpSkPnDw(9*P4PI>}l$MEm_m;;Cd05|*IQ=#)?spa$ewPF;K{LH765O0p zq^@3TGb(V~(@Q{KH0@?0d`dZFDy}P)7|+?Y?7x)5&@gKHTN^Vz2{S=yZcB&z7Lma zSX|w~kzXHaRmwKIzb)4!mpeD3nv&dg@lrjCr6xG zF1G}gyAP?2`V^w1W;~MG|2`-zlEm_|N@8(NVN!pEG*!oD-+lhwBYkYH2PFM@MJ3Yg z92<$i2Sc#Wn%BC!aMq$HfJ9!wC7=3d84@`|~y;&{j$I2HljX>S{~hw?RaX< zbY~Z#vVVJgoWDqL$_p^I^C%Vn=}4r_8eV-Dut{2GD3%T$Q}NeEe?D0E8;Np^B^Zcp zpZa2v;*g~*=4C6Bd|0fK_h(_P(K7L>g(j|!p_{2Kf$8qpe$)ucI)DfDkXdW5iQ@P1 z@71H(;`*o0^^&Bze;WbyCtv^33xkn3ts6vQ^GYP~xsJiYUTdIzP4~7U9rbkFO)rNME3Ac2th0ij1&%PoEws1C|RHMpr z)Pz&_N^idX{B-IvC#@ma_JoSJo*Q%*ZnRI}$5*%V0#(cZrLUb!+wWKj7^BJ5luapY zatn0ln6JF-DK2k4r8IhiM8aiY?^|}@1R1WumV2XitM`v@B~Pv*yZ zt$f%c>Rsgx=A^$|AlVK^1Nq`~fiw;yy3ELAucT&;)u*%JWXqB${T`=JT^X&+qq0c5 zV|&I>ro#tQYa7S^gT1$ks-s)ib%R6j5Zoa^aCZ&v5ZqmZyF+jY9^4&5aCdjtpaXYz zw^RIc&o$@TZOwn5i*tAO)kvc;sH)!k$f)Yy^F8lU1fVrtl%X`Dt!aA^W#FCV*K^eW ztzp>I-Quz`8#9;`pG82e*1-uycem8&QO~##6i6xxB!6nr^oH>gZ~tjoTp^v|r068V zjd!}*xMnBEEqtly(9BW!@gE>JWi9+T<&FdFFm2E~%=c+~3Deu2_sR&H({&Jlfap_-#x!I%{=8L5DNt%#sX5p0nqJA4D}zJ zp&${dgrp+d=|Z>S=3}QPGGBP&=#PhPJYo!PBV(>`WQN+X!Qy4Uxc=+(;nA(%uA#%O ze^7Pn-@FGpCw>)}~Zil%>rf zdo>7YBv5zW&JLq;{oV=|=k236BgxgWkC#m?OFe!Zgoh$bxPBgRB5-0n6vCA&E!*C5tus!vW4f+>ewe?~D^y*KvQ>eM~ zopUDc-{n?YvD3I5=_XK}T=CQ*%@Z`+`NpPUklf80H5 zr+OmR@)~u^FI|{}o=Ny8KF56OLU`F!N;;z;%58sQn>Y;hAmXs|m59v>xyHSB@tJ!~hH%l+_o?KO zjOOCgcZ-UHol7+bA<^X-Bc`b8Vpdv{nI#&3$PL~cFO6Pd;t5f_pWcxlJ94DQGbQp3 z6Ct9Uz0BTX2$|EFbd-UHTW(K2ZNps|OE!~H zQM4$PoO_$V7!AytHS6hSsu2HrYB&fVq(73xIqZhiVDE46(YqO20m-+4*5iYxGY56= z1~t~8P_AWZdr1_F(rhBv{G`EM9yU)!``}ob+vSLOI=2nhLZBRy8ZC+1EAx#z9=8X% zXybRQyXB5dAJ;SA$I)zj-6IxRl}d=hqCT52HYKdV=i?qY!)<_mW2&^3sZ_{^@)p$# z#XS0N-amA_sT8Z0+&mNo-PA=Rp+a`7-KzQflL;40Rts$A72~PkSH?qXh%O8a+|J)) z2)l4;ybdc}YfqDBnxKNY z%;?iBIwhC5M%MU#7PE0Vy$F?c~$BA9B5@OCb+ z6gJJ9(2I3*fc;r;jT;f{6e7$Ylnw6`03Xe0E#;E>fMIF#;`qCD?jMQq*^P z<}0Yh;#X(-&gETLbV7SRn$2|!#0%P?9jgO=q&5=9&0oTm2HA9&)ct|sTjzP0JH&s9`t~*-f=j*`N3a<#7alsXDG@uk4^|oPusal7G9Nva zS3mAy8u%7Oa@m4PBn&;~?-3I*>2G~F9)%A0y~~SqOYC2k0I<}ySa&tDYjZ?TZYZJd*{v0w+fgPg1(R`F@MD2J$B)0v|s7=AZ zQ&ukBzmUGy7U-pNt;!sAY3O1Ob$wPfP!ErHW{Lx8_*aT;wF!exLG1|yJIWk5{bJ-}K!%g%Es8niOd9f(@y5&NnO5$6yR04^bR{Ow4b3k<9%5 z+AV$GX@(W>>U&^BpqNRm9Fk0cLsKdqxrBO&+8qp+w#z^LJ3@Uv$8QBfkozR`D_@Jv zwecs#CoNQA`*QI6iayDZ$(T^BOr^|}I|e8n7$a*ZBWu1bjj2Up z$`^uQ_j)3;lnp*6Cgi1G1Kn<4n38xfWcwsTfIGHCW3Fxp>P3T?&c9rK8{D+oWi9G? zvd=-OB3l)U1%yv1(2xn=`QmY^?zcTk)izeqLL8(&o!jqtG|){)EY1Uzbx4WA_vsP>d2S(7faJADQA zz@7)2~NBRxAIi|gh=_sxL-3a5AuP|%(5B# z$sax}j40o+Y~5F|f;Is>ns1!0d$x*V+T{>hIuA8yy7=@!_8RP>{8wONbd@D<-s~xy19Kk)?($@lFaM%%*zeu!1Ia6>0sT)hY*ug`pCipu1_ zql08;Miv;SCFR>YbxUjOIzn8+t8C-Su6PDM`P*0zioy42eD0z`^`nK<`-iwsX*@W# zUvN+jt`pH}&E$9h2u>LMG`FLb8o~ylLKdszp$xeBsDQ3NmR)#$6>(?_Uf=K7)Y*$9 zdDq=Bq&bn-GEX0R!{7?ObT3W<7eC8NHOyuDW?b zxNq&lM;KrS`{VGzxu>@Qym-tNr)_4}E=|=B{&NmzneR4VuoN0hzjoRW;y-P3+;ZO5 zdAK(`HoQ82Ccja)i=g8&j}w`%TSa~I))C2v@GRvtB3TovimVbbGr*A140WI#I1sE0 zK1!ZhcnZ>@1n)e7F0Z z%6~l5*dk%J2sqpV(}!bT#bG1W9r5$m}$*Akh5Vc$Dj;bZKdpY!#|rV}S9(x)FLC(7eX# zcEGaGU|pKblvBLfdmbEe7!9<4&+mgcFsM3iht(r87j4{ATcM}Hh&URdq6Bt3FmlbZ z!Ey972ZMfKt~ykJ0{*C1M6Zu=|1mW|*cGr_vyfGtWa2}xGA8((k!TpRwH+79l&Ju= zAv;PaqzC@l6l+qn`-pYTD~HiVuCU+5Z=WP)2I5WY=hm&1GehMnx)DsJm6Eg)j&h`l zhj@)fW2e!SXu+s~?A?umjj@i~z|S#K;qeptHre z_HRs|qytnXZ4d@je|zx$*Cs9eFfmwcsR}$ zPV!e(S#I!gZLt%Lh+D`I4xI!qS@tiOi`eZ>7cz^2SDpXf)$So zn`?oO3r*ZVt02NCuW;9NN1@fyyF~hCBMzZv^Yo5ymaR4eA8Ngj)Og)#F|()Oo#~?S zFh5&+CPPXxgBTeEbm4}U1*(dg3DG5V2kUyze2-7(n6xrn`OabWP;M&drb3{;g1kgq z_@C|*;neC1W*-*4`{9zO_1x@MNkPvseE<;qfyVLuW{v=u-5MWK*?T|iwhB4i5WJ{+ zv=E$O;Z!@8n{(x-g_;i+nDNM>Gb#0+V`{i0nZC$a`qiS>874m@tQ6i!H6cHaxu^BX&9~zOV-X@haVkng zp^lciMeY>{fR@NZbo+EVwZc7blXr`h5c$XvA7YI|>41wb;$q79KU(;j`@iT2ZL_C^ zqpv!XrH!CuKD>jlJ^8}Xl4`ie%>UAjsY>XKDK4oS!I&`v`3|C)!PNW zj(s=CKg9~{!*D@EXFniVm&IJg&tt&kq+dnqXOTJ-NZT<5-3If&+A?kz1LM6*{5IaL zwy*~0jvPg0Sy`ik5CITS%7iQC?w$1dwg=Z*AzpkF5*v9IIvxa z6z!Zs^0xc7cV!CwZ*-w=4~2jm&3W==B%Itkc#vmaTW6&nB<8iaYnyb;UkW=S7Jx~N zmlLEy6WZKS|F^gPcNh{t$yU-6L{6ERcK?%(4){5oz?+fo`@pCFh(HpNp#VlYvKC`L z(*JCvdk35n+B=fi{{~_H`}z3By&Wj-Ea5)bU)T9xz~R6BLjfE( zkYbnS4)ecbqyNoHCjkC-zFwVhkbtuma{^Lmj%&O>LDCF_V|3xAHFADh|*U3K-(f@B1dU6vV<;lm=IMJ?n z+DLoukLy?c`t-HYKO)L$8NIpk0G+GO0Q1xJoNkrgx?u9zgYcVPZ!u3W1Y6#W*Ck9WL;am!e-8m@{GnS z|6eY>Ez2qXmVesxCjZT*cMgP8WB(sGNUGlV-#W9r8h5Gh%_ej73Pej;Od zZuR)vKdg27kG=G+oY8kk9Nhi4*O1?eyYd%m#pL#9@>4AzJ%h7i z6X*5UyF|Fu*jc|=dmQa%MwA3R1Q4_N+NJ@)Z9v%b5v}}#c6f!DF<{*D=rzX$-QsXQ zkg3<2ls?KUS?h{(;<1{Pv^)*n!F#WSJ-^U#7?#2NaC(E{DhYia2A!_D*mGNLa1I8) z5>yqnKsCOvifspNx7`F2MvAwJMr`{Yapsww{}h$=H(0^%8Z>v<84Dw0SI7PaJaNl1 zWtOe#8MPj9IslqLeiFGj8d`3bW2RkX&qu3{!``hZ4K{uY)@ebqNt3Ec!saMd5_~b6d-q3EsI(Dm4jA}bZKX5~3*M11M zZ$hn9h_82D%*G8?mgy9bmM}YRZQm@}L^<)Kxag#^s57m^f4$fp#vF}8jdu$ckq6r4 zjBXP>K3U9qYy<1#@(J%RZt^#zPNQWg$o;UCNsW4r>l*jqML{Bwu5{P&r$Y}qwdzn! z{mM@uDhtMIMAN!ar0vB%Q7e(FHWcC?1m#mh;#;Hem-0=x4p`B?(tuz4jXmTf93%!hTG;>zOGUEZCJrc<{* zB1VUz^+^GUKy+qp_iB^2$3D=~C7p?pD;$znb}-o-1#<01w3`6-{_Px&03h*`@^7WF zdlvTPElUSd;}8GjEUm`B20on;`{&zrL+j*F>~mnP2`y>m$rDR>m9@%FVVmsy1gc_` z!9ZzJCV_@-M~G~$jk1LOjhqC^h%rz1mv9J)L@vhSKEWRcGWjtSKF8p)Sy8zxCx4kO z(;{{;9w0g$OQs&@nbW}VwoTj`yZ%MNd2FhDg)xw*+86$V_s+eaQ@ih<0=`01z6lX$@P;<^lVM+YmG|+1nj5$Kl~@t583oE@E#_R2fkB(~d=uNf6z~9{ zz*X8X6*z3>RA%)VR+CySJYlV18*D%;&(?6P7*Oy1hMc;}XsC;Uo4wL(PJ3 zsVU-0Y2iG7GS{GYTb7ADYtcfN<)vt_tQ>#_M+oaSFA=y9RcTW<2>}Jqv0_Ba=^LKV zc!7B~>i&qY`z#vaWiy(j{~_~vthp`4>G07}sY?~8XrsLmmV)_I%3dx;E&WgHEs=sr zo1HJ|lP+(r7`|EqM)TU!`N2X(%k;f(_Le7!*PWJaf^r`(-{=atc;+2XKu-b=#^&sK z9ys_2q|2vO_eT-3?;j4t%TXH;o|rdJc-0r-^M|KBvgSc4+w`I^cYCb%bTs+Uqe)bzx_oD4=joAQ2Z{pzD0E~wXz(d@`jVHjz zaf9IP+^>&K4`3-2LJ(h>N)W3kGd~x@em0Cqt!LPuwVEFYwzq+3GmT51J?2Wf1Tp}U ztLL(x%=<^;4CAC2Dm(f4Z#aBnk-^fZjt<%NvTP3P#?ylpECerQ_I}lKvgu9e>4q{H zdjYnxaH$@+(;DOGP#_E3+G?RG;j;N2040j!g&GZw^f~;-*ZzYjAiMOEo#*x+xE%%R`rnVRsc}VU#F#!U_VErh6VL}Hw!&Jr0igA=LOU6bG1u2< zD?8o{_$tWxb+=Z?fu(3gJKiCrMer*(%MNCxV)g#CH%pSu#Y_>o7xk%7nIA%cOVBN= z!_88`>RHv$80?Ma8P!|v!4C60eTMN}2xfYl^kqV6xU}_VZj|1(Q=BP?kXVVcW)Aw{ zm|O3=T#FM6qDbr|CJRXj)v2wZPYLcVU(SRM>*TiyIPHZOJKCNfORun| z5m&<-kt`b2?a1k85RZFakh;=SHIhX;;1}mP17^Cbu@dNKIhMR;-3@J=+d2{84410F=stzw+lW`w68T+qV*v$d%L<}IMDYI ztzF$?a(~jTlV<#?K>nb`oxLi0UxIFdJbNh|F3^+2$-UhO*!;*(V2i}&=;BC3-kV-I zHJ&WyV$$x!?*jjwE1nYYnk|N7*LZJ|bKPpjwfn;#8LkWIP`cW62_N*Nd=d}7;(fS^ zdAQk|4I~&yjvY#%6e0~m)JNsdZT@XgXL;&l=-elk}@O@XiIxo1-;-=0hou>92=1L!_1-dUxuvsQ%wSPGXNc)n!( z4_6|ak#qx}ZGZ>DAh2(<&i6H{_(J$bS`NYIUwb^3t2HC0N zDqm|65bVrix?*vWL1u^LqkA+jftbn?e_E*Vhbln%tkBg7C7>$%%WP7){vr&Q*TqK} zpL_HNfaEm48d#qy5Uk|{`QObx;hzNr1H8F-Jhk!%X;((@NlA%!VBz}RjdDFhb=gsT z_e!#t0Q4tzx9i5C%O{!fG*t1-U_qNQo&i9x1~mUl3nGoq&7`<@ zupFOG7?;Z-!LMK6^()5A`fx<{b~_ye6oI?C@(=V0lJ0jx{tMd?>;*9o%TD9VIqBBp zC4{N?elQ21N{H)&6>VuxMinfkRX^1g9?(@mnx0tAC4l{Z-3&tfMR;imvoP*{?2ZTo zQLB1JvzxE{vhhZ5*S985+(mR5@% zy@&F?638BM+KH%(;{1j>OV|CDL~TB-l9L@&lfCk#!9me<0qzU5U#;UZM1cK6V7{nsWB(p;3>F{LL3#& z&BBkfG+7lK;~MaGINl}+DZ{X3Uc(zRp4NCtLyJ=)Pr6bj5`wN- z(zif~H!5t#Ng`LxWPAZerxu~|Ls0LcW`EX$A?Y2vnb86qbw$B46Eo$!LHrnsifOas zp*5Y?2*N{Yvd)%DF#=l4?7OGRuQ>1M{MWCaofMuJ%i@-yv z0k7~2cE`21a8NUcMgkt_^sFmY*irQk3`Gtka>U+wakfF2eZcnX>z=pU9L>jb6C|?= zeqMMZT&Z>&kT(ZFZsqq}PRHc>1;{sxs>#aMCPw*qHFC`+8v%ogpLET3a`4PJi6SOB zZi@4CK2buF{1r0I;5`!2-z8NWrL3dF)P`!7gyq*4r5KB49zOfw?t@t%mR~}}WEt?e zW}sVtWQUCScm_a~2L5$ct8}X^ULbqiRH1nIQbDs(>+`kEdW-=sklr|3LCkk{d8WQUDK%Ub0^BC0d!C?E zqD7nWlE2rpcrv*vha(>4jtmicvRnk4klb0N3E@tnxZ(zI?vm@cIb0sQ%GeAcIMmCFE7<@*SQc{)H{90 z?go6(LkV}l!LLJ-j^G|cB5Kz#Qhd=wEN=1K-9uVmcA!htO_5 zCoSCmo=3V33wZ)L*VAy6jQvnIN0i9GlYCL6Ny;t%LowBakz}$!&4*jUkSS@ZF+NO$ z&hbCXM=qTbf$uRY{3LM@OZ``;@UfXS)^{LNf*o`{Mzx0Q&NXc=sElWZltMRpOBx=l zjEJ9{PnV_QU6nC#BGp1QDh<$;KkG*RR`nU7B2`L@*N;Xk`PF)HV9Dil5LfcE#DrGl zF7-^Xqr0R8+7OZHyl)6G_Uf^{DvfQ5wgm&DDrb zon+$l9t@N2<%>Eq$gXsOJo)kBk}6BpkF6uI#CIu3^*ds|Y>7!+SnDjW?<``z7`BpM z+E(a*1>%G~RL8^gEF@o^%}jV)vcqkm|r#{+wV$o zBr12q)Tb3{j+(RImYnuu#~d_y7j;?Sx~5s__U_JqyT!b#WudyH$}hb`aL;h|gO5JQ z^6Fq}~BCW2we} zd4xZX=4+a?he#Xxa3V!+;60+ny>B=x>=rbF8sjwfkF3K$gi z?MZXl5m`srN(~SUdjGsRe-{choUkqU%VkYvPCQ%=m)FI?8snn>)JOgybz1jYypR8? zBvJQ6=8X86TJ3M8DoZ!N0tc&EJjcyxTjU+;Qq8)Gr~dAy3X*a5j@u{#Mm&jt-^xWp z8h_l(*e%w_j7#m!gvvB)r)<1^Ul1SS!ol#K&L!hM<96C}~r`S-DyBEK|7`}0c${hRb#n{q~zd`pMpenNwzQb-FIa_CPP zF}D6zuNL)lld2akCdfr&>M<}I*6I!D9GUj)ftAANRfi{M`P|nf!3@UY*0KIFXy%Ix zN$K1bFt)|&b7HxH&g3Uy|Jo_ERc|?HpaN*-yV}?6Zl1u?G1nqt1&5t**(xZ%iRw!Z z#nYmF-dENyQoNtL{SB!P4LLk6#z$TFddJH#xE6V~HTaV|x4Q`%BWs#J19i7v^lw-O?WdLeuwm*XD82L-UF zic)}X42CUjwW`(-t%e&!k=pG_vN+Y$%V(WbIS?LOg!iORrP;1$j3Px*AYEU{sb;4Z zEro4#-es{e)*GcX`Lx5&6Ny2C(pz7q+wRL$ZHshD&?Uv*C5WI&P;r(;lghDEm}4F` z;75MKeEmLTtFK}6KCjmloWu2-X3{9f zPcTdsQM3&J*rlKS8FbZo(N$oWrDqk(io7qu<`zUr6`-x2bnB*z^=lg$$uT%58>bgRL z4piu-^i~jtGI(*d|L9gMlr3>br=qPBVh~hR*fOO=F-5sP-N6kF)3v4<-BG|oAQ}TQ zb=&3cFk%$)rgQTK#fXCT8O%8-Chzc(`3z_EHX;R7!XPlH{>r*P z+32nEGYQ6^{x7Va^|!i1JZ=NycRromKo9 z*w+t;YLpu%=s6g?I&5^Py6_EM-HRdp<(WTp#BTvtF1ZDd1M?*pS6|^BzazCu>3S!V ze;Ri(M?KF5)k^_xt}|E=J?R}LgqP3JV*{1!MQb9qB1*AiSf+5AVJ!}hC6-!T8IkS% z)HXGyewy|VlX(WBF3fFA9wHoHXmUd=BKYbXeaopLL$TgOiT*KdH&$Dy&M1k}gFoi| zW!l~&?~=TlqsD9xquFiZIZky_^Mjgw$n30hU)W8dpf3Mc@fjF4{W&Ps1H_K`{A5F!Oh8wp-3gIX`Q zSkPsXn0hl1OSfK*quO6E^fv@%E8Z=rPOcYYgg3tb@T@2G6EkLNJb0tux`|R!hz=_X z9haVsR;~xBO8EzIwNz813Cv9ir@FlJV8Hkq(vhYGn?<%j64p1GE{;&Row0Q8{ZI{7 zGH%$9e7lDu$vJoEilWj{iLaqJ(}rAc(b5lyxFToz0aj`QhEm%54lUu2x@WSv6BsOh zlsx};5<`D)9{T$H=%YxjRYV0 zmtE^o5WcUif~c)tpWx3E8=j}5X!#FX*3<5mg$V8mhQurh>G80WS3zBw9))ZLS-unp4wFvzV{ZIDbOF5n8I zn8$()lVy$MIj&4vOrK#QHf~IxJr0<1@4GWpKY7%xzC!&r zg<4VCNt|-k`bdyJ{{WP9Y7(;zaBEb1b)uEMl|@wC3)0cd=249~YvM!_W5%u4eHuP@Qx0@8#4 z7&&A5%H@V@&uv4P$b?G$JRAy_L#Z$V6I>{Rghx#JVjGtWopOnW%uI|}I1Xtn zwJGnqCHhFm4z1=dAf2ab*}XvetN-UZr_}~Gc>OcG-#SZ|oi``SR4REZp09l}sm3A@ z;pF5Fy$g1%O0n=CDXAbZYg7U82DQe|0e+>l)^PbE6jKMahr$9NK!%_$@DZHT^*aOveDHxDMk`8< z5CdhGUeqnzH?{UQhRcbR#cuNF?u9`f=}e3B-(Hre0rXjw;}OIh<33G76bqn2=tM9F z)Hpaaj24@l7(oZTGT5#n7?#1aWxnZb(oRRHSfFZvJ6WBcBQ4%N2ozB0z!^+?4CKUC z(heuGg_!vC!r#Thed~+7{c=PomgZkQ{+ZFmzp;|oGN|J2IsfWC_g?Tld>gA7Gpo_D zIt{qnez8d#OS-f5(!s6J%Xk!lNVSP{Vorx}R}FM3uAU$`qO0Z2nQP5j*Nj+{1hTLA z!S1=U-?^bRkBViBx>eTHb!9(eQlmC57{QO`Wi9)5|BskfbGF}OVNpD8=v+ikS*QA04Y;j z{ppNIpEIoZBaorhO)u0R2&7#zn_9!T79i)2n#a^|rMCNFVsm-s(HIzNsa5R#Kmqs5 zQXrGma0mn(7pRX7PI30nbd?N2qsQ)72c)yIYrahvq8}7p*N;>pn=i+>jrF}DY)nFT zkAMP{t}#TdP&*`Ji>-%GJ@cu`{^a|X7!%!5Q}Zk5;Pdd*UOVBZA;PNo7lg12$7_rhn(h zfx!#XTy@DnJhUDB`Ar;pSgrS^ZxDvg)hh!O)_YQz%f9vy7FdqY<&?mPuq}Y{o}JUg zLAvqF>j!>tnyj#MX!7&;e!sx5o+1inxJr%P`T6gGs<;7b2w0Ko>sbW2;c-U&CC(K6 zeVo*3u^TXAB@}Y$O!Z;@ig{n~>hzICcImawYOQZe@AEPzocGW%bkcz1YgTP079^c2s3BxKYn}GAfm>~Zg!)Un!kC>E}sxx&?~G*4+geIL)xrU!-`l0gb$`vf2K#K4k7krOwP;`L*``^7$^?4vkzw<7%475+xK7Pnux<3g)`A`glw)I`D8!Vd{PcGY9S1>A>pLOpl?h%YYrw%MA)o8m^t zrqTsNd2yYF>=2c1(|1*-8aCbTlsM(?goLu*_VR( zy5~~A8k5`24a37dF6s%Go$tHd?B?{ljg|C^Hq)1%z6&rR)P(99;XJ@)-|MS*2o)j_ z3|j?e5I*K4&CZ70Xn$8`F{A7nboetAZ%k&!Rt#hFd?$47(kcDZ*js-b+pW_j!HXH$ zqvaBhS~!uDx6T(?n4vo$G*ugMF|WG6({ru%-X`hS_kN9|!l@)>7-E7PHUx8(|`4d>IT}q$5EWm>qwI49-chR3l$JN-E~7vY&kP@^#Hzdr2El5 z!)%w(j&L_Zv+qEGjRr*TSuq%+G z5ka^f8@#lCZLZ#J?$7j#^gcI~gm$I=ikvDY~ z$hY<H_)y??lRej0{LteM~LA}!o zNnT%U*I$NUk#9+7W}X-pG}6q?tI)s@U&(2~dM)P3pig}99>escp0E=HX=ocwH~(%c zyxH@faAn;(pZBnMtP)qx6BZ5m49PY7zz>mL5Dsn9yz7aG?Qs^7aG!V8B=guJ>}oHb!aEXJFZf>A1{I-9hKA_C2zK1H6&QE?7{XZk21F~bytT_9VzyGifOt6s|gthqyFEG2Xrh=oJU6h!|pJqUxfoSs98}F&X*rh;m z+4WSnlK^+&Wr%ZSSCgl-xa4KRhOb#qN4Hs-?`KpOrW?GbZ)$wUCanc)@j;v0>fqZ} z$lV|^)2ki%8(UsE9N1qbB<;KN6VfcBP;E#q-uGnBqs10(^MtZw&O;Vh@5~lE>1Cgr zoMh~Islb1HC&_~S{EwF&Qt+XuUeft<`H!Cd%h5Z~!!F6y;+9<3%j?}r{$2(ulIUdN_QQnU(+cupbD)sidVnPn9yo|1}W*!wH68zl&hn;%o6HueLr;CQ^-(P?ZDlo4tU8RA4KW;H$z_^j#E}M`3{jzfv z0p{WSeLY6OKR*utS{wiN84;!frav6yQpl?Mx65P${cUwpgEW%f356x3$p!`_bPb0T%Iqmy3qO|33y#IE~>v)8{pRv)?BRfS$PRzsmKn@&k`Q zt@k(FvuJJgu5kDqt`kkNM--TJI(a~Pb<8{=^H|3C?&V|&r$d=Vs$JbWhlg4eHbu>Y zYe^k>vnZtZ<1zcgP4nqqrq2sp3dCYhzEq-L-zK+U3efz_YKC)M>?fFuY8NgbxVLNg z5Hfdx;PZTqNvm1uXVStH3$TdeNh-ugkM%z1P^GICj7N_xD{Be-89pz5P*vFUx0X#)kQ}J4E zeSQEQu@AYjGqV%}8CWRl%c=fYxR<>NRKEam#sx4JJi<|NG$K)d+|zE3mJjb=@}g0s zk1pb=h4`of2!(To9xFYRyNd^Wdv%Qao3dyOBSvu>U?vVdV+Rd?snP%)onhBvmXhEHRqJjYB3( z#cFRi)2HqNuCV+87?L8T`}dA62_*q=McIb^QBqi4$L)Zt&-CXWV<-s<<+G=>u3T2- z5bgNN^MhiPj+fbPqV8bNFs+wmf4N%aPt3!eKIb|WZ|i&AmOJk2eg5b+e&>-UitLO} z1$J{|!Sp&+0niB8l4{c6y?@dSpLzyaj29gbuet6U8hTuI%jnH#9LA>;b$zVj30En& zMElq`=5z=azY*-zcc}Wh*k5RO+Mj?*o6q=LAFghtE`U`vLyAplDqxBsy zN=Ysd42X=kGR)d!FP4_+T%7&vE8lRB*+U7z(%+c3L)$~tne_C&`ut$K>7K%DL2k9w zmY%Uc&-k@9F41(fpnkQL$y`baIkAHfx;V+{fE`8OEE4ZFSa~VK>tTuw1KdWdIi+p! zoO8S3)%zQ$hV%JqdZFS%VkC6V1|%*N6a{-9^7^>?y0_NGJ5%hzE&;P{eXG-IzN=1qHcELo#|fs?Mf~nmYM{3-EHm%qhWF`hZ~B6+3G|I!SyR;pvyKVb-J+rgq5N| zSLh|vyJ6vhce*Y@BB%e;4*`Y!-E_nF#JOiW_$`>5Q!!u{t1#83ZDE);MHDqe84~*H zklIk_!Eyi7YD0*bChC28AzgCsg(JY$tsGE?ObZI0x9ENmo=IeJ=_{C=4< zapnS3=p(Rwog42t9RU~4zxc0S1g0^KL`&Nuhwvu)sbK0po& ziG|lpJv-jepnK$n^s-|U{8pRH`ML8H2tSh}quIpUecXlYU1JV`&*vGy<-9L>mmr|y z+RFC~Q{j9Dt-&m~x7VzoBpP^GLi%t>qo)5H0Wt`vP#**-?)8@|sYgfiM}?XR6ox`6dGugp~p*bBM?}n^s*MS8S{u(-NG)g)N?TDhPWFxb7S+obBY2+y0oip=x(_!m)b&I?2wi@ z<`lb`7iq|^Lq0fd30I2Xr^pIO)x*(-q+PPDddLeRmC97u^^~qQkqIAbn zyhSGAHqJX91vu&_ls9p!^Asm<5c14Ri4fw&)f^{#Oz19a|V8367`J4Q5tEHS#S_ ztH*~Z!=bJuQL!910V2{(-@IWhSa4t93@G`rZArLw^SVBcXb_3(p^Aj_J!4r(rniiR z)U2lSZ@*K|c?LYN5uUw#j*x92p;Lu32XmKa*kKE1TtYpM&1i&b$*_S41#BCfqj4@6 zzycvH{r<}sQec4{I;Gz?xrrM-doPqA@HEEP<-8AUsx)aP71G_e#6+&?`WE_P}gZ{ zd-&^8eJt8K#T%lb5P{2V91PFTHr(vfPD{-85oxnOJ}K+h;{SrqdWp9LZ7uues@o}J z{e&|}4Fh$Ulm9zCYY{`r>D_1g3+Nn&RTBXN{A_&s6F+O2<@_nM1OvqPA^7EH z{G5`YqS|YXoR5~{vh&PpiZpXr*}twrwb zg^37^exZUQZ!#RW2Eoz6`nX1;#LSjK|bj%Y2ehmyT2-|LTA613{}+oerFPkyzFpuLiu6n zh$-q-h0yW&LJi2+hYWWpjangv^CQ&ecdnAigZxqOepFTAC7f_4SWb-Lc1ix-`4^S^ z;Zn09C9zvxQ^&aZo`G4X?-tibs?GX(9^Q{60dcr1XWQ`c{YfmNHZ@DHmr1r%Z^S&w zot|+Ni`An#)ldbajH3*T(0zHPpi@l+*$7RMu2YTu81+$N=dL?^@qq7z>5!F8__A$8QS&Rs-RZkvLmu03 zRyV-x*J^uBGYF}7dR&7jKD)d8`2_7h@;c zCY)Ya4f$sgg<0V%Yg(4Rj$w6q%Vebae1A5-84q*-Hy2eq-1I&QPc*1Vr&+9W-#!M9 zN5b>^FiDLcyaXS~VJSgY*7trroUVvQGufXOzo(Ep2_Tl%^L zH@IcPa=hr;koCFkNKZMAx}QRQ=N`wHA=e^iCBd&J1XeQsS%D9=CN-w%T=4BT%n&yV zJ|v7c-_VX{>y^}V=@in|0pl z&;}#1UY1c^(u`93HIXhq=L|;`t7xlR8^rOn6o5A0c7@_uRZ=g6Is~O3K>xsBr34nN z41yCqyf_+qBXMKB4pYK$XqDo;vV#L1vj+V+rPgOAwEoyT-Q6DK#fkzst(Tpc9|EIQ zo@l#`z|H=myb)BV57^Xbl~tlEa#_jXFuKpe;7-4VE~+TGn@2P;68L+PWcCt}?uhA{5%*`jMGjnHFDB;^43z^I|a%@*mNV+~CSU5;2c^^sxWf_7O{ z8oWti2Nov|*vcDaFORL)pMyCox8hwDqDFOsHH>wF-nW23L`4oNp`y|!%samod{NPy zayxBrP+VM%7C&sk0I`qHtO6i(MhBao{WREJtx|@$C|>C$aHbpc08`6(mN(=IO?att zQPhj{q_gqr3UHM=Jp*3H@!rsDxg{yj@r}@G>poU>ZAibNtz{2bTWp+z*6Ae~HChhx zwHpwGwWNda%pp%Xf&2ZDoTMvtavS$1TEp`mocR?+-O&TR>fx_b%Tp$O*JWYvZpYIUx1i+(BBW$z?s8^srR zjE0Io@xVudAj9zLF9&U(%MyOC4gW<+!mi95pd&AvG@^=%5Ho zWUYqb)!;FxzY9Lw{O)ntUC1upsYQgq=N?cspzrR?ZItv1yi#P&)OqW)4JUI5LKTcE z$rYqx_?P@!85 zP9a3&=M{uzzi+M}1oC{##W&@yoCRA#b2rfC*&k%e8`Vq z5{kIAqw(}up>&#bUV0YwYe(5beEq7HvdZ-1-FY=j{KTUrA7=Yt*T=?fv~|>ixMgSA zzcMY2Mj$!D*Z{?RvW2koO2^J_q_rE#-uL&z9QGuE5k?IO>N4%V_ye;I1luv zP7(b&{6a7Cx$8BZ7H~|*%Q1UsvhBxpc=#3F%9QsIBXK24g(%;RnqHvWMZnZu9ZXG1KFgBt_+nahgP1aqw6U;Oo;{ zJh#G;7++BI-}~c+JJ@`Jc2@3WR|Jjw7+f#*7V1>LzL#|P@WEEjQ$^b$?XEM1OtoPK zFFPeXc6goX-al{znaH@T!mQZ1&2Y<-A13Ild4-}fYql60zG4b;7R^TU{7mJUX=Mvq z0IqNhbBS!G!EFp1R zhvd^z(V*?D4L1AW7KVw1Yt@K8`a6+|L3lChCCpqy|G&`GwZG9+Uzzuw8HUenx2roC z?uM){__3({xEHo|uc*?xXicQD{H43TDR%qv?p|N8#={wn+1>$RrfQbrR#HUT@W0X2 zHa7@gQ<-758oBOAzuRK!twuOr!=pYyE_Rw82@Al{kzO4=A*F*P1 zK}39#KgrSD_}vWopiU=fPJV^16Pgho&+$z$yZ-2x$`;{D-O5~d|2gE>3baBc>&hs$CeyREBO`{8xI;JKWX+7|v=%?=mGEQpbe)c#rT)SZ`-Yw#$FRxH zEw^2|IX`n|14L5CX*tp3SYPchZoW5J1Bx~#z))k*xa$9jXjg1OcjBJ-TD4R@y`qe2 zj4f0Sa7yPUSeKjE#U9mdlh+9WcLKnl%Ba*DiWSf&s5Dt?-Q7o$sS_#&Ois#?>yO>w zeIp3R_7B6PO;Kz7I9h)Vk0|J5Zo4w#RxSkh(iy>_K9;6<_^G@&cS&R zwKfy(KuwmrEhmo}dHSSLQ+tp&g4T6YtPDjPr|E+a86Ng=pv=oiN92>rBe3Y>!YPtcqrR+^2x^K zpVnoMmLCn#NRV(4%ZIfv)ko1hE8 zbFxySu5N8?nZ6!Gc2corDyGF4`WI=c^X@Ow6#04YKS)z0B*lM_rp*B`x}|)XjPDDz z%H;7|C^1R!4gr_xW1W)_eberVJW;X|Tx*G|I3*Z8MuuLuwg4qci%z}+neWgpZY-1zg9x&yu`pVQaWeU6DkcnI?T3j{UyJ|B6n zTVY!?V3p3*y(WxrYcW`793JXU9>#N*9dfxJ$%=K}oUdJ(cf`nP4Ip40sUia!Kc~LG z)Ly^bn__FFB49n21Xmeb-$5SClbF@skDYwebsYG@g<|xsCwjU+L#RG4RJTR(06WyX z&LA0RvM5LbCM#u0CEMXR{u^;8nwqx(chI%#q_J}ogFJxWf#{n^Hy-56)5>&4;x!({ z`pe8k9yAaTC*T#R=cXu?#~;2kF?HN5Buy(iJF2JzR?+u*83ozWtitUkNnOMG_xcrI{-yaAdaTug13e?zKLMeZ5OULGD0ph}U@aNkE z)0hXx>I-_Ja`34_g5*rKte2xEV-imx-iCC;dk6`DZ`oJ}-T=#P1RaE zmBEA13PC{o-Slt=-Ea1ikRB4utRbLszhhlSX zoD_SWx{rSTsud5-VL$~ICgXP&DNYqJ5Ux z7W7DWfJ8&{5Y zN3z?`I07`S*Z7|Nf^QLEX0!%$@L#J3xSZuU$ZyH=OX=^YMlVA>&bspF-JU;Z?i}H! zfI}#W?exZkvLvC&+oQK;#q+cY-v5w_eo1x!NZ3}M}ibTpI<^JrwX z6|AS3k(w-!tP{2`_~RZE^G?L(fx*g74%1+bb)`8H0f!wVL@d4+!(>%-b13|3L+L+# ziqVt_jYc~6$bjbh?|UYQMkX*pdIhJe)9XXM5E`hN zVLgP~ncaDRR7YJD6Lobmhew});{_|ra5V9&dYhbhxYm#5SQu%TUI>x#LFA7s+i;IY zLyunSE@CLok?!2G_`HdDAhL8p_fJ7X^RP65T|>M}b(pZv2pLu$RM%-4vli)H1^UIa zrdcgcul6l6ZKao41808P2p}`J&|=T{vt=J}+Li$Oa-pvT;kL;jWh4{LnMeu^6Y#<) zWfj&{DY4_PrtrN|T-Vzb0|*Hz(}GGJ=JLW#Z}*T#m_O46E~($ZjgvvJ0ZE7!f|(Z& zi$bdGI<0(Gl9nN?ICKyHP=Hx^-r35)Ugu=04Zh*ma^{E080{Z#npuPUm1X&n3^i0R zAtD0K#;a?hkNP8u=H;vy>l6Y_Nxf9+O-5bTNW}Poh!+H{q_ zwwSgTWUOI#?^Tz8@}_DRT^_Stwp~h0nTG}vk*7poG*4u3;GtmL;X-4mX{a4p5F0FQ zW`({_IG{FCFng_ao9~p7E5I|+1lP980UnOf-uv-MJ zz@>b&Yhud`_ISwic?%QOHB(u1SQ9*tKfpl~>5|Klt`Javwew2Al-i)O(G{PmG6?pn zJZ<~IGz>&(pv{WYeW$MYz$k&vteM98kTeo53+#MP4$|ohkMLSkm`MjF?LJ zsa#yBrf`EFLEH(rqtR63E-V(J61f$i1 zWEUHE2Pord*XJBG1WLv^QNAlaj9*hvFEbc}fZy#oh+EY0n$OJ!&RKe3V7F|gbZ$NM zTll;#z8`4X>KX)jeCH}O^s~d9s7-DTIdAp$?68?dI%+rK1ILDL)yk_;4oq1Bsg$9|lx!6#sI z{((4?6LTf~`K&Hm{PMM-uOnoS)P$C|zwsJD&pI-?>BTy3P{a&tD@1vIKYLq-z&~hd zSfXbo7}QSkalOo#zj#GQz8COydP@MrHfCmhtJ{*Y36?z9?=`mRBTp#(HOLB(i_h`4 za#UI2ynGXcF@gK)Gh&fW)_poru8$@Jr6Z7kylaZ9Qpnc@3280Hs;q z&9|0Q_Z;+YuGY@oBR#01x9>vmy_BQa$_V9+C<{_&@FdFoaet+O3Rf%#BQxy)G zB7C?@=CH^cF!ZM81lt+IQ;Zw%_FuH^XCLOU00W$TE{0mS-gdzD_HfDXK!M4Y5{_WY zs_P*(t+l;BKX8azui}( z>VfGY*nbqOr9Va*I7w|AhA?KP7{a|n9|Bq1o`&yy+e|YEr_g)j-pu%)$>kxUo ziErYw_w)I^9@`@+gf)E+OhW*p&#|V`&C=S-k1H!;`}}2@y0va`J~yjxjC}`qJ%uxe zr7N}u>WXc;W<%BwjD*b7M)uar=295Zv2HGb3$|w22Suw5$e8XPH~&UYmZHY<-}nC$ zaa{P{5yy*=6HNPdqC>;KAjeg+1GdZS?wC;$6#NkRXqQ@x7748FND88RTB{AOQJ6*!0*jB7V3>`3NlJzqy>_wpo&&7x^t{oC#!|(>dLC zSPLH8rDAf1t8FWaaqfD7vT?e$5x}?e9;jmy(zwRJw*?Y|$>crfm_qN%afQ_PPigwU z!^7GvKiYwB`x0AoUXn%)e=?#514YZ~{7Ct+{|Re!UDlCzwdnPjv^)R8q#gqDxo5*; z(fe|?!Sc55Y%|{xj{%3aI3&ss_CX^%r}LH75aR9+d?w^57VxN!ok3#rD~`{1mgfx~ z@@`0PWJN$K8!*vow(eI))0sdHQ37~{MKBvQ9efd-TGuGl?2UAGGI_boA@V@efb9l) z=?k}D-{f}cx;O4iA+|$6P?_mowDf=&gc}u=${NmXbIZyc5j!nd@|7$mlQ;RBe5%jj zWz_z)>mXjm;?4PN#aNdjfVD7P%qdZO+XBYZQSviZ5w%}!xvI5p$l47TfmyxH0w2T3 z|G_3|0&F7A1`Y+wJ>FgaJCh6?VN_{}?idh8PpZ)GHC~yqI-P63)0Jfkz=HAc2^qii z4wdljwfV4^q-;Xo!0b_eoGv2Ldof$H+R1n9qs=toqJ3}O_gscu@|0?Pq}Blx;cVL% zBK)|a5|9+1yVkKmd!IgS6=*5EPy@54dnL^=$<@ZeqNQ{Dyn;sIVyB``p zoM&#scwFtu&1)hxI2?X>VoWSIcGxCrEuEV5C5Sl8Ptuf(6>ELgrBbf%_Ejr9F0*29 zn}0vVq94smSla!P856CYZ`80$5q5rIHkiV-<4v56^oAcsgzenmAdZUD!3*~soUvUj zaGao0$jxItmhj83*6=j;M{EFzfm6twWGX~u6iZ&TOU!E|; zlELJ|J_9kFTE1a6j)e?=-pmsI3niD4rSd+`^CsFGK!Q4y0Cs>OSG#P^-au5TOvU^u9g*X>R+2FYlcs(6~=vuFyXi0FcpUGlIIoLueVRkvR02;!YGo&Z6!3*LJ znS8r}QpDm}-+di5*`=sgFDH0}Hn}A)H1gK`}eNv zPrFFq!{X_3D^d0*e<;)}x{#Rl*b7W1kDRWHAjNAJ$(qTgau*@3fC{t|(MfsV zVh-dhB@MA01DdDonq_Z|Ad>3^r`U~b--wnfs;6}gB03;9-qP}fDkhTXvu46BKPMJ( z_?*@b|Gbb$--vul5x8flAw~gNr!YDLrT={dIgW_Ov#)6R@O3YMa_zFAhx27G?d1r!s>afD%%CF8rOq)nIh8u5Z2 zO#E23V!+cn3kNUQhKtEdOpfOT5XlC&U!B{DjzJAfd%@d)yaBx#y2#vO7pT6_7|0HJ^PM^e>dFP1ytnj>f%#vDC> zk9f~^i)-cbBn^_}mOVLuk~_GvNS(P|z^TnZLJKxJ^PnvTyrDzV~lxDImtgZ|Vuu z%)c6zM8s^?NLKXb=(lg5Ajj7zV;LV7!ZdL2#*0guw$x&$ z%C%tlOhzmQk9qo+yc9Rguh^e|mRu-Mb-DKJZ;aZ;lq>$@2g>@@Q_g-gJSa#I>zd zJM%)vQD9nJ1R%-pq2b*>WH)6KV&F-^^Iv79kgYI$piJ3h1dPgl-xSkCfvVITYG6aZ z*3F;YyhpS>1QmYs=ke&AgJB606dI)J>NT z5XUf+OUmJ^Fdu^zl(y`6@v6Br0F7R?Zm41OicOm7eQl{1=w>OU+;5J)BFF=Abx0xI zjBdipOJ~QfRbm$T9E*N|cNb}RJL{d>HkqsDJB-ai>TQ2reBN7z%Hv|eZTU{`=J=ts zTbVQXV}$IF>*j;I>DK{PBgPre#SAY$Owus0G%TA;{(N7I2ucV>s-$EL$inanfaZyA zFj6n@BlM}XW50qqsOcm?Zq9ir2X)-&TZitf$9nrdUM9u9JZq{yYE<$8Qbf_C9Hp26 z+xBeKk3D(J2Ue|pJ0q*@*j=bGL4#h-)w&gaZL;kJD%f2yke7u(2%-r{z37~_Dl+TV z(qEUwA}}DM4A4B;KAS2mhWq#mtaVPUoAymwt$KJF9KzzW+of+l2%hF8<~+p&V)TCX za`m-)vR^3siwm23Y>WwtlpytUaUj2!Houa0H0ZBB^s>o3G}H^_XS*%it4fU0IHXZp zF#!)PxHknzM)tl$v8M6a{5eTG$76VcX#hl`w5EAi#pb%3_#WB<^h(upHhSaTnrT{s z2T>l{KA&~|2|?Po@C?9UIlYpu1dZEjF^HCW1CnR;ln-21w@)af9tXz`gH~}vY?U$1 zKqbn#0ww0@kYgb#Dczqm48 z3-B4#y1G;BHP(`2r5+g>M{>ko_6K+sfQcdj45FM6m0;9RfnNnMDsjCWEs{GeH(*)|}+aV~hi zbZFuc_Tlcy3VvIPkWFJER&V%xV4-g_7`<-#1QAkE22v>C9Y|F^uG*i5E-XE_J>%!8 zdJ*OqA5W+B0s$Z?Ew>J4$GQ%T0EttO@Vrb*9KnbCfW;ez3lj6801`U+3@V_>z9G2s z5XjK3e<|_3uZ~-7~rQ;RTE( z4-+*kB2|3rm?0cY2ZS#1!k>o%4(Dt$(-{>*)y!WR~ ztvlpG*#^-Fo&$itX(3hnk6DF8#f64iSRQMke`5}TE>Y}s^)&#N%D9aO&QLP03o)?Ba+6wD0_2MGiOgEDCcuL zMrHaee9cns8rNgtI=i-wUdIj)DPJJ=Jc7!Z@*M%@^UBpJCLK*-$!rJxxOs{>m<8IF zi5E8IvYinV0p7(%Y7d*qB@v!c3j0&z zsF?i+EP1mJ^DjQozL5FA|KtOetaBFq#at+Zc$VUdHN_tEBPaVWBWQT6E7 z^XDrVU(Rj2&drC}Wdg~KDB(bdEKwv%BI$f`p4Z2Tm8>R`jSKcm!zqC)D#y(t^=eX< zQ=I3rue2gx^*1JWCZcmL3_Z9gy4jASyKwwgE`Q^|eu#Ig&SXQDK2d`xw=w^8o8V)2@F&)<|B&r8sJkKilQ+L)U z1MlJQEx&}uCpwdU4vwfPLTtoC;3m4z|^T;4rxH{HzzUdv;~v1w(-nIZpmAR)5!>M=WWlHG{{fZdC$I5CR&{s%}C0i z^MRw1f*?6xrOyikNrw5pz+JnwH$EJxuS6k`%O+?sH1*fZU({lBreS6yU+N-|+n?FG z7N?{P@{6?wA#`d2br)*17&pqiL`?R96rGBg=E)6P!|6;-u7;l%d`~cMnRLn_lM5(i zCoFDhLd~Mv@Hcv*;5yrO1a8SedU=_;ax<8hjjtJ@|6tXhR(l5Srye^nssNZ_h{+x zL`ZFFi5f$yj@x>)*7^ZBQ!S^9^`^S?A-XNncymN)b_}mz8~}mH!M4PyNf(nSJvjq6 zD{LlV)QJMY-&Rk3QW1Xw{E*We3AufIXsuBf6^G(C$CFV1f&IcLV>jczk0hyqc(6|6 z38VL=9MRJatF)bpKXPsdececzG$|tN^qPHO&CFLxmxO|nY~d*Jb;y-(9nF4-KHjTx z+F)soba;Q$&$e#@4CXF88^hP)qC1I(F`C(qcrr0)m5!x980fw4zDpi7|GtY)+njfJ zb4wxc+uy4GgzfcGa|{xSoGiNvrNIo z>%a*Ha3r~)c$~DReTmGIzcyWqCg8YZ0xuI4X&MuJsTYFfwp%DSB4gUz)*H5M%s8Bp z+0tj2-OiYHe238Gi8)|gvd3~JGH}Ry@lSTovksZ&d;y<(xJ|PpeZ{`67vkNd#Z-zF z81l+fj2au5(`^QmKxMO5+JHbQ1XbnY<8ICt=gA4fZrO3V}C96f2>7Nmw291s#(cF}X=MAs{ht0-QvPUbJusROSm zh0o&JLuH+p^cpnPM-H6ZBZ$4!M6Iyknn4s!B10?&zslFtE8k+7)0SLGLiSq`KdHKy z1W;>Ya>Mw4lgZ2wg3B}*PRC&)`1mYWIwtEZ@%6Z;+?UeKI4@jA^Z+TIw!CK~tA(0X zA&C8`jz^|z3uj>kH>tK!bMS;sQ9#T~DdP5V=Gs9mZNrm=;~T|Iw_oNPO!b1YStNDM z#DDd5wmgGfXK9)dGdFBEJ#cDb9+(R)lgW=E!)ta19xz{>3)Fke@vDAnv z9L#%v4ABW~7!}r-E4L%dReNr+2#}NonO~%8Z$I4ih&{8Z{Sd94BcobkOIEN($RH0o z<57A+Nz*;#9Wqd-rPclOHaaF>|c@}ZM zQihVTz@BQBJy+MxpddFbz?Cqsk$3XG>jr5#l~b@}5a49ClFh{KSzmc08W{sS!OMbh zh`{hDkM6@{!U1dRLs|)kMZ}thJQ16a{v+K7`PkhVR1q^^l8xIMidAhy8O!91Od{(} zbX``4%o!YI_+l6XjlY5dqse);FOfazyB<_VMDDX8uOT~zfd!5}DT7RDEr*j!0(T8T zn$xz2^8Ls4MR8uZ0AHt)%7=574uiETC6Vg;?-D;?e6B?S4JRFms<_G`QcLqpS%2T7 zKVlB2Rx>bwEUVRNTMfk|fun_E7v#8kU4c9^TOoqumEE31Qcv^Qwbs0o{auG2CR!6~ zy6Idthv2tq+vDMZ&XwE>uR6!~q)lAbm5$Tt@8mJ|l zk_W$NmvzGUSZlLPA!mAB5hXI}>)N8JA0bbzaR#|W@X>m0&ALykG%Snq<9~l3XatiT zkP$L~puhqS*>1O>`&p^Tlvw~v27JRrinEoO{=OGCc*|(O@0;W9`V7t%|%z6Vs&N=tdm4HRx zvTeNe;hjYcgq6Y}Q*v<8^*)$!c&(iG9nzDwqLw7&c@^=6<_rUm;CrE{(UV;>wYQ-k zy%nv3m|o8-baTL=8J90EJ;6jG(#*P#-Rf$Aw+Mlg0Lbmj0rt-XMIx9rpyIp2z90)| zPqSRbN^nPC`hkxMjLupaJt$eb|AH&I3JnY(w>1Y668L4q-5*dX&GPB)< z36#PwX2)AwzYFa5t3&DwLPkL_y3%;_r)5*ROgf^+6^eR%g*1-$f6+>Tp!lS3(eiMm zqOj6SBYv&5e!|d~z@UgoRxRUg;A@Ry`uH(@Z^XCt-2w0mntpXfx@@veSH0R*regtm ze5<0Q>qyWgET%N$*f(`6EpIFeKfIJOmk9UMmVPgS!3&Jq{!nSY_*12MaPaGQs@?Zt zzvt@JH;P6}VG4VtC~TZ>V>pJb@G};>au6=}wAN|dK0moMVFJ`dQI(=RQ+V*1%eiYK zduLFK1Q1)1`wZaS%$RK_*6Tm*2c5| zGEFGbl}!&!O9@y21Vs6cT~kShBDGuj{MKGFC$q0ztsQED~Pe;&C znO-{Y6fCQ&eHR*t-hbkY5R{j#VI#wD*4@<|z~k78z_Sh_^m_z;f# zyVY<;xy(QjsVg1I#)&@9iDuZJ7R||jvS^;y7hQnim)=&e*xA=SqyhT0jm@-Fad50&#wfWMAUReXF|X)Xx0&lhKf_M;`~3%}sc9Iy z?Z-fbG(h3jz9t;@kkqfZ$Lkw}=b4o)_^|_Nb&~Cx9J=PV*F!Rstd~@h*!;Rjdw7Nh znM6vgr{9OiY(2j_mmpC2@!irV#AO$jrIks>S_;yQE~r>m+P=T;L=ullTfMkGq3g=s z>S>BbC6z?t>erZeRinRYLZ4aPbb`-6-~9TGYNtF_XeQThNl(gjUhGo#{`yId$t4HK zF(C@?I+9qrU1j&0B+xXJNMD2!mKYGZTa;$?(;w0;z#+9fGfDb9x^V1#47)t}}r( z^tJI5hlmpfN6nN|uAnv81Ta1}wS3Vyly}T@h_U{6J?1(q!EL*in{{=8sF{>D;Zn`4 zs_{SRF%|Q#z9~=^QSw?z&x;c_u{B6Y_|L(UmYFz#SWNpkEKkYbp>uaV%Wk5_F}lh2 zP9Ag}aXC~hj$4vT4dih3YNl1XKw6wHmeC{8;0}Qa4B5zZLj_?UNTAt_Ede*J ztxKR^{^+jJA4Y1!kl205fRXI>t=4jj_|=hcrY3T zxh_WEpN)Qt9ZWCl3Gpr z)1%OKg(PWOpyjg3Mx_!^ZK|`VW9Xmj-ss!2Y_NDm#!-M{HJRx_Uzl}{Xh@FnZbz$~ zvUz4*P?u1K=3Tg$U#K;UlVBaUzksy57%p>FYtG;M_j*dQlo< z6duhte)xo&1ihWLY$x1K31O&y*yyIEaMo_F|K7F$j1fY1w$39;N*1(D>yZyD^Q+Rb z)A5P`q0y&@;e;8=)$$kzT);ror@((Of&SwKE_2#C;SwtH1yd*vLAw;~bG4f{0>cB!g1%lRK0o@BTG~=R2 z<$X1_=q@5VU?$zj&J}mt(N6ROPV^(rn=YGSF55X>R+hz+Yzepq+qZGzUfcCS`bARc z?%D)c9ma^Oc_`yZMsLAccS|6PRPOptC!?P)`d^c#lUR0xUzV_g_)9K7m0y3fW3$GW zhux%hovEF}*q#^F572-Jl+1lJT&lGymLWeODZ4#cyk1H&xX^>gc3nclr5$ltzTPH5 zcYnsOo8`KJ4pLlF2`nFVUUVw2Uw^UPbmKE}Bq4m7AL!W#tGjQTD$e5SoBK9@aDD_( zY#LZX`ybLYoIsY{Yw{4q!E|#$jz=%8R+S4{vN*!NdUU$Q6_jH|#(UiF!Y2_JmT2Fg zt6F7mF&limGy55AJ&$C^!RxEsrDj(L7e32EDZw%u?EeycG7LE~82Milw z+kI{E*e(Mvs7Eub7AE~X8nHQh0q(uMy|6O;hUZaRu9W!6<+FG8H_{TFWf zmaPE<`dx2S?FoG_39uElv*2!3g4U?W!e7SNpH+LCHz?JN1+3pFwiWfXyC+>O`B$$l z2`CF5t#eZSvZNXG9!Ry^9zuOXp}Evvg9Y0Bn00Z=i%fIjxYUUGd~7dq$&i04af+e! zV+g}5j1h-zf&5Eko3|dj8q4I5L-L_G^aRVG?zUbJBzZ#-GhV_FiNL?e3oq}_n+FAFkNS(XUm|Mqo>RHpF zcLi0Tp&Fwmj>+tYaL*-P-)$S=g^SZ}ewGI#U$Fr;;8YUQw;(^AN>q0WO+rrvUUv)x z{t_2m-fKzW(+?!>Pp6NW&fD41>TL&yeoS0n`K_;Y+?6~LhrXlL(`noMu&Z&?Rnk$1 zY>D=TTd*KqWb(RB{MP%cCF&+QDvlN$)ey(&hm~lQ;Ww$w1MjNV&xbed4)ywr&sP&h zW!#Hko+$|oM5+`>p^>EN{8OE^zZ;FHHzT=GcYBfg{z5!YUTtgP&Hk~ACdF|}w#0jx zZNf?viaV5UlfR4NvvIQ6-f4AyWn=qzfhzMKE8F8FDX7$)Vi@z}nf`NAy&?=JE$z^C zUq018*VF&=XMbKgQ4`}dYO|off55+g_{57m0)g-@Gh{~m$Ey0Zv}e)b4lxN>|GBAt zjRyFlHLw4}-dhFL)va&73GNU`a19y=?hYYXa0u=m+}%9{w*Y~KySux)yDfa-?s{hS z>HY8B-Su^ys&ms9UB$(^SheQhoi@fhe$Vrid(!?#dH!=KZ!v#50ORfBRuuc^r25b} z;6S05trvw7zW*>9(lKDXxDW|Evj5+O$@(ah%wn7x@V~3h0s;lE?U~+p0}mPj^nX6$ z2Pkx-F+!WDKSy0b$dYSGcf;lFL)vCjJp_(xKmXxB@rC&q^CpD*P?Oi|!I7ekH!t}g zzqpwI)aCqF3McvhSW+Nc{=fbC4Nw9n03dAhQvdI-{JSVXb^Z^l3x-`^4?rWTw%zag zK4oiu>ms*3TPJfnJa$P9q}QrZyYYFhz2vPtP33oyZF_wQ>3K;~J-*M%$}$?d+JBwr z+e!&3U+NCVwfiC^_jAysqF7>mUe_l9?EN@;iPrD)r10|cXfw{lCkge64uzPR8dVHg zdjK-+O^%x?vK|3bsqO1eN;?zv;9w_hEtr?X@^+(}RPKxEP%OK5L|X_#>&@ii+oRkL z(5VpuEkHo(z5O`xIFst(_Uy!HfEV&fOJdTHZ}D~HCW}su%-O1lONQ3B^)A6z1V^jw zVfpR5^DSYg3i-{(Kl19yV4z+Pub&7-RS)=&Ixoedbf?S4Q^IO7!tO9JphHK zgU@cou+M55lc+;X9JZMQepD>-gdI%gEf2?iHZQph-RK8WU5o&*<=rFAvdc-d z!1Dv|{;A-P3eP6DuuI;|((sG-&Tymvqkx%Vdal=HAt@pgiEs<%fmVx8EIt&f8vo6KO--wseb(?E_S} z;_BjQ=bc)GZOlpv*%qY_fxy#5sO?ssjAFq8RxH0N@!%iU#P^KaDigld1_VE7+Apwa zDDX%;)ebnVKgxTOUGBDfNj}}A#eA1thU{=`y!#n%FHkl>uiYfg+hGz&xVlomD+O#h z8+Z3bEL8{+T{zi&I9q81=W0gjKpCoPNqEkW+r{kO{WL5;;MMlHx?+mE>#^jgXHLkHd^0(gryIp@{%d`fPl(*Uf{k zSrrh{JF3V0C4heaDhuy)l9px( ze2xM}!@h__yy`WH?>1&r@n#n{{tK`oS?(P$PfTZ5g*wG`%)usu7eZ>Ejsbs7%gHXE z)uIno7LOc*7ah=w<-+9LZWrIx&M9^~Eapn%+Mwaw1vZIXR>*XHu5%st7dit2?CK2@ z%c>0Lank0`DG0!3@*%%gk8P!h!*Rjacx-yjWG-6OB3wG_m13XtwcV9xyeV;> zFL{D*G4!}kpE2jkmPvuQ-B`D3aI~k}r7OJbx~aM#^%+jQZycofka9hT#Vj=GO$)nt zI59m|e^lJFhqKf$Bs~tu9UlB$s>$efz-l`FZEJUY2vm3)Po&8?EPgmYlqmxKcIbQz zYfRQtcQ5c_>-Aat@ZL0q_q>y?DI`Rr?8$L}iZZXWeU$56th zcGcgXbtq2TUfWuVI6lyIUQNoM7;bHHoMTDI_x02sB+OT8Yr1)?rAiGLN+1Z7yj?mQ zyxz;cFqmhEWfe!mf_3FPb`+;ab=#ftMRRfUP4n0CH^sp$~c zvq}`n>*Se_!wKJI%|P?C$w+*@#S9z2t*Z^Y`Fuh`u?oR>=*OkTH~TLJ=yBpB=+7Z8 zQ5I(}f6RAIZi=ez4R#5tM_V!SRKeb|qQEl33E75eTW(t!;R4s4{2gOU+sZvL_;Njy zQ8ML*Xw*&XFactTAN@L^8G4F)k10(f8PdX+!-!#jRx7fFN;wsP zo5ju)P0USrAq+7LOlE_n4gka%X=i9?loR|JkG+g~I5F}0v7KY+q_lnolkVh^)o5a@ z7tD-l(?UEPjz$$u!(6i@hwPTy z0^2SmJdBQ_d{|TLLgK3necTLc_1*z9C70pcU^Q`7B3|#$Dtr@Gwl>QGz9HsZ{vKp< zLO7kzk40@}*ILDfLi4q5mpkvscQlD6->%}Ai9JHOb<($|30R?!-8>o08m{^`bsnfQ zC6Nl$!#ra9@9-BNlF2@vv|QCGTg>$sgj;lUr&soZWSh~SEeE8!rQzAr@W7Z$U$CfT zSx#Y3O?nC@IT{w-na_r>Za{Yy7ClYEz9csf@?f)yIyv8E7(NsgTp})sYxN5MGBnvx znP{*En+>h;w?vk{M(oi)$+Ftp^ty{J#$n9FXap>3C#?|TEjAWcAs_W>>(&nPcNYN9 z38*yJ$dg%(f8|;Tn^s?BA3Ul9M%Z(1dtQUFl&r#lltg~6Op8n+Oi(yYE5kW(UblJX z#jna9u=1Pw7LfobLz;6W<1~PLN5iG;IxDZc=2K~c+sRb!h_|dhZ7s2!F4nUo+39a@ zQSzdK?MZyhAeB9mq{4VuAqV1%g(4n}FHMIAlYOo)t&cZ@hWD3)PRx*4YB>fVG?s~E z;FMv0T@9!2GU^7@H1P4cgSSAq#v)&q#e4Is(m?79z5AGd^-}R$!wotO^;)xgf5eb8 zP?Hnur{kGS5Z~2$z(vni)Nsd(7v;0pt((AGcg75a*>Y9YhpPiJ-dT_07;FJsAF&;c zd|=nrt#l-bNAzH}%&Mhp&gO37G8KY)ZFFO~2-DZCTs+SuJYUF+(aL&%x#JRfzg#)q zZgT7nxNntjkCX+IvJcf9E~qcnnN#vNyTnnFd`5K}h>oy1_c)cDA0x{3Y-H(^A^3LE z{wj8hNk)Va(uL*E-DBHUPwNzc&lX*0r7;WylNq11++XPZ{>-9(gxjb@u(dIuAAufA zuc-p5TDESMQ8%9DjzQz?kG+tsEs{${I>u(VCAGn!$FwSWen_!_x#x;fG1?M~VgcXQ z7uBTmyQlymne@+(H_QuZg_`~68UFTgHlmKS;kYkV+~`4gLpVLVVWJ14E3KNUh1$h= z(&^m8Ev8tc;XdhBVEvZPv*+(Lx??EyW##;y7%Pux$`}XxHYmpdPX%!4h;cfJ;MY6t zU=)14i+I(vZgwuPbbEldt|u8F@s6!B9>S&C;NZZ(p|nTrApL1i5W&HQT`3CP`3U^}4tMe0@0zw{@_#JKrMJ6t*hQRas!h0EP4 zNO8GQ^ufHGpt9(a^WKlm)+rz0se&7Qz-ex<%Pvlo%xR;uUj*`F`UQi9+mFxbF&BQZ zcW>Y_J9%IMdC;8F9Ojm&XgKCY%`t=w#`W((ooskI)EHttgd}zx7U)l#87!e+uDNT5l-{rzwzc z*fa%*UJ0y`IlAo0TMN6rA+Ij?EI#IToLlpn*B&brw>RQ^)I;Y6oJzUTiBN*A9ly6! z+h%f`z7lcUe~;XYx8CywT`;;xo?oP2S%VfHUmP4}LvSC#8`nDinh8kntZ zh1JG{YvDKH3H4Yp*o zna}&;ADrYK9<$+4hz+LlTJ?A9Rq`sQidrdl8&p^-BSRkf%mXJ*PlmoCW5i*DEILCa zLM&1YHAqto0WV+H_F18gKn2)gLZy0Bhmsps)GQvC!;Mt*8i$)BZU@kq-wupIKr(lo z`{+^uAnHhdzCUf*N}c~g`7Po5xPz=-6v!JIPE5Jzp$02UvU*pcblHii-LCz!dNnVE zco4+-?E}N*vOqg_@NPZBcY~<{*kr7>!D{-fG_TBr%VmrHzI^K)S*GJkPCNgognd>M zv7Jxy6Hng*BDmWDR^NlC$TYo8K`x$l>!}azLSvDdu9pXUFT|bl4_7Tq)CU+XM_2t4 zL&^L8q;ss)TRrY|Ue$n~&t`Q4f*fFTaew`0pnsam1cb_ln$KruUuq_l6@2WK`dX+O z&TVoT^qgml0r5KB`nq0iczKrS(Z=GYuKfzaIKb1}RI~viCtG2q1%%ZB5C0tX?}hd| z+0oW?ofoNNAmcQJ)}bRxg@XwF&Ca(e#1IM>OiDlVTJsdojBeGeJlXVj$~*f9yY*vE zTYQWc6m58qAvQb3tsbDu1!0ihxM3LJCDeH0JJ3vX3z%T+2#k1PHhhGNQ5;)#?iz1_U=%S(=8%fDsSU%N#@6b;jk|0rLj-@!euZm?lBBOv}gP5j<*yHj%xlP|=dow?`{1Zc8<4;9vak+KAxr za?ckGWWTaBk`!2dw6$5cX0oNFoL@QkocvKUhJ|;63cwuOZ*uE<6`h`;6o2x z2NABwqo6AK%I*@c%?^+l%-MRnCtT~lCW69Qf05)aoT zP!G5J{7Jhi=P73`637KffQv{O>T=l~cAD2IxkxNkBRdOE=4Y*`;#K$p5$PrH3sf6> zJV4jk@jxfy_N#hn+#Mw}N&2=P+8@UKkmT?Y&E=wyljxwj>I3aJN)T&Hj`#e|P&`NX zG%(Bem^DNjzw3Zb2EG5Fgx+8LNtY%Ya8mA_3i>5@9{8zWUarZB0_<4eH=2W!{k25z z>zfzS&ds`cn5oLJ5A+|6zv&Of&Wefa)S8r@zv+q=VQf?x9z{854B5AwVWP|T;kCNb z38(NsnseMrwrSXaqy95R`Hf2{`zQ>LU0>XYR6g=(5r&-oOAIhOP0@4M9f^xF2yRFZ zPhNsbQ|1H{u8LLd3$K4KRQyI|6`&^R)DMDc?W1>PoY!)Q;0~l;f<2{IHwd<|0aq%W z-A0WwdADQ(f++ZBYzFF6w|y%hL?*}`f*qzD@?rkUpuvTV!7VRFp|@3Wgy-S zi&~C@cob5L(jyT=g-XD)7NT5ZrE?dG>2DTO@?y$xa}STc_i5T>&$GEYRpF0-aBBL8 zp@jLn>S62aJy7P_WkPxxRdFICt5sau0U~0neZfJ(K)UYHi{it+Bc^NH8ox±ZkD zjhN1DrGA6V6AVM$XUwx|gR-~Zo08Cn=mnq70`A>(qc=KEEXw9yj+tQ*k;v%GqkdPL zjxD8|gh47UKd$={O%VpUN z?K<6W-biY>-z&F{YVDm+7h7`FFmc~BxgFIAUUQ|E(pTU}*I$UCVC>Ld36$*e8#b~g zv8k1`d_Zxh=L8Xa^%PMGNvr)|R{Qz*cO3UBHCK#fSC?vMe3OQ`K~H~-#@aln<)O4KY-B=Yq?Y`^yLg{fC)BS!j&laKfgl z%a5RgOb}XnN`c4`bAA@PC3zP-i*Y|Z4gAtbdyuCI^?jt1!wab& z7gaRI@V(GOK=bFio#s-HGT2HU+b``!mmMDrH5czlBCR;HY{G_!|Dy1z`P6_Aj;WT6o7NUW)L ztC!-luhgG4N0IGN4vP`6RsK}|jcW)gtrv5lczvEv-%MdFbCQ`4ynRzicqFsZC+;x< z#vVs?yW4;VnYm8o%H&^W<}!ow<~m~d8huu~byK;Vy?!mYK~GCOKaRJLX+;CRI#aSm zkEz3>_0fk4s907)=Uyha^|st@<5D(K2`ss2K?0B*Sl)w5yh9fnL8yJxw)w2cKo|pz z&vFZQE2mHPOZXdY(|T4UDP{6?&Kk@FH%&=Z+v#7k61(J)pl>QkA{WKkmb1luq6v9& z6U#fee0oeZkVkwF29q|tt4p)OEbft((3#eNfGthyM)hEHB1}4jV3h73PrW$J{1KtR z{MTZtIQ!vmLmb4#3XKaO&d7C?P(k5+a-7Uf;1O?yVQU~g3z_Bx7a@eoSY9PZ;F-uxqy}`9|j6G2a z3KWpZoRp#9>$dlsLAUUiw0ZTZ<~~f=mN}?>zi*FNW2D!A(3lKNbbn&LI@S}`XhNFg zgK|mnWxdn4SFEClb?2iWGSJj{de=wnen~3d)_GSwUYFt-VaNm|mxAwS9+?<+?qa%4 z)|!CkrA5@5qHvI|XnX##dAyx!jXdqDgJWLoS};OrMVyDn*J^3N0A|I`vat+89?w1HyRQ~{EvT=b`V({>(2TB?(9To&jDDX@JeOVZ>m z>zy0_$(eYdeOBQIskxp2mxXtrjUP~|eYd-di7^r8E1owX>#|yQoMiR7k>5niGfbpc z9@^WnVO0CED|LARQ)RYVbldOmt|_>|HHvY7D1G=$y z+slL>eJNYjErRT{nvlS~?ZNw{=0$Qj*GY%r|K#6me$x18&6@8?bAA`R*yL0!QT%&M z|MOU!5}4$IXNf{ig@#0Gp1aJ!Z7mBfNxm(pVa%hkrW+e%oFwwT3AYW|RZE*5v2T4V zO7MIHOO;OBZ?hHOiewue_pUSs*O-J4Hw)Fp^wlc~7g(^qN%(=pw?Y_;!Uy zo>Zx-eG1cXk2CCamVf*hho2S?SnXd4e2qo%qjMPiB`&?%}=`op?G&PrOB10)+@L=t{ z)t&rbs|nLw4)EbQ0kJLa$((X|Kdt?lj#=(`R!3JCThcna%L|kVdl^a1{LV1rEiR*E z&#BWGxb=ysSfjB@T^ddVrNrQvQOlFWgGs99+`2|qpwuC908^X&o%M#Q(nMImU>DHT zFa0oVZb*6pM2GLc>pHXl#^(3ANs&I@6s4EXiI$BLKjIzR7p5z?Q!K@N75Zd8tET;A z5fSKO+AL-jQGU41Z%1d@8;qz4%_nVAs@;}`*mVUtw;dBCD_GrOXKhZ_ACpg5KL~T} z)7+_fU#<$2(^(sI0uG_<2QvvcOJC9B5U3e~Bc!4BGCIGm@!aMW;>eq{=b=)UV5$k^ z)i+uKaxta}Aml~3O*{lPuhQ3Ev(B*wax|@6gy0U!mrl2)niJ#Ca=5`gJ@;oQ@7ZXg zrD??^XDz3=`Yx`>eLBpdcCXWHB9nh2EPfW&`Vd_ECh+w6j8}7R4^0AOG5q;YAU*zX zm5OyWG!PH8CTEX67k2R92I!DpLn|^eQ%@%VQfh0J~-WRr!|6o_e|#+ryMtu!Xn(H8x?D8--+w? z7gf+CihTHt^mAho#5T%Pp2JM+JoyY*!)pI_No!?1NCY;~k{)0*Ybc9ULT^gCKB5Bq zR5&ey+x?%49Cql{)Gl>vV%QeyG3MRSL8TMDOoZ>lH1uMsis^Mw{C7f&K^)+_aB6^t21En5PKIrUxOkOcxFUt;O0-Q?JnO==4cJ{uoYNmCAvuBxz#%_0pyqX+C<>tGgYx~7TO(SWpk<2stl}FzsD$l=x6U&1I)qeB+kL19 zk;<_X3bra`JuM6V1zob;T1brP0d@cN2>tI_vwlc3E1u#kUz0L{QL= z*Q~@_YD6pFUIkh~`ix18=2bwu_m=@d5e=A0*J_P#vipelK;F!?M5yz4*(W0u4!rUU z{WsU8TPNK+{vCPWPZSvQFdy?$ZnrFXjeIq1Xfq-S%p!COM#QD!b?j?3=|=@0$dV|xi~=x)EWMhoI08g+g9(LL$j!N`o0lLk|py>CQrS}%Gi?H-z=#EYJ_ZKm+o z{{XX45A_V!W4Bvu#_D$n28pY~% z$$GE$zQft&zxq?K1Y)|v(4IK+3y#*qrHW_c4eawOeI(p9PG*0Ux!sq#DgIuz{k|$O zHM|@8Y~^dzH&V$&j?x=NaR-2lz5lA(!9tw6@xJF}6N#GM)Cn!Ix`;Gy_DbOiOGqGq zpp^(}NAUYmwrhI9$ZGfgIVe?OPCn~Pg6*1d1((wfd!ytBTV5CnwLAH-Wzk$Hui?y| zPNo={^^wJD`^_H2LD$!3?_o<8m6fy^!ei+R?SuAMQ)NyvjJ4{n4QJBJ{uxU9QPaGP8UtqeTKm7iE^4w-uk zN$n*{s@Et>wbOf}I)oVIvmTj=hPXMZjmWLj?w>X^bUV+q_!677(8L*q*Pv5`%+6YM zY1%t4ut5%^vMC#Fa88Izh^|yVJSpuqonHJmUYBX7_wwF803TDLT`#+Iu+nj}s98XE zK9Lzv<-CK0NNp2jFX)2H2-kTL$V=SplWQCl(GTW`ual!G>D!ZEYM!kKzJJb$TZ#S{ zlq@ywCATsRGF?$(7t-x_v1#|ZbwacwF%v8bWBYPypO1G+?9=8(qO-|aXc##f);1pK z>BsUu>=8zz1kP^#u~Fkv#qfZYi=z=|(k>rs+*2_`0PHlcUAdOcD^2t8qvEk)Ys-6+ zBASUGi(f5!y~;tt4JPbXiN>>Hpu3u}hD}FQ-j~C(VbaxrzpU8t0ci|gF&0BL!=deb zoiiLy2QETrh`P)vz1enyvZZ?0}S3+TfTMoX*c_NB~EEZ6tIaSabHaV zZ1#E*oGZvKU)f4QwZVLBO}%~Sn<(5lS?H=Z6rg1U!Roh!hpzARqKFCAw*}v=V3wSm zd8O$+5T`*@=)GAKKd6nt+lN~ro# z=X@i%#nH?M>~EvdvU!V2uy+fPD8o-K*h%eQf{^wdS(is!w$J>70^{nTR>>#q{&n`- zxyoxw&T#VivA-=-zgOW9u!P*FMp8t@0&t<;pdoz4B}$2qzE-PqsXO@^X%tTL3?5ZlT-S2U8$=3Q z05?5A{v~B-0ObRgCjLiwQ;WlIJJE}hk>bB#%Kznqgd)rjX=%ms6m+S7M0OM9eEWKa zP5bF2{}El4e3!w@EZJ%;@cEwyBk&dKxINN(* zye*#2EwTRygT}rG4iqJ`gl7&H9gD?4bkdvh{(}M(p3k{I9D1`$LQrZ~}nDb!qPZ`z!x>3I3m0 z7etmFcy!{}`Ze!<$>lE5Fyq&GUEX%A$FW5R$oAu@kHk@w+iV&mE$ugelz8zOWo>i2 zE8@5O`Hs5gDlo|su2~wY{rM)v6M)Ttly||mhC*pvJ};gfmv@mEy8wENiW_XKEX2lV zmw-qSX#et!zpYl*?)hg@VMZ2F~-tbX1MBSHa-24H-qcqhAdh$)SA=Bbx5$uk z38B6H>%`Mim5inv%_I4RT1?i3Q9}OqFu%u_Hi1ttPv)lr_j-#gPKyCamqCN=q@WAk3?t)Xa> z!L0eG??;HARI1{)2NJ~L(7pCCcU*wxP;9xpJ{DDTuvv-I4!~L-^nQXbw9l)Zb0OH&)93%cfc@5BK?wN5s0W>cG5v@(9Iy-Zfmej0W%~6K!7t^z;Tp19j{k7=@NqLpE#QnEcDd-rEKh(LOR$UaU$5>vG7T zIfKw*J&Uwh>y={Uw*IzoJ|iZT&yBXk71LoIpg|?UZ=v}~&LeqGqRZPK6q_86^tw75 zmMf)H7b_2u#}8d&VLpO{9>+jCV4s(Szc}iK&5aZgSdini>}5#QwO`jus ztWjli%aAN!TqT`!)zp&bqq3&nAn$&C%n1E(!wjD6Cn-zB%9s5H^d>AZ5c5miD^+b8 zkLNT&7RQnF^BAwx1e{zxH(|zRqP>3 zM2?ZzUgRB6{cfM;!_CPub!~bfWA3-#Ovc-3<1P}N{lsxR?kGBQ@BhA65E458TEnPk zZ<59CiEFXlw7$mkg*)5DYO(1$wt&NW$!N?;arG#r{AS>0G+lz$s(Qv^Vc?t%T~x06 zT{bz)!-C=c#iEGGa=xmZcz)G`a>7so1%SHSkFw7vp>1#V`@|1wL@it{$a)_eK*pl= z8Hbxt4~=%FDD8i|>Zn(_#Q zqb+fg!BjVg(R;l#tG}QvtQgR!kkKOIaf)QWf)?4a6?9$fY&III*^k5to9B*8dh^=0 z(7K#6%3=EsMThYx7+dHqgHswww^}E>50Ds&>1w#?o^#o#QVPpYmicps`=vH?_kOZD zP@hllkoZfD-p0p5o#x0;Op!7>;xwBx;vW9<$n{OH&234^LIZU zwr9SOh$Yu&t4yKwwSi)c?aDm*G}c=&L)=9vEqLxLDGGQoM*V$M7vPjXUjR`5gn7Q z%l-&RP3rMR&mQ6nD;XGpE`c5b^Y%zGD^x35Q`Rib2EAo(F~b+!o`^en_%6B~Pu#WW zYLj*|M+(P1bz$w0rn5v{?-;V~;J?kP<4@Tl_r=|3sSUd#vsR@Ynoa4&hIV5u_2Fga z0m|GWb9$_R43o>AVCWe0mUI+=rU338X9Juh?L|HZDjn zzZ+#MqgAF&wm257AuE*)Iz}x*XSmlZY+0xcqmcS7Et@W2pef&!_W2<7UwSkdQdk5h^%GQNahvFoW7i#Ab@l9`3~rtm zvkI1mpH>BRtVm+QVCFDae%l%9+f1W!I*`)ont1!18^@7R)_&3?HTK+&UUNDlify)4 z9DvW;?0i501dql8&(Yz1@K{m~h~Hxl7g<*UwnSzaN17|}bfz?g8(b*D>$cYkzZ2R| zlIZvR_y>LEFxmakcGJ#WqFA@Q>}r2j#JT8Jg`6HC(=uy5Q}!8dXkR0l+3D9zvG%fS zP$~E<6T)5oxS<7*tl{SvK}88p<|tn?;{j2w$H3{F!0VsdLu0-6wpi9?eE%9ItNF@m z4qr{l@sm|)KfD7bLIUvS?OI$Ji1!;{L==w987}|&;-T~#`|?Hux5nRzYx|X zCUA>!@d$`hmv|&rlq#Hv+{GMSn#*2oE5K+vxD6<9#)i+{a`%mV?3zmPLwr1)W}hoQ z`TxYP+5`M*?BP}Oiwb|`SQ;wcwx$NXgMfYN8c z%b7ol6E+jp=uuy$*sLOxEa33lZJf#$_f=rIO@yBWKj}38%e}oxgVS?xgbYf_NQ{bke?-rm4{)B;RcF;rEE7$+G2c;@SE0A=o;yf z6zasw4ic}Y(8ci>{^mtuF4qZ~;K4ZEk~$JEw*37XA$XMF_H_@yGU@YFt|t1ldcY~v z+7pw&moN0i>$$7Na)SHC{M&d_`JD4juOKcv#gnHbZGTp4wY>MZhJ&VY3P( zghs^s6+M2@^>jMoazIn^ah(-q`fczh*+BHyz*%C*;QUf!c(6l(Vnv7HMdLwy`svcK zmVXmx#DO@|Hqq;OU}OuI*_hj`{vK{4>nzd5Aa(km3fDD&jr?^*Br7xgBJRuI<&_pT zKZy~bO;1xi};m$SKbY&O>R;$Hu9pV z?CA&P%!ug_fJin1X@61v!b$WvNiGuWxl)uxMbd$?>Eo<{xL*BKB}D)ISM)<{Gop}0 z?ZU7H8=MwyAR3m`u1G2Mt?2tFnQ|7so+r>{6ZM$z+u(Cd2f+AQ3mMCEv1&&#TZ4VN z`rVOxYQ2c(#rBXDQoXe{5BGS*`Bs!uS!el|&^|;~U4B8O{I=<*U}`lQmZ#jz3!|V5 zsy!fzp5oN!%+K*;K_Fg=5z%3LK*0fpSk!tjqm|#8tL@QMaYfwt`K%j#IA60)E4Dt} zVomy?7Z+*O(zvo|M)p%FIM3$M-Vx3p>{`-f=)61##WOrGv6AF5$ z7_~9@sebFGb1FF^Du4Sq0E=%%fVbxCy|e00p{ zv<;O-lHispzm8;s_kS8Lx@JG9QnpY@kwtSg^UiQ!SV9xH-RV52z@+>&40k*bY^ktC z!!tU0UTO0>AA7wKVibp#O~Bse@p8#HD`nChLLigfROt+pKE;9%JPUnrjR=bU_kFUA zZhyP#(%a&6}D{ys8wuKRJ^_@mlRu3g`D<~lt z)$z|yOv)oU)~cQ9(Ujb>NWB_=w$-if?lk0@>-=&!q~Wd5I3a&FDzT3Tt&g|KMk43c z{$wV&NV{}y%49YEU5FR8_!R3)M|68ZAZ81dlJ2nG_S`j^Nq(##o&J( z{C&m*#F1yewVE6oVn%keITyeU>kYlM`bo!(BV5UGqZq{<)=bkf3 z%(Ax9F>lT_D(4YLKwm}Q==iDWp(s)5Aw+y|glP=n+H83C0!XBR_E*u{X!6ssWcF^y zCpG}1vt)jc-`68QR0Me29>(Lc8m%3v_bDx`fm=r}giNOKX4|X?9B^-i7daJBpV_FI zB<;YM5W)VSaErW3XoqE{<=?;0vyJ^_S zkk`9s^v3HtDabD|YK*G_hE?78mxJDr3nrH`ahYVxd?Vz%e6p8J#&*kH$r7Dov!|9iPX3A-RMdcxw2QIpi6<*xf zm25fP%MeL5W*7f?ZWqs&8;8R)MO=E-L-_=@!s(>U;@))E72OG`NbSFHpbjSK2(;g3 zn7z!0$MiOM-O3d^KDVmvw*2k~RP1OHA0GYS*M%^2B7q!Od&$SR1fUm7%6oR4b{h|rWg)i{ zplQLUw|Ic9EB>Bs3Y9j|eYYyCyOOUd(?d( z4aNFjGSH3DYsCGx$G+MLq!J2LS}01ZI<9lZHRU2yyL`5`VJYZ4;vKR^fXhd>tyrkK0>>x zk!<+w7Qm53l)e#(VTKwWlLOjvNq?ZoiC2}l`C`dKk`uv_Nhf+nYk~!e`eHPqb;(x;W?%&{g;Xi#2%ffXJklr1o8r7v|#&v zcbg^?8%)(<*s2MO`H`W$7Yb1)%xn_xhAJqh7C^7_HaNc`cK{Kz2T$r@PGya$Z^t_) zaHZul42YX@hgFN9l9rAP%bEj8asR+{YKbMeVqdJroQ~X>KTD+xEL8yMf8Vm-Zwbi% zVU8#Kfc)RNyEV6C{(z1K|KNVpbSoDW{2$>~{{h}jHEDZjx0be+=(K%jF&UvzzVcuZ z5qDzeq)Q54;}_58CqCd1kKE%vXUdr=OAX^b17!NDtR&wH&Y84~usWXj0qysE@T0eS z-FB-KUm3~wD2mSpFxeR(=<|SFsZmcsdA9P&{{DJD(VP{3eNLu79pr?UVZtp#wDfDR zIzrifkxt^-)2=1)f-eutxLB1L?_iK+)H~&IXm;5sG(JkY5PaHcNfdCNmYD>9BPF6^ zSUVXtckC~CPv?C1=lMAst$R3yI?4>eU~FAQgG6Rp;XMbdIHFN(QAXd&XPZ*-& z87XX%@c}2E7>Fme%4N{!t@@ho{!%VBK5sr9mJ)%xN5%g}q#{c&xHsdg6KLvBEmu$6 zD&4qE><+nwb3cnAT$En?38?^4g1tQ!V8)ZqUxX3?1DxnLh%gk<+K?P?n?cO#0wkzT zLz|rgm>;?j;UCj*72ZM_!lcDR|u)5RN zU+h$oM1qs?TgqBGp$Ji;qWPn#W-34Gj5j7pvcz?aCDMgoW;W)d37m>(>Q^wf0s+QN zA2$oN@w{(u-_EiDnDjt&v&99L3}5Oxuate}&(r{J)R;j%G*N0nE?5j-<#$TaWvafx z-jLiO#d1K%pS}beBPMyzi+z)70CICm?5jH4cZ`G^5NSdf-~nmE1^q5RtmP%XLKERW zESfLZZ0{^#LKek}X#4_;oYYup4`I{`9_a9Zy&TQx`0|v*NjJfL%inTSeDIMw>FGnU zmxl2you?=+qpE52pg@@aJT9#1tyDUDTS_NJ|o~j%+BE z?`(TT*R_H6{liRQx=0t)DF)jw`Nb`-F9$P4#=G=>^Y2yoNHWr}a38dC6?>yMQ7AFO z88Jp2esW``Tg>)-Z%eo$UUoTRyZu^DjIRzGCcrT5mG=>0hnNF}Z}99d;+AaKiBmG6 zo-M2GmML?Ep-2$9+#~a*8*g%PS_22dQxI0FxAXle^Ad52$%LUR?Z7w^vDUOe;jNOi z-ian1XsMpFbxp1Jx&ZC4zdm>)$Z5s{8zht&ul(P^sJjqUyrU7B-+$36b{O|5=2h61 zi1p5^@8KI=c_GU=;-s-dKnS+bgBsJ{%lLDPZUrtPe#n{H{@mr}6&}-{gi`s*&x#?E z&y_z3RW@rP=JcH21=ARMGtR_KNa`HT&VP=_v2FrX*bH0LEhYyiQfCl>kEW!s>7g-|L`Jw{7K;b;LleC)q(;! zZA{ZI3$P;dW@-tGe?qZxZu@~^YQ2zC7>b<@6@uci_$1ukwe+_2Vc#GPm4Iu_tMiJD z>*(%4_3VA1-@E<|AsBAYyNa<C({wBC6FxCky3S>gF-j&YQp$vsYr|Y}Te$$+3!_{g^)=W_@U?RlDLsmP zm-QaZJCNNzDqu>SrG=g~obi5rcY;Mb`HI!o+^MC2&YvV2VVSaxPh_IfEQVTGdNN_{ zcM+)b|Frj>K~XL3zc!nUL`8yRB#VGblq?w}O3pdw9EJe_0g)^qAW4$soU?#}WXU-* z40*@{4B=h4ANRBO`May$Q|HV1aHy#%ie7ZMSFc{(cVGRxwMemPHcq~wE_q-o@9J>( z?;H1<8->z0m3hLVsd=TaV@hAOITN~)hBx+ujsQK;ryBHr*84P7GmLjIF+&t5#<8@G z&v4uZQE`^3=dzwoUle30ZI*BPrfW`FVjowg9zCs~SE5?cJt4%Q5}wLWd)M8YE;By_H>d6b&(&{ z9i1#jm9d49lHW6-`{f6o?r>uEhe6c`cV1+V0$xKs5eflDebaTg$5EjAW?!m+VnysNi*yp@>v%PK7*j{tDV3 z-g)vLH#9?!NHC&-UHh2VDM%V`r9r6S&vDhoX(N2Bnx?<)wvOVZirY*DF;PkRMLgi{ z=pViKSXz!#kndUqcW1jKeV4x8>i_BLfRG7s0z^uWfG=`5Zg5$Y%v5uRzgZ@oNgZWs z+lvpb`m8kGKneOd$ebFg=HFO|Lrzv=m{rmY5HTf&jWU2pM!z?o$|u*xVB1oCSFsy? zSA|0@g)^gz!j`DnoEJ37c%hMUWYJ`2EX(J{AP`FEp>R; zQh$ExKvRsM(6~&^Wt9NQ0vigcViT-$3F$#l#^yofzXSta2ht{t7xvlrE&C5V6+W3R z=RkHET(|!42|0qJ1{QS34lN}cL30i4rp|NG@C{bpaHOqi#}RWT5Zr?=T+j7ig6J#m?~tu zKIcA^xizS5;C}aWb2~l4L7&ioqiCYoUHYyqHGfw$hfw@dUUcp>g|0VL#pV&pTE@cDuLLwOF@Z8m7uAv4md0;CWZj;u8z62V9p5N%pJGfAT~|o6kVKFyf*=W~j*9{#e;q z^3zutV`bCY@90Z0VxBNf3dyD4;AxB!^dEKI6aQEY#r+DY_qv27esAet zphmz~085boXE;h)3|>t3to@P}DRF8{P9?nVr|D$N&WM^a*+Tt!e+|;{Ykf4Th4 zM*o-D`_p4$JC#if90C7xQ)OJ>0bK?%hoVb^(#XuF;{Pn+@tc7T`` z{cps9AjL(ezA7pz&&)55Z*0>K20Ar&jm0DQ0AS>+8RwKb{XU>i*X!7JweB6dDss(y zyPIiAnob{ZgAwpXdC+jv)o_sW5{P762`JSipIw-p1B6Z)7nZSM+ey`s*}#muu^|)| zm{6qIt*v*x0{yuQ`IZBjfS5(>EyN-h&%zWZ(ale9$nqkMJzDJIeymJB#dT_}S@F&Nzpf216$vsJ z?oAUOp2*TJX@cNA7=0hSg&#%GgAw@g@DILAZ#oftz*PPSVFNo`UoHZ-865yL6Y5Kj zAw#MFG@Z8j--y}iZ?_lgND~8Ec1ASiCb$&;zqIVG<$BKlwf07#xW=QT@%;WgXBXAdT<> zW8ge#)a@FJVV?NV<77N?>iftTmpUQXt7Ge+&rL`$)h8yzEV>^jt3}0&ikkkP2kYi+RWPI z_bzwawIA0lBTbh0-lgWY(-p3XSfjd>E`<~M41oIHw%G(8^3W*&fB^BFl*q0e7Weeu z(}C~`blU<}1Nd&)YV$q~?JAQ7iaB_~V;@j!GtZ`bvH*;Y5PnK2kPFUZKiv9$*JW*q z_wXQeJ2KR?Eg-1hyUfGslUp+OIfv{|(K~AY3t2z)pp{AnSYI4bZ@14=Y4;7Q&G!ts z00n_8J_NW#Z$l@o0dkvnB4V&!D9gCP#&rS0E3bt(ID$mxFCxc%i}yaxRj?;k2d!Xf z(5E`nzY{qd)};J9=BsIe_55yIc5@#U%r0OlC>Z@$OSyqmc=ugul@IwX%iwi2Ef1F* zAkYkdo`WVKtDWcY31N|tia`8pjbc59{Vs`%m_evc4YAjG@I#)~gE9rK$HPsi9F*nm z__TrDN`U5_d_1X3yBk|N6=U4*^F z3r6+4hmTObaav;ElC*^H`9{OM%TJMOtrv$=yK4-*2N$T%@zWU2qGCDP-@LEJOyzSH zd-hB7&T}=AmyDo0ijsF0*mRc#;2HXwN7dYJ)Pa@QF2@`2_ZNc#xnd2KE=L2rFQp>4 zdk!JyH`is6m7s05y07mL*EkQVk8A>f4Nm2e&kD9HOCtB`^^jxL00EkZF~aER?QV~C zHPKdsmtB7d(9lV@*_=Mb!OMPNFd6b$S1z%PhExnEK>E%x{}{H0At0q`1-=Vy*?O2N zJ@*#hT|L7OW$hPwv-NORM#?vTTbabfYA(jO(+m)cqV9=@=I|RC*`ED-}BikO=7pYfy9&Fa5hiKW0^L~h3 z0NHRKosh8P%F&8_cz9vEI{rOL;dbkqMKAx;RzkAdMTr&u9uQ_zChv4N;>97ruf_Ez zKa-t)l{=<;?dsa1DZr=P*dKhEn1e;nY--YlfF(j+F2toiuQWV-Ws}d_)o-mz5iN;x zce;4O)At83*=?r3yjevmz6u{xp32nM-JB_ii^$*`zW*%GKingrE5`mOf`b$MM+dEM zp2NJni-5Zo6TZ;dC&7cW)o11YhO+*P;PmJKgTQ|hIK)1JpX@rUw4_0-0PXWX2%I7E ztscNqSP`K`8fDIbrK+YhK9+~jJ{ZMOooR;uz|6_Bazn$O0-;Aw{-=~2Y1BWYnZ=z8_J$?%G2E&$=^Oi#oED!s{YioKU-b_l#x z+>oCHj?3A;alvAFMHo7~RXmda4__7`Agmf6n`eO#-f?(V!kg-Jw;BK?Ovsz2uh()-mzzt9 z%3%zi^Wek+I01UvANeT5KfXHgZm2rE(04hNKRy=TC!TJ0PXT;Nuh4C>i7r-(OuLhp z0C<2aOF8?615a7V3gML+4a~klyut>}ShuSc{9!F{ zKI=d_jfqfdoR~xtJoJ&zEIN_{^Erw?}K(Wv@hWEcHJ-e3MbCNs1=^4}3!>JO)VI zCatp6SY|4OM!AS#WBRVa=Q!0P-)`onlHG4cU5IeB8amjxTPCBwTkzV5kAf_7e?+X4 z(7=epWB1{=cf?QqbH}PU)eGZ4Yd-(+ybQELao>oP<0*U|d#L6~dm@`I`|U|@K(OQa z6lA@bawU#Q(?L5|&b*F2E!A==zlFze@BC4Ym))oHt@C7^7B`xGRBXwRHmo`44r2Q{z@8R;JVN zEO#JNPp2V~jj~0=`p~SPWRwy|Ifx4tJ^nqUwRZmXP75H`na%m=Jv=2kAD67Ye=_FtUa@sa^TSSXuW!FDYl}q-Czf3_5KKj?s>W}g|7KYuueNv_0w?eZd^2^6)}>`H&`g$Tm66&RZ2~6ga&DdDt>wk>o)^CZw=8cF?Hn*p01 z0I@q5xfF%!Ra5%z@db_>GsxXf#GTm){ts#sTdySaxk~sl#%|`gB7*Gc93_MSk>pw| z4HJIV<{-3NKgbGD9f$9q2>v{MlX*|{7idF&3)+ZS9LFQR(z-K#`F^)ft8-zwb5YjW z9n}T9cZ+A2Kk2Y^W>hIfHOckvg&*jI&%gCsRTs#qaO)z&;au%Q3%vBp>=QNZl3mNG zKNyu#X#j21aQ6ojX?!HB--JP3ZrS2pcLO?IS9ZL+4D>Jjp;-aC$QkO4Tq(n5w6-aAOj3r-bO={c)buNtcb2#Ge9W$E4eu zWzRx3zS+EfLwnbq?laoCtWYR=5s6YzmxB4Sz)|EMbI5wNCc!EXEr)PQc4{Y`Um_=p z)6jl6@`@6@NZOq{9jRas73Z5YA`411xHbfNB6GN&+&Etb$(&%b^}hFGp_Maxc~=bE zT2kfRL){O=)4iqlRpi3&O;^>(EGO0)niW+uZ;`x`#iV_=;=cgBh-#SZpKECJyxwam zTXmj?`L->d55f_t4V;$3(9|LCF^z^VDL%zT2XX;?30fJFLUYXcVD^Gcc4j${lWgQx zW}|7j@C1x8lC0qZ%D2c|BB*zvr?#r!`+mk!%UyGba)pdNzH6i0R0iRvKDxR+=wo-U*(C!5|kun2zCLWbC zv|FYkz08m_wdMua=WDi#4|CbHtz_oLEIJC1BRB?KPv?O#9+s#&isicbbe|C5@Y7j? zi?g__>Kfhd+Utu}8V8|=?p*X97qmj_LyUMGy_;Z5$~V2N?|1W#?+K@`NP!C{>Cgf? ziN3qFe`Qh4`}lA((y@L}(+LmWF^cdz$8V0^P+(^E;lv6lLfsyVqxa$*ZWMIT%9E@9 z6yU>)-u|d98==HTyN!=JcV7mJ} z$o&2*G`8_aulCmhTRr6!c<+h|GAQk>LCJ@}CcA{YUI=^t4F|;BQA{Q3o=yg5cV_3! zt3B%N+Q(`D$9el}tU>#w?g)oLWKwLGL*=4~aXBz}1s;d-MKh`W?{) z#sy3bnPyu;Qax12y{3c)_xtm=kptV6p*FH%Zh6=acP9ez+q;_AUzLqCCSlTqrt*}He$aMza@EhDZ$ zbqqg=bQfG_MeSDzoYC?ZL}+{o_=gWBI~$cVRl$uE zV2vUSGp|*{-qr1^!o@Eygz=~5ypJv|aH-XXb9icO3`~dyMa~}V@a|J1W6}_ai$1?M zeqhw{lJ~9w;1pwG1r0u5F1mDao_t}Y|4!{L`P|!k37m>bR1qMv4b$ru*~JEK6ME)$ zxFk0fhGYw6!sPx-_%*506BW)W+rXhBJpz@uI*CVQ16ja2fh}YGWeas@P?%hRt6|-q z-_$YWWJ{7Cb(`R=E{!T0A<_NqnU*{RN}*$PO9Fri8dO zmu#eqoXs=x6Bm`DZ=e3SD$$tt;aN@4^xX@wn&tAPugT&->lNoklr)qgdsP~0ls3_2 zT3Nk+=kg#yfP+Q)8+^NU=!Yt+#txUiFN9h(S4%R%0+(U3crZQ)83f-E@}k3i3DO%9 zAwttKYqp#2&Ya)_KhWKB&=ib0qQec1Y}g*!p15dsr{woIWeYDP;Bxa#P0!jL^fCiUr^)JG7MpJE#ajQ}y^g}ijGrd=A-H50=@OB11lON7 zsvdgb1FlaYVHqmado@bs6xGE=HZ>eRD(| zQJ|@69_lWPrUdOgU7*vUWECopaxE}!bTR{|cRjjOSC|(gMS?^Av;#)g++{$)IT7Uw z^F}2!#j3>p1N9+RJd0|V!uRKWQ2sHG@!a*sSgMPtl*)L%ZTXwD>rOOmahAAs3adG0 zzpnPX9v|IK!k98DWmmg^<@|Cti+*E#TSFIavWA}+(KuY4T2=XJP1f`WN0-e+w*@63 z;m2wiV^{GUy4l)Xf5^uW%WV&NEQ+5{QQZfNjIYIdHlgnb=+t6r9FYff5`Xueg3Iqy z5AR*`tuEZ=Q7*x{Jha-F{ElTevLwXSVF}z zBpr4_;#d8BD8jSM_Cda0MOKPr{oBQo9FVJu*f<)yw);NXQ^=LOZb@fhs=GT}6FbPK zEI+G2;{$e)#zE(cUd_Ul(C^GW@I=GX{T6z6UtFTFTDB2WNgdAtbgi86N6m(R*Fhj8`j#C?cP zv{Y2m#UV1yRag4gL5%^Rh-i`XvEwAU!eac5LXydH64jiCGJW1w>wH)9wa%Q$wjCr_ zY>Q5-3KOTL;zx72kouhzZ`3i-Y>9;}aP|Y$lk}2hCbBG-b932}u9L0tgE(mx?YfZq ze6f5k;yZ80ic}{JcWQ9SXU?z)=jk;#E>Kr4D6AJN;$T7s+{KzJ*~fDIf*euu9-osp z!3NdCTc!o$6(g(GDligAWV*2FXKFl8|Dp8E&PQjOuy+@J@A)CqBlIpJ^$T7vI$dot zB+0TUd=(27M_M!;Z-Tjxk(}AN8n1moHfy8TRk{U)4->qEJI6rEEaY6V$z&NP-?UrY zELtxx!bunq$s4WtMa`Rx_RUq}lzz!K1foa*)tGbJjKccOZpj(_ zY6WIK4L#3^;#F5$(X6`x5k_R4vhfTJPfI_{ltCm^NA)gEv4wM&Mxg9Cjx4 zKDmVr zAmQYzV3#YE1Moz;b?Ghg(25DP$0yTS)n=N830(Je2*2qq^0AOK`LtlbCmq4-$^?8A z0d%#NCQpvJNXd?d-!o)xldm-5hbFo}(`+Tf>Csj@CYgNBKUgTtr$}Itm?3_+H1sOP zkvo(#@-C+CaH)B}Vl56tdt`H>j=aCbpTgDJj!*5d_Ff)C{CylH%}$r9lCZJ<-sYAr zX|+y!Q4e9)`Se?Ex`R#>j5BSa1Wy%7@fnB4iZIyjefyNfr%^Y9E^J(KwXNrq1Er8$ zlYS8Dq{Z`xFya|MBiF%H6%ToDIA#-z_pUOpYo;*LdVd;}wTPG3ejT|Qb_UAxeOwaZ zsoxul-&WFNrO`M?%BEX$TEwlJXkmoogBJ`-(NE;QZxC>F>gisM;p0|^=OA_D_0EO( z)Tca}y0Bm&@M7f?z9KH2veT>ft~@I0JtmsN?(qG_@5ywUg81&=g~9CSM>B%TzrG32mvM_zJZ_$) zOd}E0q!T&ge(3>zLAkWRq{d~#Fku-Rl^b$eigAdrxkrKZ9*fo z@q>{$aUHyKQ7dxoTzY|0VhU{%cX#KGnge+z{;)23s%hRXjeVt5{$dep@#9J3jUr!H zZlU!y@>{aVtgJ!Qvij7C3&D44fiMDA&62|arQOj+dxTk&qOWyvM*jAJvG~sQo{(?4 z#O|l4yS-_k8!-Ih3fhQA(Ehe(zwtAV-_dwt&qyI!^fruixoxagx*ofzGG7#rm%_uJ zxTUt96sdgJbsR$wWnGjf09d@R)a%*p2}&CYYeI{E4`+mN}92AP9XG2)Y=Emfgb_qY>A zf|Y+?#a>@+F>N`x8*jGWIa}#1<)#o2>0vTmu6nZdn&&1o=k2qsTQ5! z?Kq_5o%F+gJee!|ayh>997o{&R(h!Gz#PUOsL$$NlNl$3D-*`P@wZgdmlI)WE4$or zVW8SxD9aviUl~3-=+4mdxVh4J{1^iy2oJZU)*oWz{UgJoP|~l0sk6$uVd(?tUgy}mp(4S*K%iVFW1l{w;sa3leCF@M>()Ds1QU6<_A8+HI z2-N$Z)wY0cDtYzF60{_-ZV2zn>p^skOwz&`o8J=2t3`{R49Z8p|KO?;EYv)bP8FRo z9u@mx(!*)jM_BdraBNXlN96nHp&oiP<*ug=wZM#X$0VTy&gA#>KDj%k8CncAlj+wy z_YQ<9w<)GSw4g;D!_W7_Ub>Z54D$YILD`bQeWeBbF(i@&%sfqq6r6!m<4O)e;qc_Z zNw_mAr!lWIEf5KL`lTptU91anA$x8{JcH1dX0L3DisYnfz0-08zhJi{O)`=YA*Hm> zu{@4iLiSc1q;FR#(KR!oFI2T|FqgO=JwM&!BV%g?`L+BDBw!^ddiy2QjC4}*HRb3n zCe3RkV0s=?gNn)62;~$C3e(<(w2%M!*PluM{c=*KfwoJ2_%nx}+5G)0U;`PIFnhKf z?N=FpPy7S511KQGxcu#HY5&dg*KfmUn2b_&#@N42#C(HAAdk~&boci;XJHe_M>up6 z|EFpHES3P1kS2*Vr2XOVbCw_hQs+b1epe@8(ZVSD-OW9u-y7@<=w@BeuLl3f@mJ%`KtQdguNjbj{o$YOJ^;E| z#86nv@61Z@{S>HG)zwQRq~BI+<+ht;5Q1ob*QA(TpnF>&5|NO8Tdl85F$FTHQ4 z;IKLw5f z>p{zCN1_^QPqzKb4S7nV81m1zK74TQxyR`= zLzcs2xaVP?>TdsvhgZ)c@BwJsp~QL@g!T`U@ayquyX_46jtjAHK0!s}Z2 zO)Dcw=Y}$l*gUJNG03@G-f-R*&-4 z{*-BS71b4W4OtBb_PpbIHP&gz+ssS>E>KZo&Lho#SdYTtNj+-2R}wX_&0wo zOF3GGet;*LUb+ZE$@vMewWVn01H)0l$T*(nz;gHNKbrjv+!;jcJ=m8S; zvu*x{R*5CEel?19yhn9qjn`7FV29@^M;ADVjUdmG#Z$%B_eGH?)b-uMzHob0$Y~|| zjhBpe!wTp2aBigi{Gsi~TFOdJ z)?ZIr04jD)u5gYjtKJ${5lFJr%(7u?n*;oMFJDiVHsR@z&@xI`p(%HLZ5X)5q=C)L zv9+e9CUyX^@v<)ty7A-6bbND+v;1XI%2lfkpYTBrq5XOv;SS`0V@5XbekJ38&PN}S zQ`p3}4{QcTfTB`OxZdi4m?l)!=m4H3VE_F)$<Q3u*HWQQ8gUAo;~_q! z%rEM4ZSC3n9{HFq#-aV|A(T3zmP4m<3XO1JYpm3qDb+~~({?5znFrj(dU{Thp$m~q z*jfL~I}J%3uX5k^D_>`Ggd+xgOl#Q;D#xURI7j3X9$Qnh>sfV%h&TYlggKyCRvUu5 zr3;Pn)E-Pt_Icy+{SDgfuoSSvR=x?v6^@yzhNPX2HrAZBUwVhWwwmf83_{FcF{PW~ zI@b4e35h(L7AC7K&U&{5Zb83*yviM{ncy`vHDtf}db&4l>c4k2igB;Td83L$JNDr> zy}q->iNnq(wln)uuW~p0RrFD&Y&Q_2SSXKw&*tBm~QD}jHLE4Dfm9Ad-0gQ!_N z_etgPYg-q%@ltTKwH>z^J4BS-Sf}B?Db=azz#t@PhSw4yJe`|r)IsF&S((q%k4`Wk zZI795@9CJC0eIKFDR2AA<3;eHNB1txlTzIUUaZ*TbeQ_ohP{}2p?CYJ4>gps8m&P2$1I5c3}b;Va)3!juW zcj1RkS05}XC;OfsA?$(3xrYG~^IE&e?_Qj*Z*%O7TNCa^aK6;awqV1}&0(N!lLa3hATqs+0-^Ygar$Imrg(NGviA*P26he?l zGf;_c6~9|k%=R>C{MeS;xa}n86*KAi?5Tb2G_HI4&4lgs-iva_-J8~vy6y3{nFVi9 zEt{`@YZ1nE9~uouZDZ*|BAqlI_5P&=UIc+A&H#hXqJNI zXf#DSbbcU!FsXRYOk=ygEY+?dHO>cbks} zYvPpndm@(??Q;i(hCbAPdf5K8_~_x9+|MWLvGT@P*5?c>^PVhx@OHV*H`|Gpj?YRD zJinLrW*p6(B~KK%bCO>xXZ-Mh9BfYQ{^8$LUbJ6`rYjyty?yhEyY<9F_h33N6AT=l zgC=r+^?74^rLj3ul|kt~mZ^@0#*v+%HJddbdh*vZ^AL5V1Yc4hvF-YzuS-y%0~qz( zL*Jg{u<7#72I&i&fw|P4f7ho}N@Mq$bGMy*<0E3DpSK4tHn_6ATCXW;J7~*OBwh8# zxEV;fDl9R$DyeBGDzbMXEJe-L&6~(np${i%cC50HuZZPPILcNF1TTCP>Vz7N*+;s0 zn;g@#dUPfG9Wf+Q^_P+EWE^@HA1O3&)K!Iuign$AdCc2!A00A`+n$MLZ7A{|NECS( z6X`bEX{+^!1uNEMNrq3Y6^!}{OP<)NP1kHk=fjOr{_I%VlA@hf>B2u)D>}hK;FqpB zwzK^?`V{?PLiI^is-4aN@6tSF#Gt!i93RZ3-vo^Y$`4h)A#|28Nd8(zaOZSX1iI zA*k?HvtaAnMD5F0ICKmA`cBQhXotFSx;i@J;kFm*A9EZBi*tTDuyRckJSn9XDg~1syXcLzq%}bKap>ZZOAbu zlh~>+FjL3cc4v`c-KF>_(p#FxLMK4HiQMc1)caTnaiDK0gr> zx|l9f?LrV+4)kBHZLoY`)nw)`iUWD&dDUs-Y>m`{DCK<0j;9Q^Sc97ag zYqPJrqt0WYp7~aqE&4*A=(x>widY9X!g7D+IF)+ znjnB8qbXLOdcu$4xE66#SN*Gg$Sqml6!Fv2OC=#6$v;l7;=@n+fMbl6_bfA36Hg51 zFEpnQZhB$=q@!M%_UAtVqSI1#c`E_7U@0-%+dlG}5FyX1~ecVJ% z74wA@N3HfiGZ6 z>pLq{hO5)42;Ji@NJ9<_l5SqwGUT{ci{szcIz9kaibA zgK>3Mz6+t-c0Pi#E-}42V4e5ROEZ|mc*?CK=}pM%z?Z~}C_-Uzc$dgG3-i;>X7yF- zTfoP~hWPh``YZfq%TjnSLL)8&(vJq>LXAw+zQ{Mz(zZ*Sb2V*Bkpml#^B{lTBK@k* zE!SP-wda$DCEZuDJ?qgfe@H1*AD&*G^Z0d0h+w@m=Iz&poRf5G0msqoj;9hY?XUk_ z?a~Nfs=0NVGBzJ(m?d6pjfL@m54D<(E72zkS6lpM8Qe>qIL{R7l=PxIsXQ00EfXKD zS_G$)HyVDkAvvcpT`-M4dC~I?d$I22mYzj8iSJTK$K_t=4^-I}ZoTy~Az8XRZb<5l z>ceU(*A!^9EPsuw7Wb9AKaXvRZ3etVR zqfpuBzY&1&T~cIiJNF5$vG)U8A?(@fOlrv7R|-c9NHPP%Y$Obh<{$MLE}>=!=whhO zb10=}?{;i)UpbjYKDPGkJJHw+lq+4}9TG99d1-@K%5tc-oBihFW^JFHLFKU6n4COb zU)-1_7m4CcK(06YHJ$oA=>+B-eFZ?Lv%6AJ>^EMrBf{?6=Zh8Bz@p4_KEpm)lSVu+ z?D^wR5*6XVPt~4PPaPLq=YjZ##+(M;#ZcB~WWmVvJmeKl$S13i_5G|&o);`GGn;!_-59**{~b;E1Jt&~r6J|2cZg04EbgSH zs=@BS61=>xw()>iaOC)>*Q(+rBV(xMh{0WA$F$P9^Mu{Z<)P`AKZL+|+e@Uqq4Nc{ zuPz>p6Jp@`Pf%^n&qvd%;7Jofn)W6fot#KWz7(4aU5JSB};JI znH9xJr^`fz2XL|c>wKzcwpsUzkjaJn{2z#2)&%(=QiCWJ>A<{V`bB-LIjT#&ZwP|W zoBJb!%l3r5$4VHJgs+yp=H&do%ncq1rATj!6F0V-)U8C))HSlD+1JeHjDJ4cSVTXZ z5*KUjDpavauppbV-pDrklly5|->%c_Fco{R>i=sC^#H9C+Q|N?8c^O4=h=<>m2Rz_ zix8TpcILyp;E|~u!9@Eu`wBz$gqpi zZ&NEj0M4u86JHnZ-&)ix38`H|TKAJ^01nFfm)~DP6p+23+sa?0*C)~(RdTa`JxBT5 zMt;SPf28%y82_Vb8!Hxwiwg61R{;-u2~uY&=dNmZ7~MaN@SnnCv;dD;*vb0vua){g qa`|2j_{9F-KmYwk{a4$~UkWtBZ#o>rc z2j6zZ5p`5cjiBDU#@TBxPzl{qc?JvLyfSP<>f{1 z-Ka`jX$`{0hNZv@}%t&_8;^Vt;L;d~tq7!|quD(eXqt~6cM)F&&P?%FSE z(4UN-J-hEdF}hhFfZO}_3B<;VW98FqNafvhhUdOTk_@{L(Vn2sR>+JCher2Zc*Lbx8E17r%5OpJ*0ajW1;bjW4~b(Jqo5jiUwP& zrP|JN*ycr4*`)Oy-HaboB)JJ*VQh(GjL05{`e1FE>V3=+of@tHAO$=TO=R)c9YlXUw^RT77r=!nr)O!bnosyaYhEDE>CSI z-;nANsaQK%Bl_SjP-Vf`{My%a-d!XiG2m5ggcav2zjut;pdI$%D!1=UP@B)(YqM34 z6ZurEF%kW|zwqYiNY933ZdBmbetURmr!DLKVW6)sYdU^S^C$0JDG{cO!RId2afA0< z``3m`IZF{Z9~#^F%DZR8M!m^28T$d0PjLjhH1vfFc5mVc?>}sg6y%sosEuUt>I|3nA1FW*)ls;@fQmltvieyIMAg^ftXThd!P#-8CzmyWQ8=zZq%} zD1D=-z}F6Ds~~xVC*SsZh+r^S(3Qv-Ur2_#o`mq*Qy-$vZ~jk-WJqZ~Noy-y8wovl ztkwO{Lb@zoi2TM|nXejn$+wBI`pq%oxwnKL@2(Rc z5WO(xRS6R?$DhQr`KBU_6C3=Xy~mXqO!DA+#)4Ztp(LKrcc)(h^$%!*&Au<)kj^EQ zdrrk8`^P($dodl472n-3cuUS;tw5|9S524M&1r>0DJN;+`UhKhEKj?z<>}LeKkhJ- zRo$$*s}&;h2GHSQ?)-|7$MQqD{!LwKhbYhQl@&abR*qb?H;lq1f+gNIzeD`EV+mh! z6_x`(9>iUXi2jkcn7Smmc(m})wY1)Omel!CTvzde>Mqx@xj0q!d+zVZ@3d{%U8seK zUD_L(1e~D)Z4spMq@*`*Sifn1DtqpT16H zl;>q^h^JR5%&p0F)0Eht-@x4{O-@dZP5!L?h1|^ky>`PGPhp>Cm?mw{o)uXoi|<_n zmHFqA3eHS$rq(=-CxqnQN{^T_DkC+O8>D@oqv=#r1VJ{6?sz5rt^3LnOOquyXSl_<9Fd&ph$g--()a_ibtl zX{r;~YtabIKD~y1%=T_mFblbUiAk32AKvubka?e zA4W{7*2PLmy|OwnMUnPj?8`Py$D@lr+w%+93d98zgf{3>G$Ep?GU&L8}Aa} z7YGn|^AN(%Wfx=5X5Z+LZ|@8fFA(`8H|{pR=->nsEk6t`leQfh-B>xebMm3Ku{OI_ zA7Thm_SfvvV9_j)E|6LK$fXGSn2`%HsWb6bQCD^enZMaA+U$Sw92g!Lb*^@%c1DCR z8)g@FNT^B(B~BqW2uFu|hLglRdExvb?S;ULy7#X;Z!hsLNpuRSnv;noktZo7QRP<@ zHnYM#;s7Y|ZSgMOP+x)llYwpB!zdK7*XuPKZ;MLAmxrw#Ir zcIuj3m|Rv|bj-&s4m+ni(k)E@z?pE1CCjSm{HfKN(n3zCS2==OtY{7oa_EKdGTWx! zmODB;Vz}dXhv*K3`h=4I#SKe_rAmg0;r7=xOO#!4YDa2QYF(M$BB!lg(Qjr(8p=f zkY%Z5u}c^=lX9DtTaBdrXj?ti&;0Xf>Y+y)JVi6PlB6PTGHSp5pXr$5TeWY#eSK#@ z>CUMukV-rAC?YO8N{m#fWyt?$?A!L}ZHoJxn?{cH!Y;chOhw8gAnILQ06AcFhw+h0_^n^VdUrGCrkL4;qg+nv}ru^x_(guzYCylNL!!rQMX6$Y=Q{$ zO;YOa&`Jth0o&1`R~wC!3a4Q{PrC#Pc-Lf|wRa-L8^mSi53QN|1UdXukx3VQZvAe9 zdq?-D*(b{WFzzunXufHfQeOnD+1dO)-tG_pdt|6=m{EI#!s`ni3H9Fk1gjypCof}t zbkc^3Xr$kufBEvlZ|zV=$|7+2VzQ>qroD{VEV|)`o-&7Wuacz8vSZUp(zyDH6mTZF zy3nEOY_-|Q)ri38q_)vXLYqx5-AuIFsjapXae(+S14q1rzs~EgEwN_-6!@%3`&853fD^yzBo>nw)nU2L`uv`Nchj8CiDQ>DcZo3w2S>rd-!JO!?5&9U_DR- zleH5!4M*Qv0!`x=U01pjAo4_5*!9V_hJ2~46XMljd&bs+=9D$nASl8H@zWnbZ;egRy8)ek#j=O}j zgi~FUvq|GXBd}EHOC5x}5~+6a={!ViRt$>FhXaA!YxB!z0wfhAQMA319R9iIUsg$$ zq#hD(Umv{VM{FAO==}Jc)OLL(se~mv;#Rh1R;t9PpDt$h1f6y_J0XLcqUDc3)&LO^ z!-Im^Y={|UvnU#bF@tqkbcUNnn+Ew;?Uwe=gdrW_HE|I!QhwRHfo`9U!+IlRX(?&N zq~wD91k*5-7Y(P;yP(u^Tr=8$zT+d+Zi=r3^0)+^!mLEWLf3I~I`E{MiEscv`(q@W zhWal;0eBaYwyVDYRo54~AL4wU#l7*u@;=+fXRf)M(l?6a>hUK#Uo2Mb|2o)~w62uL z`L+J*S1r*azh`(C)oBgqkd_O|klao9Vk?fkEm12Dt&}_##p+t?E83{4;;><_Z{XlY z0B{JfSGd^EL+l3&b`!&Ku48}iVL#7v@c#Mq+IkNDKd#Q z=>KdXioL$P%}r1DXA=(x33`214LVsDH)}dUuBTj2=_PN{(b0*!S=oqcK7a9#?%4lH z(A#-kb-;B@zO_ITyP>Fmz%*C78K=ef1J zr5nK21K{FJcRB7W3l~oh33~cVLjU^r7oXNXfPYhRcK^q;uoL9Ie8SDc^_2TxV`ICD zU)~kf0Qgut8axL$Va0=`At@vxApU3jzdiXk#sBE3|L>kW0zAC`+4Vmj{dZR#cWXCU z7bh%D56OQo%|AN-=fi(=6z9I2`hSSxFFF5t7b|GVo8sL6S~SU)?9XvsVeThlMOLB|dQQzL z>z)>>PIHcS^VOg%_hwk;YunYq&BlGP3uq>WYYmKQf};jc_nX!vU555*GM2V<%b$=F zkV@m=Uj5Y_5c2Nb4We7Ok{0@<$mM_AP8uhq1CL;xEV?2V2k!=5@Ks;Z6gNn7pmYds}Siy4q?LI%qE0` zbh6Sqx<6YAPNN87*DbL#yU(UwfuxjpLoMbt$<>#_4q7dHz-eeTGf||Y{7hC>4l~=} zUO>*KtzbV<0zj1-RK2)EE6HKqmwI19*TbXw6oYK_GOMx~81ZmK$VMM+B_DtzM&)yw z#b8|`<;hK2M(_>Pvt}q*M2qmwqbL5T9o2DCr4%+MWDBOuV`GF#2XRl2Svp()jpvmL zGs2qFW(NIMK5Ak^7%cGgVjn6c7Ey_Ombbm<{wnI3yvR-S?4CI9bnRR_!+AYbLcrCJ)avvE+NUY@P zYT8v}3#A#Z#V+F_+w0p`@)AI<@TmUC(QCQ;E~>;8ir6qYq5!MU7LYYT-_ZdV-N(Mw zrdAVdc&;{4u+WtO?Sf~k?1qfZ)P@_?#B7a}kXcnbH9@wWEu(5s#ptYPvM=g=XyLQn ztU%ngbYWOqL&K-+Yis+7^PlZ<=2(BYzwK_(;_<3cv3ognANY+6^h8&Q{s^-9HvB=- zdiR+Z#MY=FZmfkbOtQve>`MKnqr_#=&s0gdJ(tI~oVkGc^|L)`+rK;hajof#T*z>6 zVGEdYB&#Yv>5=q3v8HQ=WG>20WhTBesU45+Ns`1CBuyy54m(q6py-J(E?bY`g@BHgI%VYHz7@bYCIIH}?%-@yK;$K( z_6rG2{y|VKD7mSra27qtvZwI=-VPnmuS+IavN^Q+D(1T<@FK&KdS2ND^AytAx*-y0 z>O8k52p|Ha+4VkzJD#=8tNAFTVjgZ$>-G*0rS8BP-R!-HOy_w`kGlv7dc9)o7WtJqs~eTw3De5H7c5lFpuBe<60I zy2J#NogQ`!;n*yuO=pD!%$4KHJ8e;}A8fb7tXaC6$`3~kxdqxb$HqKQ$2wr2WQ+?; zlmbO5I1AG}OM&(m4?Xf3$z3HOSulOj;r{lv^5l1v078DQCOh=ntwvgck`TYStSOn3 zl&(9I3r(@d3LB^N7Y&^)?kGWs>&Y0a!Rd+>tn>B>C0OQci1=z7`UfsC*QSb-YTKs} zD;DVz8MC(a<8z86jLclf<8zn% zoWpOTY8blHLIWoP+XJEQdyYsa2${JmARN+|Tf06GH&(0cH0Vd2GE^kK=JBP!o^Hr&}o%MmM{910pwLp@{`ftDex;K!8%_|V(gWMo0RIy zSlWJ;T-aVCr+|zF^oX)yHeWR(W_u8c8KHxm_)B_d+X5VNgbKG3C~?hOBN~J8$EfuE zBn~7RrgpPr{MIt=Qi^PA9kNAp2n)HxH^dnEK~*1qUeW|7-?_@4R8~nZE;LytXg{5t z%APj03MsMR(n9h$AD21r|{HL?}GBUmA$Qk zg0^<1>P)Yz`$K_|phNxiB*~gCh1s||PgRG{t?F5Fqq9CUI|eKgeFW>HpXH6N+%flv z?CA8vA|5t*;DOZWwD2=1P56JjAB?&{uVq_f?H1)+GlILOO+&)WY(|(bpPIo;{TpCS z8A#mk00eFb40$#SJ|v%@Vo`I|M{P+GrD)$`gPg}`4e(1lme>lzJo#jUj zhQ2w52**$$37X2{y;I_7?%tF7gi0wo<;f5BQ z8g+rH(qZOV*4m<@@}`7lV9b~aT#+$%vhe3guqKa;FjQ`Zr*3>#xA8( zFBQFF{97HxzUt~IFEvFM1qLkm1h6b~~^YFX}#iLI5wBsgL7fO4cC^vpUx9RR6{!KT z=)j3O#oExhn|al9M4h5CCkF&gnf!ce^WK?}rDsKP`=oj%FvQ?%eqm&_GMgQEOvJ8r z@HnZc{pc1${FtYrz5k1tN~@m4N^j&mzo$--b|HVwzT@-Yd95((E6I=0#i!wR@ZV;c zVpfYcW%uP`nd$CRgSsCXA8C%43yR2GlswY5ccIVqa!i|RwPnjnj$JVjs#$yecr%Vx z$`lKdK5Wu86CS~Vs5uW35j*tsz^-R8#O-AsL`d|v*bxhtt*+72(=&2#sDvh!{!qJ; zx4-ODH#rt(rhQi`xJv2%y2r?k#hJdeNA$l(ysTKn%l9@#?JE287aavtEaKgh+?Kn- zr2jP$I;~6WeUm-&f1%Ta#oo)n$>{&NG5>=v>HN!60J*N{|3b$XJL4Wq#e?4*O0d#X zxl1RV&069S^?N#d*cm6GUU6UTN-vRr8a-BQTKXI0zo)Z&?ZyxbvfcFe>%%LD6#Q1)VxQ=X8mC&(?|;o|)L1_|w95va<6v-od4uu%k$3az%}Ac0 zB)7FerUv(hbC(?mZ4l^-Y}fJx4T3w$-6H;PGl-YCREZJKmZo+b5gK>io_RArn_v%n z*fX!~X%@?_ATD(Y*DX2f94SBX&a4Q+&cs_o@FAqZRE4#CBG+K-^f+5@@-9>vWR#u! zt~+dRZF@IS%FNWZ&a4SJPsNIokkBpC=3toYJR;yliT%uwG@)Og;4J!rZ()YVA(mC zOmZFGU^i2sL0fkkB?`Ih5Zo7Znq!5*&tv9}G%je|Y_rvfj%-(YkwK-YQlQe*`MGJu zo<+H;y`cw)uT8*MfhQMZ(MZLMd00=F_hI$g5n4#9kJp_q3g=ADzmQV+zcm4u6{}}S z+{J_?%_$>H(f%`!e$5z~y@bxbdxDgNsxo&s`khe}YE>m_bLdSsH^&fBH~^+AQXh$v zbBeehn+fI*Bh_jw1Ge%aa?L{myhMtF2(Dl6tLQ?;r#J}pcEh9+%ePt?efJn|OZ!tL z1un^i?XUEu#=~BS)1|N{r-Tj5H|sRh%?x&Ae(ic69jCr`{CaWPo0*HNN3^khL)qoS z0KX8gpO!`L&fx&U4|$R$XK9>RARNsk3`%iKg&^z)0+i| znR%GG!F?0~b|K5jBMm2`=ed$ys;a$DfiJHBfk6-Gq>&MNcB`n?9uS-}-UPzn~R`YN~d6prI$NOV{#zoyhYfW?j<_uZsI_xImwq4Kfpgoh{&-$|D9c1T3 zb!NF)-L@x6hsrTixi>l4=DkA?=UkcEQ!Vc{+P>u_?$zGbt=h?2kkA*Vu^_XSte$&uAmGb;EL`4;0;=Wss5wj_D1>AdZc;yBZw z-g5SnB!gGpw(m|0hMCo}CGFSwvWxXR_eFQU69mzJYD?RqYOh@wZ?2zCvqC%yKxGubI__~GDWM9nswz=jctt=(F>Wt zb-9XP_L#u+ww%OJQTe+=274HiM~Ug41*X-iR z($-d%!Na27*}L%3MJcSWNHMOgbfp+c(c9~p!N?yJ=zIP(u(q{BYToDK)^OT@y3{Pp zxWIzQD@XwI0NrC;1;KbGO^-ti{2CFKV^RT8h3R5X$ah3ei>2vfRb4 zqMD2q&V7-SrR?!q9f3v zs&4*5)9(XSP^lsRFF|dTpji;f9I^ftsDf7m>x2j`Y_#>FK_w}9>_U?{ujl%Ty64|1 zWW-@WQmMT$kqmQuwE~lMuDeQ@5laWI1FZsRw|XmA!Z@?5uz#-|Cb*+?##Qnc@4@|0$cra{InGjFwYQTa}=>ddE*PHJWx$qQ88kpMCp)mi{fO)8soxw}Or#PqnN#TugY^nz ztfVC%`}D}4xA!f6{wi|=)^Z==hoq{AP}J_zeKPOtU33e5^Y%e3EF<_QSxxo z^c+cuwnh6Dntc8=R`|LQ^UI_R@Twdx)dy4DVo$uIv}wz@XW{GC3F>)9+o0?Lh(n^HZwgW!PU^Mm*GrYRHMqBG}Um;{$1roJ`Tda0n z)66Bs4g*iMb{cD{bnE283SEMP+vkNxw8heN4w&1q{*;qe<$REEUA;4_Qpo*4RF}<+ zc|7859DE_69`hB!|A*<&EckfNaWw6Cb0IlbvMQaV83|;t>dbwBsQV+9RthlPIc?(y zJn*v<;@xq68dSwt>0aUVkr6 zoXl-=K^wI(>gCW@$PaE6(c(R+X!QirdxJ zr)>nS_ES>wcqLNLz?h%^JqVOc(dwxq;AZ&)=-GnnuT zTPeTqkVHHL<0V<3K05`v?SEU#1EmQ_tRFe=yx!z+CLTmR^_@PLNOOwruC0 z^wAb3xN1bS=6rJZdrc4q&dW2`%qzfxo{mGRXy@nm*@`nhExTXHhSACoC0iicT13B$NZKW&%zp zs&)*1`GtMnFO7rXBeF6#!{QEoaR=+nC6Bkoe4W1YqnG!Q2w7*$QT>b943}>)83#4Z zl75*1oGxipkNhmn$Kqv1hG)q~D&3)h4ZE?n%RgX}se*d^$yk7DwRc;u8@l$or3;dYNqD*H zab7zfg6UKjz4@MwnJdh$W!8vrC&Az{-1R7C=?(B2leO}+3y0%mr7Zt(x^d(?j{#wB ztkMn5blwu0gv z&68~(kMJPFzBp#<#8iKkB1)Fb_k;u2=lz=JB%!(TyBvD_4JxKo=fV6%MgxI_WChRm zN{Xu*4?p&Zp`Q1Sj9)Z(1I19@kbtlr;_RtBTTty0X?pazEMJmo*yR6pA7QB4jDC3hvug-J zv(0)cmap+6aK?hPI;gt+VAloS_IIBO}ldH}4!MrL}f>|IowU(1=%Ai5i{djaks{6j`R`1H8gA(Y7bCxev z0NRks65i@y=JjJKVznJs>AB{NnXe9X!t8KM?M2i(`+#}iw|{Y5gE~vu*kn4qc6MD4 zlQ~=|-Ij9w)%0j1>+lY>*R}QNhzCcrjv%UuLi^DSJGDKEOypf+=?MY!8YFF@Dv7>Z zZwmo2hAmebuGy4idrQX=Hw+9=p7m1;`}1{Kf}qhED$aoj2JsS&P{9=TM$K=ILPEF2 zl_mMu#RT#)e7%D6F=($rmys`O0+AxGMSPOqwr*?j9o1!YS(on88 zpmfyM>whFSH6T!&CAHL|B&XM#W3oDfXUISPe!d=p-1sUepe?(9J4Bara|X0uVQaQI zIjFhIu!~)-!vAlpMVf=sVT<0Rj*(=$JZecymr9f;uxE@Vx#hL5bPq3$xPNmmd?+^v zE}NPa9rfE88{Ql2C|{9JnpnNQAFl*_kNL%pUVmTUSQP!a%LI2Db29ftu3Z)?rWTk~ zWvG1cN{@{cSdlB0l2b*(tt!qufY~vKO&5n7^4Ix=`RPb_H-MtSnTp%<@pf$u*W>G5 zO7pAYY=NS^;_=@3{b{HIRmEIDWT)z>Qm154`PoEh`9zoM=3UOmzHU>flH-pglw2RA z;Jy(f3#}9b23c7n-^59EvWb6ML%!Q>S1)UU6$UB#?rN}wmqWRe-ICjt0p;o+vYnq~HS>K`BNZz)qlEpu(A&>6|b z8Dm%zdX`%ag~8jCPhLs=_c0Wk_oTrFRyKzF-qiLD;H%Ww(VSdIZf;11^39AkKTdl~ z%}m>r>Sk*@vQ;T*tWEn%21-EL2O%U0idd+Z?^y7ehX z@$Nx)P6k4Gt6va`^T*jBc^bb{Eb$P=-lQyfEB)9HAplIsp8%NK>=x9ltM~#d$|R+A zY<S`3Svf_+ib3(|8?ubbjx#>!zboxnlFfqcfP4UesVe>{G}$rTe`KhKx_&B8PbSNTGl*FXOPjI})Tk7=Zb}jOvY$n}&&P=aJ*Xx>zRJx!ZHXErK zCTyt#LAP?f)=diFrr1fBYi3zArg15#l)SC0MJ^&|Ym5>^@9n3#(Ag~*MVC+Lur-YI z28bgt1nBLWxEiY3ETKS_yqCKcYX$Sef-T8}%%G-JmjQPY@xB9O;MZbL2^!L^1}T9) z>T)lM4(-;q-L*u4PrkEX&Y z+SGl9DXrQP7#ON?GKN`h-O-z-G20aG`n57Q1J7Na&RY#g++xWCUOJL763pM@pCJb~ zo`y+E__0=xPxV7%zza1K#F^!Z;h;>Z(T4dcob_Djn!?ym2F)VqVFud|2YH(% zEpD~=;&MkuUlq3Gx6z=PvHR~g(`H8owq}_5R7Kz*O#7P5K#G{>e64!raQgNxl}c8RO<1Y@=;*11V=UD^Hqv50gSIdAT|gqd z!&KwkM}t4Qxz`=td-#w(h}*yxe?7aIZ)~|nN{H_S_M_PG;{BF(9TI?k?H?D*3+R1B zd9?X_%7sSIAMkdB`CZF|sSEH1AVN@ebi^rC&~NmUNpa`i5*`d607;7|VAgrjDlzWR zOAY*^LXy;=FI)Ni@WfxYQTxL#gr~4+1c01EGYe=47fUHd6U48*XZk^#Usd% zpcUu+U^w2YHwuf85o_rDGN)W)N?09FmJsalJ<(|1Rt;yKCOXkZ{Ca;KV0H4f@#u8Q zvo4V64(^{gB$qM0K?vg~xkCuh|ApPl#3rKze~%RZG6=?LhH3+fh4&-iO@fXSler0B zaZld#6bPJk@{z!qxzCXdX`)LkGrI29aDU9gb;+Tw z=0KhO#!A0A=HawSwc&3$ghdjgQ%#}=&S}D)vA#f-^4N@R?V7f(T0Oyvt@I zY!-t+4t^N2!^aDf?4BMU5{>rzKuwacyd)>mwE32bwG&v?@B`vKqUepLHb7TVP}LnC-nG4mj+zAy7$Cm=iLHJfguryT(QU{k-K6HU@aEa?wC z3{R*VY0)8LHJVP@W1`>7@xe%p8mv6b@jh{ryg7g3Udx2a(JhGGpKn#SWi_BzX%T9@ z?vJ9b6tX_(oQVg`C&f3d1u;mwcdCOIPrRa!K4vHVRG(+U#t4N?lecYxkK3;+k-3yK|J5ik76G`WL4Gl#w?xdN(EJ|V3RK&&1&j3d1!l zA3Ih3IerA=$vTf#xm_quwU*wft|Q!9)3>85Z8wE~X`l1k zPBjYTEimG1*p9DT3~`RsP-~a(xcZ0dO`-rLB?^)Usxv=J zk1a|O)$=XgvTlpZivEbbWyMK29$9zGMH&q89xPMmz3=r!+hz0=o$&JdfUHS$^y+v) zE=Nvw0T%3jg_Hh)IR(~+>>Kz_%=Qmck=kq0lzryG-*Bj=`alM z%k)2*ia>87^VZx6z>?L-?k{#CKI;|e*$4a~=A+3g{4iN zC>5@kj4j$sw6r9$`L*XfYgmp0X&VJi?-VO8@|aCk$5som$}k6r_T>W$HG5KK<_imO z(KkWKURaNb`e8`r(0uPllUdy7E(@!U3rWF{1J@1MRwH9v$~1L))C)6mB;G?GScW!e z{0rmBKo~yr`q}FBQKyxyM1^a)LQt>=-@L!4)A$xbxw!+lj$C1~9i3vFUn_Bdvzi!T z{ni=N;Z7esH+Nce1}H*X;OsYl7@&2M0m?cDT#-{_WWr_^SSf26;zPeJp=K@i_Ai*c zr6gb6TMJYF2na{SN_ZO5(42(ZrGro}q+Vso#=F`8!>QlJ4w6kaIn~7k+V=X+U=HlK zos0xUUBsxa2!<1oVyoDqGCJrV44-UzW~{Uhh;8Fr0MjUg?q*KQ@r^MRpC>tq9Vw(l zt!%4omeADa%b@$jpVynWkS6K=IE;su(8zj8WaU>G)Y^aEj(u0nYOO@UO+U_V<~A?H za+IDdh%L}C+#E0ANMTDRvg0~uzIL#(vivtr=b7(hAVZM#94dr?&ohOOpuRFReftJ1KKE)xesc zfIGi`UXOz{a@;tg5zl{T!K8z6B(i^JXtABRg0aTwR`B5CzaLot*^5pZYWv&aaYAm< z#qZ-iN|OCuvCbM1Sokh`E2rsWY)n{bXSN~NXZRU5#oep#QS%g4=}s?0PE)9C#FEteO9Oajt|R{v%d;e91|JSJIHC ztHFEydUtPI898GE4?IoOuFtJDnJ0TKEtw^$r;JJ58x}Pc9Qkl|CD- zZp)Le3%vj$8&fs!`OEZVO#_?~OZV6JU6d8(jokomEB9tUpIMEJiSPm^0;Yl%;^)i? zR`V{aCFJ0lm4Tyzl!p^mXlzCk!x>EIS=1ZyMRxBp(`4LW{BhFkL3uJ!c7SD<4z>cu z16#tg0WXMN?oVsq2uC?GV(|x+iKE*6e=5!YaHK)xPvWJp6$a&8@P4!oSF!0!L%~w+ zx6k&og~qhL7C4%ViJtg`0w2DK6DQ9!pV=bI?@@)-rJQ&c^+Y7KCW+DbDF&YJ5%*+s zmx~KJg16`WrFRfPVaoyyI~S(4rmOuJEkgkpx`BrTzK6E(7|ecl=KJef=@WCs+`wcsR+@$~QI`2n$!G8cW5Ei%onR#FgY= zXyP2UD&+0JK3C=hz#xY5)7J6r_g4ovFSGcCe1VBu(*mG9Yvsl`e>k<_ z%l!$mH2y!Sr<#Cum#OD~lNA!Li)tcnu>TjQ_(@BV%u|f+L!>F;JL9{YleO4 z*|WWYSu>&`F@>q^OdOyfWC6@|rShIXaW#0abeY#gyZ0EPz2}(GsMgZG0aWW0%!uK7 zAGQ!9-+mJP?Ah$0gI>8g$Qu!xys%&nfHy`Ge=mnusB*dJj}A`4mRR)ofdlJx1?jt` ziW-QHNG&hmStmlX*$wU9h9;=#kgWDd1T^0N7C-;V)c0DvYt;6 zlumYKLEpF5YoVc`xdxxpkMbDif)*+~HupaS?6;Q_;T`Pm0{rb9H8d)&`K(OIto~7B z^^=x;c?kgTgJ7$=#GD({XlRB;*)|>*3Ru* zZxzMA#YsxKh(i7H8WsZP%_-+iylFRr@|cq^NEQ4{ZB4O{w=}UFzYK{e2ZD`uXqZv+!>aZ=#f)9CUELH(@}{Gsb_*@K zHBqɦXO{QT_f+b6?t`4krgYSVt?4!0y7qjG;i{%-DE(A7OLY32cv?`ycg|Vr0 zP|rO**h(bu#Acz`xSMw)V>l_fOs0i|sy3nBl{!>bKR>PT1|C=vZB{1!SwI-K)o*a6 zpIWHH`}yiEHnV!fi|LXgBd8Jha-Qg$WADFju2(QSTwE37w@*ej1aqoGZPKyB93{Dx z`y-NUr_AdoSdG|>=V=7=_m&5lc5OO=h4@rk~h& zKhK@h?(ElxVN*maR@G zEYElK9~id>Bep!eDtQ9^Od5ei)X^aCa~l6RHU;) zrR@nqwY1V2uWtD#wvXq*4#(G-V0twaxrAQ|DOjYY{rV;4l`i)3d>RsSS*^?ydk}h+ z8T#{{bP<7%f@4wrmzb-YJkX?*G_^Wxwbc}O6LsHBAq20`{+TsF<#TM1BK4^|>D9;J z?i(RFbP0e_OQy}7$T5t*=^qa*@DB1HU&ECmqx=j@*Lrfd?Tb*V?C>Bk+vwzAo+`^B_o&pNZjIk_eHq}hx$4b{N!Qeg0l zXEsobRunlKokU}G0#!LgX%FkWs{&6=0@0Rl$vNWHTQX1)$!wz;i=*|0$^Wc|myW_K){E}P%%i}ztX>M&{#!&`_d<%xi4so^xf9!XRR~6&RJaY; zL`I4*g*k6o_`L(kX9XFxr0o~qZo$}44iyyN_Vo=ND>8lQyzZ0YINPYT6SV5N0mq0^ z5nOEd(dJ$(=viU&=ZWmLYLv(8#hP_C_tOqKM=L$MXhvbeX?%H~k~wuariPt(+Rpv6 z9x+Hjs#70iZF578wv)QdeDo~TI;XuJYMa%)%7}UBR!5M=c-G#fmi+7^x_=;y;+`*^ z;p(6n$C;mJvXE(jet!S)OXt=Lxl2$H6X2J_VN#K}1@y6}yiPs7CPd?m5^DYhAVB@U+ak z8R`_zrggS^#>)idMUCfFoO!FpYK%woBTuUwn~8^@=nZu;M-mV=u?;}!JkioLoO5mE zFx@Lp8_3eDMa0Cf(LzRR%8gkIO-es;7*{{LOiDsH3u)mQIR5%SH7I$Hy^Qi+tn96x znLtXa)(W78bl4L1V&dMdPHgdRU(;J<$6!;KC&lSPh|tC-uq5zB;E88sjPH&q#B^rX z*ypm=YLYFP9Z*NSJ}2kr>EHkot9D|e5=6#dKS5P&@$m5M%@&g(c%?=z-`Y3}Klm$% z@V9+G^_-5;j3yGLkgq{!_701!zh#|eHNRwzHyhx5$le5xApPMet z2W=k{MZOca+iQ9dFW#`P)DkbwyL(u`Bp>@|N3?}wtjs{M>kS#>!WHclM>Hez+az#{?Xp z9sQ#Y?e$VlKsvR3*kfk}H_&HG{nmwI!bbr1k8xm2Mk&#qEWAii>q<|;(wx(? zTdSbX)wWXZ*2BW>NMUoW$x}#RbO1lX-BdjpnXX7DEUC#X)efgw-({Z zR9e22Zxti`*Bk!n*KswTtEIC5cZAGDdLG=Pe-mpv2`a0wf`?SzVpZ(UP+ zN|G%B(lU+ne{%BWO~idS*g&pU?u^IH=g*%j=<9!Y8g|aHJoJ*bJ4WQzTnRB2@MB+_ zV8o^iC!J0_le!|}_{=0;OH|7s^$)jar*}rnbHzn0nl7{VCgqlH&zc2KACq-^uLQYy z_u~Ca1*9wSY*}b}`V9)KW{%u1q^F#1dK?**u=hpX%<;k zcp$*af-aFsyUg!`06)wrU2SO^8f=}@#K$rC6-{0pFaGT=TFIVhiR$+zA3DGete&GGWZpQjbquCzns$ivAmB8R~%QuzJok}HiBw-VBP5Z>$}NnF&&Fq$MnLf z#HuWOt+Uv=4cWK+5nGYJA6~P~J)K-0m`I#a*Vn{tnskcZ>&2EZr`FA9bG2vHk6uXO z#*qFK68(}5L5M9{&7HidTJ&6vAAA=V69XE)5KCD6LM7m#`ZfDtS5d@|aelB8{_3Pu zODsY~?W@53f4hKF1M{KaZ%SV;<0P8PG-BJhQw`qP%l&sbunv+<0O&*K7?r8{GZbsL zr2L5p5B?u%R~Z)78m$#U6ckiS8bwI~X=zXq>F!jzo1s%g5e22Bk?xU>8A?UEW2j+3 zdWHcchGy=@<5AB!dY}6|_uocj&z^68Ypr*^?^^3ae`&ayE&pU<{%)v2`630Ep{-@l z4sv9~pFhjK^MP7}-T6K9eu8B`Q;`IbfKs={lFC{U%}-UR!TS1c-fgItOBswtX_c|` zr)!>8Om|G1botN~fgrieZ0t|Qj<0aM`?w{zMS;+fzj1tpe*Mk1q*b@iVx)!E_jAOjnLwdVO^>vcXdXA|9zUFRbsf z=uwKiXUd-2^+!d(RF6}3yd3F4Z0ULYd)9fx@>Es(R*zN zo+5)B?!K1WdyFXVd5fs-jg1!X!JSSL?6#)IDQx!a%_v_*crRv64vAG4yKdzb-c?Y@ z2v2>f^WZOvG)Ut-XAltilUDtQThGsI)fsM%68rRT zlaArwg_sRJyL8Z}1DE;UPZhL9_$gV`{tMmtH0tPMqvhr_+;9rnJJ!Q#pFitdI)@7w ztx-wF!b`ADLxtT~)x;`*2NX`QjXD#aQc1bB!`8d^KtqxBj>79wkUCG9Fk8NOmCc>mY$FKW@DUhC0lJ>Wv8H08=W82T?V?|Hd>~mlAf_^r&M_@ zMbf^Arv~@l#FmIL-jv6kNZ1bRE;_iQO=J92g8q7c|JMzq4s4E~n1qas`KH|2PTK1E zQmMoC!MCE$&78OKzrBEScTOJ(ayv%`LW+_v0d)I2Kx+&s&bOL6aLbwOn~`I+-Dm~x zITrfzE^617Ycu<>Ci3MuAaZ?=qPH4mY;@NTc*bg()kVmGbBcwVKkX=q8~!Le*DHWli@1TCQ# zjGjgK9!v^S-s`@##}Ju&YkM_TpH0)vzHjQH8-Chx&MfqMpw$AuC2QT7Fr^Xkzv~cSl2tTO(=GP71ZHsU9$l>=9}F6| z*T@lzpXvv%$1dX^$blW~(REb;zN2?^Ydo_J!2lqI3Ve||=^Qge&@S)mRYJH4Pv3lR zm(aGNdw5SGos*6ETe{{Xg||ic;%0XQh(o0N_B6bj4G{^Bvw$IUVCjS>*Ud zt&Gw!Gu;Dny}zOdk50O2E(I{za;PCY{v?Ql&6Nb&X2QMx}252N5F4prPWfsJVMMUhD;$G|V+XFVBP z%c|=q%t5!l^uC`FNyA&atSn!eG5NByfS9NC?~UC=(${qGD#g-uRP7YMV;63ZUv>S%g8C<|yk z^=Y$87Yk^}TFb)2$Cn4W~GVZ4^EF*neRx9>}_g z7;Y-vEC&40Ae+qZr;=Pc=NUB4&(^-p)Mvz3lzzBzK;SjE=e#k|rKeV{MRuYp01QZ$ z0Rpfg6$bEJN5VZOr34Oursm==jER5K@}lkCk||~^K3k47HkTf%=&>@z3?y@Jfaa0Q zJTAD0t;{VXHzd=pU;ZWB{$sr$1Y5`vtr5pbPJmx~Gnm_}^AsrGkuA0%yIJ(`8c8~F z{O9dL=IQ)L!J9o*_xh924KNu|1}P~i53j>kMB6|tDjU#A&X1SAIXdYTZnW*8{5`@4 zWSZ7jX;DBz%7GHFix=Ral54*rZCA+a>?~fPR43PrbM1=jhS>nwXte#wos#6cpZ}hl z0*D=QE?s+zI>#tPqu)fzs|BDHN4`(mH<%wLXCQoL*Qm}+J&*twf62>P@2`OKA5l=U z4)^ZejWG;RTo(0_DgJN{yl`^~tTh~7=DYpiBq(GM(Ul(XRd#OxJwd`W_V=V1gaF6f zzWn|V^%G^qP4gFR+J-r(+h2l+UbELtG6rz&!SrySv&Z{F3aL5(+Adr5;CQ1i3bxW9 zE?5!G%DW(YRJazU+?Z^;(c`n|y1(8KUpyww@O{dFb{FTOk5rE<$gB%bh!ek?D#*O^ zzD4f_#yqD;Nc`_;@*{Rn-#zcPLV9BxWt^AkI$3lHZ~NwzsT4#r&-T2btN_5_au^lqk9SE{h>3{eV#aS+HOW<-*q=R@ z81ru@h&8pas!Y$!P%Ji4^5ZdAy*aOKk~c_%m1kw9OuzOMdN{ z;OC@YP*(>vk!4Ngbu1WHp#=U8SPrrdnlnG%-#@fop8E`8xa~(10nzr%k$aPfIPZGD zlBAJvZe}I9RRf`u)Y}?;T@hV{o@9sY&_w~6Xm-Be1OD+kzv}z<>Eb%6huZQu%cUi< zYTC);6QX}FR0ma@6XkylwVNzs9LoVR0l8>f{I(C$>~{6U%mK_2w{(*2k|M@yJ(u;o zP#>2mhYb@CfLIm_hyp0S>fVxz=ld*S?7d#Uw|9UPD_=OerW*DptvF2t#F>DZx^1li zu0uP{k1GNc+bZcn4h~LM%?$5P82-;I&&yXuj$I+!wc1P%XZnH%Doy3BHL43PcrE*R z9Rjpm`_|e`o{e;%!R?aQ@=N8pe0;}PTF7;?eu+Z!6My|8LDd-W*IqNvMHHL-<0Cwe zOixtS+6rIm+G=A#YgoZ^bij~)nru(9klALO;YM%AHjtU0b-~iYvkVfh7(IwR%;~i} zzU3NltZwAL;J`{}q?6+(Vl18@p#6&G*kBm2!46tOS$L8&0Rd-P~#<(Hny z!j-n1>W#LrGavJTSyUt)br?w%M1gR$bR?^XCPCay7p^b;SI;B$#&xKdRd^| zNnDo7<3En-+qRYTAYU42s1-X(f>g#ghd9KMGxho?C=?YLHA>U@_$O-!1Yxw8-j|i9MJtzPa{)q^vf6^82YHscXq!FQ@yt; zAjr7i5ko1_0X4=dbzjO)dI+#F&n0z!D=h-{ppov<;zFMBAa-M*rh(nB``uEv$Aln# z zCP~C6gdJkEhZD3mASa$Au6&jl+Fp(_ix1FQkG}x)o%)bzNB(Apz#lvf!`|JE-J(mEy(B*ngpE%_2lI^31!b1T zG3ANV^(M!co(o-$Hqm!s2O@NwOjpDq06HIqV3puNmt8v$R?H4c>U)?(AlT`d%+>9f zo=lU|ym8UYKtIjH35(Awp-iDY*FgvC>(^J!$<eWB)|i8B<6d&#n(1EqI~AR&diztJ@!>cO$|BV1-jGwrOjiyRqszZkc3_DYqjVFqAO&y9MzLzUGNOmdK(X+=9!+=?J7kOUINzI zYk4_n<2yO%VHj|srn21716c!lKOOxeM_E!gXUkSJ0cEZ2Wej-W>KQ)#jv5XJ2102r zwtml-e?%Py6z(^cPPUb;+<}3Hbr4rBEX+&~vAkUKAzayfqTN&+RGMw%m>&fRJi!e^ zV~eU`uO9Yaf42&>e3Z=EA4zIdC|XDJS^{F^!I-^8T3C`Bx7X$ZX5>b*KuWg%%IHAW&|M?E*pifF>c*#IT@h71v`)@#M0AOHJY zQWA^4gFtDr-y?x_`XJ%G57NfEn!zC<_P2#ATiL0|6Vsn#)T=Z~aVOsB>sC6HL+veA znKk)fhr>)S73HKumTrcJW@~gW9(e)Eao@48xYH@E8R8BOWu@u$-7e|OIn49PRA3mmu;8W}K z6+=HJO{zz+E)P@xuj(U1G>O^#kh;sQf%{0mTd-FbF0)>mGqo?c7Ud2E_I6?0SASfc zz(9SVlsGB)KU&#Oe^jOvNGhmn+5P(GKmB?&eVt?!Ks(UgedZ0I3+;sUVZ0A@m(k5_NbM_aoB?Wb338)jn%Q2SMCWuB^IHSW2h{9nXQ7&u?dJh2OMi_*)= zgt$FgBp$l+RYAyeH*c%9w0_dv){>nE)ry!0<}W>L#@U;Lcm+IU9c+z~IcWb9pw+pE z=!(yfCTRBq8F1N`*txwv&oeo9;zW{(4t{;qpU{qBf*?6U3@GB~e$ex&rQIi!KHie9 z4#?UjnE|uXix!f=vYH|i*iMQyrcNSkl_?m?`_&f0#l)m_5@Fe44sRYs?g$1Q2=KTK z;qy3m(41fyuZHCYjmoJ?N>R)(VuKXUsM)l)#YCFcH_*|dglUX~#nPb`s12d~16;imB$1Ck_ zk#)qDP3Q>}vb$2;isQBHsX;~1dZblYsp;=e4K+JZV|Hl=N;FB9fqE*9d>y5@BS^nt zwe`nD9?xk-7q1YiflOCwh+miMPTlyBHhB-Mb2VpgW$%bu7%+-bi6?c?aqa-4i6nucA z2%Z%-lyf;6!MyE?cJ`ITrnoo(Go*lRKd$)08c5^t7ivWTGaqgO+QfN|w^(Qz`T_Co#yEM5mDt`egRJ4voR2BEks#{JXJhb%+248&aXalNCyVj zjF}JewOk>!TWC)!&Rc7ilI_j)KxU7!l5w&HrO(?6w!(^yzGhHuwFr#DUvpDTL{U1j z;l^>0B#5C5R^MyzW&bC;qNtQ?RgzL`JeKw&xFC*qQgzEZRC62B7_k3@=0(Kj-ZbDP zUi;37-|wp5$|Lsee$5yLcLlV*3iNS0RLJE$U%oNiP#*>KN5$R6BYYOkK2;x$hYLOT zldd8pO8RbF)7X;%+TFw0)~`EVZ~b@)KaTF)lz(z}EL`n%j1@4^XkMo9dPYD#3&(h- z@wLmvyU!m|-+XebUHmEKL4X7w!Rk}n1wYSZ4K24yP#{$-5HW`y8u=9HZJ9(z*EP#hT zX8qwZ{$t)wM7R%~U5WZ?Z~i{I#t;Q1;YSYNq(}uzclr-b9hKJQEZ#&8*Jm=-A3YZU zURfao)OQAC431)ud39heQ9Q!F)-_UP*lz}(HA>1sQj zOCD(Q+iJ*k2JGDv+^+8u{~Wo&47|ugG1muwD2lRVnMN%ZcXg>#;P39Sd1J2~G&LNB zDKa#^-Zay(in{*cUIiEC8ML1E-KaL=Xvk}7aPVHcg@;)7A=EolWhswI!Hi5=C1nZFy0 zePUGMH=LXyapQHSk?p#TYON%1wq}aJ6}J-_Vq< z!;kHZ173RP$fg+9VRb|~-!EyCZ|+rt#a9?Brh(tof1~DIf!)BQ&U`Ti zeSh8}VmP9gT%=b*&?TzAJ92|{y2@h^(-rp;71P@v-9ji%QcK^K95RVQM^(c z#9Ru)kMpD7yeT*n_Nu_ z8iqu9g#R$dvr5wGV!L;()mIEgZM&^w55vjQCxLKHW-$i*W+!aZ;`aeL#`|5_5^+lmp zS6iEk9P<3$z9+>= zg&Q5iNkgmQQo>h2&^DM$bHa%n?a(yS?E4nUmyGI9a?|^(y zkMoWz;JcG+X|@00^(ZMuA=p4BsKCdr+t^=Sra@*;SJzOkF~_h>8mGqL*Bjfl8&}|? zi)xzrG&zl+2g}dDt}S)Qa}8|uk}u!at9uHcP@XbM^BwR-~bsZX=Jx)q@&-713X6l=$SBnhcA4J0j?GB&k?mR_)S-@)828@ z;4jqdj}ifE#uF9CBqtZNuXT?cwy=|4p-K0zhb-3>3U{`)vd5ptSKRa+;07Wb|`KM5LKI=^O%?82jRX|41=Y);xj9=KMxf+`Ff)2oQ+!UsQO zv(KYY#qG0U`ARQsysbMF4(g`Fgh$uTl%U_u>|9i76(ZeEc~?wrNP_?wMN^zZ@xV7JXjev zzOB#yXz0lM%ex6a*Qsi~!;g*_A%9XsVBhz5*WmoiO@05dBdDoBqRL8JBcg;q$g6Ou zL{+n!YPf&H__&RKUwMH;Ye&f*Ej-e$gx0s2DT|d;RtAlv@L^87+rbp7x7V!csSlP? zkPa5JJ1Zkb7m-7Ki4xeV$h4B;7o1iFmzhpE)+45{E+V@Dw>YJ?6{)qX2h!ult89v> zhOZBOhU!c04Ut&u?GLV5kV7+#N5Y5$4r>)@x$G>AqmRP)<)_#h8X5}JdDR5pI$qS_ zT`!(>Z7|9)iZ`~iSU+H8>C;9}2n>FiZ?tSoMqfV{^(E(=j>hl#&!7#)OSH3ZS+Ph+ znlL_kdHE5}Fl*0I(MvLdz&K^71T;LM4Kl%dY6cznc+x?&)kVPoEUyT1DdMV+4 zQ1HGg5iR$*qknZ~Ms+%mJ&D2Hk+=MC#9z#1c!FZ~A+am+$wYp)Xcd7CdnB@!NVj&O zJUrJD&v3G&)}s5MM||kLk)ol!+Kg}MG`>B&RHir))?0w|tuCSDKlF(Hl9L7`!#VMY zV-0nMy3|&B=vc{SM8X~->yf>spFocRz#LdQw8sA70iWWP1;hRh%;p?@b!K}72AQk< zwZ&6O$FU=h;|^S(>wFv|T>D21VwGK|5f$dyRl2q=ZThTsMHPZtwM)Sk`!4;Fbp;Sc z8NcdR6ia}kn^^V^#yN7`UI-&Y@ddPVVXjwEB=QMwqqGE<9iR$w)Bz&yJ)7WwTc~^Z zD7t_|=r+CoexrfCQ@RsQS2|ZRN#(!tjqg9EC!g0jykHw^&SxeWzb6S_15}f8Fb&&`2g%34kbA2Y zdG+nln(CEJ+HpSKe5aD~>NILEa4Pqr_ZGdwn!dzY_;}~HJux}rFrGSA=A4pYD2={; z{yx0|o0=b6JZBKUZPuP~S_g$zowu^%&pM{RzTX|!Aa<~adZCu3znFGi0{}f38*2-< zD<*P0rE|%}>B_TY0}H%T2yD{YJprh(5~}s_Zg6t*I&{30JXFkc_igZ*Gjp@EZTxB4 z^&ern;J%JKt)}hkGH)Nyp5v78(@= zg$UqSot6vxcG#;$n$L)7#c~Q_Kh8Udm$KOirAB3Gl&jpeaxzin$`=29hCr&8{yH*@ zYLB$cgINfkQWV^yCz#BnY2z_D?p zM{Ij{w>kw@ZKbCTj-bwwd$fl=)6;)RE zxw;&88xN0jp8{E|m~f@~CX;(guj5h*EMErZ(Q`CCeagpYVe;mQoJij5-*Vj1I^AkC zfLiPBA8lQA88QDui~?S1&Ti#>2zD*^Kv3A2-y^@6#nhOlpfn75|rWt2VBfob{t)6WLkO z?rLVV(E(e$Do^vPSFnX)iwiEltJ^5U6lm6m`}yxo9Tlk7mz6ubY&+*8a_pHwkd)Nn z#ZCSDr=9Nzl38eO&f5(%-OHpReyG1wSYi-Je~oduQZcM_1n)dYILC5#Ih5 z+;S!6&U0DT4UBN*o6`9BkYPQ(5rRJa{pOV?w?e4JQz2^1FeRs`=$xvos90WI>?WvHswJ8eV}3e#(|m84N8q1$F(?>!KymV;f>qYH zL^%01P>3IGrzM<-pXM#yE}NgYKOWuD!Ic(Y061Vxg2KGHIkN+|ZX3-CpS9DL!(#p1 z7!Q*`Q~u?eCnPpGFM}C|>Jf&VGLOmk~QT2f1xh)`lf0zkU>VB-`5-Wf`g- z&29iix&2ix5yZn7MY&i~^JL%LLiECoYRNz_;Zx#bcY49q4)|E`t05lN{4+Gs)`SJ0 zg>dEcn((lM?AAD}tU}V&M3lLFo7z)ZrNk`FSvjRfbQWj*hM(O=hmV7Se0zA_aDq*= zBICH)EmXj6?A_&i4izFk((xXNo7_s7WcE>&u~_(th^;J3GZ z-=^F_b8~#OqtOfq6E10c@uEJPV}fF&e7F>akCGeyiJ7La5PGgoaSSs{8e`Jj_UPWl z9(x~l(%ZDGQEL`@KVb{i*1%4|-K48u@ExGcnXDR&8CPjd7|}0;vSu=k2O-gP>Vb`5 z@x=Ov^gW5-w;?iS=BTTExk;=;=0Z z*>xL3?_O&@jkMhNhHIwG&(6kXii?y0wBZzg>X~|$7Mz){uWk665AWXnyGZW}*)BJy z^vi&C?Ur51l(sIF9qHo@kKQAW)+=aI!M|p*0GNXYsdcg235yioK+y99?TD$CD#Z+uFt1dy7yEwI0(!nPQfWMrY%~hZU_MSm zg*M~wON2pI`5|FnDx8+(;Nt1IOp)|Cp@`xkd36gLj8AV~wJyq~;CvHtO;uQ;KwdUm z>^)LPi#~nSW4#1%y<}KZt@y&sM;ZRdU8IH{veXBuBt~f^-TENw4|xY@Vw}&4i8kW6 z>x08^+4I(CYDbB2wm318OG?Wmqe)TkJj+*4{aq$0rB_-Os+H3*Opt zgYPWKBOl`&gC^Z##eDo<~XJ4SvXxC62ad*e6FSf;K%Ui7E$O6JrtGYBX*d`#an8ySH+rz z51xp!d$MfKt?UO>0m)t4PFhT|U2gr};vN)}tB{nNx1`hUy-3LmK`0)W5|0lrbQ?QZ z%wfMkZ+rv%Pi+B-5mdY=<@lE!{kwReNg5Xk%P;wqMXF0!p(iwt)tyufZz0KNKK2@& zl5o4zL)6nKxa=$fhPtn>p;f&`y4Os!3X-8cTnAn+r}mcE{q|RN7RTSdyVmS6Y)MzI zu1Gb~gW`Er`!*2rVmoa0PXsRKwZ1$fMW3_vl}kvVGot%(iA3VzYVOE&(MGR(#d_5( zw@9fo^Zf9;g5MLS9AP$Gx?a9|C6@l=lbMgy=r9WFk}3gz8ar-*ep)OiFJC=4Wd?SH zJ1(_pZ>>)!Y1G@8x6mtBtX(Eua?JhYb-W#E&Y@##c#zw!kDbZIcKQoVaNcOT;7#Fu z)$Q;7^{@3YkZIB*#e5BO=wDhyz?e7V@-)l$f%M(gUAGB4d)`+wDo`c0xEl4lGOi8taw%#?9)Cp7jzmgTV_Y1{XY_C! zYz6Im?Z)2XarQ2lJ{N!MCXkeicD=R!Qul_(7gd+OwAt1s^uJ(wPf%b=$P`v1@GFg5 zkbbIpP}};XwPDW&2O;cm)Oz6cY-pkFFqN1JffyML*J0?Z#1Vz9tT|F$shuF=SFm|Q z&H+U~E>{85?DDTWo3A4y<2Q9ip7Q&MbBar8GJYr{BHM?{IlJy!K|kCLjv=ezT4`Zr z%@?Ku8O*$E-dJB}Sur1CzZUfN?bXnyPs1skdHixT8-i0<~I0!sS=aTgk8OTxk57G?FB2K))P)0K0?ZYpm46-`THUs3p zPc}~MAQTvSVEmA!;HTJM^GeeDX~nWh0sP%X?tHy+F6poN3k?6;*5W4!$)944uS3wtmn6BS7wNiL!hDq<*6!Pc`rPn%wz8_s2*DDe#Ih3YhD^en9?( zl9db8_qW45rvHZ`&4eg6LVafDdo!kTw1Hr~T#2 zRN`M%%B38bLh*B2U-kdTkJ3WIaO<~M$F?YhF|(|X_kL>PeH#|6~>#Cx2#^~|HH;}!r%dfOUXNBB`rK< z;_W5hcfNjpd^6k2dJz9YrieOYWl^^zi=G3m7XiE7t6czxxHS+%S+|Pp*`Gsm*Jb$; zS~5p6->}`TEAH{LHJvqvxE*?C_^l+hNZ78tPkKmK89-{7b-MF44=Fb=xCJs-#ec*MR= z7ZVU|ltcO(?0Gqijv*zhQ>x{SdO1c5N5>Q3%HEqda+X3U5u^o>H2<$De(3ZG?}=@F zzI@BZMj`uD8_mc*Y5PPZz$XhE4#_$C?Ped3H|W1a4zp=YXt*c>Z@Og9VCjczP+7(0 z;TfD?U3k4#GsvsFp(^2_rlD={=-hinMHMc>74 zK3z4MP(|D^u#^M-si-`78x!ch*etV2PZ|=2V7^A9n3tNuh{m|hTJo|oaoo|ne+hz z-;L1{OjjnTK=$K!EvI#V#jD4UiRXnKXxH;poIZcG(PTrd(Y`|XlnY)Wr^_PLPs`O# z^J6)9RMi;>I2NAuOO!v_LR1p*#08o+c1gIsh^~oqXq9KsY49T~kfi?A*P)3^_bt>? z-O(WyQk=Hr#}|i9Ra+U~M6SYC_QcBNCh9IZIj zqG*6_?nvTxi<-ODpaJc8YOXQNPWAEQj5;@~;t5OfLU7}etQf>FE3@^=7Mi$lD98NX zFxU|8B}SdC4Di^?J{DV7|0V!f)Nx=Ml;GX1tA3Y{G)kP0Jk076_ExhC8B?8e8ov(Z zgDDp4nrNKj!|sZ1_I&huv?OV0V$l;&7oQ;GHRgQb@B2ssoMG=vsCF-uPg8847-V#N zX~YQQvD0JE-#Yh>{*D{89`R0**OFGEylG{6;=pR@VQaNG#Ldb&&6T&YMQ^@`8mzT- ze2?mI_r4FJ7dGL0&MA%4YD&elJL1SWT6~D4x4|^6+-0D;F+Cp;76;gW3X4;hV}es< z=x>?CX|pa#7~ayEZep{ZnCuGvi^@0&!33o5Ejj|Ai>U^z4ms$C;T}PHoofbPt?|iL z4V7u{t!b!)J`{jvw`8f`+iy158c;5**(50wvL@^m&^(JI>D3kR3 zOL?8XfQQ_6SI-u(^yH3@B>(_OFtbc@p_DJhLC%|jNb-FQoQ?Q3SVrX?ilXYVoLs@H z&$QZ}R_{ecm5Os?tQ*E^xfY@TbTNlU#}?KC5HLHT#V91?&^4u;C@b~h;Ooq<79RfZ zC*jK3Ev4gPo_Ca{jL6Ko=W=6=e&UYo9m)aMzuA@aF2Wnxt*oC4pCa)L?C>YGY4`4@ zNfX2D`EwGQ>McvY;(KXn%?6fiVDj>6D*W{P*Eyt_9aVPZBIgP^f4s?4K?Ce2iqr3n z7VC7Hcb>_;tF4<^SjcH(+uP|y^p=XNl@Kb^t#1^WjJw|wPO5)3TtZfn8$){%+7@+r zuKaiy1%iqK&lIXXP-QFm47{&LHemOYlIDg@lr2=VJ}KWOTiy7Y5opEr$j!5W^CEq! zExBLG3!$I2RVp^P<4-Z{{C?S)=SXijArCI&4S%8e#hw-=8JfDnh2LD6hq~qrpwSgw zq>H7&0*BoZ-i2B0mFYIg|D5O9^Bk~l*AG34KcQsViC>IKT|SqtALM2$QOLKxRN~CoOEMqbD;`39Vk@7;{Qmib8*tCvGr|>#NsF8UYl+yN)ht=V6#m0K8M>vhL zpv>w}M3!D`yuye!5v9yeYVQ$oUQti_Za9w$o_? z8*?#s5VeV@<8VcI2BqrG#6jw6LtO#VN_S05$m22C3^U|PBnt-{Ofm19sMXSdyePMV zGzY3`b8*hhn@>i36XRb5T_5{<7Qm^XH>3{*UbN&`++b_NoxE9%eR06AK@oMprs$ zPevO5=Zo+a4K}MMP%m;5AQLO3$YFE;CwOyZ0~GJ%4c8xA{h#_e%bUR1HWSe0a?1N} z9UtwJE^}%;NWTwgLhNWCD5J;k>B5~jgp!6^Q6h+P$#)y-{NzD|-#rwN z(5zi~Wk~|`)Labt1I(68hZ35RbB9FD^KTEo-bXg}E-#0zy$OSaIn)IdX@L=XQ>@A& z{(vqW_l)4@p854k#R%^NwHc0Ak7TR`3&tulo1#Tr&Ek}rL&(m8AMLQhI} zP*Kj_tN8~SkOyRtWx?ETfM05KK*Jcp`edlmXYa!cR5|We+CD&j7*QmhUd+Tscx!ep z)Uyc+vU8~(w@|@O=%%Jq5`N{omYT&hIjY2ncd>;6RwEShWaL+h@K;wd9pWoWsb!Uv zhUa!HY>&T?5u7Jdk(hM__$0KY+E7*1+GOF^Xlx;3#7?VT@AZjpjGW3&@k;{~+-gMi zTEs|i4p(vFFl?9kY<>=`o0MT|L#Qtt7;QB3m5Ij>joWccT$fqVW;52hUw2u z&$4rhtF9gH$RA9$x;9jXY=VyvxBb>TNNl`}{Ah2_ve-O2>!;e1_nR|FPE2WDZhBc} zm9IRPden4v|SzH^UN9MjPqQM@3RW}r_s*0TVDQ@3v zutquL&Zg^oqk1_+$ST*;rv#e4Y~!jl3#6ZlpmeAy4Eu(PS;M}V4%6Uy2nQPRaM zHhJ5{z|5lvx`Bln10<*+TE`$5gO(s`lkhHD3R(!5>P=L1jD;_pd8#)|%e#-&Wj<)& zFPYiFR1@nW&gyJ@Tua>Dy5!TSJ1gwHmaKr&rLs-dchI`n=g+VS?N$UaQu-}e>?*&G9R+lpkUJu}3D?IB9k z>Afg_U3&})u37JGkX%`~C-2|?1ZHt(r`ODMtwHCYOt`Xue9wiRWVgqtY~Xx$PL55$ ziqydnbg#3r&vz+G^TSIYqwMsm#}Oc_76y?!Y3}H~ZmPsZ5b|Sq(d>v1ZFEEHzLw-l zb0*TqoZ_fqmCFzWV>zJQT64``r5}}P5T6?35<&}{-XFZ8`lkxeHjYc*YvM#-N4>S{ zZ6o4sRn`Vx=m@(HaAa-Om1s2uwpd&|ynRcF*hP;a;JlursA-ehw64w6#KcrfICM4M zvgNVx9n;k-2OqEtgnbjSwWW)L;WMpGWr!#tN@;P^T zM=_p5af^K#AgfpeRBA-0`Q?o|TKhTmS81AxVfS0dDO}r(Xm*sNeXt*uvD$M!mA9>5 z6gl34N^IIr*yYO>mRo#8)YaI_Tk89Cg)xkj8c*>ul<%!B6<#*Y8Kux|o@$?cTHqqS ztW_kZ3tMofuyr1eZxzdfO{EoUBa$+S(1}Th=*t@$;$)vI#V@_z=+STK2*8fJevFC5 zS-|*<@mqZ1s~0??KB&8GTjg~e-M@(p_A5#kUjTi}&;2^&z77_6-Rt(YEGlxuCC$;9 z^#FASaMrqu`*`)-qB-c2S0B#AIxWrsn{lzw$1JiG7pl=kKb=C-N- zN^Ui0mBa3Eq`3y2Da^8Ww0f?ZVlCo zb6Xp{qA1A895_Y1yrpXy-e=NNa6&seE#;+jPJaA2i877>D0}nDN8OVRTl-;KuLURw4luBP5K`CZY^fiOno565T|e?-^NikMHbk zw(mgQGwzJHt(}cK=&zY2Ain64IImruKC5+%wc@3gnDkA5z5OUmGoa0sWVp_Y5$@&( z^2zZBR9DsUqN=i@-nnC7uo%I`i{@wj$eGsu>!wb=wu&6?-R=lv0DS;wRkOE`PrkR= z{9|iim@$||{F>@WMs6b*o?cD|*XOD#XSrU>Q#;WAM*Si;}R+h|)mAE3iD;yho0w)_Y? zcp%;3qAr(Q`#Co|*h&#%X$Xm9d89RYDyWS0{SB8e@p^0hF&di8e{!6F@OLVR`Yy^k zP-o)(sq0E7m=y1XNQnxIK9_97z;RoVJvvj__%+qP*FJ_js3FL!cZ0+9quCwcmrc4P zPB*xOZMz3=|B^iY`>*CsPWIp;d-X@`%it7;2b8AA1zFSm?b-fm@8Q}Eo1=->`IpE& zQ|DGSvuyJsBxQk)Gp_UIt3f}kK!q-d|a&cI4+$sfr5Y*bqyM zn%zB)<9DyHsM*w!>{8V7~sdAz-{aN3U?nY>~n!uS&!?wjRnb^qUm9kG;+DAcN z`uM?vr)p}-ZLn6zGi|kcDA4bc0~PoB{G&Ie=DlQ!dD8GwC%v^f0D7+okii z>OnvZ$?+6Z;M$_ErY=P0G&1BhSSVZYcvE1}7 zvp}~v{OP1u@kyWnsz>U-&KPhA*fTL@BZD`om%rjWXw9!j1=oG3DDXh-;S)reT(7$i}0N<*Q17v0ix#QJ-a4nMJ#P3k;_1Zu6kPKZOd@;O@H` zJbxm~yP?TaR!k}QKni@Npk39Hlccz?wcF(H)H_J+x`_pZ`3XKNeDEVK0*2IisdDqF z5YAR*yyM7MHZu43kH~y(5BpCz;Ll`o<)LUM_o=yD9Q1jYxcnalHBkE$Z(jYCeof8~ z1gBLeTnj)VPe~XrcI~RmWS^kmr&EUfyQR-DGjxU%%Mv?V>Uw&K^#~+%f2^?FPzg`x zMpd%7=hL62S|+(Wa)jLPTJ$i1i}4%UIzmRr;5LnDjkYQsAA}xhRr81K@d1>tZY`4a z7hjfaf0}sbL#T$B7`xTb`o#8ZygJ|f-sSO9S76rTk->?cOHqxE5T7v2K2kgBT~k|+ zTKdt+sxpO3G#uF%j?bm;lAgvR=5l;`Yw#I+GL6=LQg-ISot@3`heR0I7fWz2inR4t zE49UuDALK@eS+(b@(kjHrso5v-Z>1wJ%RdKL{?`$7~7+p2e9+`9@_=`-d}r#5nXv; zZOFbZiufWVN3y5U!=OBJ>*vWoCnc8UGtq2k=LmY@dCxxmc@&ch0dCQnb6v`zfF)+l zO)F1D%G5Zqph|l(ldukvZQA<9eNj<+jFAfaiNV~$47m?^P}n`cvs541_$D{klJzQw z;Vp;L)s;zAmQ{R(7yb!aB4?|SrDqzGH~stmTdRxI$6jfk`7`cPI@xh zC-a_x+DF&l$GGJ6Y1aw{sBRE&xgC8ecx(M;m|pk_bASXrQ0KpBgYmrkMooQCVVt_v zTfUafDZPu}I~4?|faQ^IgMJ%#R36~@+CCiVYk3d+l(q`n7errGMyl1dl1rPqb&|G(fDG^8!1xXbUyAoS>UOJ9g5y&iM*6YlcPY zG)j?|i1LnWKYD}}ukr+n)t3_}3Cw{({DdnC!d5J&jx%tD>th&KlswW;Us!5(f7Vy{K{2J5=U!=8qlKSJ#&~>So#03_n&c1 zZCf8Gysg-=0V>r3hzN-E8r+B|2uPQ%AiW7lhXghok*)$F9YI8T?}VVzl-_%SgdP%v zgai_jyepoqqQ{)){cwNxe%apwD_LvJF-QH6GOJ2#Mpd9&`0TJTYjBRdfuPf{F#+LH zz^()Fc~1f=QN4%w+-r-2IZnG~+6_kxjB|o>;_Rsd3__sKvd)g8D_8H(N}{AI?Rg|b z!sk*ypTNyRcrRvX-Hh^?b#I;BiPO6R?g-bW#iJEYy$Wa``^8^Z6`#HNnullZ$&mAC zA<(hVYrg`1)BAy8sZZVU6RON&;u|X}#d53Oi?Ys z&d#=Tmah{&>3tzXGy9r88Hl+yKpZHhrB;aOf(zaWQ=}7b(0p$wZC8|`RUHgFK)kp7 zcfM97XPuA|vfehuD$sZ*XUE2k6?Li#(5a`GGG3+Sy0*W-xa``A?2gi1xS>5j;i4$E zAUp15a(;KirfCpo+FPK+?P*>?{;tz=s8@O|PxIG$m#H6zTaAL8>UdChLSfx6>ww6O z`0NJAFU04OojqjVDxvG5|5eXWy~44D*wG+1j*{U!VW~`XcLW_BC-5Gq# z{A@l*%I~sizkEW`{YG&UsQ0x4-zm+`kZ#=74NbSQI;vUR5Y%jw!pWSu1d}_7hw7 zHof#vI#4oZ&T`Gy4HuhmaiYw~x2`s=C>lBm)a6JCM(0m&oLM+eK)jB&7V3N4IezjO z*!9iLs6;8xzW#n1{E)b|wXdcw$;?aR1<+=y8;>ujSRf&QvRm*ZP%`-V&@w+Kr;tGz zv4HW2{H9;%^oir{i_MO>k)9+bwizM*x0I$0&qwZrBXe6-9`>FSUKK#|hFIRB@ycrJ z3G~!frlOM`EU!+oTf+8^kp*X6yWhoUOR~&Us5~GiJ6d!<$f+m5xAmeAb;tZ>zvG0P zs`FPYZvtGfG%$_I7i3DOVpmcUMXjFHdfMtwxg9!uq#Tq|J26mZHNI_tw@P;78LV$Z zoRt-j%jB@^>(>#5MB^lO;H-E$)$pZL)#dMZBMPLW=dlv%LupqHR)*G(@di;^Z$AW+ zKc19^B-moZE!YUqfsT=t1hQhlvV}m-E|`ALcc_^ zOk;L0vP*9LXvAwf`*7+(@ubRq?K`*BZ#|1UH}|F2g6KzKEM4m{+CdOQRWUzG@OL6b zL4l&$$Cq^4@Vb35`f}v96f2Xiz4%ly3rO?QVUGX`b`ychZI@nnY#L>-nv;{`QF+Y% z*m(X`@2#M&Ue;R?N;3j zynn$iT6_;tQ%QxAEX0sy&9&&q?cw!$HWp=F3kV(s1XA=wMGE`H53HlSr&6%@4f8mG z^XrW|9el8ZQIo!%xHk^7lhF-*^}+2a%B5yV*|r~HqC(Cw8^@j4;Y66Hpm>aTswm<> zrE9BEj$J7WKZx9ZjK?}FVO9R)%(LzySxc*7Bk$$-mpbC0m*V-s_X>{ttvRP~VZqA9 zNYf$TgKjI!*Zhs_+gum1G33PgeT0d-%JK= zU%>gInUj@T*}r<5LqaF|P{T)g>w>e1lxP>L2a@2kMN`k*QoZgzESB4AojL1uK0D&z zwcBaf0&z-(>IaNV{)6N}F$lyg5=`-5Uv_<#;>?GV+y*CQX>j#ziP!eOPX1_`4>D0t zK9p|_a&jCNT`x$l)8!>>ol}k?l5SrCk4yS^6M{Omb7KqzH%fBsy1IoPBiXy_2>mIX(f3w$hx&B`kHWhu4s z{$Q?YGrSUfJ)vAA-m|U;US!IB8$sSk!W!Qf2a@*{zU=Rw7L0NOJt#}%z31K(oZeZKPvcG zJ~rOnb;h@p`9}H=8xZRhsM_J+$*(}oOIUTnD5Xv4+i=s%J?_qicE%Bu2F)yUKFu!aBOeEudmxF+m+ptvJhZ48gaGF)-x>j3LbJCY?*=y) zD>PG$`{1_s{lRh5djV-TCL|iw?XjQpgQWpnb21GW3X1x29~2-U&-f~LxzFuhH=U=bJk-4izx@z37 z_CZ{JQNT=Y<98MRN1)dEdjflb#FMLJ?}5^^L*)poolA!`3a>F){I#a1PR!S8-og`$ zkwz`}S2k|&CQQn#M%mH)$rn=QS!Ldz{Ish+e*D8PJt6VxgrD1p%CoYC2XuuFv!+K%KK;=&*c4&U`Ir8mCS&Mj?y~dWV)yk+jQd$Y2fXhp-YcpRu zDlr>eJBr|!tDtrZ0HZ%27Q^6PkU~=*w>Q{F@kay(Uzv!^gTbuNbl-~s$VX)2J?1>X z|1s%Y8V=87m~Sg`_HSIp<{tLJ1+rr8bB)Y>2rM~ki6 zIIo^~d?_j2M~|40Pcjri73?uBP`VeSjwNgGTdGCmu$qsp%5Ao)>d64+y8iVpo?_?O ztnMOpJrnppo`3qiz!T)X{N(ri2MU`;wFHiy2CwBs3W|qmT0FnqKv>omJAj!I$@FJp ze$g%TWRF=Qa`~)C(!?iDGrs%`kx1331ACU)cDhNAR!0_g1Mw`a6TtNV6J$Yh%Shke)Y_!nr60MP$VfCT)p){7E+kQWgZG}aqx6F%$ zc2T%5+|o~qdqk*2OPibNi+SsxO!fl3m_ignko?Ql3QR+mKIXtC+x`Q%`1MPeLNDydFVHILw}*lq z6W{6+qJS<6j~c(bhtB*Eob8`pllG=`SlqpFrM|<#jbEu#x=b)TE*m~hdeGH+^=d=@ z{Rll;uJN;x>Gl~^K>!$@RP`EZ*H^&*)%~sLuDfO91zgZ|-RSAdkxTl+6O1kPgH-t7i1Nxn zdBVYRaK8tMd8oq&P}v(%sB({w3_i07ug*xgBxYn-ps^zJg1tlNx(Y8mKXEpxYUVz3 zxjz{y)jTRU_G@b4dzHgQi=Be3>1krlp@Ptvo&GW;%M^&|E?(adaWypht$);1HMR*h zamxI!724Pstc?3&kIJ#nXAMi1H0pHic_8LmVNUgUE#JXuRhbISf~8|bN<3&8doaFm zU|rw6LSL+sS)=-&4$7+{2Vu`~ux0u9?{%QxqRMaCf!M1%;?-Qrmg6gYY2z%wOwITDv23bx8c4Lq)PoZ@{pfdQ@ zR0~Ne+`IE(p(3Ha)lFsTCxE1t z^d5DkWRraAVU+H$bDl)rN?eK?ZaH2TwM@?E@`{RSE{=KnG9T!z(nm2Le)LhP6U~f7 z9-l=aV=~>Hg*4cKwj}G1Mr9}{RK0)=Jv)1^f;sXqQ206fG{$?mru84s3c&@5Prnhbj&nb{-L&uOq+@_c1pYG zq7B!hKMwnH_r7wv$juhXjv2ure>56lf0u$2qn5wn)bzQ1`?Z?*E zyph!F#(sa5S}m9XWf0bN|Fm8vY+>O(?pumhkI4_98!d-OTWuju+Uqx4-xu$?-2+}H z3e@H!u8@q&Zn%qXbTxE&jBI=IFNB#&qcIBvGABWFY33v@WwOKqb8!L{(*{3hx_D5l zfU2pI=*ji&lH^tJ^{hJuX88Mo!=BLU3d)94ZidAXzH_oo2}>=tZHZ#kmcNgc?j-Z^ zQ2nxsr*@7{U-&)HNKd!CSY8twaCM+ zc8kTncVvmDQ(cdP2!Vqx_M}a1fiy>cWNm&~SetgE|)(4tX;)gggVZ&G*VfEu0gzhSrS&{ zxjLI(#X7@4e`VYkr%I(3oKc=xn5}L%!v@!sPM6k5IW)ohAmk}9mFZlbG_#GsHGZeB zhj9D~Cx2CH3MDk=XmHGHNtAG}A99Aj8tP;P%Iz-U)6c(G6r+^X&CV{Cng~^fDAI>} z0qvEQIkAo!c)I?{)TyD+7jIJe4etDJuDlqxQ!p-wGWwA^7-iBM{8$$7yFawnn=TM_ ze#GOy5L=;9cQ*hRq&@fha~Fm+Q=?%+eQ|Md&lv%-ONCRleXIv&%zn!dZtN{@HD)n~3aA!G{M_`ko$5dnJTSB5^#>_B zY6W=TuIE6(a@lUxvS01w_Xh!m(LiQ{i@oc&>65Q}^zg-Q6{P@qDJflB+ePcp0IUDO zIo>+}CoDTV8&zP79jNKPQvsdk5HPNNKU4$s@`ySNpHw0?w!ooFvO632y#PZN2)&Nk=0rotcrjE zP5qpQ|Ai=7YBYUeRHaNS&e-(v@%5SWjRNfXFO2&A&+6KM_Pq-Q32RsuYJNmS2is$y z|Hf`bKRiDfDM@F11?|VNJb?!a$3{sErkNEQ|T z_7x6->!z`j>3Uw0(17-?^s2sbKRiWpFvmdYBOf0y!^aLCE|-zbS)&NF;|?w=-}oO! z(D-hnrFD;ymEKcI1(nqaYiVWmsk5^)m~Zh6eU%o>@^JAFsMsNX27hxc*TQ2!_GC`F z<%hKW*C!RAYJ2ZOCZ4^=s9%1#svRP-gBWX^3i21D^|jYTx&69}Kko34298H;>VOJ2 z6oZ^YS{ujmW04ZA**)BIPO- z3SG8~f}*j2FKz=_KE|?i%!Ga#E`4J+t@ejq0^is83h9p4P0``YH!nP zmm_rfhU?Cm-NxbyoQA@8fT0()=)hHP>C&4Hsh8R4YS)Y0U%l*&xkEsiC+Zo4lh*aS zZxaO4L|CI?X&QWI-z3pz)0wY+)*F}yszrYC5n0#7m5_Lz{?}t&1C#svOl+|!=LQlEdX7!eDG|<&sK6967sVa1@hd$?ak_L z#A!7uv;0oyP|~2JfTFTo>K|LG($63DnMkKQytVw4YWd{k1ExC)v03R#N0pQm9$d;e z_ckD4*U^)Ke;u%l>n(k-&+@=p2($EJx!fE;(tEaxCPc9SYQz7#bZmcUxTk&$IsP zYxmp_%nWki4xjLc3^8b$YW6-HStg~slWw@UXqL_E9RGHspXn6n$I{jwKxxkuG|d_B zCz!HM)_R!Q${eP%LIwE)GnNVviUDtvH2;2tOkCVtX0|YJsy;mIMTVo8`Oc(QWV?SGt)%zLN@_ zC+J-Uu9rlY(c7&G;8hjPSuPtNXNuD;NVGw~QP(v!HS@;ofODRI`%Y_WTtz#HSRtxn zgs;z^c@e1%gfXC?&F%mEiFSkQLpGP5FHH+Yp3#xbs_`v+t?WZ>%b!qGW-<{Mp>CNK zxy1f6UwnUqz(0?-t($X23A{qHEpX2%5aj%M0!ACb^d~smJ3jBfF&N z?Mm>ur&WxhCiSmn;VEn26Rq-4-wW=}FqwjnQcQHnlYVFSns_>5vn(yN4#UcM!qivB z&ffk+cUV|%^=4GabJw)g$%D-!lKX%E)(R&A>&K>fd>>i+k=wlR&D5sVY_r5M=Y|VI ztRo-_scYu-M_XPt=AgZ}PNzaTOt<@h3YYUPVna@WL(2paTTP!l_ex~Bj%H(@91&K>>gkl8jR)T<-k#KuCUqG<(I$T+G4)DFWDk`~m#520P<%6f`>2W`+ zm_xY2aYnUI{qVGu9BsOr%=5Hr2YJKVG)m5MR5~my>`a0OYD`S3FxPbGEga~+|J?(R zUc3-ke|SM+(&o;V?|VqnU3u*Lyjzn~JIIO2+SlmF!2u;MXU^lKEJb|iit$7DM~jBc zWpf?j)Fe>>C*t_7j}&YO7))5`tI59;5gGFkAu}T*cl6_1 zAiDVNyVInJe}U>X?!P)vR@4No+LFD*pEteH4>^vNH-1^I?&IuZ{0RRs*xVtR-aCXU zzzfqMZ{B=-K%W9M0#ZH%hi7Q4 zU_z1n&gv0hsBaT<(xFgv<=6#pn`Nq6qT$7smRrAH#_q&@GmZ+d)xEorCA_8)I1h0G zXv<-_^23XdQ`TLHN%iq@j~uR^{$C2hgOOOo;eIa}m(o5RI&Z3WSwq(9`Wl76kOrtU zZsB{A4gUuX_Tpq5>b$qeEvm>0Y6I)IV~uS-=q=>x2C$NrENh5>v`5?cD~W4*^jPr) zOB*WI7kPMZ>kY-d%^~40nzn+&GXVxzojosKwZ+Cp$5I9UbpoJXLuI9SA#`!Yr9Fj_ z8#rdM{Y_P+)aFZ}rd8$WC(onU4;&c}pr_!1zDg>Y+*?iQCNu%^8O~nCeg>^=G`5A> z)JIl9YWd-5auR|Y@X*6wI#MCMP@wcYzq&ZF*;aE81daIj6xXLjP|v zQs}yxn%ensZdLvmq&sQdNe7|FP*LIj3{IuQ)H4h5MSCA)_J6KwqWJqIq+JEX38*#C zILD)r#Kh}AU%Jo>HJY>VcK$@VwYk+!eOgagfLC%#LHgqVQBvZeY4vE}#rVZ={ghtBH)6$c!?Lu5w*^T6c7(H){_BxuICr zS~J$}yIij}0shdkiekM{HKM-)pR{P3qNsI&Br!FTYw6HTvowy6HOQz5P*?4jvFa}j zyh?z}5re-(Z*PHx9p+T?F(crZueQVb7@s+|f2qqaztCrvA|3Y{4=plksGHd+6hjB^ih>TnKeB*=^GWn4#sWF)v(B{Pde~&L=Wph)cdivDp&CzBtXk zk_#dYQS2y#evt7^2Ao`-dq=YNUD9ql6`AYg%<$ocnuKFNKlko4h4=wj==euDz3Ve< zz%{*#H2bv_;QTLYo*Ao@0*)!1h zR832Vp_V`E5)&7S#nn5Kf8Ov$mEF18RGz|TzWLX48Z|rxzO1s!2g>6H;(~nh- zFHFg>xO7OP=uX;Is5}mC(^mWODQ|-WR4v5R$cS}tu>CoCK3>4+MXbF&ija{e>u5FM zM+y>l!%y6;%;&-QZY(}PrJ$==6C*wce3ofh5RPunm%^Th7IYS5wzjk#^Q~Ou*mRUO zkN^e+{l@48iV4zeo-clZc}2}1v}4RMyI{IIlQXQ~vE%V*IkJ{kyy-jmi=ti&j%Zb| zyY}1*s^3CR^#TKLbK5S0%92wJ_T9$pic@>C20$RPUJ`e!a`c1{Pd#35*>fP_<7#oI z_!CWD|9WoqxpTo4T2r0PNiB7(#+7M7hXbPeDPufm-QkJc! zsXXep_CnsXC^y9`o3Qz^6ANZG+;SKy3&)Fw5Aasz^1{5@;Lj*_mjYIe)#J|k~Ks<8Bh%XIk1#-(t zM6k>aQY)JcrIq^6{Yl`fyDx{!+|U=38Q#UX zaFii(;V41w4m$yKH>G?O0z2Z;NMX;gXnitp{x9?Um+ru};+a-I|1}r1-YzH*?qq`E zG}Vu{yXx)gxHbJLY{t|?YaLWme0j{89O*@j7^~i5Vba9z2ix>4U-&6Dzqe;+Grwd! zR$H~5f4V|~R|znWeT3&(ksWVDeZ z&ZZ{;*E6{T-Ra50+sY|;_jv*e0{0QhQ4JydZ~}sPANHs5Bdv3ER|Fg#Mea>bOK$%C z+-vKjWhBpVNjaE%E1A*mJoF6M#V$O_8;b%Q%NJCYq$3D3?+TvL1H91Vpvn0tnb-KC zThP8dO#eKudcK-ohenDX%mIAuVBRY0;g+A`j4_4AwRCZXM+d*LTj0>aj!iBoyPC=I zQXz=04{QT>faHlNP^)YhwU3Ub*4937F{vk*IxfF3{G4jKNHMir_gyhnazS=>IzKDh zf=VFrO-^Q)ZiU92Surd7{J7br=@q-hm#OIs`4IFv5iw^>@oC-eC)n5rzKCb7aM!>n ze^+e44ljP$N?wV7Qz@Dso>%BZsp2+!c_(H!3F^b#{6%iOwQoSt9FbJNrU3P=%Gj9W ze-OdCu@9r}i z$KBpDD`Gv!owvZ6h_3M^UF72n<4w(vIo3-Q zKS@ZGKAM7HMlE{<2WR@)KV^b_Q|1hBbfoCBh~;4V$LGAZBTif_JH1c$aC)yrXDE_9 zG@OuI+zKNn~l}v|L(r^-)cDv@Z?3fO0M>a1f2R2T$^) z`hyK*Uyi-tf$IBR57}4~=5|4^Gxc&;Tm@qXE)kV-po^@g@y2(V4{Bed3W0TxrPQ5l z-UGT6o}mM$WGC@w+S?^1;q=umA<|^C#68r|mT3)?S0Q=Gpc;3Vna}Lcxu}D~iIk`5 zpDFO|UVYCCn$jaI^@>GFd#&ho7FmQGDX`1_?yo3-_ZvS181mq-aVdX+1{e zOyULK5eTXy>((9EWF>mKY}_B$|Cdgg>J>ACX+igkU;DtN5wWOG(O1NLp z;3MBEwSyfS>9ca0`5*^9erP(8h! zy}Gw>T%vN!{DaMKp^2gh-N=iJ=8F^+nliIcrdz75m$f7B4Ra0%71nRyMjm{4_rF0+ z0KJHY)JW`~*N|~}u#6jhRiZxJwK|QWvPewQ0^e3xBn0l45QhuKrC8BHb9!#l;%A0d zInML=N$v)htyc*~_n$En>bWwr}5p{;pgjdMIt&e&9OK3~&JJZ|`vTFijr>EWP;c zRW5~lmOIhGH0d z8hZ+u2y+v!eXTZw; z`$fzECnJiojs--;g}8l7>xz5pq?L%4uGFc6N4SPL-H%9{4u0 ze*N*f$gp>L&`_8puqHp^h0DKRMk$C`dzbcM)N5^25sl zMG-8f{`@p(eyqyAegu*bw;sGEXGI6T?1`YQo*;|U@pIZZ@--y~pco>G`bxiB4nU3r z`su5sIN3yH1KV2;SE^wT<)h_K{O{E_f$N$Sk*U$SOO}ojMxMWfSl?IS4PdcG6PHa& zMQP}h8L7*%w@jJ9V+@ib#4hOQjlDLdXAx)$^YB9ETS@$cZC+DxZEo$+%XIZ*xgm{K z&TJ5yJllJj?>QPpWv-Bh$!pLw@C=JajZrj~xv@okQQ!Mt%u;%zMInYgGR^0YTo% zzbGUW9R%}H*U`~YvbAm1)6h|*bC0JQJH6}vl8+oHQ6zUWnu6ESBM*Y!^%iMF4+L5G za)VduB?QvgB7=XUYiO)Ya{>y}5q1kLtMPRlVIJskWNL{wZg@lpun5Cdi4%8!) ztWZ08OtS{;kA4q?TrG5Ae8kwbS*;gHb8akYGF)(ypr9!;UO?&#`?2>fotW^xtzs#V7p*N|#lczF2p&=jwZz}^DY`Hs~~oq87Zmd{I5;@SRvzCDbmCt$}0 z^^6<_7Dj%GM7KTq;`Oh9^-=H-Y0$hv}th!Au(2A&qBh~_yfs0Gg(Rp z|E+Pr1!qy_DEGvn`p?%s5)7y1I56O;(3?eV1bW816Y%;#T73Khp%i~#rEGO>wPdT# zl=$6i>@t4!=X)Nb7$XumJJiP1zhyF+AD2>58564=dN;=1^U8$y>OI)V98J{e zeEgGa8lDF(YF;7~Q_95j#sH$FOYPXcO*_lf_~nL#~0U1cgoF7xf%i_&p% z*L8KtA1wMKb8}yCJNA!HzBV&a5#~K=zsSixKK$D1#gRqm&&s1QTQ%K+!hNO782W3txK0B^E>8W6CFcW|r3X;-mTc=O18xq#67% ze?@;F-98^&apPWSv}eKQWwpP5T8N(7y<4-TFYD65ucmWNa@=3-JG5W|`=jDci+%F$ z18R*cT>iMw!J#3Z9e!$RCYOyWFY^J$-Px!V>5;@+;6!T8)YR5K5iP&^=WSnV(1?9} zK@4%%v`qng#1FGa_4~|m8`cu=P&e7h` zPW$c7%4)s=jjn2}$-SX`gWfXh?&80|O!7I?)6=KQozj?#Xf;CN^U2$7$ab2H?!>2e z58e5WvHbEqspCg`=Jy#&{Ooyrrz)3`f=o50P_+f4qn{O>>hBhUd5gUZKO={Uq~&w!gN_tgGA%-OMX6o~2^4qMWbZRUrz1Po@4EI!te6 z%tw2y?JsVJNS3JS2jJ?`{y&)v8Nh97eI69pR296j1o zwBwNpo$zhP4(JLU3FRu@3y84&A13kNU*zWx3JNQ}+hq2q+vM-qv4ea@VVEA+3M`l7 z-{pSQm;dhWPipM{ab^zGxNwIMTIl*U8zl6#rNE_HiSz)wd^d$_pLouASz zMRUCfSf#U-g>+<5Aqa?Cuf3P~Nl<=BYa0;M%7={9()0Cc2=3nf^b!eNAv9}kj-nS5 z|GV&Q&dCpk3uOC>MEI^gUwc_~Pm>kiO9zv<=g4FOt5nY~n0Fspoo{OTC~{`fV_<(s zvxJ7E?28xrO^iZN`4&03vr@3_QK;PtJq`Jv4KZ8=6K^RuQG8ck+Kxhf<6rxYPe!+l z$GmAe?`g`EA`{p0whVGwpL+ihtsXX^^1#n3`1{Sdj2f9GM$ho`5ipZ-j}KweHkWz} z8dtvpj83~K z=)|I_&l)Yit~K$9w9BP+MJNgg!&K4N%uaK&t6ue77%52=BlnCvQa}`0+s-AwALH$* za4@jb8nj0=bv*g8b2KT+zn*SyK+?gSH8tEH(+`lwMzaxe&FHnQpZP%Oc@ZXF$mY~3 zmkxrt@ujyQpz^#oC;vij?j@1=yxhJ(#|uDIi2!YULAotk6ji9+yA@mzF;u8A;P8|D zrdRA#z4Ri>H9B!fe7NwBJ(pz#8W}sE9#m0KpzfD1Jioy6&@!)Ec|! zcGWXc+ConYGnS>jW~$BK*g2ISyoYneaN||t&!uzaDqLh|^BU#6x1xa5E0pczI6i(^ z75NE>R$Dr&K&M2kyCZ?D>~up&TGTMYN1_tpWYhQM_vL3dfCK`PRf?YW;M{-r8sd|4 zuo1@Z3V5?_{AvLVGFDRZe-x6@!w9|raY+Lc;sAnZ z?{jZixu`2e1)1319Tpt^;<}|Bb(03lB(KrSeO<<+(S_k7CrRNx-tzwyt1tAu|8-kOC!hBBaW|VZsRRAqcAR`0z=U)B@@~?DkhzL~&9~Z? zqTeh+FkwN5g(H_h5l%reb#!V@@(X}aUVP&LCmX?-&ASIqD?~=zLTQ!iwkOQ!M`D;>8 z6)=!sjeT_aqw!>kg`7R{RSD2dAGHJ84%Gh*$p?m}yvX9wt-z~wMEra&p#P@Ued*~Z zxh7Tq`>wQVR)GT>LN?)|1u~jCIww;;0Zm1@SKY4Z>Piqem+lhAD?!1G5$cT z-&Xqx`;`lwKi_4a==mu`Q)l{HATDaWM603Ok(c#`;SfUjMzr^3tIh<@MMp;`b~*7I zQ6nOKd8l8Kir?Qz!424xIUT(c%)&EL^JCQEC*&w;5f7;aR!tL=6PfB~YQj8wlfQjE zU2LaNJyEY73on1Y=&Nnj{Vi}p6AOW zGr+uZPM1zK>dnH%8uh9Tv_^MC!DFX^C10DH-=T#71qv;8e4KU*Xrc_pIxZ~g{pW=n zmsswWKd_aRr64qo?3M6KL#aUVAka#dFFDUEk?*Q6M4eB$kPKU_ns5F)gnB9@Vnm>U zghUW0^bGp;{8;pwYU2Uo7Ew0hTc+tt=j zvxT*vyKZxWq_G>cvf7X95Gd){25Qp6qHBUtn}FubPI=^Y2;n~NH5V)~Edhg_=gF$f zSnRh?kz5Xto3?LduYWHE$5k<$W4AK<0zDyucG|$ol~mWuHux^ZUv+P1=^KY9yDc7e znQD+}HN%`?9R%X`4(;eLpk3&kKY43UII{XYKDr2E*-Vz0AOOQE!ioTN!fUNY9|>Ez z9iFW3=#UsJF9$YwWBRNw z@9Z@cK9m*W?<9*KP+iT2McFRA(Om(W$;_9xMqjOt3P`ApaZ0RVUHTmvn~YBIwgR2a zwu5pstl=A{a#45Zr+34;o1T2*!MqD!oG!(3u7BX4E;DjwhHOqoBNsm4BWd`TD2+x3!)6>mz>MGUVe%|WMKuAf{n(2|Z)qb#WCTf_0xm7XQBPpTzF;=@1-P)q+{D}4>yS?b z!DtgtHfD%gB&seOEDY2&z_tpK*qo~d-w6A3Y`m26b#g5%&6TEjjlUf631@Pi*D^JY z+r3!SASEoQ61>>TR;jVXa1;{3s(lS_%gez5l0}YebaE;y`yhxzz>t0mt*i5R_OWJ) zFZ!gVWmXQS=t8^{$p?>Hq;~=v0@TYxJ7xwYKP~g(AnzsODOr3l%EhjkbPd4GZ-IH#he!69WSS zho^=mK$H4<-V%#Y4ovx0NK}av3|^vX*S+D7ACb4M%K-I}a*KdMlnKI@GMBY@#xUMA zZzmM9IGZIN15j`IFnB*O0Yjqrji8+&8j(Q^FEkCPvUSz z&io;CNxxo!ZK5p}rP_UysG)C6orkpit#c3LSB1Vh54v@hoS#V{`-<0=Ya86U%x5cawmb5u{#bT&_#GQ0sZOJJ2X2=52q;;wcv{5|D&F~!*UEFyU?Yq9r3{)mOAy{`uKdMfHw^$C0>Nkm#Qcqf&JW_kjDLCAr@HXsl99wd@sizcv4YdHb*1SmUVMFPcD zY-^j7`UCaBZtHILpJ#Y;)t$Aq zRL(tra^N~C5w=B?0U*q{Wj@5rv`#Zd#)%!6uOikG1suQF-C2c56CJl2dGidcfF?)F z$H+#wXAcD3mViIJV+X@W<@hgvo)FQlmn?uGFK;*=)*9%%){7Y)$d^rD0jeMU?!!!w z3@m2xvTW|;Yn>~Rh>5%U){{|2&K%Jol2&QaRuYw9x`m|h8cv${a6^Dfp}Tt8F~%{1 z!+tuZqiXn3blJyUVBUJX$7NNyE&Na&hYC38epzJS_-XHLeO!0m0);3-v`Ut5RS}%~ zX${`VkfHr9^PtvdduqZdVDyY*MC5#CxdoY7XwiyM7Jqxf9kDSJ#$Guo+a*Qjzf}W_ zTJZxK;l~;Fuan8jwV3h~$}I*grb#;4z(L6C+}X;A0da{F+Wd1u z#Mwv}Joh>Yo?SJ*i64%Bf=7=xLm5F#`Uo$5*Uq^d=qdzEu`Ph-&V32OhY3e-@dNEz zUidp&q`Ip1{E+y95$G;)H$ky-H*VR|{DZ(;AfiWWHG>gvYkB(aunv{o1`(d_%dx4(h z*tj8(Xys>lV4zjx(!hp~$4a737Ae>Gow%@xbi8=2=Li3>H8O<#L&vCFrbdzb*n?!- zy5Mj_@Q6K;f!%f($B(AI+uBp&gn+-_tlWD1aGT#I#r#stUJtiCDgqc&;d;z>TdH_( zSjbPLal)k*f8TNQrrWofc+4#KWTZGfJ6Oaj5O30NvzNx#Y~&GE5g zG7%ko{yEl`8b~IUj|^>x%n#5UtoOwFh!_8u{hs_rpxT_H->NGLj}Eg4S9{#lqZaaD zEUSBcaZo^f(gu1v<$KfN*Wctz?ciS+PKKs_V{Avk;_;{thV}DdT@_x>zWV6jEne~1 zo-lo1MrnEpAz*IGzJ;WxZ|PmnGXv@!h>`2xZ}`iHe}DczNS97tTKSXlyLY{L`HCem z|D~j@A#Mf1UpeyNX6CD|L=hqQ#!6j@dbCpsx>)T@icwq;t-yBTAqq78#w?A6T{?fO zDE+G^MMW35%r;wJN5JI_2X#NlCfe7J@}OD+D9|LCicjvdOmVbHJ3!DO{v=Z3RQ;#n zx^4Du$Iin$`KiO2imLzQt=V0Jwf^|ykjJg-*K`2Jvd?PN?uIhQljmM600GD;-`1iG zpw_fQ7UKV5p_=eH+aknKA{u1kt^NkAdl1Vk_(UJ^@?=Ft1#m!QZ^qhde_-}d-dHxP zSVv~fL9lDrF!Cv-PQueJ4mAgyUUi`xL%mIxQEwA5i z0PP{E7jOGQ)Q@+uocTx_CI>{WtcUBWtI-}F%ZMs{S(k3H-kkE+nw&qiO%&$>@u4nL zf9gLHR{Y`sn{N|%HNWJ*iQZ*QOP~b=F`Wq$#0T}gscN1WW zZ|V(0R+3u3isA#fx2K&kc-;&+%7VD5H^;az%dTB@-u-0#+D)6fv|vuz7^ z`@BrrV!9j7WOg;2W9lG;=LyQo$xHaPR?oeGGzoah<3NzTNUGS`7P(FH%%amGpqm&O zxpku*jL}q2a)YG;aQ(dTYzM>4=eY>Q?&w zp=Iw|Nn^FOGa(1CESGOa0i*Dqvr(s}QRdhIQv!lQWIjpz7n%1*{Kk*uDWI7ZPx9XR(b!Ok<={QViN51w}4@v--RnSlkzTYl)Bb+ z3B-w*?2d@YAd*Sob`P$LyOCT5_s$~9bcC&zIWTad7 zhKk*+zyF9oArce{ScnUbVPWCs@AI6kF>bT#o=t^0y2~Ik6^{Y6plYLbZ5%x1xg{wi zC8+@3cj1&~z|C{~S=Sv4K~K&|CuC)=wI+885Odm}hn1eW+<7_a^iX-MgM-?uiw7ph zj)u-1Z_;w9RFJI*=kzp_MPoOU38RxPX)Bn&yC*RJzjm%Ps0kwqSC9g=HmwfBQ3=%w zj#@6GUkUK~s1OgLK zAg4eegpj29VKC0LzxvnxwYxJryZiPX-+S}+0R=piNY-YL7f2*xTS&^(iiAv`@2)1+ zujrD@#c?wZ(XZD}*Z9irY_>F0ozj;11mw<*$26G$Bc9T;sxbj`%l1dgJA#@bo?jX5 z8;C^^i83m-(8xM4BQMRh$f}zi**UfS?enQOvWeiv zSc)lrs88^U_Ejc=0M%WL$ol=E`90ck6ofa(1(VbxrZ9htL?4K471=D@4Zq7>ibPm7p@x`0ObWE-SJbDurN(hZZq74DNz2nM z>mchlt$4YJd_1}I*fgfP@bTx;o^QI2Eu8wYuD3%h1a)Ke4N*anK~gz)hrAb(Fftw? z6wh)v>}63#OmxqN=Di8^jRPWE9F(DhhZY(k-wi*gk+WRPcPt=v3_nDX)Lk4L>>u%) ztCgAKXLRx)F>&`rsi15d_E5NS@Kd7zCw5&$OlME+wj}7(qlafEM91U;4`NiCT`^xK zi*LQVT?q3}9?b8BV`xJu@lwclMtqk8grD%VVL=QhpiUN9CY_9eGtSZzi$aErnfS@u zd!c=5je9CP@1GN7!0$d?onn+~hqMC5RkE^!uPO|`W>029zS8|+elfXO=6(w90>|zZ z%~^NMQ4+Yf#0~C4<3S+m?nU4aHc{sI>v+L=fC~s z#_Z&uNa}+&(w$d{DPTtD-1$ZrV@U+94`%k)Oi%cw=hA4$i;&g0(`0V~~fTTJITQrcI5tB z3Yk*>w&8o6sn}ypQUWN-(GVVP$Djg;p|M<7AP_~FYUdqcjNwv_lF=OV3j^zxPLJfW zP;a#VCNI4rgcFq5?M{~eX+~MQ6tY62lr)7$;+h9+(0;=3vW?2!l&+G7`A|SQ`T`wf zdFkcB`bBp}R;zF@tj{cCX8|xXC#co5mV5#B$~efAE2FQ(iqHT`A)Kt2e;EgAQuzV- z%t4X=3HYOxYoG*fl!F$fqt5FHjJRt3CEJK%P204^U-BlhF-%lMC}UAQ2$!;Xtwr#B zp#CyvExK{DZ5+hHVr_xFaZB7h7o1rZePV*OhL@UXR~o&XUrcg{4wR4qVPQ5VPtLEp zgP)28W7Yxr8bUjQj{uU;=SuG;BoeSOSbP0bhjLoNyQu3-fWQ^Vx3MiicnAa75# zv#*;uBC>7)>xBzM7mgsCnVpt_XOjHc)Z}*;GYmn8qV8T18>lD+b@yPoOgb_={IzVK zUXqBZqg2jrHy!+P%lu%u8`&pMZdB2zI&WbfPy&IpB3t+N^sMk}<$Lw9vUO8hCiaq; z&3u__J!%MPiJmB}$&#f3q(62W&nNL;JRueRJ&{U^tz5%MRQb|g&1YDIM=d%lC#9Y< z3nTOr0&lg7ZWX_98+dl^PF=!TuxRD2_6R?3=9lOLR?q@3=F&)c-g~31K*%MzHoto!Pl7! z3CT$a2@%OT*qWGI8H0g|hbN`L$SEFR_IG;Pa>CK%HT}uB#^jgW2mj%U5JblRi2({C z3}0RAS3@pIz)(Qtw@*JY`DY=*3^YG{sHO`m{&Zv@phSHJA!K(q>S?<=n90%ku(y1b zb(ZmDEDM&qUlZKzq=xB-DA4$6WG;{Z9apSB28^5E577>c(@Hiu78?x>RxTsTx9QFs zEd5*4NTJ2c)7{(5_Rluz_h53cL*dKRvd}}6@^3?^-)h16<)B-d7rH-p1TxA5v4&ml zf1v!zWK#N`#y-S+iUUaNMFD}?wtCndFo**ljJrc(^xjybeY<;AhgaTZTJ>Gc3J`%K68hx zBZe$AXY^aDu}-qGqcEh~t^0jh{kC7|$VAiBMkt2*6f@H+JFN zm?;Ulrj8B|XD`O?DQqyJ@^O%7j0xM~$Bd!bjqFWS&{Uj&rMC1iRShh0O}U~gkd2sQ z(}S`0%c-#!UBH8}U*WVzbL}#>%Yhkff}3TGqeeKG)E~5lyd%4A@`qOPqxkbqvcG1%T^e#wl3ke9kV<}3)yQ}dV!!PFK#lpa+QDf- zvI)@rM27C7cZ2Ke@}!3oKq2_fuO$9%GT@q6aRA4FzcQBX18lHBjT}TGA{-GzJ%k2K z5`Tf*J0+wU!A`-VJl1{qX{424=seVYq-!`)Jx1w37Cp#$2$L>pcCcu_k3EBS6q(2$ zyEE1uenRs?uytFXv;6!>;HTTY13dFAl1z8F8n|w zlf;^0BH>t|Dy1sPCV3~`8BaX4KTL0&Jg_~SIG7pROw2?jmBAq!Gdn}U~m!P3I4c()XJag7e5VhHYv%G$P3ZFng=Sh zE5u9&)uz|BGJb7VYZho$eMo@U{;f$aV_YVgFeIcLSsHFpY*BfrGZRsoZ^6uB$`Y+n zvuNMod05>N?veWFaN~VAH`C9;%;Loogwx7QYZhriWzk|;XkjzQUBvNSWX54;)6#m5 zv+5?GlHYW4>R{*k{bOolOJh!>TB}B@l&3910@ z{;Tlqob8^E!rmc1-(F>3WMAMQg#yh2Z=hwM$B~kd)I%OZoI{W!KZ)9irirqMHbvR@ zA#O2m@$|9D=%I5Zd`OT?z%8sUY5%Hu}(ydz3*@+(ehAr&= z7};Cd1=;?rR|i%{Sx4UEXRnS8uhr+5t57W`t$ErRTF6SLIjFfDZJ6cym3bQ$o0t{n zmF82J@Pf;?I?7*RR&eGw(e)*XYglYyqvs)yw1Fa z9WEWi-kLAfKHxr@FOTXt)bGoZS$5CXJ109y{qRr-2M;aBDWWpu9(3psCTqtZEe>^v$gL@*5F)?TkHLZTK z+g>D*mr6~hWJaZgoA&33cTosa;>q5Ryf>Mv?nhS-Z$ZbvcC{4^|!W zOx})HO26h_Aa~ci6#u3vsXp{+EkXB}Z`NUoMiOTnW~K}MiKd5>hY&O|De+c4zfLU0 zzFKKBv#oj4A0;(1yaA`7hN~hu6 z&D;X3H^vmhtkj;#X9;h^4kHd@m$w*;UuG+@v<9`*+u=2mewKPSoE~=1^jLb$;b}-| zWHjF1K@0~>2Dl!5pKJJF@uBiF-s7*k&=!&d5_$PIkG&gZJ_DcaxA}%&COwr%x)II4 zRi&t*J~p;k^C(fNrt5OnTmNe8JGnmj zy|i``wq{>2(pYXmX

E$M3G^ens1^t?A-jDPFQ)ab0XRU(w;&a~jUG%){fkayP4L zu9T$Y{h0Q4m1VJZ*oPCCUhDnSo8u|_G-h$_0bwg;5wa9U^1Ts-u;l#O*wj*#e5~c4tJjq zOEyJ|?bWx`6eXwl9jzSi%-^O?XvS!Ye3;%&*0E2Rl1wqiY_T$1NVedJ8b+M{dL;&uFeFmzBBStc`gT*ac()arBS z+Pe1DdBK%s)$Zqn*VKiH)ZO!`BaaP_byJ&-cFSmscLiH(agmZ-1+GizHGF?7Gk{rB zBnA%U%>Eh9j|~Pqw-c^%C901jC9GB~WLAqLe! zKJ5xTKWwSUPssT`(apN6^Xp??<61GW^ZoPlMmRhVL5R2dwC2~=jyG)oABSt3onT_7 zaGhX;d}08KRWVkRFp-e~qXOQ;f`NycgFylBz=0PI@B#yaj1K{W0sf-_FX3E>|2hg@ zoeTNj@9jY+3MdLmNC5v84IPY)Z5+*PofsQBNPwnh&6U)g)MTVN4Q;Jy^^I%|jA`Ah z?Le1+al3H>Z>^1;^oiW8t!x}Q-FQg;p1}#c2YpRPLiG0(Crcg@H5oY~AzKGyB34>@ zT6z**cp@SqZU-Y1P6c7nf0qNFcu35goa{L1=v-Z0XSM7&tgM=;#^g7#V4R zGiV&$ZJhMoXlxuw|I^5SwIgioXy{;W=VWecLj-D9-@w+{iHC#)bff?N`_Jz*b~FE< zJJ~q?>lV;KI?xe123mT$|85(&lpFLdr<}Q)v6Z^8xiv66z&&`GSUI@=p8tOx`JX%f zuPfF5=SpTK#{Yfg|2p#jyi(cG*g?qF8n~qs@BciRe;5Dnga0n%rUUi-e~rX{hWYQe zz&P{5bJP9zGvkGCfQ1PH1LFsi5Ef8!13%7!TUK?v?e8>%R*W>l{x;BI#GEL|U?x5r zM7@v@*KIV222Cun;M$k1o%~tzlb{8);pZpX+DNRC_3@XSJM2BKyN4Ui(=&IU@vG5p%(LdTtB5_*&Lk%t$Aa~3SayhLKhkwrW^yP@Gs&o%M$Z3y zW`_vzW1<)8j`@3(9YO{k+*D7dw6(3~-xvQsKHODx{A2ziQBn3Y=865k?nLv8ptwEk zr1VrYt)`chOCv_czh0(v8d3d0=lHJ?X%Ja=?R?VSEw2q0H2fMS%= zo85Q#nZI+9Pby~)HLGD+QEYN2UgU;rON9ir3X>o>1s}24w41rzx2mf{&V33`l!EUqq_fE+y4m7L0q)q^MDO zU#_PlF*K`op09G=tQf6Xh$>r8Tjq-bVUXXC^W14wziJsLDO0N}6Jl?#x@|?3+8Ekw zIoMZ@a;0gSG-&(2z1(fwF4@l68=N%E(YCsyQ_4=?z2yeJhmWA^aC1BV1v5-bstawZ z_aHY27l_JpJ8#G$2>gN}oH?!S5T4ijE&Ubexhy*E8b9M?bqeWJdSVKRc(Uw0F7uTZ zXDk*ArAVrz?4Z`O4$q$(A-Dzv1XQQCeUaE3TQPj^f$kykz1|x}Rvis?zP+5O^&;~< zql}l`1c93!RW!W^KG`*}GT64>E|})vj4yc{ReXd-!bxPaZh$86zMjxOp0Al%Y->9i z`rXWWX0#emMs{xN`=mMVIL^}pi~|nV)KEc+j)w&0&XP@Q1!LtX>k{?#=}PPIpm&Ey zmj8|r`Pca>-k%w!*SlH=f(M@|z{*Z&bTE}*<{7^eAnb>1zo+%sLG*rRTmrF_)bFQ`b_wr<;gqXJ+| zWTB_q6Ges@SNOUe<17wGlN%OVp$JSJw+);Uhbl{9m4$ccHNY=!8q!Fr;Am!_+sztL zQS2!aBn`T5So3+BDZK;+OXn(W+oXT$D_^_HP5~1Kq*Q*~#xv0PoYLFlTxY!^4Z?_J z(~1j1Gji?ke3N;LvI@ZzTJ38XNa>@1v(__SBryoCgC7ahn%wcXZC7ImmgO}euOW!o z-w%6Ib0H{Wcpr<$Vl^5pFl?H4FllmF?~WG~eT%_8ZVoZbw(>#=a5`QeEh~~5SAa>B z=lk|rnl7yUyC>Sn)ctnV(>4k;k$PC>w9to*{%~IOqO%ZMNKfMCNiOP#;zR79rih&I zXD58+7zZ&B*Y!AhEQ3DfFvMSa~GZut|Muc@;^KQzH!D1k}KAQVFj_`tU zr21oFqbJnzLHuH!-FmU!FCFo7psByzkM@atv5%!BA^9Zr9qVCKsXh?6Ljxo%SI-*Q ziJ_%&uST&3g^@eJQ$ljgU5LweTbXr5cOB!aHYb5&jM1|?LMYDOoO=980Ae*A-_~ zx5tB`9@ZtRd~f?@`~G0+^be22i@!a)u?*b2t|!Did}nvZ`QEhFz*`G#o9S9jjv-tx znffAe*=#yIHJm0y_7J{cbTPS~wclCPieRb>$FS;~v=&p}snL%#`8M|>c#Qi>8Yv-^FV0_nd}n8h-QCXc@ov@gYmekh z`|INo1M<#5Zq}a;K49*%)bgHpK{8|&3!y*gkm~3H0Vh3~b;?B^Y5yhsL+RX-ZKpTM zQD`_K4ns_NsChOL>pW4&pkM6u{)AA~FHE#RM@cbNVtU;dhd2TDMY^J{1K}c znf4t_{H$nz53I|Q-MUxeQI5ULgaKia!aVEC&HkgSaX#HAC8phnFEdr&-%r)LnS62u z(!^Jz_}Br2ij7S4CSdM-sdO(bMkNar=ynuca_=zO%)QY2jAqD9`MMs6JZp)K$im}| zFxV?qB>~yUt=hZLAf6LW|LjI{s5Gj2T!r>?YV8*B4|Li-S$Ei_id07VC=huYc@66c z@dgxO*S_*T2%klmVo~=;VkcPD4pO)u4InY&m4v#&kVr#6y2)m4WS(W!Kat{}`8=v` zufz>%9cLU&6Ze<2dr0?6 zcSMbZLC%B~iF7ClOvg4!*Ww3<66>W#hW*Q7I@JRdWvU<%RHG0I#^#A9Lrj>O^I@Fz zcq!VFxdXmMszd<^2fdaY%8L9&Kq4A<$;e92|Acadi3XY6-K#uKcuv@bRnZ7(lpzR2Qk-KjHX z0nDQ7Xw-j%Y?kvr7YddSbNiLY zYP~qk?%*H1#Wc>HzwxD_KAynwZPoj31uKXXu>leiD=1i38*b(uG(s`UH`F7)Sfqf$ zTESq*U)4m4J6&LsaY^Q4llc&WJP-_j>^g4?I>dQ!vd@sAErd0te@m#PqdHob>dfLs zm9#(XW(bdpm9PAoKMs9_qnfEIE+(3FR&x|45AlF*K)N0vje&a9B2!nsGt&#_!|bkT z2OgBQ-X96R81mY=FjhqA2BA)aIE%=jUiC2uqBA7d=whm z6w%T+^6hy9r%O#$$?}KDUo;F|=y543xbJRdQ-*S)$A4$BBLun&wL$KnjgwmoVhkJo z&@ynaJ|U`VU)xDekky6H(F#c)%h*}QR>Mqmm@NOf`OP9uh#nHQ@N&crUVafihWQ|k z9MhIn>T~^h27#~|*Va4CK@DGA+cx_=i{baCSwmq0OJuMJU!1tNV}+bpBb|DOtvSzH z)Vi>lguiY2=+iFORVeRRA_R`WQD)}K(g14C4PD9k0=bax0ds61QzWQK4|O=YGPDf-KDNG}eI9N(0!V%jS zEXiBx=Uk7(4-Q8_UI|$nAtv2>8SP|PFtZ^gCrB$C_Xlt(mFnv26}Rp))#)TT!znj2 zs_PN;ZlVUD8k0wUjI8#kI3m(Bp4m|&{5h39m_RAE{EIL=DM~kwne+(=4=jn#N)st^+(go-P^so->j)s-2#XH>5zZa!`H!PKbiWOD`u_EpQuGB5aRDrkQ<- z#rznbk;3wR_Xq)@&afrgYfpvMz7LB!tD@N+>B_{bh6NbIfo$}#8j8ldxT9rOy#`o+ zXOEFTT_nD@*(2FJnu_YURx7RIvy<{R!D`(jBHxYsvS8~JhOs9aC=EKw)3$qdpoZ~a zEpgn5zdJ_}<>)npVi1r~kwhiH=+f*oE;~pyP>{(IBti{;xjZM0CY|L~laTkdd8ij6 zkZdqWNTJ{TeOdJZ4pJ$Cy(}glF?XVNxLd?H|4whNUP`I3hIWxMP=)@*kj3QQfhJPemY6N8cni&4pG7uRy-lVrG6}p?v{{9+A1FA~rWB#) z)p8>ZW3$ z70+mM%Fgy~zS!w@gMI%^J*PHu@w~u;K&oUZvuLjWJz44bB)ua6j=v@SVeCtAYwAtE z%enN0&Ib}|cJrokxI@qW_JPL~3c4UVmAdVXY~l6_mEQ01v`QkYcG&j~QH*~5Zq$bE zDu_(fT#joJs~iz~VM#1Js1Yipv)L9U z2qqPDIp45NGxKK4lCxbEC^i;noj7_Ea_*SET(n(wT)h7i;(6L!Bs%ML<`@O_j@Abp z@~F+kXA9iC7;UprU5f$1VGbii+Fi_qd4$(Wo|COf!nL?Owrzyub5?X3I@3=86F{2; zJ%lwtOfl_6^j{8`v_5POBMFU?23C)msORfvb(^Rk%Nj84It-KZq{5*qk-}=OQ*f^_ zNwn8P=3^=P%2$Y&LPY~Rs_6ED7WFjW}GI8-fXHiu#I z9byzh8#2L9B0aL19pAaeeHm^vO#Uc+7$MmZ5$uu1K7;ML7{mPTbzS*e?D?-`)AuFQ z)AzyW1y+bEvC(R-qK_oV@o^Hx4~N2#@vNf!B(`A{%|q6z9gXOHcVUqjdP7)>gQHMG z=A>u@ib52bfLK;k_7$a|tWIx#U< zRqVqjjV=$tQ)eg>Zn3adeO$$9jT-BkY~R>x5cRY5zh`09Z?sggY_?W+o>7M6D;gqQ z|EWr#NLRxQVa!A?9d?5IZSuD#yjbolD!{0`GTh?e`CAC*T0!@{ddf!gvkpzZz#&y; zl$Jelcjf!%`O0F}-jD`!Rp^G#Jy(C5`xB31Hh~zaoxafSRX56tz_#jU$PQ2(Gf%c0 zp6_bQxZdMM8BIxWjd~eH9i48bF}Vx=y`Kg<1WmgCMl}jrC+EZi+50O!@!;j* zJx`qhMa}LT$mkophmC)1Uo5^q=4h9dF1J?e)&mAzb@9c0C2UN;8`sH9nd<&RUD4~g z;85j^i+2wa+ej#LwkwzO;Sb)<*!^?)cQvG(1is-%vlSTx>b6^%k9i&5PognB<-ay= zoPh%RPh*XGWpgbhjRG!LSAK`E!7@i2nIoFnJ-7CaM_YHXNq5Wl0c-BQdN%|O_KHsdRyscX9RQ5Mx0uzmDODjGGAs` z*|}ODPzIDbr@ETiS7ih2kiZs=EIRWApZ@B_`w>z{#B+3DFwGQTr*FM&CeW}%KgRIf zrdImH1E84eAv7`pp2zimUbb&IsA!n^4bz9;EgkUl z;{8giEAr+yoteag+GI@|j+@1ZCT+4$iviQ7ybZ8Imx4Fu1TnuC^wv##H5MbWnF83I zr-|$}E78b50=!-g`BnbkNi0JEdRBH4(T_->-uK~>t)~4Pgd_P{-De+ymaIsz3bYB` z*8PX<Drc?dW9;X_~?e=}^wi4kBY7*zc1)1x(dcF!o>v7ZF8BK_Mj8dn+L{Zk|#|end zR;x3Jsz3n|Ci(!Mhxk1pbKH{Nr|CQh-ky6e)A73PYYuOVP6(9gbyMdHO?5WKBh*Wh zVWxrc{ls5R6Ii+;O!S=Fz>tK$Otv^5+0Ck(zEiiHtZrd9>uo{#PK^a$dCbt=bsk7R z?N8s}1=S-E6%Ky^&|;Qq0E4wF)%bMUTBZ~VkA^`KdB5s;HskcA@~43`AHY1gy-tV` zenyV~_t^w3SAKL+&>Ptrcrhy{>6dQb0|`eK|ln^vUW4uIZ-v`u9g& zv)Geu27_K$mWjO{vrTA3EEL#4J!!XE7C{Zcm6YUXJ^ro!m?koR&d;X#QLOOA?@xLx zf3#I~JW5X0K${-^(eDZQoZoiZa?}kFeTRfnVq`D9K!?-0A!ruhQHN6iK7zmE>GAF? z2N92*KHK+lkkaCJAVx0Y^Tg0stc=135TU0MO!7dcM(jXiscgX^&T*+n))R{S)hMG> zxPxhmyY_u@ScxY|Y47$3)A;mdCs1^m$^QhXLP`U#I9S8U%TIx8^EHNB4fI9AE;=zM zHOwv_gpg=lcSq9TZq(WDKDT1#sO?U}o`a~$?Y(hLPgch#0)Iu^jz**0OB zNueyBc=y->6XL4LP0GDpDHwZH!pp*=* z=sR+pqU9jbU}e-4V_Df;ShAYjwDfmA)}q)!t6*x)#9_gm12|u`&Nq_t-p1u9%Cj(J>kq^^yIn9n=d$4Vma!Ry3%v?<6W}kf`{K zD}VFjOHf{9Sfv)r%|`u{;lBb<#W%}jbgQVJkimT^hI@t;xgbX0`D4BjJ31bScOw5M zgyTRMOX+p&PEHcU9k%KICtay$d2d%ZK-Gdogt}c~Z6IGs*P=tgP>SVFpbTh--k8=- zv~+)QfjW%jKYhw<;8!I!5gD5LCt4_B0FpszDOVfAKb=Dz)yx4g5`W&dvd?qd za5aQvVN1@{-?sR%cA8Z4doaMj(G6sWGH`Gm7BwY(L&f0CBkcPA`47O}6}H~*S(Y6u zmW&BqIB7?I3e+WrK%jSZ0K|$Aq1-|G>I~Zs^_2{#vsIo<-`5BI6m1tcH9i=W&hL&$EuF>F{k(5jPc^`~pWQn^|nE1tRxC&$D?vp%-L$ zZ$pkbt4`&xu1o#T&whK?eBF{4!!KCdb3R>G?|HmEDa$T2r{o>z>ayFwQ)SS)JDmBl zt{}?#iX?_-FDxyDw^}L|g;UV9DM-&rNH|3ShPM0d{$gInk8X z*?@%49_aj|K<=Vz{@X!O4%&ruIakH7i^SmFnlIssMDSEax50U*ZHrB;DPPI z1ZNnxVLDif1PYnSALgp0IX;gi#&cUh0xLL`n)DBW`AuoyEcz#%3T)BmfMcj4zPT)S`C7x4yO`+v}vCdts2cy!79U0)Z4i!4JLC(aycc z=56}7qd8f$D9cFY%RVsm3s67Z@_roFzCoLJ-^==l&uLInyB>fbD*M{c_bOjdgGEGX zZrl6AIK?ef7-xeSO+!Bz<2uZByM9U%BAB?e$xw#KcN3TT_~%x%0Cd%#Mr%z3n|jjH z(iN8_8D*9FBV^xuAuEK|jp6l*I~2&B-qRF%mlo4;VLUCILA|P?VVb7QcQak_{4UeF z0N1U&xrIX=D>;vqL{XbZ~1H_?bsqWj+sDEw9PEag81+J00YX#GGo!f8fmPRW48-n0_Ib!D`JM$v69vSibCV;tfzWb6-)4J)ljR zItEB0=FNAvas>jrCBIbs2!xBrnT;rbzFF=Tc`H@12+H?h>V`>~xWCxy0T!A~iUlg{ z7WZ*=4>MvuYOz=hfk)7wK*7)-VfRzzq}GRYld5IMt(T{og($i*o`TJU?X z_^zyv0?*fApULa~oQR-88N+KcoU-&o@HB!y9~O~+9=sPI_CdsJ((2Dow7QpO*W}z+@05Wz^iY=f+8ZsunRJ9otrNyXf*#|$4rgOR4 z@Z$^}W~pJ`8sY;y+LR-nI3@`+4YX`Nq3<(8HMD`r469JnO&y2{<&|piGN$hl>#ogl zcf^GJP*xfuf)@`RpHi?F29+umO5jnv+cQF79^24co~Tl!PS8^rOEosha*aAekeq~a zWYA|dTF0_saNeC4l2`!F_okl&kG^ErgLKo6gIhnwc?6{>7Qrw)Uh?)O%|1@$OP~1P zu=S1PoG?IR~ENHeF!gZ6?8lf8rxyg*eYhqEGzN zU@r4iW|dw;3HQkCq^H~+j1F|d}Jqk@dtPvGNEPlc@p);{9iQ8ht{^>}Ql!&C}@Fo|ygo zoYKri1K~%*8oC67;3vsG+c69V`De!zOi^ql>Zj%!SXv2Ma~Tbwu>8rT!LmM=@V}vh z<^z(ui`#abSingexDUPFrNA;Jq5qOb3iJ@6C}DX?JmF`=0EX`YMm(e99qm&-W!e4_(?u3wW7?BS(Bo-)Rpb@}d2KCYg*7FDxFMpLzbk1&n2>Jstp zl151R;C}zS`f%|?Y$ih5!;x5BeV|A0rYv9Ejz+4_bC((-#-`Z#?Cy1cGska=?*>xa zo^;l9geSKLQ(|YTSPmqJ6`~HjE``zy3*SZEymiCCCCdesNQ>l`3AKJce89&Z`(VinmwWRfM^bwHI5U$MLh^zYtmUVx3U}h)NE} zA7FzHw7$QZ!y}H^1ZANLdx#rBR|LXER#cKrDv~hW31jXxNXO}2Vsf|~&kOTDkK(FQ z$u(k1>Zutp%Tt+GY?{}ldQe`0?+Li|bp<``-7!Q>o0W8D3ufX^wAwXsn@ zUGt_4>CA}YF``!rl#Au1m;3YV!?~`*ok~mfPivAF;8$E{ZHmKa>0eptyYzx=OC~L?NB7>p$busO zB2a3hsj>K_t*TbxEKaFeuS44ZL)FZf5VZa*n3pi z&Y;BsoNXxSpK92|(XRQn*+GyK=Q*vsaP_>A9RexTuM7qEy=B!P2<8L#3V=c)sX0o};LrIR64+!d9wu`Bw1m$GFZARdw8rQRW!+0sL99 z;aZ{Q?T&EE54ssF=w=Xx8-l67?oL-UvnVvErlkQA*))~IvGCnPjUcRE)TfR%r%^6=s=^BuN{&AI72CB{6(W-|@W5_3#2rH8V@~NxwqIa=b1jPzWt?l*)OBxBiBI zLeNN|s+t@C6oV`ZOkAE~kt~&Rmium+KQ^y|3SesrU=N$tIKTD;Y!F98RAUS*h;!5< zmpPuN9Lmahm4d&Wx1!ier~9JMnpgdg=C@6<@C)B<*k@#2+WV8fr(m%HaH7iDtOv0# z1qZ%e7dWQU);+)_9I$v>P3H;0@L=pEdr+(7fDB&iu+zvue-Fcsf#h_t^X1JyS_rfSY^^Eyu@iTmG+dVxswz1U19#gC=htu8gtc%L?@ zQ7L0ZrRdt7s+oa2^W$CU#a4e=y$LuL96N>>*k?>mfDP#Zp(S)kC2#RA@j=ne26ajp zK%i`uMTw1!OLc@3LZ0s2wCCgo3|`=nCS#mPu#+T+QkL zP&&b$XYHy|A5yNJsnS?*lypP^V^x0EV+|Mt1%93j%i&sQXLe!1r#4&Cv=zWlroaCu zZ)iQ%>1x6_MY~Yi3Sb@)p8-#myi6m2UV_SRcD~H@fR-^3*dWVDL+CMQ)mxknzK4MG zcw847hG(}o!gi$+X*zCuWO}@B8EelJamcOIjrZO0I2O!WEm##S+eNhhW2LVDCV6qzbQ`ruIQd$oKl-qGl ze>Y&6s+=oEQD+i>pzl6Jis-3VUIGkSGu2y3syZbX))Qa(UX-?a_F78GDv(AW?jv-c zNiw}}7amViPTKrGKX<;~RM8s^#DaxT=6E@z@_M&)cZF^Pq((T-Ojj%s7J;Qf<7|Qd zWvKiB0HEu=BKdL^Zp=ykRR-%13%21dKiCJQ2+_k%`LdrR;=?cuNZRxHYJ{bGNf>n2 zT$ZfEu@K1^rRM0P4akeC!w(um29No$xm>JBBI)C7;|*4s#@I0Hb*a`IKD^ z{ESa79z#}D+$=5pjOoQ+46zCO!80df2XUsfZ@qr*=5t^|FkRu?QsF@MekPhUQXPTlcM?qZ1$qX07n-Gx2-t;$2qxq!- z0Y$?Fh!KrsUe7uiU=Xj;tp2i>6r2z@en28S0#~A&{FlgHfn9?TU+T%fzOc<$+CjaB zf>{V9v^}Ri6Vm$Z!ZM3MO`bV&oX}E&*lvF9V1mOp%&x*RW&BH z{$EuMHI*OfYMjd}bwWN;x*(|L9M5ED!aWw6niJg2HPi+E5c9>tOd$)z)Knx;nueZ z*aP>8Y*ZKvE7K6o!NsEFq!MsMV+bmITJ^X$TPUQ{7|FFf4hmSnCAEJYp8>SPIp7~C zTdP(|%m(>i1lpj3fU=8*ZKw(#m=$rTX7Y%LjF{WvVOvgudKq z^!fqW5jYaw->s{EL3>0s8N$Z2LZ}F4`@TJ`9sY6!x&DA4TPBPZ*M}Ojw++~jn25(J zft!vCrwX@WI%g%P75EJORkprVCyAM;$As%A#yCpLJCcJ^&TQbENv8fLeyZ0qf#)$p zMM@)Jt`$41p4hBxIi!k0pD_naa2fl}D|O=3B4@hjst0R>B5gsw0D?G-!bUs~cHuY+RL5*Bn&!><<388m0z1s`j4hE2+i>tox^WJ^NtOG z+}Q!Uq?l)UNNmgW&w!D2mWA=660cA;3kOr%X(H8UdmwHj)i4?k*a|#Xtx};;GR*>r z2JNDU-%k~8GMIkzdOt$_s0PG<-_MEuEoII}vp9fXH==|XVhk-5aJTAXVS_Sgx-8`O zS1h$y0Zx!V5eb!EPkbtq!;x7B%L63@v&Hp{fVS0p$XhVoFgJUJnD!98FiFs1G~@j zy<60`i^2>Wc&w7`{M(bI{`&GeJ^9eXgqxi5yPT?J3nWc|=Tp*#rvMqJrNx(F^EEt& zmv&&M3H(j^yC#{eXZeL^FomIJ2;-$`!x!ZOq2J>bPfh`ba~2vEzlJ=Aqj3$G5rygO z^k*QW%&^e?_$;mzFyzL7O(zVZxFmn{GNsx!%&1u8sU1AgDft=(pVjmeXgGHaN*Oi` z9ytTSVTkC@F&0+Dt>{$I*QO~VOQE4ugRXH(EIs}h?o07V)9T?Dg8jX1a^#&^-P$fC z2Cup#g7ybae@j55NPfq9!9?dQGn!9!6lw%R{l($MtJCR`^ge25 z`Wuvym7@26-!(skE)>r}9I?OqYpQ+#M($20z!IV}$i&R4J?gis6~7);b@CP#L;{$O zE$*fmOH@6Uei^F~FoTmb(p$3of(*zc&>t@}c&C-iegkZ<3AO}&J+7s6`6mua*+|Xl z@A2HfPx-V>8nX84G~y*BND@B&ey?TB7&nk^5><%cIOmoI+iS;A82$iTloPd0YWf?O z`twvdC(SBo|B0L{6ktJBmqZ$k^S9qZ9I5Jj^&mN{qh=>#g#6lTNBXP24Nb}BrP=u# zyyEDFV~>b~;*Yb=Xe; zD0mE9hpm?e2~M?n=EK56XFy2-*di=i`*3$*r@uf{3YM$7wc7!B1C%)*0aO7;ePUbE zfwQy22~mK;Z*3Nz!k19+=h96qw*?hc^VZk@-}_Q}{y*+ZNymc#=7hC8W!^GM>v$dd z8$gf8J-h;Y80yY9LJ3V|&)`7$)(2EB@cr~Ci3$U=F$6I_)yK8NnAb;ub^olyGR?Yi zfuaqVf%@U($)DyopUygU+ika$5(%G_(nC*y7&aSL$NW*tPTcszCmVNOmg4 zhLlZMB?9mZ2$bG;-x$g9e0NE?owrm^%In79+3$l6K9Htv%&6+(1ECzKD@GWHg8=s} z75CK$(_sRVrMRKr{re@L`~YEp`Ev0X5UCDN1fV(yA`jG=K(N*ckkq~bP{Htk7MxS1 zulA+84G3vi(LnHc*mQ)nv!d$QcAE3O`0{a*ph@Bj?}{M6b{jN;oY!G%4f_Y2qW9sz z<5t202Shr*#9!wCX^9A=5@dW%Cs+_TKoLt2$P@|zypNV}cx;Piq)}&pU|(5M4P7^* zqWNxG`Z@Ihy&usB(4Uaqz+Z=TU%$JHhr*cb13NEb+|Ma^+|Knz>i}8h-Njykf)KFd z!YNibz-}nokD)@R-6J@Hwne-K7+E?5OrkNF9)L(i>0L0}2UKHrU^B*a9UJz3AoCT; zF{s+#CC{(~Sy_ttK=n-5EhrOso;0Z5Ur&8?W*y_FSj`G#4_8`K1Qo6G00HVuk4~As z0c=)5YWKW*|Fepa*UQZfpw_QlM;Q8{E(*(!80WMcW!ATN%nFiJ;E{Q*<=7v#6K)=X z24mbq-s6`-QXZCtqf>@Nv8}Q%1G_KsfHGNswp?AHO|*k-VXO z=qWDb@aabe#Ke_=;E$Xy7BjmaM=Dl&$e^n+1X9P9yN5rX>4O4@`rj^G0Z#G>rK9q? z=+byj*VFR3O?(azq_m*t2OELwj|*zQnbp)koG#HHBu|;Hf7S*nj-fwtA_AaI4N-A% z)kIhcZb8wOG(ik{-S@GtxqkrwJqQ4pWc8D=A)R*yz;XjH2gem)QlTTMW4rdzAK@jw z5bBptw0pXquwBeo9R5Q8%FR&ayhR$r{T&^L;^SW&{RXW5VLyga64;P;-hhwFgSl#u z$#iV`$bPFY*k+|ASR#SqfCW+~9aF=MxLwOAPUISvDL|virb6sBc5%{Z)+QN70x>_( zDVxyeo;d6GI0BJhij`Ru%WT+ZD+50P(a#!=qbxnBVt9FjY9EIS;Ls_8fG4Gm#O1K{ zya-xDF@CuAMqM9R@Q+lB(bBZ*?2wUgn8jnfq@eV%V!0BU)H^B8J4HqGf-zM7JjVTu zOjXiz^54z~miB0b3*OOp2yD3koDa1`d?3WgL%NQp2JMHv4j3tNMbWWzay>S=z5{qw z$Sc| zW7tV+i<$VW66pmk@eeHDprt(Oy;(R8KSG@MOX+KoKLUrgpW~QVzeufqt9F*5i{c~t zMd>X=X}2MnToP_SJ>Uno=o4htZvzM*nP~SMc+Ner zCoP&k3NKS+P6`Y6mUu-#BV4RlpHFtl+$|hc)3Jf@gYmH@vpdedTdx#~XP#G$n}~;m z8}gGTTGuB{`u>II`)bq4s(a00-8$S`}ZrJ z4j#f7@6^m>hQ|};U!~!0)QC#LmFvBF_i;Ol;L&q$>-hN7w|+t406QTHm2wu%6`vb= zcc3H_q@Q($hkqopdgD9fADhir35*s$1!Kjj0)Yrwd>8p4 z`(F*o9UawZuYzoEVw=cAg-XY7dVYNI89LnPDApU*-`zaFDM?eCuXd&C^T`QyRePV0 zU!~`dh=_wRzx?113B{4YnMGgVJ-Yq*p8rC;zethW;^BvUM*5R3NBfP@p-RneO6PdVpj#(pxs>mPl=v%nyvn@;QjB zACQuK%Aod-7`yFVy|@T87gzoYi>hmX+D$~_a$WrW{qu249{jr8Wsv8!d9d^ zWd<$R76CIRz^K>;*u#~&$E(x`*B%vmZ^huHc|)+o{1E;44ycX3xRbMDvtArAobo&K z`2)0i4SXxQ9aV_njq?b2p!Z4KPN$Pn#~bnu$|E6)k=!4W)AM|44UCgmPTaxvck!?b zc@_%sfrqGCAx=&tUCOnLxyio0K4lsRX3E^V<6)#M+td!C2^ZRT>aB-ad^;G2X42;X z{K8xPT7?gwND03*D-P>N*fouZbFuG3PYb({fQeI;8UNZ3wr~oavG}eo(6a{cwU+Zg zwLk_0edw;b7k&a9j#A+}GvgJh-=66k$6n)q6%_HBlybOaqZ3Wvps$yvreG02XEU~{&q55 zuS%GoUidek-eo72<3zq{-w;vK&G=R|93C#NF1R6%-#d!pYp z6d+hhb$EwM=~UcwdPofrY4!C>6?x_}t2)$ZjRygsD5*;J*j<_QYHY!xX)i6l>P@5Q zdjuf3a)m`@TYD>VF{rN~7o+_QOpF@62hejDYw6&pErVehmF9zOb6{>pEVZ9W{lM6z z6VxSt0E`XBnFy6E00-T5fk`6-%y7Dw zSo&Z`e%-9p9estR?vsS)Paa^$hKDOS?IW{?wj9#gmz3n&xOXv>3>=eRO)T zfNkt$?0Yar-R?w$C>MMatn4N{1Kkr4uV!Fed0ZXuKBcHjCbb>7bj8i5eD(;zYT58E zLk+Ce@ZC!p64~G4BIWz*!V%SM3f|vRkdJYu^U^o2i@``~fyGWl*-zrzPO4@0Tvz7Pa8$*1|*{lIkc7Aa3VYC&5~nIA9MrBc;~ z!D(EwER;NInKkjj@f&4yPO~8;2OohvYId_lpfzC+Q$7_T8k=5PDiN{c*fG=FaXOkk z1mqpdHy7vn4dv%|yCi-hmjeeb2O;L8Ec;TIz*31`CHxyTnZCc%c>3G$Sj;H04({e$ z#XXpj?1k5+VgbfGCV1Fov12Vhw9(2O{}+n2L$(9E?S$%of5C|DUq0Sk8iF(u zw}?#@rOZueB;}i>d#Q9#^Z!A-!ZuGEoRrL@fUHa-CL?yiG{l6x?E&eJ6_NAbBeNwB zYg-iYW28|iUZ~1aHUTvFv9rfXjLeU7p7Mo`Fh=$UA`?y%lPF=)bj5TVH$O=!7Hu~H zF4OFstK@wA|Cy`YeJiMO!N4g~(~I`GFEj^(hz&ucamXNzgE6xqHALwd;d3Z-le`b- zdsPz`R1l?GNJ}7gD7c|Rn~vvr%n^S)A}!1ND-jQm&+b^@$gi8T^1uo?CeEiLMFN&ocF3TxuhInu3SwsWEI%H%!{A}3m@TYd+;*R>O{ z5|1p6ea}Az$r}MkX|(+;QtWwc3SNJtIeQgg=s&5SX?4VhcsU9Av$BP0ddZckuJ8)5@7P z0Gewbc*_ zLXx$H-{s34;sLnQy!ja#A9kT)jySS5+Ik~hA)lD?PiEy+GyIB{?X%PMGr?%SH(*Qs zQ$<$`c@(FlCG(xpG*>j0)S+e$hbw@TttYk_z}HMtk6NViw}mqJ=8rP~5jg!)#b9U( zatVIrXU7bccgE1sAoaI&F3D(~;(X2=#KEbd(q(ye!@h;O%7W5td$oEBE0RIf#Y_}v zW3KL4HWITRF30wU?j;?;O?=T*-+gxL=lRm{}$9F8W8MCjWY?oM>W06rFj z?O3+?5Q8)yqwV9Z0SCk4Z;X5tIK4}IllK`Ke1WEJ1!E~mp4ux4N>Su{A>3;^#$jaR zLo1yEt3@8JzM&mk713NkxXe_e&**br6)(PaT=;Y!=UL96a?`O^Ljz}3t-nq6pxIx~P z6K)P6v%fk{4PKf0hNsOxfUFZqSg~YU-}#q%GlC1-YX8Rbz&~ zATGwPc-N~Trr8@7ftw7haP2B_5G1bpr0c;ITdAa)Ba54+{iCGI{yfk zxQ|5316CbOyXbWOSn`Z}KK+JeMI;pv0P}C{C`3YebVuoaQvUlb#c(})B3`_8aKo84 z>Priwus;U0ee{#Q(=ifdVinu{eL4W@*GbhOO@iap-@NPO86pYT$jYt-k1KibCze;t z=%(JAx6H1b0TlKjpSTKGCHdjrw3`Ay=K=aE{Mb(nbMmyB8B+20F*y}HeCT=2p4FiY zfD9N(?186=HsNJNEbh59p<5BY;K?}85PQW7;ma}NYdE~|5O898@l`|+qHG$t68m0Q z!eSQpsJ!O}KC(klLen= z4lRoMfe^ehN}gC!CmW$$Hr|{7Nq5n8@{R9wezxW@q5?O{ITtuk;RqVxuGB81<4Zo> z9bt%Jbd?I%Zq?N@U`z8oT&!y>1D-k*0%sH@87n9(6n>U`_{#BUap;yGBCCr=djS2# zP%o5-?$L*64i3(&nAULXx9YDe*O1;a-Te1WakPZV%`#KLKw2b{3IqK zcYSyJTQwkv8z55;rg5r~{dT7|O|I?9V;8}eXu%KCrj~PuF4N4(-L57 zMQt07DF~j1Urire2D%x^snC|+8WXBl91>TUi^b~ z=ADV3=)KC_h${;UBj$WOc1SKr$;ki+smbsZqIW@TRkyNF#paVO<<3q|go;5m#Ni%I$tU;T;3$R9 zG9;Y_t9Xu4Emcf9ol$j)L#Dhlke+YSae zy&Y|jMIi=hi=F3x9&K67R3VItj}+kw_I?8r?*PsAG6pXs%^O;f$FzW5EF~+>2 zDIH&HTxZb>YXEGt6HDn^E}Jl2=zlFlz2$^6Y=1jP=AN@GIZpVCx7Odr7mIsx!euGS zKeT;Ey;q*f_z^qT(%8(%7p*cuI z9yiJ?s^l~K7AxvxkdM5ZPZV~u&U;tUfj*0)uY4^r_D;t_&+Iu?OzTg<Ylb@gQWaQ+6JIs_1DB~GNv$}(LKZV5A?9+E$E5hp;I^vT? z_ofNSc_5vT&KHrwZyjON`Wb7qjMV1uAhdsb*)6)1D8_r1uC|XrhySfWT`!{JQ_#5$ z!J28Xm@1iTgp;xux8*_9b<%5*JJk1;5{jczB+ibvsAJivS@Dm=Fs7isY*TM7Us4xk zQhgD0*W z#uo)c{5$42Keeu)R_Igj(AuoT?QkyK)>V~Tss3tTL`n<0BU@GfZ({ePT@;@zFOVu7z-76l&M?dQ_|$2fxeoC@&|-UMec zCWKx0u3VZmB)Cq-byhkto0`$vJ~}Z6ckhUU){(6k*C#6@rszu{+S#;!yOiEj>o2VX zD;(`^@S4^RV=S6P&v~&1CKP>cJiEwX%N-=iYP?2T%ZCkE$tS9TiF3%P>Y-+`cBM#} z6T9aYA8q(J%Bs6sE!QcVW2S@g>Ici$?qlH#X=Vo3@4ovv4l!fzQKPpvwt@|UyIENk zEvv32gjxj>3g0G@+Bd7(PUAI~i+b55$My1lZ)vZj6u3K;3-(d@8DO-f^|x4m$Y*YIegYG8e>?GUnFfXJv6D5D7F9RlC!Vtq?y5a5q<8GyYPQ<$-$=p*Y|i1 zNi!jl2@Z$aic{>>07PIjbnB@nGJLRuysH~?1sQ1oPtx3@su`RPl~+h(nkgM>6@6FM zNxdXej3S4172n9QYSjC{rM4V6DP&__&THxD-l#caYF7LRn9}P*L-9YEf#^<}IGhSF zUljW{>`Z|eQCqDtoIw*gN!b78t zoN53SePXnbe;Up{V5biduf@dJNta8T61+C#&ZHfm&&=hd!NiV{|wp zss12>HAmaQm$s?P*PmE$1{2I^jS zv;+9Bje#%Ql-l_14QRrm)80yx2aG=#A2%tLVA;Sjop#8V&4slFBdo?~dcIW43hY*+ z3=`CP)8inR?jsJg*0%8OVDf+N1nGihb{&VO?Uyv7tcMiqX}69ug{>MV35++6W%8T@_!jy<7^4%rS>GOmC z$R-sPGC$C#SnKcniKd8cy`S!BvrSTufW?>8s4k3VmwY#2CZf&`HLU+JGbyS6ymWfu1BJAb~g%&RD=W=KhfytIEgr}C1`!B&|%5@HQiB`Ihq zUJ!rr3&I`NULQ`29C_=C5WWvE_O7b3n#^__weG=!*45jo&3p;Ww&?{A9g4lzkaLo*qh#XGe_0eKz~GVjHnf&+$h@cUTb38 z1$Q}+ePmb7scWydm?bqzewbx(F1zSfV4ONi%MH^s0$G3k#IenljrHi`+dI6xd_>`MEfVE%#Ta(V;H%qQm9vc#T3>@dzbYPPC^C;d=Qa=Xp z?i261wX2RPZr!K#ad|n5R+x9LbHQX-vFN3xmszixk?HuGrKnrZm0uJ$>z`kh2400r zU8khm+iEv^*%Hb8isD?z5%$Tj2|$sxa`nRwh2mZ|1)ZpQeZM5K?+X3_1UynplqC28 zH<#k?G+xryp-Y^KESx;6cCQw$un=px8uFhr;w(&>?+JQYL$=LqFe38X_n@IqSm#NiUTilbw`jYUeqjT>Fu6?8g1Dy!n3SEF>0oy)n|9_&0lSd3G^uKOhoCLmm>M} zLwaPlK**f9C!pejsIk8mf&&x-5y}Y)jQ9T19$>YSvcsK2P3UhZP`XVpLivJDEjM(L zPtakkxVayvem^-V@=K_%N9p!YU?qV`AYON?$JM3bTuxy^3E@?~{AwA>?LZBm_8- zN^1gY7Gal?X|wEw<>sOlBJhT#%*~iF=uvj(Gz1HarV$Gf!IhlF!#2hX@@U1R+^SN$ zqe7xwpHY-o?ATi@G_X<>#+n3gI8fC_dh#c?dlgHGTZp2kkYc;PYC*T_H47A^%s@$f zwGP3hwI*Th@(!vpu9Ns{*>Fu&ZGE77_8NvObKsP>bq+=lRneE*PK{7$_1g@-zd#ds zo%^TOdx&H`oFCB|S)J^R$C^`$4T|V7?3*R?A-wMD)wBd-5g z=rwJwb8z^I--=s+w2P18lR%e%X7Re+m5B{5gtnKl2I;@lmBJFe(`G9yBMNiD_z%B1 zI3K3BuPA*^ec5}wGi#vY21+4Cp*I9~sH4P)0-wq4;I~BM~iJ#5C-b zlQA5=S>sa3|1}hX6FfwYC!*3Qw>cMbfAOhK3`$^rH-6RL;*;BwFlU#yryIAf7Wtbz zF2(Okxni#14}>$Uo6Z(epaW_T}V9r`5olr$}Ta{vPbRY3~wC)tQz!xpkyQ zMuq5h-#ZYHRlTHUx?K3J_bYzKLP*b^mMa~nNX+t~S?j@4fdgd}MUQ%z2FrwX_6Les zcgM&D@Dupwh07#ZnR)ZHudTy~Q%Fs73~;?g#NtofyeA>oF6$ zl5IVNIuK$3!Ncqk{bcOB)b*}!W332*73^L!^^Rpj02$)>vL0`oM^k5X6)q7uMD^BlO^+i za8AqJknvTV5kmj(dp8~IAYvPEOpxH6M{IK-T!QWs56geRXF2mU0T<`p=;Rp3vVGfc z1rnGOimJQ3y)SSFu)Jw`p1w;|js14z@#1+hMJGfwp=M_?svpzdPhICHqXiIbNelIT zVnJ=)Ux>8%M?aE=7Ng<#u`u=e>642r)PQHMg@$9ONQf+~sf9aGJ$_g=1@IeP_Yd?H zKcT}E*hX{^=rvKUTyBn0sFY_K&^lTbLFxPR3!IAkUEVm1u4R(~s@GOOJ!lLfdsf*& zpR!MCInb+>t+<$;3LH~WJgv>=F)NP;^Py+uph7HEq&{EsLX^8l5Mkq-A;i0F{}$Nf&+e z?G_cTvLGis*wvk}yRDCkzOF=>=KB&OH&Y$cG+FZWC$lF&CWJtWR46cI1xE=8$|WS= zWiy39oJf#NV#_q{*VYF=tXhL`&l4|Px~ZhRYrF#UBHO7fRC&~|>N<9?w}Vf?h(sFo zH8Q_^gMa`KEM`Vr&FX!@;p5xrB+mmzUprwBp$U5awc0^NW7~D}I=50U`jPr0LUv0N z3Y<%U8=Zu|NDYslU99(bvNFM!_Gn1CrT(r1`nh+hQy!Ud@o!Q(MEqh=iqYX|n29lY z#WThFr)vAoBe$?|vv+{27TRqb_XQE7LocGO1^lx9CkH)v7 z7R+Iul!~zHmp(hTf>uTC^fRX9!4STHii=6Oh%F~fol|N^s8mDek$CpfV#w{ z!Y?$Y?WIwb*xMGh?X5nkZsRf!eDJC+vqs6s4(F*2vkWDXR_uc~Lot`9wH>2eG1tda z%k<;pRYBUR`*wx^Dz-w*33+}`&O5|dXclC4_bXtJlrCBUU$yE9LkqOu=0$F7N#OJ* z^Ra*-opZ)#70n z^qjZqLRylt2L!kqQUsG)-$$0+$E2~eB`_m%G7UC35YNSOZAnHnjfnba77|wCf@S)) z6~|0Mz++?j*^=lJH;dX*O$peZwx7H6opXN1=f9_{kt`^D2WU_#ls*D`z01~p^Igum zlsJ=*Gv^)9_x5)QWHbFSu(4PpY9B5XR=wsEjoxlLAQ6R3)mBjW7oCWhT5&WVCv%s7 zW-ic_MrfW+86?X|dl>B$EGWk1#Ju~TmCEPI)Io87lKuxk3<65cktFr*Hna?`uejWv;jz+ySF1Iq3-bn`|E=!`B$b(Da7?vV4RK z?1ddGw`F1dzE@Atc0sb5W!a^+J(c_}8b!n|U2^UfjM&d{?UV}ArPhQ5%QawRQi?=! zNnT`?V|}Vt2QQPY6M5g^S5}9F+p9L_^ME-M=A)RTw5Dls55Y5|TO`kQ2`JeXMnyhz z*k3se5|Y9(^s>PGhx_uJ#+$&YA-8z`Swyc3NL_}K-rgmRLv}-^?$8e0vA`hL?=VX4 z7)P-_hibs%2@Y9H-{MoSRTg`2FSCaizi)R~BEzEW$?h$BS9gcd(Q2GfO^ME;JOcsm z`4|?1%0_GQ=q!F`ng2y|W+p(D{JHHQZ4k?8)jn+17HPoUbswut8W_f-phkh`T>&$4 zHIxq&q6j#}5y-3yfiPw}uCaG<_jGS(Bhmmu1UxHy9w<4Ys8GZ?c}I4=}|^!&kkv*~GcqHKnlr2LO|T4mP*l1wt$8 z|66w`0u8kk$VJZ2CjO&7`akO^f+0AHyYSa){e~iO@NcWzYqu zhE%D|d{qcW4}_uAA@MI~E1f!7dqQ6PGtv#y{=as>V!Dv@9@2b@^LFdV0d~NBB+*oz zE`5wNUlhG|x%5<3kJf@`kkpnJ(`{;@nEWFiALrfleh|JO1+5Sf>FfUZ6%F}DNDuyu zn2h0YiaPe-XDjB(<0-aTul9i(MsJd9-?ac2jPD~Y?Q5??5eIA=f()gXbynR0`e5`P z_7Ux&q1p-hDL^Yhz^~j`X*xSSx2}U2LXEMQ2LEP1+s>D3UtC56p&$oM36e z5%k759@d~xdTCh?(CfK;3Hjmch&~R;QcLb~jfWC@3Ej-k-oJkHf5jX0Vef} z!uQy@7=9gOA3&}}T5j{ct>TMvGq{s#|m&cug!*^n& z?7B(nBu&NBy)`skM@TKwVYeX?uv9!>$gz}Q3PJna(UsfZuEn!j+D7yX*ahvKyCBjJ z*hj~&F}S&2zcu26cSC36d*mmRu#_Vp*yNzO37&!wrbr7vkbUm2yyOn3DMR3p60s1z zQn}lgYd&7R2ljV_LkisAAF8p~aR{K;zZ%22^G<7v>X$1`DsV|4(@n2rV&!8wBp|#0 z5>eHot@qjm#LL1R;UP9LfKYyAIH%%TVTvEI7@PX1+bSNZDe9QTT~O@L+>Zw4|s z=_kiSRMn#SbhE!XR6|f&7&+D6(hY%(I^vn@H^bh}QLhxI-2qw=VqWJ)Q35iFp4|D^$c1EXL7dYTWFkDO*L=wdhD4{JunJ$yoVNTghBU z^G2?&2|*<&(9C2GXHdnb5U{}ZpnkJ+PH&uRhzEqwUJRii$@%2ga0L$F--B`b4OEH6HO3Xy=Ws85fENHK8Nhe#~8nYYD>mcJ@yUnxq7JDf9 z+5)SiOkB(z;+kZaw*{yZcGsNY>K6||G0E#L!k>*~AvOo#(Hk_?gI3Vp=Lk-q=dcmy9Dhh~#mbMoq@arjy&8D}!g4og8pY*3|T0QLm(eRD& z-|wfDtQO)0EucO)Q{_|x`OwKOOfkqJMtKj=hEek|%S!mvL2q|R_7=}uZ08q@4_8qA zggZWVR;yHIUmQPnZkpJ|x$KpplK2PGVEE@)%qe7(6&kM*wkgmT;D;|D-ex-S$5DBs z`TM*TN_BEW_CM;Xe8m;%S#+Y4=Mgte7KBOEnLGx`Xwj|g+!jHG>?laPP+CTTMJ~`_ zl)Px_Z*@CMC(TM>i#j#sKqX*~SOo2szhyII;7k);(WU(CR*s_LyDW59yvG=M2EBis ztDiu!4eF=eK!e6PineLoHoiLoJovN8^rC#92H14;;nae=kYaX2$4_-oPNNr(Z)&yL z@tZK#7FzB3ouY>6O;~KV`>Gl-u>f@&3qOqlMCx#&Ds)FdHKdUD`EKc*3wj1;NUuF5 zd;oeLJj7NPiQSKuPmg5Aq&IFPfhav3X}Kq-|GVYlmHofP{uArIdmr)i)^FWJuaf_# z(HtX^ADF|@!AD=*t4$L|Hr!+AcwBFZG?u~h^`AT$PNK3ae{cjT&t3Qt|EFx>%np9u z-J7cGycy)Y8smZ_&53FhPlOHD6@}4=pyf8MM<>JoSiljyfZiA4|4W~9wV;Ob@q@S? z#aa)sQ}9(;-WB##GyN3iB~}aBq$527M|AYA?W8+8DytFJ0yDr9seXI;^L^RlTp{Ck zU}q5l54yngbsFp!`*!h_ljae0`kjKXmVGrQLn|^@pwgNkW)}V z9rTXu2SnBus1n%y_>ZdFpM9YDzt(%s2oEVmHr5z6j0&gz&38{>6)4~+P3eiW~Mi;EN?n5@~s)^jO0Tc0Askycd2XdaNF~7YLq%C zbqJ`*T`|Zz%v!OtvBs}){KnOY+VsEEUu0@WcSQ8iroZ>Kc)EBsN6Mm9gU*w)0oCz> zwI%J>KUf5QMnh6xN%}+U8D#}VHhzCL{;an@A8iJ4@h;zQ3Ah-pdUp~08wGooJy+g7 zU*e|H$OIpESW?2tezlyTO`Op4B1a-W`?AwKr6=l z^roHr*6}f5A6Jp&y}yW)&ijk-knpT)gxQ_H+waNN5%r>HTeYoDe;_A)MQ!ZkrauCr zagSlB)29m%Z5b(ttJlZHNsZ!%q=sJa0i?KmCB3%i*WZ>q>|iE=`|8QPC?drDDGb)L z!M7R=Db=T}l||M)Nx`t2hhPwiW~$M%StvaRv`3q`OQJ&~O;n2t@gq@3WRAFiLxH-} zu{T9P>A)ex+Xr?Uiat63V?>&r;l6l1l_>N#utgGzs zjluajSfu}6c$RCt$-X|(_3B~kH6={9WN`PBFy-jwApiquDJv>^@Vl!-#Jp}yGrn5C z;HjInL?0q+4+SC5jIGHY^)Ok&6+Ig+syVjeNt8+8gOo0bgLkMhi%XY|VG?6*f2s)Q zL-l@pk@QO1uki+9#D{G32@Xik{6sO@@S2cD_>a($q$|yvDm%&D2i;n=r*m`nVobG$ z!QDa*N3W4ITf)d(_et3l^6Ipzdfxm90XJiTxssiEl=lsYTDT_%C%HJ=eLy63&C<_g z-i|l_H4E7&OjOlu->GvyqIoy*obQIVvp0;=BZZVNlS7mO43x?IK&qyTK{9H{{d-U_ zdZy99u!@&w-QGLmMHBSK0xW#RzZ&*EAA)-<~C2#PFrF#(^`B20(3K15#O8S_p`R#^mHfF{Ok)JLK0 zjG=PhRaiy)Z3 z=A_cBujL8sL653by+NuMof`_>b3=n^aI%D?OeY@qUdboSHY-q0hbPBfEvL%J>(X`) z2&QID@G1voVyonh;IFXbcL*{F z$H{!LzVWLq0rm*x11lfo_&z(&ug%@OA2?Wvx(+H?Kk&A@ZNdt8r`?RDWMINNmfexV z`K;Wd$CHCj#QWegYx>_Ca;!b}ivZP)9x-C^gi{(n7}6!}6iY{pPJM0miBpj{WrEQ- zJC4V>zW{s~&nq@X*2WVYtCNf(*Y5;q!fq302Yxn}-q$q&cU5^o&*j#mdo7%qR;Bj0 zk|dw6&@Nfj_>-p^2pQUuDx`+`U+f9>49BS4l+w17ET&X@%GJl&p!kp^i`)xT_#1OQ zF~9GL#VFppf5Kii{T*x$^10)tS@92J`~HMHJp<_>c7K%l$_#>y_uXP#J~J-4!oB4z z^zg4eMt}GHa9hpFb@G>OI!)<;Mafg2l6?6zX5eTn@4PQEb_N8(a3tQ{)bDZ2F~5Jr z5sftmjr;4eRIpdMeLrdmG$9`$4usFz!E&3*~D>S%smQOfkh*(bBuD&0-`2g_53$@EbtHxM@X z`?%=E$n@cqKCn%FdB*M#GQcT{9xTUrDm^|_mU@x0H_{`nHc9sBWPhy8qEzu2yZD*p z4(X$Nn0L+wJmWXm+Usw7-u(AI<(Q_?i#4m;#Q&SJcLnW=nP>c8x7T=RKcDQwjbOc- zBC_+UZKzl7c?->fEBzrTqVzfP(s4O9i^eh{dL@V919vlP-bUL6-%8TzgIooZM>HP= z3Unx*u-2c&5Su*yCST*o7dx1yE6rZyPvYr~&7cF(9UL<#f{dVgPReow1At6w5L=i!wK zm*Ka3rVIMHh**ivU8nKqnQp|}SwyA`r!FiMK0$KuT??KUJ*bkaA3X*Qy-MV!&DNVi z_%ICYYN);73k9rt?mUKv}**tNt zTm!p>$Gf@AXE8 zqJ6l&eZ2aj0%Anv{eq zK$s99md6-BKz0yzftaW1xZET%u$vcTqopvzEn)L~^q{mkaDqe5a9@OORKh6~I?<5= zay!M|Fb{?x+G*r`Pp*4)I$KKzZyI2SG4gC#2ciZ*`q~!@^`2XgQX*UjKLM=Cf#^gH zR{%jD+>Cz${lvD(V8*hxCrP8D%Yx)ns+dWo%J-lRLE^G;H+Vxu1Dw1q+NGpF6TZ|> zWBa+H>H5T|OL_=c|y~owP)I5f7OF zF|oXZ z zE&7$Qzk!mvBY9I`SAtbtv`Wgci_E|cpl~*i$`B57#vWJ0*fO__KNybP^qT254mY&B z0+(Z@Pm@()3_4;&=OASvn2UYlcu4-Vdwx%b`lQoH#8nDBW(Le(Q#ub;!`nvG4pGq= zx}TR~=brw-dE}1CagxDlu1!fuFwj14TdM!wqz-N1^toq%B1W6h8aPs|d~tt)CQgU5 z0($o!AyfDg8~!%q^Bdw}k%NdqdGF)xA{(_N{z&GHkMpSxg4Qsm>$c%Yh*ZQfqYH2x z+#gAw_`<>D>iqr}SP8Su!G)zg@Fi*;gj+g<@v4XQ9-A|dQkFptOyu#HLiWv9%G3aE z|CVvV7zkSog0+GxjCVx%9q7Lq(Y1wcseL}^NGC-e3e2R>6Y?-grLA4JHkUbjS2KfG z61v*~@maesDIF?s@!OCC%Z?f>I`BqaYC}<%<;upbBhNtKil)Yp3Q8?RTdRYC__lT1 zG9dbIzQc5M-(a%%6xE1ScufZMU{W)KAEuSI@iu%^auj6uQ%Y&eyh9cR0%R(gUdI(1 zo2cC4)c7p*iZscA3zy$c{=@J42SMQYj&nM-T-}E{hV|;WJH_DJslkdk!QL(oIt~kZ z{l$S)A(fYN7Q_RUfg6F8>#*wOxHc*z42xpnSH(#(GTxq~36NH)NaU6ILq(5B<>yh2 zZTyvhzqi0fQIZ`t{N*0}M;NERoYuGax4JxLULuYdG3BPF7 zkUBslzCtqbEq+egF~o;^cZ9>7gyzSD{6$ON)y~5sjUW_>9EyPXHlxANa2(HY(je=Jec6(g^$T2r$wjl4&kIW9b`LKFun-Ag(IVs_SzH zbX}GuC`${@P5T0|H_l(0$xy`|9fpdAFyBWLo8vv+_2rMUvrsF%j4`ZW7Mo}qkAoJ> zJ8zwpb`UVw8HK=$(cS5`W;dy2ZLsL9827EtIPVo!c?FH`cUW`N?|mAQ_|La|?ZpLw z;m7Q&2U`n83wd|$B7Q}VdZ1~;?Q_!ps=i0*@v?YQ_O4I zmK>8yvfTSKKdXWmgAx7ZHjL3ERAG+aN_d$FcGgYTFuf?_?bJp_mm<=qK^xXo6szQE zi!!z7BhL|aL16jRgOTANo4+0-xVP9IoaizzcU{GMWl^|DP z?bn0jJ+U>Dh1N$!NWXfXI@93eW&&DA1pd~1{dSR&4&s1q6 zcq?)8M9NlVOgH0m1=K!qsIqHUm9~+QJnzyTw%VtT@Rw)K`;D6vm9t6uaC_SjQ-=TP z?uuZnHz{*>(a=kr^!DPjde86*v>Qv$@gx{CV_jX`L83kTP-m3HV_qX2 z=%fUUd})IYTvy7t+6y%?E9S%PyzTGZmgM5FZc^T|>f)bMN?%F1z77iEBJ0-_g&S;N zvt!hVyso-2ijrm}e-88?``r3Wm6Rf!mQ4LqS2ALM#N)2wQNh&hHfs+frCzAM5EAuj z)9rm(dsO&TCvpH~jhTN0TbAuD*LVeboJy&rs3=qVydm$Uwu~bJ-Ilx~r|%?`vn!Xhp0s z8$^5k8e2EN%}pb{)I?sr&X}80tNiiQOhFvcqcJ>YhiVz3g_?H~d7nw|U3aRe^ksQh z;k)tO^iXq6>eH&5-^%SPaYbhp>bO`ijd@!`%JPsZ|s;>!&Oa=Hxm_ zsy~sUiL;Zuyzd-aZQX9&Ro!p{e^#uW&*nN^MWT7J@UBOAqBoJsr=%#oPjX58otFMxem^wkO$w_3Zcj3KhNgSr0$PQ# zf~=1AJ0hMl$fZW8IRVCLbk#L7XX*j3ZyrkD71CEoSru(x_Pv?7TV~+sC4N5iKf*-6 z(JY2O$|1ow`Ck^&mH}M|pSOQ_;vjQZXtfGF>5S7&U7gc^jA{M*Ly!@F@9ck#_?MR$ zA8v)La{nV7A3er&vootVX#MH+a}s;ykaW?vM+I$_Ceg|(>RZ)FpZzN z|3Ck+Cfe>1gJ0!XzW0A#0ZxqAXLzNiXVqDA*h?5t+0K(zxiNK~ogDYPFA?Hkh(6zc zVq|()GSMHb)|s}h`#pGY&~kVI)+S^RGQ{?G`nJ1PkZrJ&QB@+q!GHvGA^Tzv%it=U z|D7nyj>i3<&ku6l(2oA+A7S`Aoh^NlBv_f!P^xg-p}t~K;%e%1`TT8HN}!)%<36~o VbJ;o8^#c4+zNN14PR=ag{{qS|ao+#{ literal 0 HcmV?d00001 From 7caec10e7b978853f8f87fe1d0cf77aa85066cdb Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 16 Aug 2025 13:16:34 +0800 Subject: [PATCH 029/361] [XPU]avoid circular import during XPU init (#23017) Signed-off-by: Kunshang Ji --- vllm/platforms/xpu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 66ebc8ad9d22f..af24437f649f4 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Optional import torch import vllm.envs as envs -from vllm.config import CUDAGraphMode from vllm.logger import init_logger from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS @@ -105,6 +104,8 @@ class XPUPlatform(Platform): and not cls.device_support_bf16(): model_config.dtype = torch.float16 + # lazy import to avoid circular import + from vllm.config import CUDAGraphMode compilation_config = vllm_config.compilation_config if compilation_config.cudagraph_mode is None or \ compilation_config.cudagraph_mode.max_cudagraph_mode() \ From 5157827cfc0fd06d361897b2cc912ee1b5bc6277 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Sat, 16 Aug 2025 01:36:27 -0400 Subject: [PATCH 030/361] [Build] Env var to disable sccache (#22968) Signed-off-by: Lucas Wilkinson --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 919300e143c1e..cc3037ebb72cb 100644 --- a/setup.py +++ b/setup.py @@ -60,7 +60,8 @@ MAIN_CUDA_VERSION = "12.8" def is_sccache_available() -> bool: - return which("sccache") is not None + return which("sccache") is not None and \ + not bool(int(os.getenv("VLLM_DISABLE_SCCACHE", "0"))) def is_ccache_available() -> bool: From 78863f8c5c67367f32533dd0230faae51ec51145 Mon Sep 17 00:00:00 2001 From: Andrew Sansom Date: Sat, 16 Aug 2025 01:25:10 -0500 Subject: [PATCH 031/361] [BugFix] Add support for loading prompt embeds tensors serialized on unavailable devices and sparse tensors (#22962) Signed-off-by: Andrew Sansom --- .../openai/test_prompt_validation.py | 49 +++++++++++++++++++ vllm/entrypoints/openai/serving_engine.py | 6 ++- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index e31a1d077608f..4197583074dfe 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -1,10 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import io + # imports for guided decoding tests import openai +import pybase64 import pytest import regex as re +import torch + +from vllm.entrypoints.openai.serving_engine import OpenAIServing from ...utils import RemoteOpenAIServer @@ -42,3 +48,46 @@ async def test_out_of_vocab_token_ids(): prompt=[999999], max_tokens=5, temperature=0.0) + + +@pytest.mark.parametrize("dtype", + [torch.float32, torch.bfloat16, torch.float16]) +@pytest.mark.parametrize( + "layout", + [torch.strided, torch.sparse_coo, torch.sparse_csc, torch.sparse_csr]) +@pytest.mark.parametrize("seq_len", [2, 10]) +@pytest.mark.parametrize("hidden_size", [2, 10]) +def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout, + seq_len: int, hidden_size: int): + # construct arbitrary tensors of various dtypes, layouts, and sizes. + # We need to check against different layouts to make sure that if a user + # uses sparse tensors to reduce the transmission size of prompt embeddings, + # we must cast them to dense/strided before passing them into the engine. + # We don't use non-CPU tensors in this test to avoid preemptively + # initializing cuda and break other tests in the suite that fork processes. + # We also need to make sure that we only use devices that are actually + # available in the environment the test is running on. For simplicity, + # we just test against CPU. + tensor = torch.randn((seq_len, hidden_size), dtype=dtype) + if layout == torch.strided: + tensor = tensor.contiguous() + elif layout == torch.sparse_coo: + tensor = tensor.to_sparse_coo() + elif layout == torch.sparse_csc: + tensor = tensor.to_sparse_csc() + elif layout == torch.sparse_csr: + tensor = tensor.to_sparse_csr() + + buffer = io.BytesIO() + torch.save(tensor, buffer) + buffer.seek(0) + encoded_tensor = pybase64.b64encode(buffer.getvalue()) + + loaded_prompt_embeds = OpenAIServing._load_prompt_embeds(encoded_tensor) + assert len(loaded_prompt_embeds) == 1 + loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"] + assert loaded_tensor.device.type == "cpu" + assert loaded_tensor.layout == torch.strided + torch.testing.assert_close(loaded_tensor, + tensor.to("cpu").to_dense(), + equal_nan=True) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index d6f92a63301e8..0f4a7c0186b65 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1006,8 +1006,8 @@ class OpenAIServing: # OPTIMIZATION priority = orig_priority - 1 + @staticmethod def _load_prompt_embeds( - self, prompt_embeds: Optional[Union[bytes, list[bytes]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None ) -> list[EmbedsPrompt]: @@ -1015,12 +1015,14 @@ class OpenAIServing: def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt: tensor = torch.load(io.BytesIO( pybase64.b64decode(embed, validate=True)), - weights_only=True) + weights_only=True, + map_location=torch.device("cpu")) assert isinstance(tensor, torch.Tensor) and tensor.dtype in ( torch.float32, torch.bfloat16, torch.float16, ) + tensor = tensor.to_dense() if tensor.dim() > 2: tensor = tensor.squeeze(0) assert tensor.dim() == 2 From 6d3da472bc8f202229a8e178671f4fe72037cfb1 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 16 Aug 2025 15:26:10 +0800 Subject: [PATCH 032/361] [Misc] Add --save-dir option to benchmark_moe (#23020) Signed-off-by: Jee Jee Li --- benchmarks/kernels/benchmark_moe.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 13bf1be836f6a..b4a03665ef10f 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -3,6 +3,7 @@ import argparse import json +import os import time from contextlib import nullcontext from datetime import datetime @@ -542,6 +543,7 @@ def save_configs( use_fp8_w8a8: bool, use_int8_w8a16: bool, block_quant_shape: list[int], + save_dir: str, ) -> None: dtype_str = get_config_dtype_str( dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 @@ -552,7 +554,8 @@ def save_configs( filename = get_config_file_name( num_experts, shard_intermediate_size // 2, dtype_str, block_quant_shape ) - + os.makedirs(save_dir, exist_ok=True) + filename = os.path.join(save_dir, filename) print(f"Writing best config to {filename}...") with open(filename, "w") as f: json.dump(configs, f, indent=4) @@ -707,6 +710,7 @@ def main(args: argparse.Namespace): use_fp8_w8a8, use_int8_w8a16, block_quant_shape, + args.save_dir, ) end = time.time() print(f"Tuning took {end - start:.2f} seconds") @@ -748,6 +752,9 @@ if __name__ == "__main__": "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto" ) parser.add_argument("--use-deep-gemm", action="store_true") + parser.add_argument( + "--save-dir", type=str, default="./", help="Directory to save tuned results" + ) parser.add_argument("--seed", type=int, default=0) parser.add_argument("--batch-size", type=int, nargs="+", required=False) parser.add_argument("--tune", action="store_true") From cc826a202b7b66af222374129573763237db3c1c Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 16 Aug 2025 15:44:50 +0800 Subject: [PATCH 033/361] [Multimodal] Update Tensor schema test to cover arbitrary shape mm inputs (#22867) Signed-off-by: Isotr0py --- tests/models/multimodal/test_tensor_schema.py | 143 +++++++++++++++--- vllm/model_executor/models/keye.py | 22 ++- 2 files changed, 138 insertions(+), 27 deletions(-) diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py index 92390d8c2f7ee..036624431c20b 100644 --- a/tests/models/multimodal/test_tensor_schema.py +++ b/tests/models/multimodal/test_tensor_schema.py @@ -1,17 +1,26 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterable from functools import partial +from typing import Any, Union from unittest.mock import patch +import numpy as np import pytest +from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk, + UserMessage) +from mistral_common.protocol.instruct.request import ChatCompletionRequest +from PIL import Image from vllm.config import ModelConfig from vllm.engine.llm_engine import LLMEngine as V0LLMEngine from vllm.inputs import InputProcessingContext -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, + MultiModalKwargs) from vllm.multimodal.processing import BaseMultiModalProcessor +from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config -from vllm.utils import GiB_bytes, set_default_torch_num_threads +from vllm.utils import GiB_bytes, is_list_of, set_default_torch_num_threads from vllm.v1.core.kv_cache_utils import get_kv_cache_config from vllm.v1.engine.core import EngineCore as V1EngineCore @@ -23,12 +32,64 @@ ARCH_TO_SKIP = { "MolmoForCausalLM": "incompatible requirements", "MiniMaxVL01ForConditionalGeneration": "broken model", } +ARCH_NEEDS_EXTRAS = [ + "InternVLChatModel", + "Idefics3ForConditionalGeneration", + "LlavaForConditionalGeneration", + "MiniCPMV", + "PaliGemmaForConditionalGeneration", +] +REPO_ID_TO_SKIP = {"nm-testing/pixtral-12b-FP8-dynamic": "duplicated test"} + +ImageInput = list[Image.Image] +VideoInput = Union[list[Image.Image], list[np.ndarray], + list[tuple[np.ndarray, dict[str, Any]]]] +AudioInput = list[tuple[np.ndarray, int]] + + +def _resize_data(_data: Union[Image.Image, np.ndarray], + size_factor: float) -> Union[Image.Image, np.ndarray]: + assert size_factor <= 1, "Size factor must be less than 1" + # Image input + if isinstance(_data, Image.Image): + W, H = _data.width, _data.height + W, H = map(lambda x: int(x * size_factor), (W, H)) + return _data.resize((W, H)) + # Video input with PIL Images + elif is_list_of(_data, Image.Image): + W, H = next(iter(_data)).width, next(iter(_data)).height + T = len(_data) + T, W, H = map(lambda x: max(int(x * size_factor), 1), (T, W, H)) + return [d.resize((W, H)) for d in _data[:T]] + # Video input with numpy arrays + elif isinstance(_data, np.ndarray) and _data.ndim >= 4: + T, H, W, C = _data.shape[-4:] + T, H, W = map(lambda x: max(int(x * size_factor), 1), (T, H, W)) + return _data[..., :T, :H, :W, :C] + # Audio input + elif isinstance(_data, np.ndarray) and _data.ndim == 1: + return _data[:int(len(_data) * size_factor)] + raise AssertionError("This line should be unreachable.") + + +def resize_mm_data( + data: Union[ImageInput, VideoInput, AudioInput], + size_factors: tuple[float, + ...]) -> Union[ImageInput, VideoInput, AudioInput]: + size_factors = size_factors[:len(data)] + if is_list_of(data, (Image.Image, np.ndarray, list)): + return [_resize_data(d, s) for d, s in zip(data, size_factors)] + elif is_list_of(data, tuple): + return [(_resize_data(d, s), meta) + for (d, meta), s in zip(data, size_factors)] + raise ValueError("Unsupported multimodal data type.") def create_batched_mm_kwargs( model_config: ModelConfig, processor: BaseMultiModalProcessor, -) -> MultiModalKwargs: + size_factors: tuple[float, ...] = (1.0, 0.5, 0.25), +) -> Iterable[tuple[str, int, BatchedTensorInputs]]: processing_info = processor.info dummy_inputs = processor.dummy_inputs supported_mm_limits = processing_info.get_supported_mm_limits() @@ -40,30 +101,69 @@ def create_batched_mm_kwargs( seq_len=model_config.max_model_len, mm_counts=mm_counts, ) + mm_data = processor_inputs.mm_data + resized_mm_data = { + modality: resize_mm_data(data, size_factors) + for modality, data in mm_data.items() + } + # Mistral chat outputs tokens directly, rather than text prompts + if model_config.tokenizer_mode == "mistral": + images = resized_mm_data.get("image", []) + request = ChatCompletionRequest(messages=[ + UserMessage(content=[ + TextChunk(text=""), + *(ImageChunk(image=image) for image in images), + ]), + ]) + tokenizer = processing_info.get_tokenizer() + res = tokenizer.mistral.encode_chat_completion(request) + prompt = res.tokens + else: + prompt = processor_inputs.prompt mm_kwargs = processor.apply( - prompt=processor_inputs.prompt, - mm_data=processor_inputs.mm_data, + prompt=prompt, + mm_data=resized_mm_data, hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, tokenization_kwargs=processor_inputs.tokenization_kwargs, )["mm_kwargs"] - mm_kwargs = MultiModalKwargs.batch([mm_kwargs]) - return mm_kwargs + items = [ + item for modality in supported_mm_limits + for item in mm_kwargs.get_items(modality) + ] + return group_mm_kwargs_by_modality(items) + + +def get_model_id_to_test( + model_arch_list: Iterable[str]) -> list[tuple[str, str]]: + filtered_results = [] + for model_arch in model_arch_list: + model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) + if model_info.extras and model_arch in ARCH_NEEDS_EXTRAS: + available_repos = list( + map(lambda model_id: (model_arch, model_id), + [model_info.default, *model_info.extras.values()])) + filtered_results.extend(available_repos) + else: + filtered_results.append((model_arch, model_info.default)) + return filtered_results @pytest.mark.core_model -@pytest.mark.parametrize("model_arch", list(_MULTIMODAL_EXAMPLE_MODELS.keys())) -def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner], - monkeypatch): +@pytest.mark.parametrize( + "model_arch, model_id", + get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys())) +def test_model_tensor_schema(model_arch: str, model_id: str, + vllm_runner: type[VllmRunner], monkeypatch): if model_arch in ARCH_TO_SKIP: pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}") + if model_id in REPO_ID_TO_SKIP: + pytest.skip(f"Skipping {model_id} due to {REPO_ID_TO_SKIP[model_id]}") model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip", check_max_version=False) - model_id = model_info.default - hf_overrides_fn = partial(dummy_hf_overrides, model_arch=model_arch, exist_overrides=model_info.hf_overrides) @@ -119,6 +219,7 @@ def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner], if model_info.v0_only: m.setenv("VLLM_USE_V1", "0") + # TODO(Isotr0py): Can we avoid initializing engine? with ( set_default_torch_num_threads(1), vllm_runner( @@ -145,12 +246,16 @@ def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner], mm_registry = llm_engine.input_preprocessor.mm_registry processor = mm_registry.create_processor(model_config) - mm_kwargs = create_batched_mm_kwargs(model_config, processor) - def validate_model_input(model): - for modality in ("audio", "image", "video"): - method_name = f"_parse_and_validate_{modality}_input" - if hasattr(model, method_name): - getattr(model, method_name)(**mm_kwargs) + def validate_model_input(model, modality: str, + mm_kwargs: MultiModalKwargs): + method_name = f"_parse_and_validate_{modality}_input" + if hasattr(model, method_name): + getattr(model, method_name)(**mm_kwargs) - vllm_model.apply_model(validate_model_input) \ No newline at end of file + for modality, _, mm_kwargs in create_batched_mm_kwargs( + model_config, processor): + valid_func = partial(validate_model_input, + modality=modality, + mm_kwargs=mm_kwargs) + vllm_model.apply_model(valid_func) diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 40c66c2268507..db9ed5910d78b 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -30,7 +30,7 @@ from vllm.model_executor.layers.quantization.gptq_marlin import ( from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors from vllm.multimodal.inputs import (ImageItem, ModalityData, MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargs, VideoItem) @@ -44,6 +44,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope +from vllm.utils import is_list_of from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, @@ -112,8 +113,9 @@ class KeyeImagePixelInputs(TensorSchema): - g: Grid dimensions (3 for t, h, w) """ type: Literal["pixel_values"] - pixel_values: Annotated[torch.Tensor, - TensorShape("b", "np", 3, "ps", "ps")] + pixel_values: Annotated[ + torch.Tensor, + TensorShape("b", "np", 3, "ps", "ps", dynamic_dims={"np"})] image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)] @@ -145,8 +147,9 @@ class KeyeVideoPixelInputs(TensorSchema): - g: Grid dimensions (3 for t, h, w) """ type: Literal["pixel_values_videos"] - pixel_values_videos: Annotated[torch.Tensor, - TensorShape("b", "np", 3, "ps", "ps")] + pixel_values_videos: Annotated[ + torch.Tensor, + TensorShape("b", "np", 3, "ps", "ps", dynamic_dims={"np"})] video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)] @@ -1295,7 +1298,7 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, return None return quant_config - def _validate_and_reshape_mm_tensor(self, mm_input: object, + def _validate_and_reshape_mm_tensor(self, mm_input: NestedTensors, name: str) -> torch.Tensor: if not isinstance(mm_input, (torch.Tensor, list)): raise ValueError(f"Incorrect type of {name}. " @@ -1310,8 +1313,11 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, f"Got ndim: {mm_input.ndim} " f"(shape={mm_input.shape})") return torch.concat(list(mm_input)) - else: - return torch.concat(mm_input) + elif is_list_of(mm_input, torch.Tensor): + if all(p.dim() == 4 for p in mm_input) or all(p.dim() == 2 + for p in mm_input): + return mm_input + return torch.concat(list(mm_input)) def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[KeyeImageInputs]: From 933f45334a79dcb69aa93178b3bbf3d9e0d46f09 Mon Sep 17 00:00:00 2001 From: Chengji Yao Date: Sat, 16 Aug 2025 00:46:00 -0700 Subject: [PATCH 034/361] [Core] Make cudagraph check cuda platform only (#23005) Signed-off-by: Chengji Yao Signed-off-by: Chengji Yao Co-authored-by: Chengji Yao Co-authored-by: Li, Jiang --- vllm/config/__init__.py | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 280ae60c91ff4..72fec5e205e34 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3535,15 +3535,6 @@ class VllmConfig: # in V0 means the compilation level wins out. self.compilation_config.level = CompilationLevel.NO_COMPILATION - # if cudagraph_mode is not explicitly set by users, set default value - if self.compilation_config.cudagraph_mode is None: - if envs.VLLM_USE_V1 and self.compilation_config.level \ - == CompilationLevel.PIECEWISE: - self.compilation_config.cudagraph_mode = \ - CUDAGraphMode.PIECEWISE - else: - self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE - # async tp is built on top of sequence parallelism # and requires it to be enabled. if self.compilation_config.pass_config.enable_async_tp: @@ -3552,14 +3543,28 @@ class VllmConfig: if self.compilation_config.pass_config.enable_sequence_parallelism: self.compilation_config.custom_ops.append("+rms_norm") - # disable cudagraph when enforce eager execution - if self.model_config is not None and self.model_config.enforce_eager: - logger.info("Cudagraph is disabled under eager mode") - self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE - elif envs.VLLM_USE_V1: - self.compilation_config.cudagraph_num_of_warmups = 1 + if current_platform.is_cuda_alike(): + # if cudagraph_mode is not explicitly set by users, set default + # value + if self.compilation_config.cudagraph_mode is None: + if envs.VLLM_USE_V1 and self.compilation_config.level \ + == CompilationLevel.PIECEWISE: + self.compilation_config.cudagraph_mode = \ + CUDAGraphMode.PIECEWISE + else: + self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE - self._set_cudagraph_sizes() + # disable cudagraph when enforce eager execution + if self.model_config is not None and \ + self.model_config.enforce_eager: + logger.info("Cudagraph is disabled under eager mode") + self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE + elif envs.VLLM_USE_V1: + self.compilation_config.cudagraph_num_of_warmups = 1 + + self._set_cudagraph_sizes() + else: + self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE if self.cache_config.cpu_offload_gb > 0 and \ self.compilation_config.level != CompilationLevel.NO_COMPILATION \ @@ -3618,7 +3623,7 @@ class VllmConfig: current_platform.check_and_update_config(self) # final check of cudagraph mode after platform-specific update - if envs.VLLM_USE_V1: + if envs.VLLM_USE_V1 and current_platform.is_cuda_alike(): if self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL \ and self.model_config is not None and \ not self.model_config.disable_cascade_attn: From 2dbccce8a67e8004b365e7e533107c54c9542ce7 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 16 Aug 2025 17:44:19 +0800 Subject: [PATCH 035/361] [CI][Bugfix] Skip Ovis2 generation test because of broken remote code (#22954) Signed-off-by: Isotr0py --- tests/models/registry.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 3efc9a99ea415..10e29e01e8a18 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -196,7 +196,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { {"alias": "gpt2"}), "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder", extras={"tiny": "bigcode/tiny_starcoder_py"}, # noqa: E501 - min_transformers_version="4.55.1"), + min_transformers_version="4.55.1", + transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501 "GPTJForCausalLM": _HfExamplesInfo("Milos/slovak-gpt-j-405M", {"6b": "EleutherAI/gpt-j-6b"}), "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m", @@ -408,14 +409,16 @@ _MULTIMODAL_EXAMPLE_MODELS = { extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501 max_transformers_version="4.48", # noqa: E501 transformers_version_reason="HF model is not compatible."), # noqa: E501 + "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501 + {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}, # noqa: E501 + min_transformers_version="4.55.1", + transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501 "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", extras={"2B": "OpenGVLab/InternVL2-2B", "3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501 trust_remote_code=True), "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1", trust_remote_code=True), - "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501 - {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501 "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501 trust_remote_code=True), "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501 @@ -455,6 +458,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Llama_Nemotron_Nano_VL" : _HfExamplesInfo("nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", # noqa: E501 trust_remote_code=True), "Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True, + max_transformers_version="4.53", + transformers_version_reason="HF model is not compatible", # noqa: E501 extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B", "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501 "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501 @@ -482,7 +487,9 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"), # noqa: E501 "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B", trust_remote_code=True), - "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501 + "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct", # noqa: E501 + min_transformers_version="4.55.1", + transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501 "Step3VLForConditionalGeneration": _HfExamplesInfo("stepfun-ai/step3", trust_remote_code=True, is_available_online=False), From de9cb617637deabab4e34db05d26c8d4d6b2ed98 Mon Sep 17 00:00:00 2001 From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Date: Sat, 16 Aug 2025 03:21:20 -0700 Subject: [PATCH 036/361] Add docs for PrefixRepetitionDataset + enable usage with `vllm bench throughput` (#23012) Signed-off-by: Seiji Eicher Co-authored-by: Roger Wang --- benchmarks/README.md | 22 +++++++++++++- vllm/benchmarks/throughput.py | 57 ++++++++++++++++++++++++++++++++--- 2 files changed, 73 insertions(+), 6 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index caff8f0342141..1d715a193ea14 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -40,7 +40,7 @@ become available. wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv - Sonnet + Sonnet (deprecated) ✅ ✅ Local file: benchmarks/sonnet.txt @@ -51,6 +51,12 @@ become available. ✅ synthetic + + Prefix Repetition + ✅ + ✅ + synthetic + HuggingFace-VisionArena ✅ @@ -592,6 +598,20 @@ python3 benchmarks/benchmark_prefix_caching.py \ --input-length-range 128:256 ``` +### Prefix Repetition Dataset + +```bash +vllm bench serve \ + --backend openai \ + --model meta-llama/Llama-2-7b-chat-hf \ + --dataset-name prefix_repetition \ + --num-prompts 100 \ + --prefix-repetition-prefix-len 512 \ + --prefix-repetition-suffix-len 128 \ + --prefix-repetition-num-prefixes 5 \ + --prefix-repetition-output-len 128 +``` + ## ⚡ Example - Request Prioritization Benchmark diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index fdf6548ada5b6..0c19fa6dcfdd2 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -18,9 +18,11 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset, ConversationDataset, - InstructCoderDataset, RandomDataset, - SampleRequest, ShareGPTDataset, - SonnetDataset, VisionArenaDataset) + InstructCoderDataset, + PrefixRepetitionRandomDataset, + RandomDataset, SampleRequest, + ShareGPTDataset, SonnetDataset, + VisionArenaDataset) from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, write_to_json) from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs @@ -327,6 +329,12 @@ def get_requests(args, tokenizer): dataset_cls = AIMODataset common_kwargs['dataset_subset'] = None common_kwargs['dataset_split'] = "train" + elif args.dataset_name == "prefix_repetition": + dataset_cls = PrefixRepetitionRandomDataset + sample_kwargs["prefix_len"] = args.prefix_repetition_prefix_len + sample_kwargs["suffix_len"] = args.prefix_repetition_suffix_len + sample_kwargs["num_prefixes"] = args.prefix_repetition_num_prefixes + sample_kwargs["output_len"] = args.prefix_repetition_output_len else: raise ValueError(f"Unknown dataset name: {args.dataset_name}") # Remove None values @@ -356,7 +364,11 @@ def validate_args(args): raise ValueError(f"Unsupported backend: {args.backend}") # === Dataset Configuration === - if not args.dataset and not args.dataset_path: + if ( + not args.dataset + and not args.dataset_path + and args.dataset_name not in {"prefix_repetition"} + ): print( "When dataset path is not set, it will default to random dataset") args.dataset_name = 'random' @@ -432,7 +444,10 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--dataset-name", type=str, - choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"], + choices=[ + "sharegpt", "random", "sonnet", "burstgpt", "hf", + "prefix_repetition" + ], help="Name of the dataset to benchmark on.", default="sharegpt") parser.add_argument( @@ -521,6 +536,38 @@ def add_cli_args(parser: argparse.ArgumentParser): default=None, help="Split of the HF dataset.") + # prefix repetition dataset + prefix_repetition_group = parser.add_argument_group( + "prefix repetition dataset options") + prefix_repetition_group.add_argument( + "--prefix-repetition-prefix-len", + type=int, + default=None, + help="Number of prefix tokens per request, used only for prefix " + "repetition dataset.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-suffix-len", + type=int, + default=None, + help="Number of suffix tokens per request, used only for prefix " + "repetition dataset. Total input length is prefix_len + suffix_len.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-num-prefixes", + type=int, + default=None, + help="Number of prefixes to generate, used only for prefix repetition " + "dataset. Prompts per prefix is num_requests // num_prefixes.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-output-len", + type=int, + default=None, + help="Number of output tokens per request, used only for prefix " + "repetition dataset.", + ) + parser = AsyncEngineArgs.add_cli_args(parser) From 4dff91c93da668f4cca3f80aa3a94622d21c34fc Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 16 Aug 2025 19:30:49 +0800 Subject: [PATCH 037/361] [Refactor] Allow optional MultiModalKwargsItem in IPC (#23022) Signed-off-by: DarkLight1337 --- tests/v1/core/test_kv_cache_utils.py | 12 +----- tests/v1/core/test_prefix_caching.py | 12 +----- tests/v1/core/test_scheduler.py | 12 +----- tests/v1/core/utils.py | 12 +----- vllm/multimodal/inputs.py | 62 ++++++++-------------------- vllm/v1/engine/__init__.py | 3 +- vllm/v1/engine/mm_input_cache.py | 33 ++++++++------- vllm/v1/engine/processor.py | 10 +++-- vllm/v1/request.py | 7 +++- vllm/v1/worker/gpu_model_runner.py | 4 +- 10 files changed, 59 insertions(+), 108 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index e0b91e6dd7ee4..47c74aff1e753 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -7,9 +7,7 @@ import pytest import torch from vllm.config import ModelConfig, SchedulerConfig, VllmConfig -from vllm.multimodal.inputs import (MultiModalBatchedField, - MultiModalFieldElem, MultiModalKwargsItem, - PlaceholderRange) +from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange from vllm.sampling_params import SamplingParams from vllm.utils import GiB_bytes, sha256, sha256_cbor_64bit from vllm.v1.core.kv_cache_manager import KVCacheManager @@ -42,13 +40,7 @@ def make_request( if mm_positions is None: mm_kwargs = None else: - mm_elem = MultiModalFieldElem( - modality="dummy_m", - key="dummy_k", - data=None, - field=MultiModalBatchedField(), - ) - mm_item = MultiModalKwargsItem.from_elems([mm_elem]) + mm_item = MultiModalKwargsItem.dummy("dummy_m") mm_kwargs = [mm_item] * len(mm_positions) return Request(request_id=request_id, diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 28cfca6767b1e..89824768ed909 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -9,9 +9,7 @@ import pytest import torch from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved -from vllm.multimodal.inputs import (MultiModalBatchedField, - MultiModalFieldElem, MultiModalKwargsItem, - PlaceholderRange) +from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange from vllm.sampling_params import SamplingParams from vllm.utils import sha256, sha256_cbor_64bit from vllm.v1.core.block_pool import BlockPool @@ -37,13 +35,7 @@ def make_request( if mm_positions is None: mm_kwargs = None else: - mm_elem = MultiModalFieldElem( - modality="dummy_m", - key="dummy_k", - data=None, - field=MultiModalBatchedField(), - ) - mm_item = MultiModalKwargsItem.from_elems([mm_elem]) + mm_item = MultiModalKwargsItem.dummy("dummy_m") mm_kwargs = [mm_item] * len(mm_positions) return Request(request_id=request_id, diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index ac70c90d92add..23762a0fb6223 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -8,9 +8,7 @@ import torch from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, SchedulerConfig, SpeculativeConfig, VllmConfig) -from vllm.multimodal.inputs import (MultiModalBatchedField, - MultiModalFieldElem, MultiModalKwargsItem, - PlaceholderRange) +from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange from vllm.sampling_params import GuidedDecodingParams, SamplingParams from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput from vllm.v1.core.sched.scheduler import Scheduler @@ -1328,13 +1326,7 @@ def create_requests_with_priority( for i in range(num_requests): if mm_positions is not None: mm_position = mm_positions[i] - mm_elem = MultiModalFieldElem( - modality="dummy_m", - key="dummy_k", - data=None, - field=MultiModalBatchedField(), - ) - mm_item = MultiModalKwargsItem.from_elems([mm_elem]) + mm_item = MultiModalKwargsItem.dummy("dummy_m") mm_kwargs = [mm_item] * len(mm_position) else: mm_position = None diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 52093d3d381ae..849c3f59ae527 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -6,9 +6,7 @@ import torch from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, SchedulerConfig, SpeculativeConfig, VllmConfig) -from vllm.multimodal.inputs import (MultiModalBatchedField, - MultiModalFieldElem, MultiModalKwargsItem, - PlaceholderRange) +from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange from vllm.sampling_params import SamplingParams from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, init_none_hash) @@ -143,13 +141,7 @@ def create_requests( for i in range(num_requests): if mm_positions is not None: mm_position = mm_positions[i] - mm_elem = MultiModalFieldElem( - modality="dummy_m", - key="dummy_k", - data=None, - field=MultiModalBatchedField(), - ) - mm_item = MultiModalKwargsItem.from_elems([mm_elem]) + mm_item = MultiModalKwargsItem.dummy("dummy_m") mm_kwargs = [mm_item] * len(mm_position) mm_hashes = ["hash"] * len(mm_position) else: diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 0bbac45c121b6..a33ce146995d8 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -4,7 +4,7 @@ from abc import ABC, abstractmethod from collections import UserDict, defaultdict from collections.abc import Mapping, Sequence -from dataclasses import dataclass, replace +from dataclasses import dataclass from functools import partial from itertools import accumulate from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar, @@ -218,7 +218,7 @@ class MultiModalFieldElem: i.e. the name of the keyword argument to be passed to the model. """ - data: Optional[NestedTensors] + data: NestedTensors """ The tensor data of this field in [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs], @@ -315,13 +315,8 @@ class BaseMultiModalField(ABC): if len(set(field_types)) > 1: raise ValueError(f"Cannot merge different {field_types=}") - validated_data = list[NestedTensors]() - for i, elem in enumerate(elems): - assert elem.data is not None, ( - f"Cannot merge with empty `elems[{i}]`") - validated_data.append(elem.data) - - return self._reduce_data(validated_data, pin_memory=pin_memory) + batch = [elem.data for elem in elems] + return self._reduce_data(batch, pin_memory=pin_memory) @dataclass(frozen=True) @@ -643,6 +638,17 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems]. """ + @staticmethod + def dummy(modality: str): + """Convenience class for testing.""" + mm_elem = MultiModalFieldElem( + modality=modality, + key="dummy", + data=torch.empty(1), + field=MultiModalSharedField(1), + ) + return MultiModalKwargsItem.from_elems([mm_elem]) + @staticmethod def from_elems(elems: Sequence[MultiModalFieldElem]): return MultiModalKwargsItem({elem.key: elem for elem in elems}) @@ -654,46 +660,12 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): assert len(modalities) == 1, f"Found different modalities={modalities}" self._modality = next(iter(modalities)) - self._is_empty = any(elem.data is None for elem in self.values()) - @property def modality(self) -> str: return self._modality - @property - def is_empty(self) -> bool: - return self._is_empty - - def get_data(self) -> Optional[Mapping[str, NestedTensors]]: - if self._is_empty: - return None - - out_data = dict[str, NestedTensors]() - for key, elem in self.items(): - assert elem.data is not None, ( - f"Cannot get data of empty `elem[{key!r}]`") - out_data[key] = elem.data - - return out_data - - def require_data(self) -> Mapping[str, NestedTensors]: - if (data := self.get_data()) is None: - raise RuntimeError("Cannot get data of empty item") - - return data - - # These methods create a new item to avoid mutating cached items in place - def with_data(self, data: Mapping[str, NestedTensors]): - return MultiModalKwargsItem({ - key: replace(elem, data=data[key]) - for key, elem in self.items() - }) - - def without_data(self): - return MultiModalKwargsItem({ - key: replace(elem, data=None) - for key, elem in self.items() - }) + def get_data(self) -> Mapping[str, NestedTensors]: + return {key: elem.data for key, elem in self.items()} # NOTE: UserDict is for V0 compatibility. diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index b29394f3e6760..f7ec982db41b4 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -3,6 +3,7 @@ import enum import time +from collections.abc import Sequence from typing import Any, Optional, Union import msgspec @@ -47,7 +48,7 @@ class EngineCoreRequest( request_id: str prompt_token_ids: list[int] - mm_kwargs: Optional[list[MultiModalKwargsItem]] + mm_kwargs: Optional[Sequence[Optional[MultiModalKwargsItem]]] mm_hashes: Optional[list[str]] mm_placeholders: Optional[list[PlaceholderRange]] sampling_params: Optional[SamplingParams] diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index 1fed74330f0ec..aa7dc62fd4acb 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -1,11 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Mapping -from typing import TYPE_CHECKING +from collections.abc import Sequence +from typing import TYPE_CHECKING, Optional from vllm.multimodal import MultiModalRegistry from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata -from vllm.multimodal.inputs import MultiModalKwargsItem, NestedTensors +from vllm.multimodal.inputs import MultiModalKwargsItem +from vllm.utils import is_list_of if TYPE_CHECKING: from vllm.config import ModelConfig @@ -58,21 +59,21 @@ class MultiModalInputCacheClient: def get_and_update( self, - mm_kwargs: list[MultiModalKwargsItem], + mm_kwargs: Sequence[MultiModalKwargsItem], mm_hashes: list[str], - ) -> list[MultiModalKwargsItem]: + ) -> list[Optional[MultiModalKwargsItem]]: if not self.enabled: - return mm_kwargs + return list(mm_kwargs) assert len(mm_kwargs) == len(mm_hashes) - out_mm_items = list[MultiModalKwargsItem]() + out_mm_items = list[Optional[MultiModalKwargsItem]]() for mm_item, mm_hash in zip(mm_kwargs, mm_hashes): if self.mm_cache.get(mm_hash) is not None: - out_mm_items.append(mm_item.without_data()) + out_mm_items.append(None) else: self.mm_cache[mm_hash] = \ - MultiModalCacheItemMetadata.wraps(mm_item.require_data()) + MultiModalCacheItemMetadata.wraps(mm_item) out_mm_items.append(mm_item) return out_mm_items @@ -91,25 +92,27 @@ class MultiModalInputCacheServer: self.enabled = mm_registry.enable_mm_input_cache(model_config) self.mm_cache = MultiModalCache.get_lru_cache( model_config.get_mm_input_cache_gb(), - Mapping[str, NestedTensors], + MultiModalKwargsItem, ) def get_and_update( self, - mm_kwargs: list[MultiModalKwargsItem], + mm_kwargs: Sequence[Optional[MultiModalKwargsItem]], mm_hashes: list[str], ) -> list[MultiModalKwargsItem]: if not self.enabled: - return mm_kwargs + mm_kwargs_lst = list(mm_kwargs) + assert is_list_of(mm_kwargs_lst, MultiModalKwargsItem) + return mm_kwargs_lst assert len(mm_kwargs) == len(mm_hashes) out_mm_items = list[MultiModalKwargsItem]() for mm_item, mm_hash in zip(mm_kwargs, mm_hashes): - if (mm_data := mm_item.get_data()) is None: - out_mm_items.append(mm_item.with_data(self.mm_cache[mm_hash])) + if mm_item is None: + out_mm_items.append(self.mm_cache[mm_hash]) else: - self.mm_cache[mm_hash] = mm_data + self.mm_cache[mm_hash] = mm_item out_mm_items.append(mm_item) return out_mm_items diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 376c76a7e7285..c6a23cdbf65ae 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -17,6 +17,7 @@ from vllm.multimodal.utils import argsort_mm_positions from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import TokenizerGroup +from vllm.utils import is_list_of from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient from vllm.v1.structured_output.backend_guidance import ( @@ -295,7 +296,7 @@ class Processor: pooling_params = params.clone() # Multimodal related. - sorted_mm_inputs: Optional[list[MultiModalKwargsItem]] = None + sorted_mm_inputs: Optional[list[Optional[MultiModalKwargsItem]]] = None sorted_mm_positions: Optional[list[PlaceholderRange]] = None sorted_mm_hashes: Optional[list[str]] = None if decoder_inputs["type"] == "multimodal": @@ -308,7 +309,7 @@ class Processor: # in the input sequence. sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions) - sorted_mm_inputs = [ + orig_sorted_mm_inputs = [ decoder_mm_inputs.get_item(modality, idx) for modality, idx in sorted_mm_idxs ] @@ -323,9 +324,12 @@ class Processor: if sorted_mm_hashes is not None: sorted_mm_inputs = self.mm_input_cache_client.get_and_update( - sorted_mm_inputs, + orig_sorted_mm_inputs, sorted_mm_hashes, ) + else: + assert is_list_of(orig_sorted_mm_inputs, MultiModalKwargsItem) + sorted_mm_inputs = orig_sorted_mm_inputs return decoder_inputs.get("prompt"), EngineCoreRequest( request_id=request_id, diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 562925bde669e..8b703b6191fe6 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -125,14 +125,17 @@ class Request: block_hasher: Optional[Callable[["Request"], list["BlockHash"]]] ) -> "Request": if request.mm_kwargs is not None: - assert is_list_of(request.mm_kwargs, MultiModalKwargsItem), ( + mm_kwargs_lst = list(request.mm_kwargs) + assert is_list_of(mm_kwargs_lst, MultiModalKwargsItem), ( "mm_kwargs was not updated in EngineCore.add_request") + else: + mm_kwargs_lst = None return cls( request_id=request.request_id, client_index=request.client_index, prompt_token_ids=request.prompt_token_ids, - multi_modal_kwargs=request.mm_kwargs, + multi_modal_kwargs=mm_kwargs_lst, multi_modal_hashes=request.mm_hashes, multi_modal_placeholders=request.mm_placeholders, sampling_params=request.sampling_params, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4c919b392fbd9..5ee44a82574c0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -500,8 +500,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): second_per_grid_ts = [] audio_feature_lengths = [] use_audio_in_video = False - for item in self.requests[req_id].mm_kwargs: - mm_input = item.require_data() + for mm_item in self.requests[req_id].mm_kwargs: + mm_input = mm_item.get_data() if mm_input.get("image_grid_thw") is not None: image_grid_thw.append( mm_input["image_grid_thw"].tolist()) From 829bbd7882222c85c0ca5a17fbb2f70e543f50ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Sat, 16 Aug 2025 20:16:58 +0800 Subject: [PATCH 038/361] [New Model]mBART model (#22883) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪志鹏 --- docs/models/supported_models.md | 4 + examples/offline_inference/encoder_decoder.py | 235 +++++---- .../models/language/generation/test_mbart.py | 123 +++++ tests/models/registry.py | 2 + vllm/model_executor/models/bart.py | 444 +++++++++++++++++- vllm/model_executor/models/registry.py | 1 + 6 files changed, 717 insertions(+), 92 deletions(-) create mode 100644 tests/models/language/generation/test_mbart.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index a24fa4bcce333..a514572945c3f 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -330,6 +330,7 @@ th { | `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ | | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | | | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | | +| `MBartForConditionalGeneration` | mBART | `facebook/mbart-large-en-ro`, `facebook/mbart-large-50`, etc. | | | | | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ | | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ | | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ | @@ -418,6 +419,9 @@ Some models are supported only via the [Transformers backend](#transformers). Th !!! note Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. +!!! note + Some mBART models' config files do not have an `architecture` defined. Therefore, you need to use `--hf-overrides '{"architectures": ["MBartForConditionalGeneration"]}'` to explicitly specify the use of the `MBartForConditionalGeneration` architecture. + ### Pooling Models See [this page](./pooling_models.md) for more information on how to use pooling models. diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py index 0da6fa5c4af5f..df6c1eaf4a21e 100644 --- a/examples/offline_inference/encoder_decoder.py +++ b/examples/offline_inference/encoder_decoder.py @@ -2,9 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Demonstrate prompting of text-to-text -encoder/decoder models, specifically BART +encoder/decoder models, specifically BART and mBART. + +This script is refactored to allow model selection via command-line arguments. """ +import argparse +from typing import NamedTuple, Optional + from vllm import LLM, SamplingParams from vllm.inputs import ( ExplicitEncoderDecoderPrompt, @@ -14,119 +19,175 @@ from vllm.inputs import ( ) -def create_prompts(tokenizer): - # Test prompts - # - # This section shows all of the valid ways to prompt an - # encoder/decoder model. - # - # - Helpers for building prompts - text_prompt_raw = "Hello, my name is" - text_prompt = TextPrompt(prompt="The president of the United States is") +class ModelRequestData(NamedTuple): + """ + Holds the configuration for a specific model, including its + HuggingFace ID and the prompts to use for the demo. + """ + + model_id: str + encoder_prompts: list + decoder_prompts: list + hf_overrides: Optional[dict] = None + + +def get_bart_config() -> ModelRequestData: + """ + Returns the configuration for facebook/bart-large-cnn. + This uses the exact test cases from the original script. + """ + encoder_prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "An encoder prompt", + ] + decoder_prompts = [ + "A decoder prompt", + "Another decoder prompt", + ] + return ModelRequestData( + model_id="facebook/bart-large-cnn", + encoder_prompts=encoder_prompts, + decoder_prompts=decoder_prompts, + ) + + +def get_mbart_config() -> ModelRequestData: + """ + Returns the configuration for facebook/mbart-large-en-ro. + This uses prompts suitable for an English-to-Romanian translation task. + """ + encoder_prompts = [ + "The quick brown fox jumps over the lazy dog.", + "How are you today?", + ] + decoder_prompts = ["", ""] + hf_overrides = {"architectures": ["MBartForConditionalGeneration"]} + return ModelRequestData( + model_id="facebook/mbart-large-en-ro", + encoder_prompts=encoder_prompts, + decoder_prompts=decoder_prompts, + hf_overrides=hf_overrides, + ) + + +MODEL_GETTERS = { + "bart": get_bart_config, + "mbart": get_mbart_config, +} + + +def create_all_prompt_types( + encoder_prompts_raw: list, + decoder_prompts_raw: list, + tokenizer, +) -> list: + """ + Generates a list of diverse prompt types for demonstration. + This function is generic and uses the provided raw prompts + to create various vLLM input objects. + """ + text_prompt_raw = encoder_prompts_raw[0] + text_prompt = TextPrompt(prompt=encoder_prompts_raw[1 % len(encoder_prompts_raw)]) tokens_prompt = TokensPrompt( - prompt_token_ids=tokenizer.encode(prompt="The capital of France is") - ) - # - Pass a single prompt to encoder/decoder model - # (implicitly encoder input prompt); - # decoder input prompt is assumed to be None - - single_text_prompt_raw = text_prompt_raw # Pass a string directly - single_text_prompt = text_prompt # Pass a TextPrompt - single_tokens_prompt = tokens_prompt # Pass a TokensPrompt - - # ruff: noqa: E501 - # - Pass explicit encoder and decoder input prompts within one data structure. - # Encoder and decoder prompts can both independently be text or tokens, with - # no requirement that they be the same prompt type. Some example prompt-type - # combinations are shown below, note that these are not exhaustive. - - enc_dec_prompt1 = ExplicitEncoderDecoderPrompt( - # Pass encoder prompt string directly, & - # pass decoder prompt tokens - encoder_prompt=single_text_prompt_raw, - decoder_prompt=single_tokens_prompt, - ) - enc_dec_prompt2 = ExplicitEncoderDecoderPrompt( - # Pass TextPrompt to encoder, and - # pass decoder prompt string directly - encoder_prompt=single_text_prompt, - decoder_prompt=single_text_prompt_raw, - ) - enc_dec_prompt3 = ExplicitEncoderDecoderPrompt( - # Pass encoder prompt tokens directly, and - # pass TextPrompt to decoder - encoder_prompt=single_tokens_prompt, - decoder_prompt=single_text_prompt, + prompt_token_ids=tokenizer.encode( + encoder_prompts_raw[2 % len(encoder_prompts_raw)] + ) ) - # - Finally, here's a useful helper function for zipping encoder and - # decoder prompts together into a list of ExplicitEncoderDecoderPrompt - # instances + decoder_tokens_prompt = TokensPrompt( + prompt_token_ids=tokenizer.encode(decoder_prompts_raw[0]) + ) + single_prompt_examples = [ + text_prompt_raw, + text_prompt, + tokens_prompt, + ] + explicit_pair_examples = [ + ExplicitEncoderDecoderPrompt( + encoder_prompt=text_prompt_raw, + decoder_prompt=decoder_tokens_prompt, + ), + ExplicitEncoderDecoderPrompt( + encoder_prompt=text_prompt, + decoder_prompt=decoder_prompts_raw[1 % len(decoder_prompts_raw)], + ), + ExplicitEncoderDecoderPrompt( + encoder_prompt=tokens_prompt, + decoder_prompt=text_prompt, + ), + ] zipped_prompt_list = zip_enc_dec_prompts( - ["An encoder prompt", "Another encoder prompt"], - ["A decoder prompt", "Another decoder prompt"], + encoder_prompts_raw, + decoder_prompts_raw, ) - - # - Let's put all of the above example prompts together into one list - # which we will pass to the encoder/decoder LLM. - return [ - single_text_prompt_raw, - single_text_prompt, - single_tokens_prompt, - enc_dec_prompt1, - enc_dec_prompt2, - enc_dec_prompt3, - ] + zipped_prompt_list + return single_prompt_examples + explicit_pair_examples + zipped_prompt_list -# Create a sampling params object. -def create_sampling_params(): +def create_sampling_params() -> SamplingParams: + """Create a sampling params object.""" return SamplingParams( temperature=0, top_p=1.0, min_tokens=0, - max_tokens=20, + max_tokens=30, ) -# Print the outputs. -def print_outputs(outputs): - print("-" * 50) +def print_outputs(outputs: list): + """Formats and prints the generation outputs.""" + print("-" * 80) for i, output in enumerate(outputs): prompt = output.prompt encoder_prompt = output.encoder_prompt generated_text = output.outputs[0].text print(f"Output {i + 1}:") - print( - f"Encoder prompt: {encoder_prompt!r}\n" - f"Decoder prompt: {prompt!r}\n" - f"Generated text: {generated_text!r}" + print(f"Encoder Prompt: {encoder_prompt!r}") + print(f"Decoder Prompt: {prompt!r}") + print(f"Generated Text: {generated_text!r}") + print("-" * 80) + + +def main(args): + """Main execution function.""" + model_key = args.model + if model_key not in MODEL_GETTERS: + raise ValueError( + f"Unknown model: {model_key}. " + f"Available models: {list(MODEL_GETTERS.keys())}" ) - print("-" * 50) + config_getter = MODEL_GETTERS[model_key] + model_config = config_getter() - -def main(): - dtype = "float" - - # Create a BART encoder/decoder model instance + print(f"🚀 Running demo for model: {model_config.model_id}") llm = LLM( - model="facebook/bart-large-cnn", - dtype=dtype, + model=model_config.model_id, + dtype="float", + hf_overrides=model_config.hf_overrides, ) - - # Get BART tokenizer tokenizer = llm.llm_engine.get_tokenizer_group() - - prompts = create_prompts(tokenizer) + prompts = create_all_prompt_types( + encoder_prompts_raw=model_config.encoder_prompts, + decoder_prompts_raw=model_config.decoder_prompts, + tokenizer=tokenizer, + ) sampling_params = create_sampling_params() - - # Generate output tokens from the prompts. The output is a list of - # RequestOutput objects that contain the prompt, generated - # text, and other information. outputs = llm.generate(prompts, sampling_params) - print_outputs(outputs) if __name__ == "__main__": - main() + parser = argparse.ArgumentParser( + description="A flexible demo for vLLM encoder-decoder models." + ) + parser.add_argument( + "--model", + "-m", + type=str, + default="bart", + choices=MODEL_GETTERS.keys(), + help="The short name of the model to run.", + ) + args = parser.parse_args() + main(args) diff --git a/tests/models/language/generation/test_mbart.py b/tests/models/language/generation/test_mbart.py new file mode 100644 index 0000000000000..854a72713943b --- /dev/null +++ b/tests/models/language/generation/test_mbart.py @@ -0,0 +1,123 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + +import pytest +from transformers import AutoModelForSeq2SeqLM + +from vllm.sequence import SampleLogprobs + +from ....conftest import DecoderPromptType, HfRunner, VllmRunner +from ...utils import check_logprobs_close + + +def vllm_to_hf_output( + vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], + decoder_prompt_type: DecoderPromptType, +): + """Sanitize vllm output to be comparable with hf output.""" + output_ids, output_str, out_logprobs = vllm_output + hf_output_str = output_str + "" + return output_ids, hf_output_str, out_logprobs + + +def run_test( + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + prompts: list[dict[str, str]], + decoder_prompt_type: DecoderPromptType, + model: str, + *, + dtype: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +) -> None: + ''' + Test the vLLM mBART model by validating it against HuggingFace (HF). + (Docstring content is omitted for brevity) + ''' + + vllm_prompts = prompts + if decoder_prompt_type == DecoderPromptType.NONE: + vllm_prompts = [{ + "encoder_prompt": p['encoder_prompt'], + "decoder_prompt": "" + } for p in prompts] + + vllm_kwargs = { + "hf_overrides": { + "architectures": ["MBartForConditionalGeneration"] + } + } + + with vllm_runner(model, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True, + **vllm_kwargs) as vllm_model: # type: ignore + vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( + vllm_prompts, max_tokens, num_logprobs) + + hf_kwargs = { + "top_k": None, + "num_beams": 1, + "repetition_penalty": 1.0, + "top_p": 1.0, + "length_penalty": 1.0, + "early_stopping": False, + "no_repeat_ngram_size": None, + "min_length": 0 + } + + with hf_runner(model, dtype=dtype, + auto_cls=AutoModelForSeq2SeqLM) as hf_model: + hf_kwargs["decoder_start_token_id"] = ( + hf_model.tokenizer.lang_code_to_id["ro_RO"]) + + hf_outputs = ( + hf_model.generate_encoder_decoder_greedy_logprobs_limit( + prompts, # HF runner still uses the original prompts + max_tokens, + num_logprobs, + **hf_kwargs, + )) + + hf_skip_tokens = 0 + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output, decoder_prompt_type) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + num_outputs_0_skip_tokens=hf_skip_tokens, + ) + + +@pytest.mark.parametrize( + "model", + [pytest.param("facebook/mbart-large-en-ro")], +) +@pytest.mark.parametrize("dtype", ["float", "bfloat16"]) +@pytest.mark.parametrize("max_tokens", [64]) +@pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) +def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model, + dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None: + + run_test( + hf_runner, + vllm_runner, + example_encoder_decoder_prompts[decoder_prompt_type], + decoder_prompt_type, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) diff --git a/tests/models/registry.py b/tests/models/registry.py index 10e29e01e8a18..99cf997790fec 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -316,6 +316,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { # [Encoder-decoder] "BartModel": _HfExamplesInfo("facebook/bart-base"), "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"), + "MBartForConditionalGeneration": _HfExamplesInfo("facebook/mbart-large-en-ro", # noqa: E501 + hf_overrides={"architectures": ["MBartForConditionalGeneration"]}), # noqa: E501 } _EMBEDDING_EXAMPLE_MODELS = { diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index 3d328c88ff6e0..32551d8102f32 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -46,7 +46,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsQuant, SupportsV0Only -from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix +from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors, + maybe_prefix) logger = logging.get_logger(__name__) @@ -422,10 +423,7 @@ class BartEncoderLayer(nn.Module): if hidden_states.dtype == torch.float16 and ( torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()): - clamp_value = torch.finfo(hidden_states.dtype).max - 1000 - hidden_states = torch.clamp(hidden_states, - min=-clamp_value, - max=clamp_value) + hidden_states = cast_overflow_tensors(hidden_states) return hidden_states @@ -906,3 +904,439 @@ class BartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant): }) return loaded_params + + +class MBartEncoderLayer(BartEncoderLayer): + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + r""" + Args: + hidden_states + torch.Tensor of *encoder* input embeddings. + Returns: + Encoder layer output torch.Tensor + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states = self.self_attn(hidden_states=hidden_states) + + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + fc1_out, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(fc1_out) + + hidden_states, _ = self.fc2(hidden_states) + + hidden_states = residual + hidden_states + + if hidden_states.dtype == torch.float16 and ( + torch.isinf(hidden_states).any() + or torch.isnan(hidden_states).any()): + hidden_states = cast_overflow_tensors(hidden_states) + + return hidden_states + + +class MBartDecoderLayer(BartDecoderLayer): + + def forward( + self, + decoder_hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + residual = decoder_hidden_states + hidden_states = self.self_attn_layer_norm(decoder_hidden_states) + + # Self Attention + hidden_states = self.self_attn(hidden_states=hidden_states) + + hidden_states = residual + hidden_states + + # Cross-Attention Block + + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + hidden_states = self.encoder_attn( + decoder_hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + ) + + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + fc1_out, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(fc1_out) + + hidden_states, _ = self.fc2(hidden_states) + + hidden_states = residual + hidden_states + + return hidden_states + + +class MBartEncoder(nn.Module): + """ + Transformer encoder consisting of *config.encoder_layers* + self attention layers. Each layer is a [`BartEncoderLayer`]. + Args: + config: BartConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, + config: BartConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + embed_tokens: Optional[nn.Embedding] = None, + prefix: str = ""): + super().__init__() + + self.cache_config = cache_config + self.quant_config = quant_config + self.lora_config = lora_config + embed_dim = config.d_model + self.max_source_positions = config.max_position_embeddings + embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 + + self.embed_tokens = BartScaledWordEmbedding(config.vocab_size, + embed_dim, + embed_scale=embed_scale) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = BartLearnedPositionalEmbedding( + config.max_position_embeddings, + embed_dim, + ) + self.layers = nn.ModuleList([ + MBartEncoderLayer(config, + cache_config, + quant_config, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(config.encoder_layers) + ]) + + self.layernorm_embedding = nn.LayerNorm(embed_dim) + self.layer_norm = nn.LayerNorm(config.d_model) # 改动 + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + r""" + Args: + input_ids + Indices of *encoder* input sequence tokens in the vocabulary. + Padding will be ignored by default should you + provide it. + positions + Positions of *encoder* input sequence tokens. + Returns: + Decoder output torch.Tensor + """ + # retrieve input_ids and inputs_embeds + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + embed_pos = self.embed_positions(positions) + embed_pos = embed_pos.to(inputs_embeds.device) + + hidden_states = inputs_embeds + embed_pos + hidden_states = self.layernorm_embedding(hidden_states) + + for encoder_layer in self.layers: + hidden_states = encoder_layer(hidden_states=hidden_states) + + hidden_states = self.layer_norm(hidden_states) + return hidden_states + + +class MBartDecoder(nn.Module): + """ + Transformer decoder consisting of *config.decoder_layers* layers. + Each layer is a [`BartDecoderLayer`] + Args: + config: BartConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__( + self, + config: BartConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + embed_tokens: Optional[nn.Embedding] = None, + prefix: str = "", + ): + super().__init__() + self.cache_config = cache_config + self.quant_config = quant_config + self.lora_config = lora_config + self.max_target_positions = config.max_position_embeddings + embed_scale = math.sqrt( + config.d_model) if config.scale_embedding else 1.0 + + self.embed_tokens = BartScaledWordEmbedding(config.vocab_size, + config.d_model, + embed_scale=embed_scale) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = BartLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + ) + + self.layers = nn.ModuleList( + [MBartDecoderLayer(config, cache_config, quant_config, + prefix=f"{prefix}.layers.{layer_idx}") \ + for layer_idx in range(config.decoder_layers)]) + + self.layernorm_embedding = nn.LayerNorm(config.d_model) + self.layer_norm = nn.LayerNorm(config.d_model) + + def forward( + self, + decoder_input_ids: torch.Tensor, + decoder_positions: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor], + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + r""" + Args: + decoder_input_ids + Indices of *decoder* input sequence tokens in the vocabulary. + Padding will be ignored by default should you + provide it. + decoder_positions + Positions of *decoder* input sequence tokens. + encoder_hidden_states: + Tensor of encoder output embeddings + Returns: + Decoder output torch.Tensor + """ + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(decoder_input_ids) + else: + decoder_positions = inputs_embeds[:, -1] + + # embed positions + embed_pos = self.embed_positions(decoder_positions) + embed_pos = embed_pos.to(inputs_embeds.device) + + hidden_states = inputs_embeds + embed_pos + hidden_states = self.layernorm_embedding(hidden_states) + + # decoder layers + + for decoder_layer in self.layers: + hidden_states = decoder_layer( + decoder_hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + ) + + hidden_states = self.layer_norm(hidden_states) + return hidden_states + + +class MBartModel(nn.Module, SupportsQuant): + _tied_weights_keys = [ + "encoder.embed_tokens.weight", "decoder.embed_tokens.weight" + ] + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.config = config + + lora_vocab = (lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0 + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size + + self.encoder = MBartEncoder(config, + cache_config, + quant_config=quant_config, + prefix=f"{prefix}.encoder") + self.decoder = MBartDecoder(config, + cache_config, + quant_config=quant_config, + prefix=f"{prefix}.decoder") + + def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, + encoder_input_ids: torch.Tensor, + encoder_positions: torch.Tensor) -> torch.Tensor: + r""" + Args: + input_ids + Indices of *decoder* input sequence tokens in the vocabulary. + Padding will be ignored by default should you + provide it. + positions + Positions of *decoder* input sequence tokens. + encoder_input_ids + Indices of *encoder* input sequence tokens in the vocabulary. + encoder_positions: + Positions of *encoder* input sequence tokens. + Returns: + Model output torch.Tensor + """ + + encoder_hidden_states = None + + if encoder_input_ids.numel() > 0: + # Run encoder attention if a non-zero number of encoder tokens + # are provided as input + encoder_hidden_states = self.encoder(input_ids=encoder_input_ids, + positions=encoder_positions) + + # decoder outputs consists of + # (dec_features, past_key_value, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + decoder_input_ids=input_ids, + decoder_positions=positions, + encoder_hidden_states=encoder_hidden_states) + + return decoder_outputs + + +class MBartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant): + base_model_prefix = "model" + + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "decoder.": "model.decoder.", + "encoder.": "model.encoder.", + "shared.": "model.shared." + }, + orig_to_new_substr={ + "beta": "bias", + "gamma": "weight", + "LayerNorm": "layernorm", + }, + ) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + lora_config = vllm_config.lora_config + assert config.tie_word_embeddings + self.config = config + self.model = MBartModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + + embed_scale = math.sqrt( + config.d_model) if config.scale_embedding else 1.0 + + self.lm_head = BartParallelLMHead(config.vocab_size, + config.d_model, + embed_scale=embed_scale) + + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + *, + encoder_input_ids: torch.Tensor, + encoder_positions: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + return self.model(input_ids, positions, encoder_input_ids, + encoder_positions) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + model_params_dict = dict(self.named_parameters()) + loaded_params = set() + remaining_weights = [] + shared_embedding_weight = None + + for name, loaded_weight in weights: + if any(skip in name + for skip in ["cls.", "pooler.", "final_logits_bias"]): + continue + if any(embed_name in name for embed_name in [ + 'shared.weight', 'encoder.embed_tokens.weight', + 'decoder.embed_tokens.weight' + ]): + if shared_embedding_weight is None: + shared_embedding_weight = loaded_weight + continue + is_stacked = False + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + vllm_name = name + for src, dst in self.hf_to_vllm_mapper.orig_to_new_substr.items( + ): + vllm_name = vllm_name.replace(src, dst) + for src, dst in self.hf_to_vllm_mapper.orig_to_new_prefix.items( + ): + if vllm_name.startswith(src): + vllm_name = dst + vllm_name[len(src):] + break + vllm_name = vllm_name.replace(weight_name, param_name) + if vllm_name in model_params_dict: + param = model_params_dict[vllm_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight, shard_id) + loaded_params.add(vllm_name) + is_stacked = True + break + if not is_stacked: + remaining_weights.append((name, loaded_weight)) + loader = AutoWeightsLoader(self, skip_prefixes=["cls.", "pooler."]) + auto_loaded_params = loader.load_weights(remaining_weights, + mapper=self.hf_to_vllm_mapper) + loaded_params.update(auto_loaded_params) + if shared_embedding_weight is not None: + lm_head_param = self.lm_head.weight + weight_loader = getattr(lm_head_param, "weight_loader", + default_weight_loader) + weight_loader(lm_head_param, shared_embedding_weight) + self.model.encoder.embed_tokens.weight = self.lm_head.weight + self.model.decoder.embed_tokens.weight = self.lm_head.weight + loaded_params.update({ + 'model.encoder.embed_tokens.weight', 'lm_head.weight', + 'model.decoder.embed_tokens.weight' + }) + return loaded_params diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index b817615b43564..109bc1fe5c779 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -141,6 +141,7 @@ _TEXT_GENERATION_MODELS = { # [Encoder-decoder] "BartModel": ("bart", "BartForConditionalGeneration"), "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"), + "MBartForConditionalGeneration": ("bart", "MBartForConditionalGeneration"), } _EMBEDDING_MODELS = { From 52ce1420e9f6f52308f49a2898433a52674a4a8b Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Sat, 16 Aug 2025 14:36:30 -0300 Subject: [PATCH 039/361] Fix handling of `max_num_batched_tokens` for pooling tasks (#23004) Signed-off-by: Max de Bayser --- vllm/config/__init__.py | 3 --- vllm/engine/arg_utils.py | 10 +++++----- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 72fec5e205e34..14fc5589a89a4 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3600,9 +3600,6 @@ class VllmConfig: logger.info(reason) self.scheduler_config.chunked_prefill_enabled = False self.scheduler_config.long_prefill_token_threshold = 0 - self.scheduler_config.max_num_batched_tokens = max( - self.scheduler_config.max_model_len, - DEFAULT_MAX_NUM_BATCHED_TOKENS) if self.cache_config is not None: self.cache_config.enable_prefix_caching = False diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f8af6d36e0c06..630fbec4539e7 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1602,9 +1602,6 @@ class EngineArgs: self.enable_prefix_caching = incremental_prefill_supported logger.info("(%s) prefix caching by default", action) - if not self.enable_chunked_prefill: - self.max_num_batched_tokens = model_config.max_model_len - # V1 should use the new scheduler by default. # Swap it only if this arg is set to the original V0 default if self.scheduler_cls == EngineArgs.scheduler_cls: @@ -1692,8 +1689,11 @@ class EngineArgs: self.max_num_batched_tokens = \ default_max_num_batched_tokens[usage_context] else: - self.max_num_batched_tokens = default_max_num_batched_tokens[ - usage_context] + if not self.enable_chunked_prefill: + self.max_num_batched_tokens = model_config.max_model_len + else: + self.max_num_batched_tokens = \ + default_max_num_batched_tokens[usage_context] logger.debug( "Setting max_num_batched_tokens to %d for %s usage context.", self.max_num_batched_tokens, use_context_value) From 68373d3126b4d2c49a9983fe0696bbd48fc8aad7 Mon Sep 17 00:00:00 2001 From: Woonggi Min Date: Sun, 17 Aug 2025 02:38:42 +0900 Subject: [PATCH 040/361] [Frontend] Added support for HermesToolParser for models without special tokens (#16890) Signed-off-by: minpeter --- .../tool_parsers/test_hermes_tool_parser.py | 127 ++++++++++++++++++ .../openai/tool_parsers/hermes_tool_parser.py | 81 ++++++++--- 2 files changed, 191 insertions(+), 17 deletions(-) create mode 100644 tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py new file mode 100644 index 0000000000000..28b1f8358d80b --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py @@ -0,0 +1,127 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json + +import pytest + +from ....utils import RemoteOpenAIServer + +MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" +LORA_MODEL = "minpeter/LoRA-Llama-3.2-1B-tool-vllm-ci" + +SERVER_ARGS = [ + "--enforce-eager", + "--enable-auto-tool-choice", + "--tool-call-parser", + "hermes", + "--enable-lora", + "--lora-modules", + f"{LORA_MODEL}={LORA_MODEL}", +] + +TOOLS = [{ + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": + "The city and state, e.g. San Francisco, CA", + }, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"] + }, + }, + "required": ["location"], + }, + }, +}] + +MESSAGES = [{"role": "user", "content": "What's the weather like in Boston?"}] + + +@pytest.mark.asyncio +async def test_non_streaming_tool_call(): + """Test tool call in non-streaming mode.""" + with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server: + client = server.get_async_client() + + response = await client.chat.completions.create( + model=LORA_MODEL, + messages=MESSAGES, + tools=TOOLS, + tool_choice="auto", + temperature=0.0, + ) + + assert response.choices + choice = response.choices[0] + message = choice.message + + assert choice.finish_reason == "tool_calls" + assert message.tool_calls is not None + + tool_call = message.tool_calls[0] + assert tool_call.type == "function" + assert tool_call.function.name == "get_current_weather" + + arguments = json.loads(tool_call.function.arguments) + assert "location" in arguments + assert "Boston" in arguments["location"] + print("\n[Non-Streaming Test Passed]") + print(f"Tool Call: {tool_call.function.name}") + print(f"Arguments: {arguments}") + + +@pytest.mark.asyncio +async def test_streaming_tool_call(): + """Test tool call in streaming mode.""" + with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server: + client = server.get_async_client() + + stream = await client.chat.completions.create( + model=LORA_MODEL, + messages=MESSAGES, + tools=TOOLS, + tool_choice="auto", + temperature=0.0, + stream=True, + ) + + tool_call_chunks = {} + async for chunk in stream: + if not chunk.choices: + continue + + delta = chunk.choices[0].delta + if not delta or not delta.tool_calls: + continue + + for tool_chunk in delta.tool_calls: + index = tool_chunk.index + if index not in tool_call_chunks: + tool_call_chunks[index] = {"name": "", "arguments": ""} + + if tool_chunk.function.name: + tool_call_chunks[index]["name"] += tool_chunk.function.name + if tool_chunk.function.arguments: + tool_call_chunks[index][ + "arguments"] += tool_chunk.function.arguments + + assert len(tool_call_chunks) == 1 + reconstructed_tool_call = tool_call_chunks[0] + + assert reconstructed_tool_call["name"] == "get_current_weather" + + arguments = json.loads(reconstructed_tool_call["arguments"]) + assert "location" in arguments + assert "Boston" in arguments["location"] + print("\n[Streaming Test Passed]") + print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}") + print(f"Reconstructed Arguments: {arguments}") diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index c7030d34d453e..d126130ab9bc3 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -52,14 +52,51 @@ class Hermes2ProToolParser(ToolParser): raise ValueError( "The model tokenizer must be passed to the ToolParser " "constructor during construction.") - self.tool_call_start_token_id = self.vocab.get( - self.tool_call_start_token) - self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) - if (self.tool_call_start_token_id is None - or self.tool_call_end_token_id is None): - raise RuntimeError( - "Hermes 2 Pro Tool parser could not locate tool call start/end " - "tokens in the tokenizer!") + self.tool_call_start_token_ids = self.model_tokenizer.encode( + self.tool_call_start_token, add_special_tokens=False) + self.tool_call_end_token_ids = self.model_tokenizer.encode( + self.tool_call_end_token, add_special_tokens=False) + + self.tool_call_start_token_array = [ + self.model_tokenizer.decode([token_id]) + for token_id in self.tool_call_start_token_ids + ] + + self.tool_call_end_token_array = [ + self.model_tokenizer.decode([token_id]) + for token_id in self.tool_call_end_token_ids + ] + + self.buffered_delta_text = "" + + # Very simple idea: when encountering tokens like <, tool, _call, >, + # <, /, tool, _call, >, store them in a buffer. + # When the last token is encountered, empty the buffer and return it. + # If a token appears in an incorrect sequence while storing in the buffer, + # return the preceding buffer along with the token. + def tool_call_delta_buffer(self, delta_text: str): + # If the sequence of tool_call_start or tool_call_end tokens is not yet + # complete, fill the buffer with the token and return "". + if (delta_text in self.tool_call_start_token_array + or delta_text in self.tool_call_end_token_array): + # If delta_text is the last token of tool_call_start_token or + # tool_call_end_token, empty the buffer and return + # the buffered text + delta_text. + if (delta_text == self.tool_call_start_token_array[-1] + or delta_text == self.tool_call_end_token_array[-1]): + buffered_text = self.buffered_delta_text + self.buffered_delta_text = "" + return buffered_text + delta_text + else: + self.buffered_delta_text = self.buffered_delta_text + delta_text + return "" + else: + if self.buffered_delta_text: + buffered_text = self.buffered_delta_text + self.buffered_delta_text = "" + return buffered_text + delta_text + else: + return delta_text def extract_tool_calls( self, @@ -124,11 +161,23 @@ class Hermes2ProToolParser(ToolParser): delta_token_ids: Sequence[int], request: ChatCompletionRequest, ) -> Union[DeltaMessage, None]: + # 1. All tokens are parsed based on _text, not token_ids. + # 2. All incoming text data is processed by the tool_call_delta_buffer + # function for buffering before being used for parsing. + + delta_text = self.tool_call_delta_buffer(delta_text) + # If the last characters of previous_text + # match self.buffered_delta_text, remove only the matching part. + if (len(previous_text) >= len(self.buffered_delta_text) + and previous_text[-len(self.buffered_delta_text):] + == self.buffered_delta_text): + previous_text = previous_text[:-len(self.buffered_delta_text)] + current_text = previous_text + delta_text logger.debug("delta_text: %s", delta_text) logger.debug("delta_token_ids: %s", delta_token_ids) # check to see if we should be streaming a tool call - is there a - if self.tool_call_start_token_id not in current_token_ids: + if self.tool_call_start_token not in current_text: logger.debug("No tool call tokens found!") return DeltaMessage(content=delta_text) @@ -136,14 +185,12 @@ class Hermes2ProToolParser(ToolParser): # figure out where we are in the parsing by counting tool call # start & end tags - prev_tool_start_count = previous_token_ids.count( - self.tool_call_start_token_id) - prev_tool_end_count = previous_token_ids.count( - self.tool_call_end_token_id) - cur_tool_start_count = current_token_ids.count( - self.tool_call_start_token_id) - cur_tool_end_count = current_token_ids.count( - self.tool_call_end_token_id) + prev_tool_start_count = previous_text.count( + self.tool_call_start_token) + prev_tool_end_count = previous_text.count(self.tool_call_end_token) + cur_tool_start_count = current_text.count( + self.tool_call_start_token) + cur_tool_end_count = current_text.count(self.tool_call_end_token) tool_call_portion = None text_portion = None From 000cceca8c329d5b5d99e0186fbd444a390384cd Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 16 Aug 2025 14:16:00 -0400 Subject: [PATCH 041/361] [Bugfix gpt-oss] Fix float32 convert for flashinfer sink support (#23016) Signed-off-by: mgoin --- vllm/attention/layer.py | 9 +++++++++ vllm/v1/attention/backends/flashinfer.py | 3 --- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 1a9c0e26b53ca..0e87fa3f23e3a 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -308,6 +308,15 @@ class Attention(nn.Module): if hasattr(self.impl, "process_weights_after_loading"): self.impl.process_weights_after_loading(act_dtype) + # FlashInfer requires attention sinks to be float32 + if (self.backend == _Backend.FLASHINFER_VLLM_V1 + and hasattr(self.impl, 'sinks')): + from vllm.v1.attention.backends.flashinfer import FlashInferImpl + assert isinstance(self.impl, FlashInferImpl) + if (self.impl.sinks is not None + and self.impl.sinks.dtype != torch.float32): + self.impl.sinks = self.impl.sinks.to(torch.float32) + def get_attn_backend(self) -> type[AttentionBackend]: return self.attn_backend diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index eac3f33e15096..991904229fd7f 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -642,9 +642,6 @@ class FlashInferImpl(AttentionImpl): f"heads in the layer. Expected {num_heads}, but got " f"{sinks.shape[0]}." ) - # Cast sinks to float32 if needed (FlashInfer requirement) - if sinks.dtype != torch.float32: - sinks = sinks.to(torch.float32) self.sinks = sinks def forward( From 3253ae765ef4dc0604a6f3ed3a1dcd61fdda6bda Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 16 Aug 2025 14:33:08 -0400 Subject: [PATCH 042/361] [Flaky CI] Increase timeout tolerance for test_mp_crash_detection+test_default_mm_lora_chat_completions (#23028) Signed-off-by: mgoin --- tests/entrypoints/openai/test_default_mm_loras.py | 3 ++- tests/mq_llm_engine/test_error_handling.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/openai/test_default_mm_loras.py b/tests/entrypoints/openai/test_default_mm_loras.py index 372e9b1fecd42..b9c466a6fbeb6 100644 --- a/tests/entrypoints/openai/test_default_mm_loras.py +++ b/tests/entrypoints/openai/test_default_mm_loras.py @@ -48,7 +48,8 @@ def multimodal_server(): # noqa: F811 f"{{\"audio\": \"{AUDIO_LORA_PATH}\"}}", ] - with RemoteOpenAIServer(MULTIMODAL_MODEL_NAME, args) as remote_server: + with RemoteOpenAIServer(MULTIMODAL_MODEL_NAME, args, + max_wait_seconds=480) as remote_server: yield remote_server diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py index 3feee01dadf73..77e3732cd06c6 100644 --- a/tests/mq_llm_engine/test_error_handling.py +++ b/tests/mq_llm_engine/test_error_handling.py @@ -255,8 +255,8 @@ async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch): pass end = time.perf_counter() - assert end - start < 60, ( - "Expected vLLM to gracefully shutdown in <60s " + assert end - start < 100, ( + "Expected vLLM to gracefully shutdown in <100s " "if there is an error in the startup.") From 4fc722eca4f6ad63edf1936989f4d2171aab3ca2 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 16 Aug 2025 15:38:21 -0400 Subject: [PATCH 043/361] [Kernel/Quant] Remove AQLM (#22943) Signed-off-by: mgoin Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- .../scripts/hardware_ci/run-amd-test.sh | 1 - CMakeLists.txt | 1 - benchmarks/kernels/benchmark_aqlm.py | 345 ---------- csrc/ops.h | 9 - csrc/quantization/aqlm/gemm_kernels.cu | 597 ------------------ csrc/torch_bindings.cpp | 15 - .../quantization/supported_hardware.md | 1 - docs/mkdocs/hooks/generate_examples.py | 1 - examples/offline_inference/basic/README.md | 14 - tests/compile/test_full_graph.py | 4 - tests/kernels/quantization/test_aqlm.py | 40 -- tests/models/quantization/test_aqlm.py | 68 -- vllm/_custom_ops.py | 41 -- vllm/model_executor/layers/linear.py | 18 - .../layers/quantization/__init__.py | 3 - .../layers/quantization/aqlm.py | 376 ----------- 16 files changed, 1534 deletions(-) delete mode 100644 benchmarks/kernels/benchmark_aqlm.py delete mode 100644 csrc/quantization/aqlm/gemm_kernels.cu delete mode 100644 tests/kernels/quantization/test_aqlm.py delete mode 100644 tests/models/quantization/test_aqlm.py delete mode 100644 vllm/model_executor/layers/quantization/aqlm.py diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 5e5a532cb57d5..df0bae0c9cbff 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -121,7 +121,6 @@ fi if [[ $commands == *" kernels/quantization"* ]]; then commands="${commands} \ --ignore=kernels/quantization/test_int8_quant.py \ - --ignore=kernels/quantization/test_aqlm.py \ --ignore=kernels/quantization/test_machete_mm.py \ --ignore=kernels/quantization/test_block_fp8.py \ --ignore=kernels/quantization/test_block_int8.py \ diff --git a/CMakeLists.txt b/CMakeLists.txt index cda1ffc795d1b..34386d670ac76 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -286,7 +286,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") FetchContent_MakeAvailable(cutlass) list(APPEND VLLM_EXT_SRC - "csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu" "csrc/permute_cols.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py deleted file mode 100644 index 42de062b08e42..0000000000000 --- a/benchmarks/kernels/benchmark_aqlm.py +++ /dev/null @@ -1,345 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os -import sys -from typing import Optional - -import torch -import torch.nn.functional as F - -from vllm import _custom_ops as ops -from vllm.model_executor.layers.quantization.aqlm import ( - dequantize_weight, - generic_dequantize_gemm, - get_int_dtype, - optimized_dequantize_gemm, -) -from vllm.utils import FlexibleArgumentParser - -os.environ["CUDA_VISIBLE_DEVICES"] = "0" - - -def torch_mult( - # [..., in_features] - input: torch.Tensor, - weights: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, -) -> torch.Tensor: - output = F.linear(input, weights) - return output - - -def dequant_out_scale( - # [..., in_features] - input: torch.Tensor, - # [num_out_groups, num_in_groups, num_codebooks] - codes: torch.IntTensor, - # [num_codebooks, codebook_size, out_group_size, in_group_size] - codebooks: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, - output_partition_sizes: torch.IntTensor, - bias: Optional[torch.Tensor], -) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - - if bias is None: - output = F.linear(input, weights, bias) - orig_shape = output.shape - flattened_output = output.view(-1, output.size(-1)) - f_scales = scales.view(-1, scales.shape[0]) - b_scales = f_scales.expand(flattened_output.shape[0], -1) - flattened_output *= b_scales - return flattened_output.view(orig_shape) - else: - b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1]) - weights *= b_scales - return F.linear(input, weights, bias) - - -def dequant_weight_scale( - # [..., in_features] - input: torch.Tensor, - # [num_out_groups, num_in_groups, num_codebooks] - codes: torch.IntTensor, - # [num_codebooks, codebook_size, out_group_size, in_group_size] - codebooks: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, - output_partition_sizes: torch.IntTensor, - bias: Optional[torch.Tensor], -) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - - b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1]) - weights *= b_scales - return F.linear(input, weights, bias) - - -def dequant_no_scale( - # [..., in_features] - input: torch.Tensor, - # [num_out_groups, num_in_groups, num_codebooks] - codes: torch.IntTensor, - # [num_codebooks, codebook_size, out_group_size, in_group_size] - codebooks: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, - output_partition_sizes: torch.IntTensor, - bias: Optional[torch.Tensor], -) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - - return F.linear(input, weights, bias) - - -# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against -# the generic pytorch version. -# Just visual comparison. -def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None: - n = int(parts.sum().item()) - - device = torch.device("cuda:0") - - code_range = (1 << bits) // 2 - ingroups = 8 - - codes = torch.randint( - -code_range, - code_range, - size=(n, k // ingroups, nbooks), - dtype=get_int_dtype(bits), - device=device, - ) - - codebooks = torch.randn( - size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), - dtype=torch.float16, - device=device, - ) - - count = 0 - for index in range(16): - for i in range(8): - for book in range(nbooks): - codebooks[book, index, 0, i] = count * (10**book) - count += 1 - - print("codes shape", codes.shape) - - for i in range(16): - for book in range(nbooks): - codes[0, i, book] = i - codes[0, -i, book] = i - - weights = dequantize_weight(codes, codebooks, None) - weights2 = ops.aqlm_dequant(codes, codebooks, parts) - - print("weights shape:", weights.shape) - print("weights2 shape:", weights2.shape) - - print("weights are:", weights) - print("weights2 are:", weights2) - - print("first 128 weights are", weights[0, 0:128].to(torch.int32)) - print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32)) - - print("last 128 weights are", weights[0, -128:]) - print("last 128 weights2 are:", weights2[0, -128:]) - - -def main(): - parser = FlexibleArgumentParser(description="Benchmark aqlm performance.") - - # Add arguments - parser.add_argument( - "--nbooks", type=int, default=1, help="Number of codebooks (default: 1)" - ) - parser.add_argument( - "--bits", - type=int, - default=16, - help="Number of bits per code element (default: 16)", - ) - parser.add_argument( - "--test", - type=bool, - default=False, - help="Run the decompression/dequant tester rather than benchmarking " - "(default: False)", - ) - - # Parse the arguments - args = parser.parse_args() - - # Extract values - nbooks = args.nbooks - bits = args.bits - - if args.test: - dequant_test(4096, torch.tensor((4096,)), nbooks, bits) - return - - # Otherwise, benchmark. - methods = [ - ops.aqlm_gemm, - dequant_out_scale, - generic_dequantize_gemm, - optimized_dequantize_gemm, - dequant_weight_scale, - torch_mult, - dequant_no_scale, - ] - - filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv" - print(f"writing benchmarks to file {filename}") - with open(filename, "w") as f: - sys.stdout = f - - print("m | k | n | n parts", end="") - for method in methods: - print(f" | {method.__name__.replace('_', ' ')} (µs)", end="") - print("") - - # These are reasonable prefill sizes. - ksandpartions = ( - (4096, (4096, 4096, 4096)), - (4096, (4096,)), - (4096, (11008, 11008)), - (11008, (4096,)), - ) - - # reasonable ranges for m. - for m in [ - 1, - 2, - 4, - 8, - 10, - 12, - 14, - 16, - 24, - 32, - 48, - 52, - 56, - 64, - 96, - 112, - 128, - 256, - 512, - 1024, - 1536, - 2048, - 3072, - 4096, - ]: - print(f"{m}", file=sys.__stdout__) - for ksp in ksandpartions: - run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits, methods) - - sys.stdout = sys.__stdout__ - - -def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods): - # I didn't see visible improvements from increasing these, but feel free :) - num_warmup_trials = 1 - num_trials = 1 - - num_calls = 100 - - # warmup. - for method in methods: - for _ in range(num_warmup_trials): - run_timing( - num_calls=num_calls, - m=m, - k=k, - parts=parts, - nbooks=nbooks, - bits=bits, - method=method, - ) - - n = parts.sum().item() - print(f"{m} | {k} | {n} | {parts.tolist()}", end="") - - for method in methods: - best_time_us = 1e20 - for _ in range(num_trials): - kernel_dur_ms = run_timing( - num_calls=num_calls, - m=m, - k=k, - parts=parts, - nbooks=nbooks, - bits=bits, - method=method, - ) - - kernel_dur_us = 1000 * kernel_dur_ms - - if kernel_dur_us < best_time_us: - best_time_us = kernel_dur_us - - print(f" | {kernel_dur_us:.0f}", end="") - - print("") - - -def run_timing( - num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method -) -> float: - n = int(parts.sum().item()) - - device = torch.device("cuda:0") - - input = torch.randn((1, m, k), dtype=torch.float16, device=device) - - code_range = (1 << bits) // 2 - ingroups = 8 - - codes = torch.randint( - -code_range, - code_range, - size=(n, k // ingroups, nbooks), - dtype=get_int_dtype(bits), - device=device, - ) - - codebooks = torch.randn( - size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), - dtype=torch.float16, - device=device, - ) - - scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device) - - # for comparison to just a pytorch mult. - weights = torch.randn((n, k), dtype=torch.float16, device=device) - - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) - - start_event.record() - - if method is torch_mult: - for i in range(num_calls): - torch_mult(input, weights, scales) - else: - for i in range(num_calls): - method(input, codes, codebooks, scales, parts, None) - - end_event.record() - end_event.synchronize() - - dur_ms = start_event.elapsed_time(end_event) / num_calls - return dur_ms - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/csrc/ops.h b/csrc/ops.h index 3e29f0a973dd6..6e39758f16a1f 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -154,15 +154,6 @@ void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope, torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor); #ifndef USE_ROCM -torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const std::vector& codebook_partition_sizes, - const std::optional& bias); - -torch::Tensor aqlm_dequant( - const torch::Tensor& codes, const torch::Tensor& codebooks, - const std::vector& codebook_partition_sizes); torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel, torch::Tensor _scaling_factors, torch::Tensor _zeros, diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu deleted file mode 100644 index 79cd2c610b3c2..0000000000000 --- a/csrc/quantization/aqlm/gemm_kernels.cu +++ /dev/null @@ -1,597 +0,0 @@ -/* - * Modified by Neural Magic - * Adapted from https://github.com/Vahe1994/AQLM - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include - -namespace vllm { -namespace aqlm { - -__global__ void Code1x16MatVec( - const int4* __restrict__ A, const int4* __restrict__ B, - int4* __restrict__ C, const int4* __restrict__ codebook, const int prob_m, - const int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each - // codebook, at most 3 long. - const int codebook_stride // as int4. -) { - int a_gl_stride = prob_k / 8 / 8; - int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - bool pred = a_gl_rd < prob_m; - - if (pred) { - // advance to the correct codebook, this easy because we only multiply one - // column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) { - codebook += codebook_stride; - ++codebook_size; - } - } - - int b_gl_rd = 0; - int c_gl_wr = a_gl_rd; - a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; - int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; - - __shared__ int4 sh_b[32 * 9]; - float res = 0; - - int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32); - while (iters--) { - // We pad shared memory to avoid bank conflicts during reads - __syncthreads(); - for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) { - if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i]; - } - __syncthreads(); - b_gl_rd += 32 * 8; - - int b_sh_rd = 9 * (threadIdx.x % 32); - if (pred && a_gl_rd < a_gl_end) { - const uint16_t* enc = reinterpret_cast(&A[a_gl_rd]); -#pragma unroll - for (int i = 0; i < 8; i++) { - uint32_t dec[4]; - // We bypass the L1 cache to avoid massive amounts of memory streaming - // that doesn't actually help us; this brings > 2x speedup. - asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3]) - : "l"((void*)&codebook[enc[i]])); - half2* a = reinterpret_cast(&dec); - half2* b = reinterpret_cast(&sh_b[b_sh_rd]); - half2 res2 = {}; -#pragma unroll - for (int j = 0; j < 4; j++) res2 = __hfma2(a[j], b[j], res2); - res += __half2float(res2.x) + __half2float(res2.y); - b_sh_rd++; - } - a_gl_rd += 32; - } - } - - if (pred) { -#pragma unroll - for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i); - if (threadIdx.x % 32 == 0) - reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res); - } -} - -__global__ void Code2x8MatVec( - const int4* __restrict__ A, const int4* __restrict__ B, - int4* __restrict__ C, const int4* __restrict__ codebook, int prob_m, - int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each - // codebook, at most 3 long. - const int codebook_stride // as int4. - -) { - int a_gl_stride = prob_k / 8 / 8; - int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - bool pred = a_gl_rd < prob_m; - - if (pred) { - // advance to the correct codebook, this easy because we only multiply one - // column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) { - codebook += codebook_stride; - ++codebook_size; - } - } - - int b_gl_rd = 0; - int c_gl_wr = a_gl_rd; - a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; - int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; - int lane = threadIdx.x % 8; - - extern __shared__ int4 sh[]; - int4* sh_b = sh; - int4* sh_code = sh_b + 32 * 9; - int4* sh_code0 = sh_code; - int4* sh_code1 = sh_code + 256 * 8; - - for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) { - int4 dec = codebook[i]; -#pragma unroll - for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec; - } - __syncthreads(); - - float res = 0; - - int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32); - while (iters--) { - // We pad shared memory to avoid bank conflicts during reads - __syncthreads(); - for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) { - if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i]; - } - __syncthreads(); - b_gl_rd += 32 * 8; - - int b_sh_rd = 9 * (threadIdx.x % 32); - if (pred && a_gl_rd < a_gl_end) { - const uint8_t* enc = reinterpret_cast(&A[a_gl_rd]); -#pragma unroll - for (int i = 0; i < 8; i++) { - half2* a0 = - reinterpret_cast(&sh_code0[8 * enc[2 * i + 0] + lane]); - half2* a1 = - reinterpret_cast(&sh_code1[8 * enc[2 * i + 1] + lane]); - half2* b = reinterpret_cast(&sh_b[b_sh_rd]); - half2 res2 = {}; -#pragma unroll - for (int j = 0; j < 4; j++) - res2 = __hfma2(__hadd2(a0[j], a1[j]), b[j], res2); - res += __half2float(res2.x) + __half2float(res2.y); - b_sh_rd++; - } - a_gl_rd += 32; - } - } - - if (pred) { -#pragma unroll - for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i); - if (threadIdx.x % 32 == 0) - reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res); - } -} - -__global__ void Code1x16Dequant( - const int4* __restrict__ A, int4* __restrict__ C, - const int4* __restrict__ codebook, int prob_m, int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each - // codebook, at most 3 long, sums to m. - const int codebook_stride // as int4 -) { - int a_gl_stride = prob_k / 8 / 8; - int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - bool pred = a_gl_rd < prob_m; - - if (pred) { - // advance to the correct codebook, this easy because we only multiply one - // column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) { - codebook += codebook_stride; - ++codebook_size; - } - } - - a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; - int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; - - int c_gl_stride = prob_k / 8; - int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8; - - int iters = (prob_k / 8 - 1) / (8 * 32) + 1; - while (iters--) { - if (pred && a_gl_rd < a_gl_end) { - const uint16_t* enc = reinterpret_cast(&A[a_gl_rd]); -#pragma unroll - for (int i = 0; i < 8; i++) { - int4 chunk; - auto dec = reinterpret_cast(&chunk); - // We bypass the L1 cache to avoid massive amounts of memory streaming - // that doesn't actually help us; this brings > 2x speedup. - asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3]) - : "l"((void*)&codebook[enc[i]])); - - C[a_gl_rd * 8 + i] = chunk; - } - } - a_gl_rd += 32; - } -} - -__global__ void Code2x8Dequant( - const int4* __restrict__ A, int4* __restrict__ C, - const int4* __restrict__ codebook, int prob_m, int prob_k, - const int4 - codebook_a_sizes, // cumulative sizes of A spanning each codebook, at - // most 3 long, corresponds to cols. - const int codebook_stride // as int4 -) { - int a_gl_stride = prob_k / 8 / 8; - int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - bool pred = a_gl_rd < prob_m; - - if (pred) { - // advance to the correct codebook, this easy because we only multiply one - // column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) { - codebook += codebook_stride; - ++codebook_size; - } - } - - a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; - int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; - int lane = threadIdx.x % 8; - - int c_gl_stride = prob_k / 8; - int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8; - - extern __shared__ int4 sh[]; - int4* sh_code = sh; - int4* sh_code0 = sh_code; - int4* sh_code1 = sh_code + 256 * 8; - - for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) { - int4 dec = codebook[i]; -#pragma unroll - for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec; - } - __syncthreads(); - - int iters = (prob_k / 8 - 1) / (8 * 32) + 1; - while (iters--) { - if (pred && a_gl_rd < a_gl_end) { - const uint8_t* enc = reinterpret_cast(&A[a_gl_rd]); -#pragma unroll - for (int i = 0; i < 8; i++) { - int4 chunk; - half2* a0 = - reinterpret_cast(&sh_code0[8 * enc[2 * i + 0] + lane]); - half2* a1 = - reinterpret_cast(&sh_code1[8 * enc[2 * i + 1] + lane]); -#pragma unroll - for (int j = 0; j < 4; j++) - reinterpret_cast(&chunk)[j] = __hadd2(a0[j], a1[j]); - C[a_gl_rd * 8 + i] = chunk; - } - } - a_gl_rd += 32; - } -} - -inline int ceildiv(int a, int b) { return (a + b - 1) / b; } - -const int THREAD_M = 16; - -void code1x16_matvec_cuda(const void* __restrict__ A, - const void* __restrict__ B, void* __restrict__ C, - const void* __restrict__ codebook, int prob_m, - int prob_k, const int4 codebook_a_sizes, - const int codebook_stride) { - int sms; - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); - int waves = 0; - int thread_m; - do { - waves++; - thread_m = ceildiv(prob_m, waves * sms); - } while (thread_m > THREAD_M); - - int blocks = ceildiv(prob_m, thread_m); - int threads = 32 * thread_m; - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - Code1x16MatVec<<>>( - (const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m, - prob_k, codebook_a_sizes, codebook_stride); -} - -void code2x8_matvec_cuda(const void* __restrict__ A, const void* __restrict__ B, - void* __restrict__ C, - const void* __restrict__ codebook, int prob_m, - int prob_k, const int4 codebook_a_sizes, - const int codebook_stride) { - int sms; - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); - int waves = 0; - int thread_m; - do { - waves++; - thread_m = ceildiv(prob_m, waves * sms); - } while (thread_m > THREAD_M); - - int blocks = ceildiv(prob_m, thread_m); - int threads = 32 * thread_m; - int shared = 16 * (2 * 256 * 8 + 32 * 9); - cudaFuncSetAttribute(Code2x8MatVec, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared); - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - Code2x8MatVec<<>>( - (const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m, - prob_k, codebook_a_sizes, codebook_stride); -} - -void code1x16_dequant_cuda( - const void* __restrict__ A, void* __restrict__ C, - const void* __restrict__ codebook, int prob_m, int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each - // codebook, at most 3 long. - const int codebook_stride // as int4. -) { - int sms; - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); - int waves = 0; - int thread_m; - do { - waves++; - thread_m = ceildiv(prob_m, waves * sms); - } while (thread_m > THREAD_M); - - int blocks = ceildiv(prob_m, thread_m); - int threads = 32 * thread_m; - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - Code1x16Dequant<<>>( - (const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k, - codebook_a_sizes, // cumulative sizes of A spanning each codebook, at - // most 3 long. - codebook_stride // as int4. - ); -} - -// Dequantizes the code and codebook into weights. -void code2x8_dequant_cuda( - const void* __restrict__ A, void* __restrict__ C, - const void* __restrict__ codebook, int prob_m, int prob_k, - const int4 - codebook_a_sizes, // cumulative sizes of A spanning each codebook, at - // most 3 long, corresponds to cols. - const int codebook_stride // as int4 -) { - int sms; - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); - int waves = 0; - int thread_m; - do { - waves++; - thread_m = ceildiv(prob_m, waves * sms); - } while (thread_m > THREAD_M); - - int blocks = ceildiv(prob_m, thread_m); - int threads = 32 * thread_m; - int shared = 16 * (2 * 256 * 8 + 32 * 9); - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - - cudaFuncSetAttribute(Code2x8Dequant, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared); - Code2x8Dequant<<>>( - (const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k, - codebook_a_sizes, codebook_stride); -} - -int codebook_stride(const torch::Tensor& codebooks) { - return codebooks.stride(0) * codebooks.element_size() / sizeof(int4); -} - -void code1x16_matvec( - const torch::Tensor& A, const torch::Tensor& B, torch::Tensor& C, - const torch::Tensor& codebook, - const int4 codebook_a_sizes // cumulative sizes of A spanning each - // codebook, at most 3 long. -) { - const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); - int prob_m = C.size(0); - int prob_k = B.size(0); - - code1x16_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(), - codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes, - codebook_stride(codebook)); -} - -torch::Tensor code1x16_matmat(const torch::Tensor& input, - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const int4 codebook_a_sizes, - const std::optional& bias) { - auto input_sizes = input.sizes(); - auto out_features = codes.size(0) * codebooks.size(2); - auto flat_input = input.reshape({-1, input.size(-1)}); - auto flat_output = torch::empty( - {flat_input.size(0), out_features}, - torch::TensorOptions().dtype(input.dtype()).device(input.device())); - - for (int i = 0; i < flat_input.size(0); ++i) { - auto input_vec = flat_input.index({i}); - auto output_vec = flat_output.index({i}); - code1x16_matvec(codes.squeeze(2), input_vec, output_vec, codebooks, - codebook_a_sizes); - } - flat_output *= scales.flatten().unsqueeze(0); - - if (bias.has_value()) { - flat_output += bias->unsqueeze(0); - } - - auto output_sizes = input_sizes.vec(); - output_sizes.pop_back(); - output_sizes.push_back(-1); - auto output = flat_output.reshape(output_sizes); - return output; -} - -void code2x8_matvec(const torch::Tensor& A, const torch::Tensor& B, - torch::Tensor& C, const torch::Tensor& codebook, - const int4 codebook_a_sizes) { - const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); - int prob_m = C.size(0); - int prob_k = B.size(0); - code2x8_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(), - codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes, - 2 * codebook_stride(codebook)); -} - -torch::Tensor code2x8_matmat(const torch::Tensor& input, - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const int4 codebook_a_sizes, - const std::optional& bias) { - auto input_sizes = input.sizes(); - auto out_features = codes.size(0) * codebooks.size(2); - auto flat_input = input.reshape({-1, input.size(-1)}); - auto flat_output = torch::empty( - {flat_input.size(0), out_features}, - torch::TensorOptions().dtype(input.dtype()).device(input.device())); - - for (int i = 0; i < flat_input.size(0); ++i) { - auto input_vec = flat_input.index({i}); - auto output_vec = flat_output.index({i}); - code2x8_matvec(codes.squeeze(2), input_vec, output_vec, codebooks, - codebook_a_sizes); - } - flat_output *= scales.flatten().unsqueeze(0); - if (bias.has_value()) { - flat_output += bias->unsqueeze(0); - } - - auto output_sizes = input_sizes.vec(); - output_sizes.pop_back(); - output_sizes.push_back(-1); - auto output = flat_output.reshape(output_sizes); - return output; -} - -// Accumulate the partition sizes. -int4 accumulate_sizes(const std::vector& codebook_partition_sizes) { - int4 cumulative_sizes; - auto cumulative_size = &cumulative_sizes.x; - size_t i = 0; - int last = 0; - assert(codebook_partition_sizes.size() <= 4); - for (; i < codebook_partition_sizes.size(); ++i, ++cumulative_size) { - *cumulative_size = codebook_partition_sizes[i] + last; - last = *cumulative_size; - } - // fill in the rest with unreachable. - for (; i < 4; ++i, ++cumulative_size) { - *cumulative_size = last * 10; - } - return cumulative_sizes; -} - -} // namespace aqlm -} // namespace vllm - -torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const std::vector& codebook_partition_sizes, - const std::optional& bias) { - int4 cumulative_sizes = - vllm::aqlm::accumulate_sizes(codebook_partition_sizes); - - int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(); - int const entries = codebooks.size(1); - - if (nbooks == 1 && entries == (1 << 16)) { - return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales, - cumulative_sizes, bias); - } - if (nbooks == 2 && entries == (1 << 8)) { - return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales, - cumulative_sizes, bias); - } - - TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, - " entries is not currently supported.") - return {}; -} - -torch::Tensor aqlm_dequant( - const torch::Tensor& codes, const torch::Tensor& codebooks, - const std::vector& codebook_partition_sizes) { - int4 cumulative_sizes = - vllm::aqlm::accumulate_sizes(codebook_partition_sizes); - - int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(); - int const entries = codebooks.size(1); - - const at::cuda::OptionalCUDAGuard device_guard(device_of(codes)); - int rows = codes.size(1); - int cols = codes.size(0); - - auto in_features = codes.size(1) * 8; - auto out_features = codes.size(0); - - assert(out_features == std::accumulate(codebook_partition_sizes.begin(), - codebook_partition_sizes.end(), 0)); - - auto weights = torch::empty({out_features, in_features}, - torch::TensorOptions() - .dtype(codebooks.dtype()) - .device(codebooks.device())); - - if (nbooks == 1 && entries == (1 << 16)) { - vllm::aqlm::code1x16_dequant_cuda(codes.data_ptr(), weights.data_ptr(), - codebooks.data_ptr(), out_features, - in_features, cumulative_sizes, - vllm::aqlm::codebook_stride(codebooks)); - - // if you wanted to flip to scaling the weights, (though it's 30%-ish slower - // and not consistent with gemv implementation.) weights *= - // scales.index({"...", 0, 0}); - - return weights; - } - - if (nbooks == 2 && entries == (1 << 8)) { - vllm::aqlm::code2x8_dequant_cuda(codes.data_ptr(), weights.data_ptr(), - codebooks.data_ptr(), out_features, - in_features, cumulative_sizes, - vllm::aqlm::codebook_stride(codebooks)); - - // if you wanted to flip to scaling the weights, (though it's 30%-ish slower - // and not consistent with gemv implementation) weights *= - // scales.index({"...", 0, 0}); - - return weights; - } - - TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, - " entries is not currently supported.") - return {}; -} diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index a547baec50d6a..5fee106335d3b 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -207,21 +207,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Quantization ops #ifndef USE_ROCM - // Quantized GEMM for AQLM. - ops.def( - "aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, " - "Tensor scales, int[] codebook_partition_sizes, Tensor? bias) " - "-> Tensor", - {stride_tag}); - ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm); - - // Decompression method for AQLM. - ops.def( - "aqlm_dequant(Tensor codes, Tensor codebooks, " - "int[] codebook_partition_sizes) -> Tensor", - {stride_tag}); - ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant); - // Quantized GEMM for AWQ. ops.def( "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, " diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md index f53e69ecc6115..06264d08b56aa 100644 --- a/docs/features/quantization/supported_hardware.md +++ b/docs/features/quantization/supported_hardware.md @@ -17,7 +17,6 @@ th { | INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | | FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ❌ | | BitBLAS (GPTQ) | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| AQLM | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py index 6b4c5b31075f7..1e8b848db46d8 100644 --- a/docs/mkdocs/hooks/generate_examples.py +++ b/docs/mkdocs/hooks/generate_examples.py @@ -24,7 +24,6 @@ def fix_case(text: str) -> str: "llm": "LLM", "mae": "MAE", "tpu": "TPU", - "aqlm": "AQLM", "gguf": "GGUF", "lora": "LoRA", "rlhf": "RLHF", diff --git a/examples/offline_inference/basic/README.md b/examples/offline_inference/basic/README.md index 0a2bd6e2b70b3..cbb3116e97414 100644 --- a/examples/offline_inference/basic/README.md +++ b/examples/offline_inference/basic/README.md @@ -52,20 +52,6 @@ Try it yourself with the following argument: ### Quantization -#### AQLM - -vLLM supports models that are quantized using AQLM. - -Try one yourself by passing one of the following models to the `--model` argument: - -- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf` -- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf` -- `ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf` -- `ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf` -- `BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf` - -> Some of these models are likely to be too large for a single GPU. You can split them across multiple GPUs by setting `--tensor-parallel-size` to the number of required GPUs. - #### GGUF vLLM supports models that are quantized using GGUF. diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 72f962ed7484c..a2fc6ffeb8b26 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -31,10 +31,6 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None): ] if all: - if is_quant_method_supported("aqlm"): - TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", { - "quantization": "aqlm" - })) # TODO: figure out why this fails. if False and is_quant_method_supported("gguf"): # noqa: SIM223 diff --git a/tests/kernels/quantization/test_aqlm.py b/tests/kernels/quantization/test_aqlm.py deleted file mode 100644 index 427db3e602921..0000000000000 --- a/tests/kernels/quantization/test_aqlm.py +++ /dev/null @@ -1,40 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -from tests.kernels.utils import opcheck -from vllm import _custom_ops as ops # noqa: F401 - - -def test_aqlm_dequant_opcheck(): - codes = torch.randint(-32768, - 32767, (22016, 512, 1), - device='cuda', - dtype=torch.int16) - codebooks = torch.rand((2, 65536, 1, 8), - device='cuda', - dtype=torch.float16) - codebook_partition_sizes = [11008, 11008] - - opcheck(torch.ops._C.aqlm_dequant, - (codes, codebooks, codebook_partition_sizes)) - - -def test_aqlm_gemm_opcheck(): - input = torch.rand((4, 4096), device='cuda', dtype=torch.float16) - codes = torch.randint(-32768, - 32767, (12288, 512, 1), - device='cuda', - dtype=torch.int16) - codebooks = torch.rand((3, 65536, 1, 8), - device='cuda', - dtype=torch.float16) - scales = torch.rand((12288, 1, 1, 1), device='cuda', dtype=torch.float16) - codebook_partition_sizes = [4096, 4096, 4096] - bias = None - - opcheck(torch.ops._C.aqlm_gemm, - (input, codes, codebooks, scales, codebook_partition_sizes, None)) - opcheck(torch.ops._C.aqlm_gemm, - (input, codes, codebooks, scales, codebook_partition_sizes, bias)) diff --git a/tests/models/quantization/test_aqlm.py b/tests/models/quantization/test_aqlm.py deleted file mode 100644 index de6851e2fc282..0000000000000 --- a/tests/models/quantization/test_aqlm.py +++ /dev/null @@ -1,68 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - -from tests.quantization.utils import is_quant_method_supported -from vllm.platforms import current_platform - -# These ground truth generations were generated using `transformers==4.38.1 -# aqlm==1.1.0 torch==2.2.0` -# and the below code: -# ```python -# from transformers import AutoTokenizer, AutoModelForCausalLM -# model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf" -# quantized_model = AutoModelForCausalLM.from_pretrained(model_id, -# torch_dtype="auto", device_map="cuda").cuda() -# tokenizer = AutoTokenizer.from_pretrained(model_id) -# outputs = [] -# for prompt in example_prompts: -# input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda") -# hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32) -# outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:])) -# print(outputs) -# ``` -ground_truth_generations = [ - '\n### Features\n\n- **High-throughput**: v', - 'The major milestones in the development of artificial intelligence from ' - '195', - 'Compare and contrast artificial intelligence with human intelligence in ' - 'terms of processing information. The', - 'Explain the difference between supervised and unsupervised learning.' - '\nExplain', - 'Write a short story about a robot that dreams for the first time. The', - 'Analyze the impact of the COVID-19 pandemic on global economic', - 'The Mona Lisa is a painting by Leonardo da Vinci, and it', - 'The early bird catches the worm.\nThe early bird catches the' -] - - -@pytest.mark.skipif(not is_quant_method_supported("aqlm") - or current_platform.is_rocm() - or not current_platform.is_cuda(), - reason="AQLM is not supported on this GPU type.") -@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"]) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [16]) -@pytest.mark.parametrize("num_logprobs", [1]) -def test_models( - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: - - with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - - # loop through the prompts to compare against the ground truth generations - for prompt_idx in range(len(example_prompts)): - vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[ - prompt_idx] - - print("Prompt: ", repr(example_prompts[prompt_idx])) - print("Reference output:", repr(ground_truth_generations[prompt_idx])) - print("Output output: ", repr(vllm_output_str)) - assert vllm_output_str == ground_truth_generations[prompt_idx] diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index a318637c5aeba..0d556053f8981 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -476,32 +476,6 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"): dtype=input.dtype, device=input.device).sum(0) - @register_fake("_C::aqlm_gemm") - def _aqlm_gemm_fake(input: torch.Tensor, codes: torch.Tensor, - codebooks: torch.Tensor, scales: torch.Tensor, - codebook_partition_sizes: list[int], - bias: Optional[torch.Tensor]) -> torch.Tensor: - out_features = codes.size(0) * codebooks.size(2) - flat_input = input.reshape((-1, input.size(-1))) - flat_output = torch.empty((flat_input.size(0), out_features), - dtype=input.dtype, - device=input.device) - - output_sizes = list(input.shape) - output_sizes.pop() - output_sizes.append(-1) - return flat_output.reshape(tuple(output_sizes)) - - @register_fake("_C::aqlm_dequant") - def _aqlm_dequant_fake( - codes: torch.Tensor, codebooks: torch.Tensor, - codebook_partition_sizes: list[int]) -> torch.Tensor: - in_features = codes.size(1) * 8 - out_features = codes.size(0) - return torch.empty((out_features, in_features), - dtype=codebooks.dtype, - device=codebooks.device) - @register_fake("_C::machete_mm") def machete_mm_fake( a: torch.Tensor, @@ -957,21 +931,6 @@ def cutlass_fp4_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor, sf_offsets) -# aqlm -def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor, - codebooks: torch.Tensor, scales: torch.Tensor, - codebook_partition_sizes: list[int], - bias: Optional[torch.Tensor]) -> torch.Tensor: - return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales, - codebook_partition_sizes, bias) - - -def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor, - codebook_partition_sizes: list[int]) -> torch.Tensor: - return torch.ops._C.aqlm_dequant(codes, codebooks, - codebook_partition_sizes) - - # gptq_marlin def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor, size_k: int, size_n: int, diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 75391c51f7754..671ad9eed234a 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -692,8 +692,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear): param_data = param.data output_dim = getattr(param, "output_dim", None) - # Special case for AQLM codebooks. - is_metadata = getattr(param, "is_metadata", False) # Special case for per-tensor scale to load scalar into fused array. needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False) @@ -781,13 +779,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear): if not is_sharded_weight: loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) - # Special case for AQLM codebooks. - elif is_metadata: - # metadata indicates fixed size concatenated along dim 0 - shard_size = loaded_weight.shape[0] - shard_offset = loaded_shard_id * shard_size - param_data = param_data.narrow(0, shard_offset, shard_size) - # Special case for per-tensor scales in fused case. elif needs_scalar_to_array: param_data, loaded_weight = adjust_scalar_to_fused_array( @@ -1081,8 +1072,6 @@ class QKVParallelLinear(ColumnParallelLinear): param_data = param.data output_dim = getattr(param, "output_dim", None) - # Special case for AQLM codebooks. - is_metadata = getattr(param, "is_metadata", False) # Special case for per-tensor scales in fused case. needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False) @@ -1204,13 +1193,6 @@ class QKVParallelLinear(ColumnParallelLinear): loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) - # Special case for for AQLM codebooks. - elif is_metadata: - # metadata indicates fixed size concatenated along dim 0 - shard_size = loaded_weight.shape[0] - shard_index = ["q", "k", "v"].index(loaded_shard_id) - param_data = param_data.narrow(0, shard_index * shard_size, - shard_size) # Special case for per-tensor scales in fused case. elif needs_scalar_to_array: param_data, loaded_weight = adjust_scalar_to_fused_array( diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index 8d63027e1863f..a4c2671225f57 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -7,7 +7,6 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) QuantizationMethods = Literal[ - "aqlm", "awq", "deepspeedfp", "tpu_int8", @@ -88,7 +87,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: # lazy import to avoid triggering `torch.compile` too early from vllm.model_executor.layers.quantization.quark.quark import QuarkConfig - from .aqlm import AQLMConfig from .auto_round import AutoRoundConfig from .awq import AWQConfig from .awq_marlin import AWQMarlinConfig @@ -120,7 +118,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: from .tpu_int8 import Int8TpuConfig method_to_config: dict[str, type[QuantizationConfig]] = { - "aqlm": AQLMConfig, "awq": AWQConfig, "deepspeedfp": DeepSpeedFPConfig, "tpu_int8": Int8TpuConfig, diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py deleted file mode 100644 index 2ea8c5dc51132..0000000000000 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ /dev/null @@ -1,376 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Supports AQLM compression, see https://github.com/Vahe1994/AQLM -# and https://arxiv.org/pdf/2401.06118.pdf - -import math -from typing import Any, Optional - -import torch -import torch.nn.functional as F -from torch.nn.parameter import Parameter - -from vllm import _custom_ops as ops -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase -from vllm.model_executor.layers.quantization import QuantizationMethods -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) -from vllm.model_executor.utils import set_weight_attrs - - -def get_int_dtype(nbits: int) -> torch.dtype: - if nbits <= 8: - return torch.int8 - if nbits <= 16: - return torch.int16 - if nbits <= 32: - return torch.int32 - if nbits <= 64: - return torch.int64 - raise ValueError(f"No dtype available for {nbits}-bit codebooks") - - -@torch.inference_mode() -def unpack_int_data(data: torch.IntTensor, nbits: int) -> torch.IntTensor: - return data.to(torch.int64) % (2**nbits) - - -def dequantize_weight(codes: torch.Tensor, - codebooks: torch.Tensor, - scales: Optional[torch.Tensor] = None) -> torch.Tensor: - """ - Decode float weights from quantization codes. Differentiable. - :param codes: tensor of integer quantization codes, shape - [*dims, num_out_groups, num_in_groups, num_codebooks] - :param codebooks: tensor of vectors for each quantization code, - [num_codebooks, codebook_size, out_group_size, in_group_size] - :param scales: weight will be multiplied by this factor, must be - broadcastble with - [*dims, out_groups, num_in_groups, out_group_size, in_group_size] - :return: reconstructed weight tensor of shape - [*dims, num_in_groups*group_size] - """ - num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:] - num_codebooks, codebook_size, out_group_size, in_group_size = \ - codebooks.shape - out_features = num_out_groups * out_group_size - in_features = num_in_groups * in_group_size - codebook_offsets = torch.arange( - 0, num_codebooks * codebook_size, codebook_size, - device=codes.device) # shape: [num_codebooks] - reconstructed_weight_flat = F.embedding_bag( - codes.flatten(0, -2) + codebook_offsets, - codebooks.flatten(0, 1).flatten(-2, -1), - mode="sum" - ) # [prod(dims) * num_out_groups * num_in_groups, out_group_size - # * in_group_size] - - reconstructed_weight_groupwise = reconstructed_weight_flat.view( - list(codes.shape[:-3]) + - [num_out_groups, num_in_groups, out_group_size, in_group_size]) - if scales is not None: - reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul( - scales) - return reconstructed_weight_groupwise.swapaxes( - -3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features]) - - -def dequantize_gemm( - input: torch.Tensor, # [..., in_features] - codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] - codebooks: torch. - Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] - scales: torch.Tensor, # [num_out_groups, 1, 1, 1] - bias: Optional[torch.Tensor], -) -> torch.Tensor: - dequantized_weight = dequantize_weight( - unpack_int_data(codes, codebooks.shape[1].bit_length() - 1), - codebooks, - scales, - ) - return F.linear(input, dequantized_weight, bias) - - -# Generic dequantization, slow but flexible. -def generic_dequantize_gemm( - input: torch.Tensor, # [..., in_features] - codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] - codebooks: torch. - Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] - scales: torch.Tensor, # [num_out_groups, 1, 1, 1] - output_partition_sizes: list[int], - bias: Optional[torch.Tensor], -) -> torch.Tensor: - output_shape = input.shape[:-1] + (scales.shape[0], ) - output = torch.empty(output_shape, dtype=input.dtype, device=input.device) - num_outputs = len(output_partition_sizes) - - # break the inputs and codebooks apart then combine the outputs. - # Surprisingly (to me) this is faster than doing 3 de-quants and 1 big - # multiply at the end. - num_codebooks = codebooks.shape[0] // num_outputs - assert (scales.shape[0] == codes.shape[0]) - assert (sum(output_partition_sizes) == scales.shape[0]) - output_offset = 0 - codebooks_offset = 0 - for output_size in output_partition_sizes: - shard_output = dequantize_gemm( - input, codes.narrow(0, output_offset, output_size), - codebooks.narrow(0, codebooks_offset, num_codebooks), - scales.narrow(0, output_offset, output_size), None - if bias is None else bias.narrow(0, output_offset, output_size)) - - output_slice = output.narrow(-1, output_offset, output_size) - assert (output_slice.shape == shard_output.shape) - output_slice.copy_(shard_output) - output_offset += output_size - codebooks_offset += num_codebooks - return output - - -# Optimized dequnantize/decompression kernels, supports 1x16 and 2x8 -# at 6 and 9 times faster than the generic version above, respectively. -def optimized_dequantize_gemm( - input: torch.Tensor, # [..., in_features] - codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks] - codebooks: torch. - Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size] - scales: torch.Tensor, # [num_out_groups, 1, 1, 1] - output_partition_sizes: list[int], - bias: Optional[torch.Tensor], -) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - - if bias is None: - # scaling the output is fastest, so we do that when possible. - output = F.linear(input, weights, bias) - orig_shape = output.shape - flattened_output = output.view(-1, output.size(-1)) - f_scales = scales.view(-1, scales.shape[0]) - b_scales = f_scales.expand(flattened_output.shape[0], -1) - flattened_output *= b_scales - return output.view(orig_shape) - else: - b_scales = scales.view(scales.shape[:-3] + (-1, )).expand( - -1, weights.shape[1]) - weights *= b_scales - return F.linear(input, weights, bias) - - -class AQLMConfig(QuantizationConfig): - """Config class for AQLM. - - Reference: https://github.com/Vahe1994/AQLM - """ - - def __init__( - self, - in_group_size: int, - nbits_per_codebook: int, - num_codebooks: int, - out_group_size: int, - ) -> None: - super().__init__() - self.in_group_size = in_group_size - self.nbits_per_codebook = nbits_per_codebook - self.num_codebooks = num_codebooks - self.out_group_size = out_group_size - - # out_group_size > 1 is untested, and probably won't work as-is. - assert (self.out_group_size == 1) - self.pack_factor = (self.in_group_size * self.out_group_size) - - def __repr__(self) -> str: - return (f"AQLMConfig(in_group_size={self.in_group_size}, " - f"nbits_per_codebook={self.nbits_per_codebook}, " - f"num_codebooks={self.num_codebooks}, " - f"out_group_size={self.out_group_size})") - - @classmethod - def get_name(cls) -> QuantizationMethods: - return "aqlm" - - @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: - return [torch.half] - - @classmethod - def get_min_capability(cls) -> int: - return 60 - - @classmethod - def get_config_filenames(cls) -> list[str]: - return [] # no extra configs. - - @classmethod - def from_config(cls, config: dict[str, Any]) -> "AQLMConfig": - in_group_size = cls.get_from_keys(config, ["in_group_size"]) - nbits_per_codebook = cls.get_from_keys(config, ["nbits_per_codebook"]) - num_code_books = cls.get_from_keys(config, ["num_codebooks"]) - out_group_size = cls.get_from_keys(config, ["out_group_size"]) - return cls(in_group_size, nbits_per_codebook, num_code_books, - out_group_size) - - def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["AQLMLinearMethod"]: - if isinstance(layer, LinearBase): - return AQLMLinearMethod(self) - return None - - -class AQLMLinearMethod(LinearMethodBase): - """Linear method for AQLM. - - Args: - quant_config: The AQLM quantization config. - """ - - def __init__(self, quant_config: AQLMConfig): - self.quant_config = quant_config - - def create_weights(self, layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: list[int], input_size: int, - output_size: int, params_dtype: torch.dtype, - **extra_weight_attrs): - del output_size # Unused. - del input_size # Unused. - - if params_dtype != torch.half: - raise ValueError("Only half is currently supported by aqlm") - if input_size_per_partition % self.quant_config.in_group_size != 0: - raise ValueError( - "The input size is not aligned with the quantized " - "weight shape. This can be caused by too large " - "tensor parallel size.") - - output_size_per_partition = sum(output_partition_sizes) - if output_size_per_partition % self.quant_config.out_group_size != 0: - raise ValueError( - "The output size is not aligned with the quantized " - "weight shape. This can be caused by too large " - "tensor parallel size.") - - codes = Parameter( - torch.empty( - # There could actually be two pack factors, one along input and - # one along output, but we don't currently support - # out_group_size, and only the one along output needs to be - # marked with "packed_dim" in order for QKVLinear to work. - output_size_per_partition, - input_size_per_partition // self.quant_config.pack_factor, - self.quant_config.num_codebooks, - dtype=get_int_dtype(self.quant_config.nbits_per_codebook), - ), - requires_grad=False, - ) - - set_weight_attrs( - codes, - { - "input_dim": 1, - "output_dim": 0, - "packed_dim": 1, - "pack_factor": self.quant_config.pack_factor, - }, - ) - - codebooks = Parameter( - torch.empty( - self.quant_config.num_codebooks * len(output_partition_sizes), - 2**self.quant_config.nbits_per_codebook, - self.quant_config.out_group_size, - self.quant_config.in_group_size, - dtype=params_dtype, - ), - requires_grad=False, - ) - set_weight_attrs( - codebooks, - { - # metadata indicates fixed size concatenated along dim 0 - "is_metadata": True, - "output_partition_sizes": output_partition_sizes - }, - ) - - scales = Parameter( - torch.empty( - ( - output_size_per_partition // - self.quant_config.out_group_size, - 1, - 1, - 1, - ), - dtype=params_dtype, - ), - requires_grad=False, - ) - set_weight_attrs( - scales, - { - "output_dim": 0, - "packed_dim": 0, - "pack_factor": self.quant_config.out_group_size - }, - ) - - layer.register_parameter("codes", codes) - set_weight_attrs(codes, extra_weight_attrs) - layer.register_parameter("codebooks", codebooks) - set_weight_attrs(codebooks, extra_weight_attrs) - layer.register_parameter("scales", scales) - set_weight_attrs(scales, extra_weight_attrs) - - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - codebooks = layer.codebooks - codes = layer.codes - scales = layer.scales - output_partition_sizes = getattr(codebooks, "output_partition_sizes", - []) - - nbooks = codes.shape[2] - ingroups = codebooks.shape[3] - outgroups = codebooks.shape[2] - bits = codebooks.shape[1] - - # We support these formats with dedicated gemm and decompression - # kernels. - if ingroups == 8 and outgroups == 1 and ( - (bits == 256 and nbooks == 2) or (bits == 65536 and nbooks == 1)): - - # thresholds determined by timings on an A6000, one GPU - use_gemv = math.prod(x.shape[:-1]) <= 6 - - return ops.aqlm_gemm( - x, - codes, - codebooks, - scales, - output_partition_sizes, - bias, - ) if use_gemv else optimized_dequantize_gemm( - x, - codes, - codebooks, - scales, - output_partition_sizes, - bias, - ) - - # fall back all unoptimized formats - return generic_dequantize_gemm( - x, - codes, - codebooks, - scales, - output_partition_sizes, - bias, - ) From bf7f470b22e8bf26e1edb30b3bf465ab7dd69f0c Mon Sep 17 00:00:00 2001 From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Date: Sat, 16 Aug 2025 15:59:17 -0400 Subject: [PATCH 044/361] [V1] Logits processors extensibility (#19912) Signed-off-by: Andrew Feldman Signed-off-by: Andrew Feldman Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Nick Hill Co-authored-by: Nick Hill Co-authored-by: Andrew Feldman Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 1 + .../offline_inference/logits_processor.py | 147 +++++++++ tests/utils.py | 79 ++++- tests/v1/logits_processors/__init__.py | 0 .../test_correctness.py} | 24 +- .../logits_processors/test_custom_offline.py | 237 ++++++++++++++ .../logits_processors/test_custom_online.py | 180 +++++++++++ tests/v1/logits_processors/utils.py | 127 ++++++++ tests/v1/sample/test_rejection_sampler.py | 4 +- tests/v1/sample/test_sampler.py | 4 +- tests/v1/worker/test_gpu_input_batch.py | 4 +- vllm/config/__init__.py | 5 + vllm/engine/arg_utils.py | 8 + vllm/entrypoints/llm.py | 4 + vllm/utils/__init__.py | 2 +- vllm/v1/sample/logits_processor/__init__.py | 185 +++++++++++ .../builtin.py} | 294 ++---------------- vllm/v1/sample/logits_processor/interface.py | 86 +++++ vllm/v1/sample/logits_processor/state.py | 149 +++++++++ vllm/v1/sample/metadata.py | 4 +- vllm/v1/worker/gpu_input_batch.py | 91 ++++-- vllm/v1/worker/gpu_model_runner.py | 11 +- 22 files changed, 1312 insertions(+), 334 deletions(-) create mode 100644 examples/offline_inference/logits_processor.py create mode 100644 tests/v1/logits_processors/__init__.py rename tests/v1/{sample/test_logits_processors.py => logits_processors/test_correctness.py} (97%) create mode 100644 tests/v1/logits_processors/test_custom_offline.py create mode 100644 tests/v1/logits_processors/test_custom_online.py create mode 100644 tests/v1/logits_processors/utils.py create mode 100644 vllm/v1/sample/logits_processor/__init__.py rename vllm/v1/sample/{logits_processor.py => logits_processor/builtin.py} (54%) create mode 100644 vllm/v1/sample/logits_processor/interface.py create mode 100644 vllm/v1/sample/logits_processor/state.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 87296a08e2071..4fc8857854927 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -253,6 +253,7 @@ steps: - pytest -v -s v1/engine - pytest -v -s v1/entrypoints - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors - pytest -v -s v1/worker - pytest -v -s v1/structured_output - pytest -v -s v1/spec_decode diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor.py new file mode 100644 index 0000000000000..7ef20efa7d28c --- /dev/null +++ b/examples/offline_inference/logits_processor.py @@ -0,0 +1,147 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""This example demonstrates instantiating vLLM with a custom logits processor +class object. + +For a basic example of implementing a custom logits processor, see +the `DummyLogitsProcessor` implementation in `vllm/test_utils.py`. + +For testing purposes, a dummy logits processor is employed which, if +`target_token` is passed as a keyword argument to `SamplingParams.extra_args`, +will mask out all tokens except `target_token`. + +A batch is constructed with `temperature=0.0` and 50% of requests specifying +`target_token`, and for these requests - and *only* these requests - we +expect the `target_token` to be decoded in each step, yielding an output +similar to that shown below: + +Generated Outputs: +------------------------------------------------------------ +Prompt: 'Hello, my name is' +Output: " ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '" +------------------------------------------------------------ +Prompt: 'The president of the United States is' +Output: " not a racist. He is a racist.\nHe's a racist because he" +------------------------------------------------------------ +Prompt: 'The capital of France is' +Output: ' also also also also also also also also also also also also also + also also also' +------------------------------------------------------------ +Prompt: 'The future of AI is' +Output: ' in the hands of the people.\n\nThe future of AI is in the' +------------------------------------------------------------ +""" + +from typing import Optional + +import torch + +from vllm import LLM, SamplingParams +from vllm.config import VllmConfig +from vllm.v1.sample.logits_processor import ( + BatchUpdate, + LogitsProcessor, + MoveDirectionality, +) + + +# Hypothetical custom logits processor +class DummyLogitsProcessor(LogitsProcessor): + """Fake logit processor to support unit testing and examples""" + + def __init__( + self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool + ): + self.req_info: dict[int, SamplingParams] = {} + + def is_argmax_invariant(self) -> bool: + """Never impacts greedy sampling""" + return False + + def update_state(self, batch_update: Optional[BatchUpdate]): + if not batch_update: + return + + # Process added requests. + for index, params, _, _ in batch_update.added: + assert params is not None + if params.extra_args and ( + target_token := params.extra_args.get("target_token") + ): + self.req_info[index] = target_token + + if self.req_info: + # Process removed requests. + for index in batch_update.removed: + self.req_info.pop(index, None) + + # Process moved requests, unidirectional move (a->b) and swap + # (a<->b) + for adx, bdx, direct in batch_update.moved: + a_val = self.req_info.pop(adx, None) + b_val = self.req_info.pop(bdx, None) + if a_val is not None: + self.req_info[bdx] = a_val + if direct == MoveDirectionality.SWAP and b_val is not None: + self.req_info[adx] = b_val + + def apply(self, logits: torch.Tensor) -> torch.Tensor: + if not self.req_info: + return logits + + # Save target values before modification + rows_list = list(self.req_info.keys()) + cols = torch.tensor( + [self.req_info[i] for i in rows_list], + dtype=torch.long, + device=logits.device, + ) + rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device) + values_to_keep = logits[rows, cols].clone() + + # Mask all but target tokens + logits[rows] = float("-inf") + logits[rows, cols] = values_to_keep + + return logits + + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a mixture of requests which do and don't utilize the dummy logitproc +sampling_params_list = [ + SamplingParams(temperature=0.0, extra_args={"target_token": 128}), + SamplingParams(temperature=0.0), + SamplingParams(temperature=0.0, extra_args={"target_token": 67}), + SamplingParams(temperature=0.0), +] + + +def main(): + # Create an LLM. + llm = LLM( + model="facebook/opt-125m", + logits_processors=[DummyLogitsProcessor], + ) + # Generate texts from the prompts. + # The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params_list) + # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}") + print(f"Output: {generated_text!r}") + print("-" * 60) + + +if __name__ == "__main__": + main() diff --git a/tests/utils.py b/tests/utils.py index 18fcde949160e..e98707fb44475 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -13,6 +13,7 @@ import tempfile import time import warnings from contextlib import contextmanager, suppress +from multiprocessing import Process from pathlib import Path from typing import Any, Callable, Literal, Optional, Union @@ -76,6 +77,23 @@ VLLM_PATH = Path(__file__).parent.parent class RemoteOpenAIServer: DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key + def _start_server(self, model: str, vllm_serve_args: list[str], + env_dict: Optional[dict[str, str]]) -> None: + """Subclasses override this method to customize server process launch + """ + env = os.environ.copy() + # the current process might initialize cuda, + # to be safe, we should use spawn method + env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' + if env_dict is not None: + env.update(env_dict) + self.proc: subprocess.Popen = subprocess.Popen( + ["vllm", "serve", model, *vllm_serve_args], + env=env, + stdout=sys.stdout, + stderr=sys.stderr, + ) + def __init__(self, model: str, vllm_serve_args: list[str], @@ -128,18 +146,7 @@ class RemoteOpenAIServer: model_loader = get_model_loader(load_config) model_loader.download_model(model_config) - env = os.environ.copy() - # the current process might initialize cuda, - # to be safe, we should use spawn method - env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' - if env_dict is not None: - env.update(env_dict) - self.proc = subprocess.Popen( - ["vllm", "serve", model, *vllm_serve_args], - env=env, - stdout=sys.stdout, - stderr=sys.stderr, - ) + self._start_server(model, vllm_serve_args, env_dict) max_wait_seconds = max_wait_seconds or 240 self._wait_for_server(url=self.url_for("health"), timeout=max_wait_seconds) @@ -155,6 +162,10 @@ class RemoteOpenAIServer: # force kill if needed self.proc.kill() + def _poll(self) -> Optional[int]: + """Subclasses override this method to customize process polling""" + return self.proc.poll() + def _wait_for_server(self, *, url: str, timeout: float): # run health check start = time.time() @@ -169,7 +180,7 @@ class RemoteOpenAIServer: # which means the server is not ready yet. # the stack trace is not useful, so we suppress it # by using `raise from None`. - result = self.proc.poll() + result = self._poll() if result is not None and result != 0: raise RuntimeError("Server exited unexpectedly.") from None @@ -205,6 +216,48 @@ class RemoteOpenAIServer: **kwargs) +class RemoteOpenAIServerCustom(RemoteOpenAIServer): + """Launch test server with custom child process""" + + def _start_server(self, model: str, vllm_serve_args: list[str], + env_dict: Optional[dict[str, str]]) -> None: + self.proc: Process = Process( + target=self.child_process_fxn, + args=(env_dict, model, + vllm_serve_args)) # type: ignore[assignment] + self.proc.start() + + def __init__(self, + model: str, + vllm_serve_args: list[str], + child_process_fxn: Callable[ + [Optional[dict[str, str]], str, list[str]], None], + *, + env_dict: Optional[dict[str, str]] = None, + seed: Optional[int] = 0, + auto_port: bool = True, + max_wait_seconds: Optional[float] = None) -> None: + """Store custom child process function then invoke superclass + constructor which will indirectly launch it.""" + self.child_process_fxn = child_process_fxn + super().__init__(model=model, + vllm_serve_args=vllm_serve_args, + env_dict=env_dict, + seed=seed, + auto_port=auto_port, + max_wait_seconds=max_wait_seconds) + + def _poll(self) -> Optional[int]: + return self.proc.exitcode + + def __exit__(self, exc_type, exc_value, traceback): + self.proc.terminate() + self.proc.join(8) + if self.proc.is_alive(): + # force kill if needed + self.proc.kill() + + def _test_completion( client: openai.OpenAI, model: str, diff --git a/tests/v1/logits_processors/__init__.py b/tests/v1/logits_processors/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/logits_processors/test_correctness.py similarity index 97% rename from tests/v1/sample/test_logits_processors.py rename to tests/v1/logits_processors/test_correctness.py index 84ee3b0392b40..43caef79b02f7 100644 --- a/tests/v1/sample/test_logits_processors.py +++ b/tests/v1/logits_processors/test_correctness.py @@ -9,11 +9,13 @@ import numpy as np import pytest import torch +from tests.utils import create_new_process_for_each_test from tests.v1.sample.utils import (LogitsprocsTestFakes, create_fake_logits, create_penalty_tensor, create_prompt_tokens_tensor, fake_apply_logitsprocs, fake_update_logitsprocs_state) +from vllm.config import VllmConfig from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams from vllm.utils import is_pin_memory_available @@ -24,7 +26,7 @@ from vllm.v1.sample.logits_processor import (BatchUpdate, BatchUpdateBuilder, MinPLogitsProcessor, MinTokensLogitsProcessor, MoveDirectionality, - init_builtin_logitsprocs) + build_logitsprocs) # yapf: enable from vllm.v1.sample.metadata import SamplingMetadata @@ -53,6 +55,7 @@ class LogitsProcsRequestParams: workload_index: int logitproc_type: LogitprocType # Logitproc enabled, specified by str id out_tokens: list[int] # Output tokens required for min tokens test + prompt_tokens: list[int] # Dummy prompt tokens placeholder params: SamplingParams # Settings customized for logitproc def __init__(self, workload_index: int, logitproc_type: LogitprocType): @@ -63,6 +66,7 @@ class LogitsProcsRequestParams: # don't matter *for these tests* so use 0 as a dummy value self.out_tokens = ([0] * (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 2))) + self.prompt_tokens = [] self.params = _sampling_params_from_logitproc(logitproc_type) def __str__(self): @@ -88,11 +92,12 @@ def _generate_fake_sampling_metadata( vocab_size, size=np.random.randint( 1, MAX_NUM_PROMPT_TOKENS)).tolist()) - logitsprocs = init_builtin_logitsprocs( - pin_memory_available=PIN_MEMORY_AVAILABLE, - max_num_reqs=MAX_NUM_REQS + 1, - device=device) - + logitsprocs = build_logitsprocs( + vllm_config=VllmConfig(), + device=device, + is_pin_memory=PIN_MEMORY_AVAILABLE, + is_pooling_model=False, + ) fake_sampling_metadata = SamplingMetadata( temperature=torch.full((batch_size, ), 0.0), all_greedy=True, @@ -462,7 +467,8 @@ def _generate_fake_step_update( # Replace as many removed requests as possible with added requests add_remove_idx = batch_update_builder.pop_removed() batch_update_builder.added.append( - (add_remove_idx, add_req_params.params, add_req_params.out_tokens)) + (add_remove_idx, add_req_params.params, + add_req_params.prompt_tokens, add_req_params.out_tokens)) persistent_batch[add_remove_idx] = add_req_params # Append remaining added requests to end of batch @@ -470,7 +476,8 @@ def _generate_fake_step_update( num_step_add_replace):(wdx + num_step_add)] batch_update_builder.added.extend([ - (adx + batch_size, add_req_params.params, add_req_params.out_tokens) + (adx + batch_size, add_req_params.params, add_req_params.prompt_tokens, + add_req_params.out_tokens) for adx, add_req_params in enumerate(add_reqs_append) ]) persistent_batch.extend(add_reqs_append) @@ -561,6 +568,7 @@ def _assert_valid( step_idx=step_idx) +@create_new_process_for_each_test() @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("reqs_per_logitproc", [REQS_PER_LOGITPROC]) @pytest.mark.parametrize("logitsprocs_under_test", _get_test_cases()) diff --git a/tests/v1/logits_processors/test_custom_offline.py b/tests/v1/logits_processors/test_custom_offline.py new file mode 100644 index 0000000000000..a7fde1990f7ed --- /dev/null +++ b/tests/v1/logits_processors/test_custom_offline.py @@ -0,0 +1,237 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import random +import sys +from typing import Union + +import pytest + +from tests.utils import create_new_process_for_each_test +# yapf: disable +from tests.v1.logits_processors.utils import (DUMMY_LOGITPROC_ARG, + DUMMY_LOGITPROC_FQCN, + DUMMY_LOGITPROC_MODULE, + MAX_TOKENS, MODEL_NAME, + POOLING_MODEL_NAME, TEMP_GREEDY, + CustomLogitprocSource, + DummyLogitsProcessor, + dummy_module) +from tests.v1.logits_processors.utils import entry_points as fake_entry_points +from tests.v1.logits_processors.utils import prompts +# yapf: enable +from vllm import LLM, SamplingParams +from vllm.v1.sample.logits_processor import (STR_POOLING_REJECTS_LOGITSPROCS, + LogitsProcessor) + +# Create a mixture of requests which do and don't utilize the dummy logitproc +sampling_params_list = [ + SamplingParams(temperature=TEMP_GREEDY, + max_tokens=MAX_TOKENS, + extra_args={DUMMY_LOGITPROC_ARG: 128}), + SamplingParams(temperature=TEMP_GREEDY, max_tokens=MAX_TOKENS), + SamplingParams(temperature=TEMP_GREEDY, + max_tokens=MAX_TOKENS, + extra_args={DUMMY_LOGITPROC_ARG: 67}), + SamplingParams(temperature=TEMP_GREEDY, max_tokens=MAX_TOKENS), +] + + +def _run_test(kwargs: dict, logitproc_loaded: bool) -> None: + """Compare `LLM` instance initialized with specified `kwargs` against + reference `LLM` instance. + + Two scenarios: + 1. Server has loaded dummy logitproc; test that requests which specify + dummy logitproc arg value behave as if logitproc is operating (output + token value should repeat), while requests that don't specify dummy + logitproc arg value should match reference `LLM` output. + 2. Server has *not* loaded dummy logitproc; test that all requests + behave as if logitproc is *not* operating (output matches reference + `LLM` output.) + + Args: + kwargs: `LLM` constructor kwargs + logitproc_loaded: server has loaded dummy logitproc if True + """ + + # Create a vLLM instance and load custom logitproc + llm_logitproc = LLM( + model=MODEL_NAME, + gpu_memory_utilization=0.1, + **kwargs, + ) + + # Create a reference vLLM instance without custom logitproc + llm_ref = LLM(model=MODEL_NAME, gpu_memory_utilization=0.1) + + # Run inference with logitproc loaded + outputs_logitproc = llm_logitproc.generate(prompts, sampling_params_list) + + # Reference run + outputs_ref = llm_ref.generate(prompts, sampling_params_list) + + # Validate outputs + for bdx, (out_lp, out_ref, params) in enumerate( + zip(outputs_logitproc, outputs_ref, sampling_params_list)): + lp_toks = out_lp.outputs[0].token_ids + if logitproc_loaded and params.extra_args: + # This request exercises custom logitproc; validate that logitproc + # forces `target_token` to be decoded in each step + target_token = params.extra_args[DUMMY_LOGITPROC_ARG] + if not all(x == target_token for x in lp_toks): + raise AssertionError( + f"Request {bdx} generated {lp_toks}, shoud all be " + f"{target_token}") + else: + # This request does not exercise custom logitproc (or custom + # logitproc is not enabled on this server); validate against + # reference result + ref_toks = out_ref.outputs[0].token_ids + if lp_toks != ref_toks: + raise AssertionError( + f"Request {bdx} generated {lp_toks}, should match " + f"{ref_toks}") + + +@create_new_process_for_each_test() +@pytest.mark.parametrize("logitproc_source", list(CustomLogitprocSource)) +def test_custom_logitsprocs(monkeypatch, + logitproc_source: CustomLogitprocSource): + """Test offline Python interface for passing custom logitsprocs + + Construct an `LLM` instance which loads a custom logitproc that has a + well-defined behavior (mask out all tokens except one `target_token`) + + Construct a reference `LLM` instance with no custom logitproc + + Pass in a batch of requests, 50% of which pass a `target_token` value + in through `SamplingParams.extra_args`, 50% of which do not. + + Validate that + * Requests which do not activate the custom logitproc, yield the same + results for both `LLM` instances + * Requests which activate the custom logitproc, only output `target_token` + + Test four scenarios, corresponding to `logitproc_source` value + * No logitsprocs loaded - test that generated tokens match reference `LLM` + instance output + * Logitproc passed in via {entrypoint, class object, fully-qualified class + name (FQCN)} - test that dummy logitproc is utilized correctly when + provided via any of these three possible sources + + Args: + monkeypatch: for setting env vars + logitproc_source: what source (entrypoint, fully-qualified class name + (FQCN), class object, or None) the user pulls the + logitproc from + """ + + # Test that logitproc info is passed to workers + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1") + random.seed(40) + + # Choose LLM args based on logitproc source + if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_NONE: + # Scenario: the server does not load any custom logitproc + # Every other scenario is a different way of loading a custom logitproc + _run_test({}, logitproc_loaded=False) + return + + if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT: + # Scenario: vLLM loads a logitproc from a preconfigured entrypoint + # To that end, mock a dummy logitproc entrypoint + import importlib.metadata + importlib.metadata.entry_points = fake_entry_points # type: ignore + + # fork is required for workers to see entrypoint patch + monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "fork") + _run_test({}, logitproc_loaded=True) + return + + kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {} + if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN: + # Scenario: load logitproc based on fully-qualified class name (FQCN) + # Inject dummy module which defines logitproc + sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module + kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN] + elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS: + # Scenario: load logitproc from provided class object + kwargs["logits_processors"] = [DummyLogitsProcessor] + + _run_test(kwargs, logitproc_loaded=True) + + +@create_new_process_for_each_test() +@pytest.mark.parametrize("logitproc_source", [ + CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT, + CustomLogitprocSource.LOGITPROC_SOURCE_FQCN, + CustomLogitprocSource.LOGITPROC_SOURCE_CLASS, +]) +def test_pooling_rejects_custom_logitsprocs( + monkeypatch, logitproc_source: CustomLogitprocSource): + """Validate that vLLM engine initialization properly rejects custom + logitsprocs when the model is a pooling model. + + Use `LLM` entrypoint. We expect `LLM` initialization to fail before the + logitproc is actually loaded. + + Scenario 1: + * Mock a logitproc entrypoint + * Validate that `LLM` does not load the logitproc + + Scenario 2: + * Pass custom logitproc to `LLM` constructor + * Scenario 2a: via FQCN + * Scenario 2b: via class object + * Validate that initialization fails with appropriate exception + + Args: + monkeypatch: used to set environment variables + logitproc_source: what source (entrypoint, fully-qualified class name + (FQCN), or class object) the user pulls the + logitproc from + """ + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") + random.seed(40) + + if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT: + # Scenario: vLLM loads a pooling model and ignores a logitproc that is + # available at a preconfigured entrypoint + + # Patch in dummy logitproc entrypoint + import importlib.metadata + importlib.metadata.entry_points = fake_entry_points # type: ignore + + # fork is required for entrypoint patch to be visible to workers, + # although they should ignore the entrypoint patch anyway + monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "fork") + + llm = LLM( + runner="pooling", + model=POOLING_MODEL_NAME, + gpu_memory_utilization=0.1, + ) + # Require that no logitsprocs have been loaded + assert sum([ + 1 for _ in llm.llm_engine.model_executor.driver_worker.worker. + model_runner.input_batch.logitsprocs.all + ]) == 0 + return + + kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {} + if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN: + # Scenario: load logitproc based on fully-qualified class name (FQCN) + kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN] + elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS: + # Scenario: load logitproc from provided class object + kwargs["logits_processors"] = [DummyLogitsProcessor] + + with pytest.raises(ValueError, match=STR_POOLING_REJECTS_LOGITSPROCS): + # Require that loading a pooling model alongside the logitproc raises + # the appropriate exception. + LLM( + runner="pooling", + model=POOLING_MODEL_NAME, + gpu_memory_utilization=0.1, + **kwargs, + ) diff --git a/tests/v1/logits_processors/test_custom_online.py b/tests/v1/logits_processors/test_custom_online.py new file mode 100644 index 0000000000000..a01a479e5b248 --- /dev/null +++ b/tests/v1/logits_processors/test_custom_online.py @@ -0,0 +1,180 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +import random +import sys +from typing import Any, Optional + +import openai +import pytest +import pytest_asyncio + +from tests.utils import (RemoteOpenAIServerCustom, + create_new_process_for_each_test) +# yapf: disable +from tests.v1.logits_processors.utils import (DUMMY_LOGITPROC_ARG, + DUMMY_LOGITPROC_FQCN, + DUMMY_LOGITPROC_MODULE, + MAX_TOKENS, MODEL_NAME, + TEMP_GREEDY, dummy_module) +from tests.v1.logits_processors.utils import entry_points as fake_entry_points +from tests.v1.logits_processors.utils import prompts + +# yapf: enable + + +def _server_with_logitproc_entrypoint( + env_dict: Optional[dict[str, str]], + model: str, + vllm_serve_args: list[str], +) -> None: + """Start vLLM server, inject dummy logitproc entrypoint""" + + # Patch `entry_points` to inject logitproc entrypoint + import importlib.metadata + importlib.metadata.entry_points = fake_entry_points # type: ignore + from vllm.entrypoints.cli import main + + # fork is required for workers to see entrypoint patch + os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "fork" + if env_dict is not None: + os.environ.update(env_dict) + + # Emulate `vllm serve ` + sys.argv = ["vllm", "serve", model] + vllm_serve_args + main.main() + + +def _server_with_logitproc_module( + env_dict: Optional[dict[str, str]], + model: str, + vllm_serve_args: list[str], +) -> None: + """Start vLLM server, inject module with dummy logitproc""" + + # Patch `modules` to inject dummy logitproc module + from vllm.entrypoints.cli import main + sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module + + # fork is required for workers to see entrypoint patch + os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "fork" + if env_dict is not None: + os.environ.update(env_dict) + + # Emulate `vllm serve ` + sys.argv = ["vllm", "serve", model] + vllm_serve_args + main.main() + + +@pytest.fixture(scope="module") +def default_server_args(): + return [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "2048", + "--max-num-seqs", + "128", + ] + + +@pytest.fixture(scope="function", + params=[[], ["--logits-processors", DUMMY_LOGITPROC_FQCN]]) +def server(default_server_args, request, monkeypatch): + """Consider two server configurations: + (1) --logits-processors cli arg specifies dummy logits processor via fully- + qualified class name (FQCN); patch in a dummy logits processor module + (2) No --logits-processors cli arg; patch in a dummy logits processor + entrypoint + """ + + # Test that logitproc info is passed to workers + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1") + + if request.param: + # Launch server, append FQCN argument, inject dummy logitproc module + args = default_server_args + request.param + _server_fxn = _server_with_logitproc_module + else: + # Launch server, inject dummy logitproc entrypoint + args = default_server_args + _server_fxn = _server_with_logitproc_entrypoint + + with RemoteOpenAIServerCustom(MODEL_NAME, args, + _server_fxn) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +# General request argument values for these tests +api_keyword_args = { + # Greedy sampling ensures that requests which receive the `target_token` + # arg will decode it in every step + "temperature": TEMP_GREEDY, + # Since EOS will never be decoded (unless `target_token` is EOS) + "max_tokens": MAX_TOKENS, + # Return decoded token logprobs (as a way of getting token id) + "logprobs": 0, +} + + +@create_new_process_for_each_test() +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME], +) +async def test_custom_logitsprocs(client: openai.AsyncOpenAI, model_name: str): + """Test custom logitsprocs when starting OpenAI server from CLI + + Launch vLLM OpenAI-compatible server, configured to load a custom logitproc + that has a well-defined behavior (mask out all tokens except one + `target_token`). + + Pass in requests, 50% of which pass a `target_token` value + in through `extra_body["vllm_xargs"]`, 50% of which do not. + + Validate that requests which activate the custom logitproc, repeat the same + token + """ + + use_dummy_logitproc = True + for prompt in prompts: + # Build request arguments + request_keyword_args: dict[str, Any] = { + **api_keyword_args, + } + if use_dummy_logitproc: + # 50% of requests pass target_token custom arg + target_token = random.choice([128, 67]) + # For requests which activate the dummy logitproc, choose one of + # two `target_token` values which are known not to be EOS tokens + request_keyword_args["extra_body"] = { + "vllm_xargs": { + DUMMY_LOGITPROC_ARG: target_token + } + } + batch = await client.completions.create( + model=model_name, + prompt=prompt, + **request_keyword_args, + ) + + if use_dummy_logitproc: + # Only for requests which activate dummy logitproc - validate that + # output token is repeated + choices: openai.types.CompletionChoice = batch.choices + toks = choices[0].logprobs.tokens + if not all([x == toks[0] for x in toks]): + raise AssertionError( + f"Generated {toks} should all be {toks[0]}") + + # Alternate whether to activate dummy logitproc for each request + use_dummy_logitproc = not use_dummy_logitproc diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py new file mode 100644 index 0000000000000..c0bfc1a18feca --- /dev/null +++ b/tests/v1/logits_processors/utils.py @@ -0,0 +1,127 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import types +from enum import Enum, auto +from typing import Optional + +import torch + +from vllm.config import VllmConfig +from vllm.sampling_params import SamplingParams +from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP, BatchUpdate, + LogitsProcessor, + MoveDirectionality) + +MODEL_NAME = "facebook/opt-125m" +POOLING_MODEL_NAME = "BAAI/bge-base-en-v1.5" +DUMMY_LOGITPROC_ARG = "target_token" +TEMP_GREEDY = 0.0 +MAX_TOKENS = 20 +DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc" +DUMMY_LOGITPROC_MODULE = "DummyModule" +DUMMY_LOGITPROC_FQCN = f"{DUMMY_LOGITPROC_MODULE}:DummyLogitsProcessor" + + +class CustomLogitprocSource(Enum): + """How to source a logitproc for testing purposes""" + LOGITPROC_SOURCE_NONE = auto() # No custom logitproc + LOGITPROC_SOURCE_ENTRYPOINT = auto() # Via entrypoint + LOGITPROC_SOURCE_FQCN = auto() # Via fully-qualified class name (FQCN) + LOGITPROC_SOURCE_CLASS = auto() # Via provided class object + + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + + +class DummyLogitsProcessor(LogitsProcessor): + """Fake logit processor to support unit testing and examples""" + + def __init__(self, vllm_config: "VllmConfig", device: torch.device, + is_pin_memory: bool): + self.req_info: dict[int, SamplingParams] = {} + + def is_argmax_invariant(self) -> bool: + """Never impacts greedy sampling""" + return False + + def update_state(self, batch_update: Optional[BatchUpdate]): + if not batch_update: + return + + # Process added requests. + for index, params, _, _ in batch_update.added: + assert params is not None + if params.extra_args and (target_token := + params.extra_args.get("target_token")): + self.req_info[index] = target_token + + if self.req_info: + # Process removed requests. + for index in batch_update.removed: + self.req_info.pop(index, None) + + # Process moved requests, unidirectional move (a->b) and swap + # (a<->b) + for adx, bdx, direct in batch_update.moved: + a_val = self.req_info.pop(adx, None) + b_val = self.req_info.pop(bdx, None) + if a_val is not None: + self.req_info[bdx] = a_val + if direct == MoveDirectionality.SWAP and b_val is not None: + self.req_info[adx] = b_val + + def apply(self, logits: torch.Tensor) -> torch.Tensor: + if not self.req_info: + return logits + + # Save target values before modification + rows_list = list(self.req_info.keys()) + cols = torch.tensor([self.req_info[i] for i in rows_list], + dtype=torch.long, + device=logits.device) + rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device) + values_to_keep = logits[rows, cols].clone() + + # Mask all but target tokens + logits[rows] = float('-inf') + logits[rows, cols] = values_to_keep + + return logits + + +"""Dummy module with dummy logitproc class""" +dummy_module = types.ModuleType(DUMMY_LOGITPROC_MODULE) +dummy_module.DummyLogitsProcessor = DummyLogitsProcessor # type: ignore + + +class EntryPoint: + """Dummy entrypoint class for logitsprocs testing""" + + def __init__(self): + self.name = DUMMY_LOGITPROC_ENTRYPOINT + self.value = DUMMY_LOGITPROC_FQCN + + def load(self): + return DummyLogitsProcessor + + +class EntryPoints(list): + """Dummy EntryPoints class for logitsprocs testing""" + + def __init__(self, group: str): + # Emulate list-like functionality + eps = [EntryPoint()] if group == LOGITSPROCS_GROUP else [] + super().__init__(eps) + # Extra attributes + self.names = [ep.name for ep in eps] + + +"""Fake version of importlib.metadata.entry_points""" +entry_points = lambda group: EntryPoints(group) diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py index 3a4d48afc9d77..4e912f98f376f 100644 --- a/tests/v1/sample/test_rejection_sampler.py +++ b/tests/v1/sample/test_rejection_sampler.py @@ -7,7 +7,7 @@ import torch import torch.nn.functional as F from vllm.platforms import current_platform -from vllm.v1.sample.logits_processor import LogitsProcessorManager +from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID, RejectionSampler) @@ -69,7 +69,7 @@ def create_sampling_metadata( output_token_ids=[], allowed_token_ids_mask=None, bad_words_token_ids={}, - logitsprocs=LogitsProcessorManager(), + logitsprocs=LogitsProcessors(), ) diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py index 31c6c881d7b83..53215f88bb27e 100644 --- a/tests/v1/sample/test_sampler.py +++ b/tests/v1/sample/test_sampler.py @@ -9,7 +9,7 @@ import torch from vllm.platforms import current_platform from vllm.utils import is_pin_memory_available, make_tensor_with_pad -from vllm.v1.sample.logits_processor import LogitsProcessorManager +from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.sampler import Sampler @@ -173,7 +173,7 @@ def _create_default_sampling_metadata( no_penalties=True, allowed_token_ids_mask=None, bad_words_token_ids={}, - logitsprocs=LogitsProcessorManager(), + logitsprocs=LogitsProcessors(), ) return fake_sampling_metadata diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 74ab19a3ce32d..d7b4746562beb 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -13,7 +13,7 @@ from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams from vllm.utils import is_pin_memory_available, make_tensor_with_pad from vllm.v1.pool.metadata import PoolingMetadata -from vllm.v1.sample.logits_processor import LogitsProcessorManager +from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch @@ -169,7 +169,7 @@ def _construct_expected_sampling_metadata( and all(x == 1 for x in repetition_penalties)), allowed_token_ids_mask=allowed_token_ids_mask, bad_words_token_ids=bad_words_token_ids, - logitsprocs=LogitsProcessorManager(), + logitsprocs=LogitsProcessors(), ) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 14fc5589a89a4..51db277f65dc9 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -62,6 +62,7 @@ if TYPE_CHECKING: QuantizationConfig) from vllm.model_executor.model_loader import LoadFormats from vllm.model_executor.model_loader.tensorizer import TensorizerConfig + from vllm.v1.sample.logits_processor import LogitsProcessor HfOverrides = Union[dict, Callable[[type], type]] else: @@ -72,6 +73,7 @@ else: BaseModelLoader = Any LoadFormats = Any TensorizerConfig = Any + LogitsProcessor = Any HfOverrides = Union[dict[str, Any], Callable[[type], type]] me_quant = LazyLoader("model_executor", globals(), @@ -465,6 +467,9 @@ class ModelConfig: - "transformers" will use the Transformers model implementation.""" override_attention_dtype: Optional[str] = None """Override dtype for attention""" + logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None + """One or more logits processors' fully-qualified class names or class + definitions""" def compute_hash(self) -> str: """ diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 630fbec4539e7..6fc894827c4aa 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -43,6 +43,7 @@ from vllm.transformers_utils.config import is_interleaved from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor) +from vllm.v1.sample.logits_processor import LogitsProcessor # yapf: enable @@ -435,6 +436,10 @@ class EngineArgs: enable_multimodal_encoder_data_parallel: bool = \ ParallelConfig.enable_multimodal_encoder_data_parallel + logits_processors: Optional[list[Union[ + str, type[LogitsProcessor]]]] = ModelConfig.logits_processors + """Custom logitproc types""" + async_scheduling: bool = SchedulerConfig.async_scheduling # DEPRECATED enable_prompt_adapter: bool = False @@ -549,6 +554,8 @@ class EngineArgs: **model_kwargs["model_impl"]) model_group.add_argument("--override-attention-dtype", **model_kwargs["override_attention_dtype"]) + model_group.add_argument("--logits-processors", + **model_kwargs["logits_processors"]) # Model loading arguments load_kwargs = get_kwargs(LoadConfig) @@ -940,6 +947,7 @@ class EngineArgs: enable_sleep_mode=self.enable_sleep_mode, model_impl=self.model_impl, override_attention_dtype=self.override_attention_dtype, + logits_processors=self.logits_processors, ) def validate_tensorizer_args(self): diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 915f14a29b907..b002f234c043d 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -55,6 +55,7 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer, get_cached_tokenizer) from vllm.usage.usage_lib import UsageContext from vllm.utils import Counter, Device, deprecate_kwargs, is_list_of +from vllm.v1.sample.logits_processor import LogitsProcessor if TYPE_CHECKING: from vllm.v1.metrics.reader import Metric @@ -198,6 +199,8 @@ class LLM: override_pooler_config: Optional[PoolerConfig] = None, compilation_config: Optional[Union[int, dict[str, Any], CompilationConfig]] = None, + logits_processors: Optional[list[Union[str, + type[LogitsProcessor]]]] = None, **kwargs, ) -> None: """LLM constructor.""" @@ -272,6 +275,7 @@ class LLM: mm_processor_kwargs=mm_processor_kwargs, override_pooler_config=override_pooler_config, compilation_config=compilation_config_instance, + logits_processors=logits_processors, **kwargs, ) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 64f7426bd65d3..5cb9f97ae0b08 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2562,7 +2562,7 @@ def direct_register_custom_op( def resolve_obj_by_qualname(qualname: str) -> Any: """ - Resolve an object by its fully qualified name. + Resolve an object by its fully-qualified class name. """ module_name, obj_name = qualname.rsplit(".", 1) module = importlib.import_module(module_name) diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py new file mode 100644 index 0000000000000..8220269162951 --- /dev/null +++ b/vllm/v1/sample/logits_processor/__init__.py @@ -0,0 +1,185 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import importlib +import itertools +from collections.abc import Sequence +from typing import TYPE_CHECKING, Optional, Union + +import torch + +from vllm.logger import init_logger +from vllm.v1.sample.logits_processor.builtin import (LogitBiasLogitsProcessor, + MinPLogitsProcessor, + MinTokensLogitsProcessor) +from vllm.v1.sample.logits_processor.interface import (BatchUpdate, + LogitsProcessor, + MoveDirectionality) +from vllm.v1.sample.logits_processor.state import (BatchUpdateBuilder, + LogitsProcessors) + +if TYPE_CHECKING: + from vllm.config import VllmConfig + +logger = init_logger(__name__) + +# Error message when the user tries to initialize vLLM with a pooling model +# and custom logitsproces +STR_POOLING_REJECTS_LOGITSPROCS = ("Pooling models do not support custom" + " logits processors.") + +LOGITSPROCS_GROUP = 'vllm.logits_processors' + +BUILTIN_LOGITS_PROCESSORS: list[type[LogitsProcessor]] = [ + MinTokensLogitsProcessor, + LogitBiasLogitsProcessor, + MinPLogitsProcessor, +] + + +def _load_logitsprocs_plugins() -> list[type[LogitsProcessor]]: + """Load all installed logit processor plugins""" + + import sys + + if sys.version_info < (3, 10): + from importlib_metadata import entry_points + else: + from importlib.metadata import entry_points + + installed_logitsprocs_plugins = entry_points(group=LOGITSPROCS_GROUP) + if len(installed_logitsprocs_plugins) == 0: + logger.debug("No logitsprocs plugins installed (group %s).", + LOGITSPROCS_GROUP) + return [] + + # Load logitsprocs plugins + logger.debug("Loading installed logitsprocs plugins (group %s):", + LOGITSPROCS_GROUP) + classes: list[type[LogitsProcessor]] = [] + for entrypoint in installed_logitsprocs_plugins: + try: + logger.debug("- Loading logitproc plugin entrypoint=%s target=%s", + entrypoint.name, entrypoint.value) + classes.append(entrypoint.load()) + except Exception as e: + raise RuntimeError( + f"Failed to load LogitsProcessor plugin {entrypoint}") from e + return classes + + +def _load_logitsprocs_by_fqcns( + logits_processors: Optional[Sequence[Union[str, type[LogitsProcessor]]]] +) -> list[type[LogitsProcessor]]: + """Load logit processor types, identifying them by fully-qualified class + names (FQCNs). + + Effectively, a mixed list of logitproc types and FQCN strings is converted + into a list of entirely logitproc types, by loading from the FQCNs. + + FQCN syntax is : i.e. x.y.z:CustomLogitProc + + Already-loaded logitproc types must be subclasses of LogitsProcessor + + Args: + logits_processors: Potentially mixed list of logitsprocs types and FQCN + strings for logitproc types + + Returns: + List of logitproc types + + """ + if not logits_processors: + return [] + + logger.debug( + "%s additional custom logits processors specified, checking whether " + "they need to be loaded.", len(logits_processors)) + + classes: list[type[LogitsProcessor]] = [] + for ldx, logitproc in enumerate(logits_processors): + if isinstance(logitproc, type): + logger.debug(" - Already-loaded logit processor: %s", + logitproc.__name__) + if not issubclass(logitproc, LogitsProcessor): + raise ValueError( + f"{logitproc.__name__} is not a subclass of LogitsProcessor" + ) + classes.append(logitproc) + continue + + logger.debug("- Loading logits processor %s", logitproc) + module_path, qualname = logitproc.split(":") + + try: + # Load module + module = importlib.import_module(module_path) + except Exception as e: + raise RuntimeError( + f"Failed to load {ldx}th LogitsProcessor plugin {logitproc}" + ) from e + + # Walk down dotted name to get logitproc class + obj = module + for attr in qualname.split("."): + obj = getattr(obj, attr) + if not isinstance(obj, type): + raise ValueError("Loaded logit processor must be a type.") + if not issubclass(obj, LogitsProcessor): + raise ValueError( + f"{obj.__name__} must be a subclass of LogitsProcessor") + classes.append(obj) + + return classes + + +def _load_custom_logitsprocs( + logits_processors: Optional[Sequence[Union[str, type[LogitsProcessor]]]], +) -> list[type[LogitsProcessor]]: + """Load all custom logits processors. + + * First load all installed logitproc plugins + * Second load custom logitsprocs pass by the user at initialization time + + Args: + logits_processors: potentially mixed list of logitproc types and + logitproc type fully-qualified names (FQCNs) + which need to be loaded + + Returns: + A list of all loaded logitproc types + """ + from vllm.platforms import current_platform + if current_platform.is_tpu(): + # No logitsprocs specified by caller + # TODO(andy) - vLLM V1 on TPU does not support custom logitsprocs + return [] + + return (_load_logitsprocs_plugins() + + _load_logitsprocs_by_fqcns(logits_processors)) + + +def build_logitsprocs( + vllm_config: "VllmConfig", + device: torch.device, + is_pin_memory: bool, + is_pooling_model: bool, + custom_logitsprocs: Sequence[Union[str, type[LogitsProcessor]]] = (), +) -> LogitsProcessors: + if is_pooling_model: + if custom_logitsprocs: + raise ValueError(STR_POOLING_REJECTS_LOGITSPROCS) + logger.debug("Skipping logits processor loading because pooling models" + " do not support logits processors.") + return LogitsProcessors() + custom_logitsprocs_classes = _load_custom_logitsprocs(custom_logitsprocs) + return LogitsProcessors( + ctor(vllm_config, device, is_pin_memory) for ctor in itertools.chain( + BUILTIN_LOGITS_PROCESSORS, custom_logitsprocs_classes)) + + +__all__ = [ + "LogitsProcessor", "LogitBiasLogitsProcessor", "MinPLogitsProcessor", + "MinTokensLogitsProcessor", "BatchUpdate", "BatchUpdateBuilder", + "MoveDirectionality", "LogitsProcessors", "build_logitsprocs", + "STR_POOLING_REJECTS_LOGITSPROCS", "LOGITSPROCS_GROUP" +] diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor/builtin.py similarity index 54% rename from vllm/v1/sample/logits_processor.py rename to vllm/v1/sample/logits_processor/builtin.py index 3a06e71057cdd..24387ab793906 100644 --- a/vllm/v1/sample/logits_processor.py +++ b/vllm/v1/sample/logits_processor/builtin.py @@ -1,241 +1,32 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import dataclasses -from abc import ABC, abstractmethod -from collections.abc import Iterator, Sequence -from dataclasses import dataclass, field -from enum import Enum -from itertools import chain -from typing import Optional, Union +from collections.abc import Sequence +from typing import TYPE_CHECKING, Optional import torch -from torch._prims_common import DeviceLikeType -from vllm import PoolingParams, SamplingParams -from vllm.logger import init_logger +from vllm.v1.sample.logits_processor.interface import (BatchUpdate, + LogitsProcessor, + MoveDirectionality) -logger = init_logger(__name__) - - -class MoveDirectionality(Enum): - # One-way i1->i2 req move within batch - UNIDIRECTIONAL = 0 - # Two-way i1<->i2 req swap within batch - SWAP = 1 - - -# (index, params, output_tok_ids) tuples for new -# requests added to the batch. -AddedRequest = tuple[int, Union[SamplingParams, PoolingParams], list[int]] -# (index 1, index 2, directionality) tuples representing -# one-way moves or two-way swaps of requests in batch -MovedRequest = tuple[int, int, MoveDirectionality] -# Batch indices of any removed requests. -RemovedRequest = int - - -@dataclasses.dataclass(frozen=True) -class BatchUpdate: - """Persistent batch state change info for logitsprocs""" - batch_size: int # Current num reqs in batch - - # Metadata for requests added to, removed from, and moved - # within the persistent batch. - # - # Note: each added request is represented as - # (index, params, output_tok_ids) - # Key assumption: output_tok_ids is a reference to the - # request's running output tokens list; in this way - # the logits processors always see the latest list of - # generated tokens - removed: Sequence[RemovedRequest] - moved: Sequence[MovedRequest] - added: Sequence[AddedRequest] - - -class BatchUpdateBuilder: - """Helps track persistent batch state changes and build - a batch update data structure for logitsprocs - - Assumptions: - * All information about requests removed from persistent batch - during a step is aggregated in self._removed through calls to - self.removed_append() at the beginning of a step. This must happen - before the first time that self.removed, self.pop_removed() - or self.peek_removed() are invoked in a given step - * After the first time that self.removed, self.pop_removed() - or self.peek_removed() are read in a step, no new removals - are registered using self.removed_append() - * Elements of self._removed are never directly modified, added or - removed (i.e. modification is only via self.removed_append() and - self.pop_removed()) - - Guarantees under above assumptions: - * self.removed is always sorted in descending order - * self.pop_removed() and self.peek_removed() both return - the lowest removed request index in the current step - """ - - _removed: list[RemovedRequest] - _is_removed_sorted: bool - moved: list[MovedRequest] - added: list[AddedRequest] - - def __init__( - self, - removed: Optional[list[RemovedRequest]] = None, - moved: Optional[list[MovedRequest]] = None, - added: Optional[list[AddedRequest]] = None, - ) -> None: - self._removed = removed or [] - self.moved = moved or [] - self.added = added or [] - self._is_removed_sorted = False - - def _ensure_removed_sorted(self) -> None: - """Sort removed request indices in - descending order. - - Idempotent after first call in a - given step, until reset. - """ - if not self._is_removed_sorted: - self._removed.sort(reverse=True) - self._is_removed_sorted = True - - @property - def removed(self) -> list[RemovedRequest]: - """Removed request indices sorted in - descending order""" - self._ensure_removed_sorted() - return self._removed - - def removed_append(self, index: int) -> None: - """Register the removal of a request from - the persistent batch. - - Must not be called after the first time - self.removed, self.pop_removed() or - self.peek_removed() are invoked. - - Args: - index: request index - """ - if self._is_removed_sorted: - raise RuntimeError("Cannot register new removed request after" - " self.removed has been read.") - self._removed.append(index) - - def has_removed(self) -> bool: - return bool(self._removed) - - def peek_removed(self) -> Optional[int]: - """Return lowest removed request index""" - if self.has_removed(): - self._ensure_removed_sorted() - return self._removed[-1] - return None - - def pop_removed(self) -> Optional[int]: - """Pop lowest removed request index""" - if self.has_removed(): - self._ensure_removed_sorted() - return self._removed.pop() - return None - - def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]: - """Generate a logitsprocs batch update data structure - and reset internal batch update builder state. - - Args: - batch_size: current persistent batch size - - Returns: - Frozen logitsprocs batch update instance; `None` if no updates - """ - # Reset removal-sorting logic - self._is_removed_sorted = False - if not any((self._removed, self.moved, self.added)): - # No update; short-circuit - return None - # Build batch state update - batch_update = BatchUpdate( - batch_size=batch_size, - removed=self._removed, - moved=self.moved, - added=self.added, - ) - # Reset removed/moved/added update lists - self._removed = [] - self.moved = [] - self.added = [] - return batch_update - - -class LogitsProcessor(ABC): - - @abstractmethod - def apply(self, logits: torch.Tensor) -> torch.Tensor: - raise NotImplementedError - - @abstractmethod - def is_argmax_invariant(self) -> bool: - """True if logits processor has no impact on the - argmax computation in greedy sampling. - NOTE: may or may not have the same value for all - instances of a given LogitsProcessor subclass, - depending on subclass implementation. - TODO(andy): won't be utilized until logits - processors are user-extensible - """ - raise NotImplementedError - - @abstractmethod - def update_state( - self, - batch_update: Optional[BatchUpdate], - ) -> None: - """Called when there are new output tokens, prior - to each forward pass. - - Args: - batch_update is non-None iff there have been - changes to the batch makeup. - """ - raise NotImplementedError - - -@dataclass -class LogitsProcessorManager: - """Encapsulates initialized logitsproc objects.""" - argmax_invariant: list[LogitsProcessor] = field( - default_factory=list) # argmax-invariant logitsprocs - non_argmax_invariant: list[LogitsProcessor] = field( - default_factory=list) # non-argmax-invariant logitsprocs - - @property - def all(self) -> Iterator[LogitsProcessor]: - """Iterator over all logits processors.""" - return chain(self.argmax_invariant, self.non_argmax_invariant) - - -###### ----- Built-in LogitsProcessor impls below here +if TYPE_CHECKING: + from vllm.config import VllmConfig class MinPLogitsProcessor(LogitsProcessor): - def __init__(self, max_num_reqs: int, pin_memory: bool, - device: DeviceLikeType): - super().__init__() + def __init__(self, vllm_config: "VllmConfig", device: torch.device, + is_pin_memory: bool): + max_num_reqs = vllm_config.scheduler_config.max_num_seqs self.min_p_count: int = 0 self.min_p_cpu_tensor = torch.zeros((max_num_reqs, ), dtype=torch.float32, device="cpu", - pin_memory=pin_memory) + pin_memory=is_pin_memory) self.min_p_cpu = self.min_p_cpu_tensor.numpy() - self.use_double_tensor = torch.device("cpu") != torch.device(device) + self.use_double_tensor = torch.device(device).type != "cpu" if self.use_double_tensor: # Pre-allocated device tensor @@ -260,8 +51,8 @@ class MinPLogitsProcessor(LogitsProcessor): needs_update = False # Process added requests. - for index, params, _ in batch_update.added: - min_p = params.min_p if isinstance(params, SamplingParams) else 0.0 + for index, params, _, _ in batch_update.added: + min_p = params.min_p if self.min_p_cpu[index] != min_p: needs_update = True self.min_p_cpu[index] = min_p @@ -316,11 +107,10 @@ class MinPLogitsProcessor(LogitsProcessor): class LogitBiasLogitsProcessor(LogitsProcessor): - def __init__(self, pin_memory: bool, device: torch.device): - super().__init__() - self.biases: dict[int, dict[int, float]] = {} + def __init__(self, _, device: torch.device, is_pin_memory: bool): self.device = device - self.pin_memory = pin_memory + self.pin_memory = is_pin_memory + self.biases: dict[int, dict[int, float]] = {} self.bias_tensor: torch.Tensor = torch.tensor(()) self.logits_slice = (self._device_tensor([], torch.int32), @@ -337,9 +127,8 @@ class LogitBiasLogitsProcessor(LogitsProcessor): needs_update: bool = False # Process added requests. - for index, params, _ in batch_update.added: - if isinstance(params, SamplingParams) and (lb := - params.logit_bias): + for index, params, _, _ in batch_update.added: + if lb := params.logit_bias: self.biases[index] = lb needs_update = True else: @@ -400,12 +189,12 @@ class LogitBiasLogitsProcessor(LogitsProcessor): class MinTokensLogitsProcessor(LogitsProcessor): - def __init__(self, pin_memory: bool, device: torch.device): + def __init__(self, vllm_config: "VllmConfig", device: torch.device, + is_pin_memory: bool): # index -> (min_toks, output_token_ids, stop_token_ids) - super().__init__() - self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {} self.device = device - self.pin_memory = pin_memory + self.pin_memory = is_pin_memory + self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {} # (req_idx_tensor,eos_tok_id_tensor) self.logits_slice: tuple[torch.Tensor, @@ -424,9 +213,8 @@ class MinTokensLogitsProcessor(LogitsProcessor): if batch_update: # Process added requests. - for index, params, output_tok_ids in batch_update.added: - if (isinstance(params, SamplingParams) - and (min_tokens := params.min_tokens) + for index, params, _, output_tok_ids in batch_update.added: + if ((min_tokens := params.min_tokens) and len(output_tok_ids) < min_tokens): # Replace request metadata at batch index self.min_toks[index] = (min_tokens, output_tok_ids, @@ -499,35 +287,3 @@ class MinTokensLogitsProcessor(LogitsProcessor): # Inhibit EOS token for requests which have not reached min length logits[self.logits_slice] = -float("inf") return logits - - -def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int, - device: torch.device) -> LogitsProcessorManager: - """Construct 'builtin' vLLM logitsprocs which the engine - loads by default. - - Args: - pin_memory_available: pinned memory is available for use - for use by logitsproc - max_num_reqs: ceiling on request count in persistent batch - device: inference device - - Returns: - Data structure encapsulating loaded logitsprocs - """ - min_tokens_logitproc = MinTokensLogitsProcessor( - pin_memory=pin_memory_available, device=device) - logit_bias_logitproc = LogitBiasLogitsProcessor( - pin_memory=pin_memory_available, device=device) - min_p_logitproc = MinPLogitsProcessor( - pin_memory=pin_memory_available, - device=device, - # +1 for temporary swap space - max_num_reqs=max_num_reqs + 1) - return LogitsProcessorManager( - non_argmax_invariant=[ - min_tokens_logitproc, - logit_bias_logitproc, - ], - argmax_invariant=[min_p_logitproc], - ) diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py new file mode 100644 index 0000000000000..12b4db24bff88 --- /dev/null +++ b/vllm/v1/sample/logits_processor/interface.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod +from collections.abc import Sequence +from dataclasses import dataclass +from enum import Enum, auto +from typing import TYPE_CHECKING, Optional + +import torch + +from vllm import SamplingParams + +if TYPE_CHECKING: + from vllm.config import VllmConfig + + +class MoveDirectionality(Enum): + # One-way i1->i2 req move within batch + UNIDIRECTIONAL = auto() + # Two-way i1<->i2 req swap within batch + SWAP = auto() + + +# (index, params, prompt_tok_ids, output_tok_ids) tuples for new +# requests added to the batch. +AddedRequest = tuple[int, SamplingParams, list[int], list[int]] + +# (index 1, index 2, directionality) tuples representing +# one-way moves or two-way swaps of requests in batch +MovedRequest = tuple[int, int, MoveDirectionality] + +# Batch indices of any removed requests. +RemovedRequest = int + + +@dataclass(frozen=True) +class BatchUpdate: + """Persistent batch state change info for logitsprocs""" + batch_size: int # Current num reqs in batch + + # Metadata for requests added to, removed from, and moved + # within the persistent batch. + # + # Key assumption: the `output_tok_ids` list (which is an element of each + # tuple in `added`) is a reference to the request's running output tokens + # list; via this reference, the logits processors always see the latest + # list of generated output tokens + removed: Sequence[RemovedRequest] + moved: Sequence[MovedRequest] + added: Sequence[AddedRequest] + + +class LogitsProcessor(ABC): + + @abstractmethod + def __init__(self, vllm_config: "VllmConfig", device: torch.device, + is_pin_memory: bool) -> None: + raise NotImplementedError + + @abstractmethod + def apply(self, logits: torch.Tensor) -> torch.Tensor: + raise NotImplementedError + + @abstractmethod + def is_argmax_invariant(self) -> bool: + """True if logits processor has no impact on the + argmax computation in greedy sampling. + NOTE: may or may not have the same value for all + instances of a given LogitsProcessor subclass, + depending on subclass implementation. + """ + raise NotImplementedError + + @abstractmethod + def update_state( + self, + batch_update: Optional["BatchUpdate"], + ) -> None: + """Called when there are new output tokens, prior + to each forward pass. + + Args: + batch_update is non-None iff there have been + changes to the batch makeup. + """ + raise NotImplementedError diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py new file mode 100644 index 0000000000000..0f58b52496956 --- /dev/null +++ b/vllm/v1/sample/logits_processor/state.py @@ -0,0 +1,149 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterator +from itertools import chain +from typing import TYPE_CHECKING, Optional + +from vllm.v1.sample.logits_processor.interface import (AddedRequest, + BatchUpdate, + MovedRequest, + RemovedRequest) + +if TYPE_CHECKING: + from vllm.v1.sample.logits_processor.interface import LogitsProcessor + + +class BatchUpdateBuilder: + """Helps track persistent batch state changes and build + a batch update data structure for logitsprocs + Assumptions: + * All information about requests removed from persistent batch + during a step is aggregated in self._removed through calls to + self.removed_append() at the beginning of a step. This must happen + before the first time that self.removed, self.pop_removed() + or self.peek_removed() are invoked in a given step + * After the first time that self.removed, self.pop_removed() + or self.peek_removed() are read in a step, no new removals + are registered using self.removed_append() + * Elements of self._removed are never directly modified, added or + removed (i.e. modification is only via self.removed_append() and + self.pop_removed()) + Guarantees under above assumptions: + * self.removed is always sorted in descending order + * self.pop_removed() and self.peek_removed() both return + the lowest removed request index in the current step + """ + + _removed: list[RemovedRequest] + _is_removed_sorted: bool + moved: list[MovedRequest] + added: list[AddedRequest] + + def __init__( + self, + removed: Optional[list[RemovedRequest]] = None, + moved: Optional[list[MovedRequest]] = None, + added: Optional[list[AddedRequest]] = None, + ) -> None: + self._removed = removed or [] + self.moved = moved or [] + self.added = added or [] + self._is_removed_sorted = False + + def _ensure_removed_sorted(self) -> None: + """Sort removed request indices in + descending order. + Idempotent after first call in a + given step, until reset. + """ + if not self._is_removed_sorted: + self._removed.sort(reverse=True) + self._is_removed_sorted = True + + @property + def removed(self) -> list[RemovedRequest]: + """Removed request indices sorted in + descending order""" + self._ensure_removed_sorted() + return self._removed + + def removed_append(self, index: int) -> None: + """Register the removal of a request from the persistent batch. + + Must not be called after the first time self.removed, + self.pop_removed() or self.peek_removed() are invoked. + + Args: + index: request index + """ + if self._is_removed_sorted: + raise RuntimeError("Cannot register new removed request after" + " self.removed has been read.") + self._removed.append(index) + + def has_removed(self) -> bool: + return bool(self._removed) + + def peek_removed(self) -> Optional[int]: + """Return lowest removed request index""" + if self.has_removed(): + self._ensure_removed_sorted() + return self._removed[-1] + return None + + def pop_removed(self) -> Optional[int]: + """Pop lowest removed request index""" + if self.has_removed(): + self._ensure_removed_sorted() + return self._removed.pop() + return None + + def _is_update(self) -> bool: + """True if there is a batch state change""" + return any((self._removed, self.moved, self.added)) + + def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]: + """Generate a logitsprocs batch update data structure and reset + internal batch update builder state. + + Args: + batch_size: current persistent batch size + + Returns: + Frozen logitsprocs batch update instance; `None` if no updates + """ + # Reset removal-sorting logic + self._is_removed_sorted = False + if not self._is_update(): + # No update; short-circuit + return None + # Build batch state update + batch_update = BatchUpdate( + batch_size=batch_size, + removed=self._removed, + moved=self.moved, + added=self.added, + ) + self._removed = [] + self.moved = [] + self.added = [] + return batch_update + + +class LogitsProcessors: + """Encapsulates initialized logitsproc objects.""" + + def __init__( + self, + logitsprocs: Optional[Iterator["LogitsProcessor"]] = None) -> None: + self.argmax_invariant: list[LogitsProcessor] = [] + self.non_argmax_invariant: list[LogitsProcessor] = [] + if logitsprocs: + for logitproc in logitsprocs: + (self.argmax_invariant if logitproc.is_argmax_invariant() else + self.non_argmax_invariant).append(logitproc) + + @property + def all(self) -> Iterator["LogitsProcessor"]: + """Iterator over all logits processors.""" + return chain(self.argmax_invariant, self.non_argmax_invariant) diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py index 1189b12f30776..9d6a87cea3d07 100644 --- a/vllm/v1/sample/metadata.py +++ b/vllm/v1/sample/metadata.py @@ -6,7 +6,7 @@ from typing import Optional import torch -from vllm.v1.sample.logits_processor import LogitsProcessorManager +from vllm.v1.sample.logits_processor import LogitsProcessors @dataclass @@ -40,4 +40,4 @@ class SamplingMetadata: bad_words_token_ids: dict[int, list[list[int]]] # Loaded logits processors - logitsprocs: LogitsProcessorManager + logitsprocs: LogitsProcessors diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 2469e09f8249d..e718d9d5e0fb0 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -18,8 +18,8 @@ from vllm.utils import swap_dict_values from vllm.v1.outputs import LogprobsTensors from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import (BatchUpdateBuilder, - MoveDirectionality, - init_builtin_logitsprocs) + LogitsProcessors, + MoveDirectionality) from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.spec_decode.utils import is_spec_decode_unsupported from vllm.v1.utils import copy_slice @@ -78,8 +78,11 @@ class InputBatch: pin_memory: bool, vocab_size: int, block_sizes: list[int], # The block_size of each kv cache group + logitsprocs: Optional[LogitsProcessors] = None, is_spec_decode: bool = False, + is_pooling_model: bool = False, ): + self.is_pooling_model = is_pooling_model self.is_spec_decode = is_spec_decode self.max_num_reqs = max_num_reqs self.max_model_len = max_model_len @@ -221,14 +224,6 @@ class InputBatch: # updates. Should reset each step. self.batch_update_builder = BatchUpdateBuilder() - # Define logits processors. - # TODO(andy): logits processor list should be extensible via engine - # constructor argument; for now the list is fixed. - self.logitsprocs = init_builtin_logitsprocs( - pin_memory_available=pin_memory, - max_num_reqs=max_num_reqs + 1, - device=device) - # TODO convert this to LogitsProcessor self.has_allowed_token_ids: set[str] = set() # NOTE(lufang): In the mask tensor, if the corresponding token allowed, @@ -244,6 +239,10 @@ class InputBatch: self.req_output_token_ids: list[Optional[list[int]]] = [] + # Store provided logitsprocs. If none are provided, initialize empty + # data structure + self.logitsprocs = logitsprocs or LogitsProcessors() + # This is updated each time the batch constituents change. self.sampling_metadata = self._make_sampling_metadata() @@ -255,28 +254,35 @@ class InputBatch: # while performing state updates to the batch. return cast(list[str], self._req_ids) - def _get_next_add_index(self) -> int: - if (req_index := self.batch_update_builder.pop_removed()) is not None: - # Fill the empty index. - return req_index - # Append to end - return self.num_reqs - def _register_add_request(self, request: "CachedRequestState") -> int: - """Track add-request operations""" - req_index = self._get_next_add_index() - assert req_index < self.max_num_reqs - params = (request.sampling_params - if request.sampling_params else request.pooling_params) + """Track add-request operations for logits processors. + Not applicable to pooling models. + """ + + # Detailed added request metadata is only required for non-pooling + # models, to support logitsprocs + assert request.sampling_params + + # Fill the next empty index if there is one. + if (new_req_index := self.batch_update_builder.pop_removed()) is None: + # Append to end otherwise. + new_req_index = self.num_reqs + + assert new_req_index < self.max_num_reqs self.batch_update_builder.added.append( - (req_index, params, request.output_token_ids)) - return req_index + (new_req_index, request.sampling_params, request.prompt_token_ids, + request.output_token_ids)) + return new_req_index def add_request( self, request: "CachedRequestState", ) -> int: - req_index = self._register_add_request(request) + if not self.is_pooling_model: + # New request index bookkeeping for autoregressive models. + req_index = self._register_add_request(request) + else: + req_index = self.num_reqs req_id = request.req_id if req_index == len(self._req_ids): @@ -411,7 +417,10 @@ class InputBatch: req_index = self.req_id_to_index.pop(req_id, None) if req_index is None: return None - self.batch_update_builder.removed_append(req_index) + if not self.is_pooling_model: + # Autoregressive models require bookkeeping of removed requests to + # support logitsprocs. + self.batch_update_builder.removed_append(req_index) self._req_ids[req_index] = None self.req_output_token_ids[req_index] = None @@ -446,6 +455,8 @@ class InputBatch: return req_index def swap_states(self, i1: int, i2: int) -> None: + # For autoregressive models, track detailed request reordering info + # to support logitsprocs self.batch_update_builder.moved.append( (i1, i2, MoveDirectionality.SWAP)) old_id_i1 = self._req_ids[i1] @@ -513,11 +524,18 @@ class InputBatch: swaps: list of (from,to) swap tuples for moved requests empty_req_indices: indices not filled by condensation """ + num_reqs = self.num_reqs + + if self.is_pooling_model: + # Will be contiguous in pooling case, just trim the lists. + del self._req_ids[num_reqs:] + del self.req_output_token_ids[num_reqs:] + return + if not (empty_req_indices := self.batch_update_builder.removed): # All removed requests were replaced by added requests, or else no # requests were removed at all. No condense() needed return - num_reqs = self.num_reqs if num_reqs == 0: # The batched states are empty. self._req_ids.clear() @@ -541,6 +559,8 @@ class InputBatch: # Move active request down into empty request # index. self.batch_update_builder.pop_removed() + # Autoregressive models require detailed tracking of condense + # operations to support logitsprocs self.batch_update_builder.moved.append( (last_req_index, empty_index, MoveDirectionality.UNIDIRECTIONAL)) @@ -596,15 +616,20 @@ class InputBatch: last_req_index -= 1 # Trim lists to the batch size. - del self._req_ids[self.num_reqs:] - del self.req_output_token_ids[self.num_reqs:] + del self._req_ids[num_reqs:] + del self.req_output_token_ids[num_reqs:] def refresh_metadata(self): - """Apply batch updates, reset input batch at end of step + """Apply any batch updates to sampling metadata.""" - * Apply batch add/remove/permute to logits procs' states - * If batch state is modified, update sampling metadata - """ + if self.is_pooling_model: + # Batch changes every step for pooling models. + self.sampling_metadata = self._make_sampling_metadata() + return + + # For non-pooling models - generate and apply logitsprocs update; + # reset batch update tracking. + # Update sampling metadata if batch state is changed. batch_update = self.batch_update_builder.get_and_reset(self.num_reqs) for logit_proc in self.logitsprocs.all: logit_proc.update_state(batch_update) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5ee44a82574c0..4219d9147ada2 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -68,6 +68,7 @@ from vllm.v1.kv_cache_interface import (AttentionSpec, from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors, ModelRunnerOutput) from vllm.v1.pool.metadata import PoolingMetadata +from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.rejection_sampler import RejectionSampler from vllm.v1.sample.sampler import Sampler @@ -80,7 +81,6 @@ from vllm.v1.worker.kv_connector_model_runner_mixin import ( KVConnectorModelRunnerMixin, KVConnectorOutput) from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin -from ..sample.logits_processor import LogitsProcessorManager from .utils import (AttentionGroup, MultiModalBudget, bind_kv_cache, gather_mm_placeholders, initialize_kv_cache_for_kv_sharing, sanity_check_mm_encoder_outputs, scatter_mm_placeholders) @@ -221,6 +221,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): vocab_size=self.model_config.get_vocab_size(), block_sizes=[self.cache_config.block_size], is_spec_decode=bool(self.vllm_config.speculative_config), + logitsprocs=build_logitsprocs( + self.vllm_config, self.device, self.pin_memory, + self.is_pooling_model, + self.vllm_config.model_config.logits_processors), + is_pooling_model=self.is_pooling_model, ) # TODO(woosuk): Provide an option to tune the max cudagraph batch size. @@ -2447,7 +2452,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): output_token_ids=[[] for _ in range(num_reqs)], allowed_token_ids_mask=None, bad_words_token_ids={}, - logitsprocs=LogitsProcessorManager(), + logitsprocs=LogitsProcessors(), ) try: sampler_output = self.sampler(logits=logits, @@ -2968,6 +2973,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): vocab_size=self.model_config.get_vocab_size(), block_sizes=block_sizes, is_spec_decode=bool(self.vllm_config.speculative_config), + logitsprocs=self.input_batch.logitsprocs, + is_pooling_model=self.is_pooling_model, ) def _allocate_kv_cache_tensors( From a258ad8bcc0014c04d11a9bc8c6591b379c31b68 Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Sun, 17 Aug 2025 08:41:23 +0800 Subject: [PATCH 045/361] [Bugfix] fix qwen3 moe fp8 accuracy issue (#23031) Signed-off-by: Jinzhen Lin --- vllm/model_executor/layers/quantization/fp8.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index a497449132510..f07be08554921 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -125,6 +125,10 @@ class Fp8Config(QuantizationConfig): ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None) weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"], None) + if not ignored_layers: + ignored_layers = cls.get_from_keys_or(config, + ["modules_to_not_convert"], + None) return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized, activation_scheme=activation_scheme, ignored_layers=ignored_layers, From 94096a47c92c4a53ad44cfffdca918669c0f89e0 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 16 Aug 2025 22:16:42 -0400 Subject: [PATCH 046/361] [UX] Separate marlin moe config logic from triton moe (#23006) --- .../layers/fused_moe/fused_marlin_moe.py | 20 ++++++------------- .../layers/fused_moe/fused_moe.py | 9 +-------- 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index a49d41c18438e..3c6ece6737e4d 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -1,14 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Fused MoE utilities for GPTQ.""" -import functools from typing import Optional import torch import vllm._custom_ops as ops -from vllm.model_executor.layers.fused_moe.fused_moe import ( - moe_align_block_size, try_get_optimal_moe_config) +from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size from vllm.model_executor.layers.quantization.utils.marlin_utils import ( marlin_make_workspace_new, maybe_warn_marlin_atomic_add) from vllm.scalar_type import ScalarType, scalar_types @@ -98,17 +96,11 @@ def fused_marlin_moe(hidden_states: torch.Tensor, N = w2.shape[1] * 16 topk = topk_ids.shape[1] - get_config_func = functools.partial( - try_get_optimal_moe_config, - w1.shape, - w2.shape, - topk_ids.shape[1], - None, - is_marlin=True, - ) - config = get_config_func(M) - - block_size_m = config["BLOCK_SIZE_M"] + # M block size selection logic + # TODO: tune this further for specific models + for block_size_m in [8, 16, 32, 48, 64]: + if M * topk / E / block_size_m < 0.9: + break if global_num_experts == -1: global_num_experts = E diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index e58a9e568d4a4..3579ca22bafc7 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -801,7 +801,6 @@ def get_default_config( K: int, topk: int, dtype: Optional[str], - is_marlin: bool, block_shape: Optional[list[int]] = None, ) -> dict[str, int]: if dtype == "fp8_w8a8" and block_shape is not None: @@ -832,11 +831,6 @@ def get_default_config( config = {"BLOCK_SIZE_M": 32, "GROUP_SIZE_M": 1} else: config = {"BLOCK_SIZE_M": 64, "GROUP_SIZE_M": 1} - elif is_marlin: - for block_size_m in [8, 16, 32, 48, 64]: - if M * topk / E / block_size_m < 0.9: - break - return {"BLOCK_SIZE_M": block_size_m} elif M <= E: config = { "BLOCK_SIZE_M": 16, @@ -860,7 +854,6 @@ def try_get_optimal_moe_config( top_k: int, dtype: Optional[str], M: int, - is_marlin: bool = False, block_shape: Optional[list[int]] = None, ) -> dict[str, int]: from vllm.model_executor.layers.fused_moe import get_config @@ -883,7 +876,7 @@ def try_get_optimal_moe_config( else: # Else use the default config config = get_default_config(M, E, N, w1_shape[2], top_k, dtype, - is_marlin, block_shape) + block_shape) return config From 5c32143b9db19ae728087019678843fa238afa82 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 17 Aug 2025 12:05:50 +0800 Subject: [PATCH 047/361] [Refactor] Defer tensor data construction in MultiModalKwargs (#23030) Signed-off-by: DarkLight1337 --- tests/multimodal/test_cache.py | 2 +- tests/v1/test_serial_utils.py | 34 +------ vllm/inputs/registry.py | 2 +- .../models/prithvi_geospatial_mae.py | 2 +- vllm/multimodal/base.py | 2 +- vllm/multimodal/cache.py | 2 +- vllm/multimodal/inputs.py | 96 +++++++++++-------- vllm/multimodal/processing.py | 2 +- vllm/multimodal/utils.py | 12 ++- vllm/sequence.py | 4 +- vllm/v1/serial_utils.py | 17 +--- vllm/v1/worker/gpu_input_batch.py | 2 +- 12 files changed, 73 insertions(+), 104 deletions(-) diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py index e07b73bd257d6..2149f05b6af09 100644 --- a/tests/multimodal/test_cache.py +++ b/tests/multimodal/test_cache.py @@ -25,7 +25,7 @@ def _dummy_item(modality: str, size_by_key: dict[str, int]): def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]): - return MultiModalKwargs.from_items([ + return MultiModalKwargs([ _dummy_item(modality, size_by_key) for modality, size_by_key in size_by_key_modality.items() ]) diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py index 0ab4e0bf59cf5..586276ee08aef 100644 --- a/tests/v1/test_serial_utils.py +++ b/tests/v1/test_serial_utils.py @@ -100,38 +100,6 @@ class MyRequest(msgspec.Struct): def test_multimodal_kwargs(): - d = { - "foo": - torch.zeros(20000, dtype=torch.float16), - "bar": [torch.zeros(i * 1000, dtype=torch.int8) for i in range(3)], - "baz": [ - torch.rand((256), dtype=torch.float16), - [ - torch.rand((1, 12), dtype=torch.float32), - torch.rand((3, 5, 7), dtype=torch.float64), - ], [torch.rand((4, 4), dtype=torch.float16)] - ], - } - - # pack mm kwargs into a mock request so that it can be decoded properly - req = MyRequest(mm=[MultiModalKwargs(d)]) - - encoder = MsgpackEncoder() - decoder = MsgpackDecoder(MyRequest) - - encoded = encoder.encode(req) - - assert len(encoded) == 6 - - total_len = sum(memoryview(x).cast("B").nbytes for x in encoded) - - # expected total encoding length, should be 44559, +-20 for minor changes - assert 44539 <= total_len <= 44579 - decoded: MultiModalKwargs = decoder.decode(encoded).mm[0] - assert all(nested_equal(d[k], decoded[k]) for k in d) - - -def test_multimodal_items_by_modality(): e1 = MultiModalFieldElem("audio", "a0", torch.zeros(1000, dtype=torch.bfloat16), MultiModalBatchedField()) @@ -151,7 +119,7 @@ def test_multimodal_items_by_modality(): audio = MultiModalKwargsItem.from_elems([e1]) video = MultiModalKwargsItem.from_elems([e2]) image = MultiModalKwargsItem.from_elems([e3, e4]) - mm = MultiModalKwargs.from_items([audio, video, image]) + mm = MultiModalKwargs([audio, video, image]) # pack mm kwargs into a mock request so that it can be decoded properly req = MyRequest([mm]) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index dc3236508348f..ef146fdfbf97c 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -240,6 +240,6 @@ class InputRegistry: return DummyData( seq_data=SequenceData.from_seqs(dec_data.prompt_token_ids), - multi_modal_data=dec_data.multi_modal_data, + multi_modal_data=dec_data.multi_modal_data.get_data(), multi_modal_placeholders=dec_data.multi_modal_placeholders, ) diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index 20f423cc7603d..68488829071fa 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -136,7 +136,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor): type="multimodal", prompt=prompt, prompt_token_ids=[1], - mm_kwargs=MultiModalKwargs.from_items(multimodal_kwargs_items), + mm_kwargs=MultiModalKwargs(multimodal_kwargs_items), mm_hashes=None, mm_placeholders=mm_placeholders, ) diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 7188ed14c5735..ef8f1b2e17b47 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -99,7 +99,7 @@ class MultiModalPlaceholderMap: seq_mm_placeholders = seq_group.multi_modal_placeholders if not seq_mm_data or not seq_mm_placeholders: - return MultiModalKwargs({}), {} + return MultiModalKwargs(), {} placeholder_maps = dict[str, MultiModalPlaceholderMap]() diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 6074a4d54f223..8c4136e06f818 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -46,7 +46,7 @@ class MultiModalCache: ) -> int: # MultiModalKwargs is not a subclass of dict if isinstance(leaf, MultiModalKwargs): - return cls.get_item_size(leaf.data, debug=debug) + return cls.get_item_size(leaf.get_data(), debug=debug) # MultiModalKwargsItem is not a subclass of dict if isinstance(leaf, MultiModalKwargsItem): diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index a33ce146995d8..d3f57cf5338d5 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -653,7 +653,7 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): def from_elems(elems: Sequence[MultiModalFieldElem]): return MultiModalKwargsItem({elem.key: elem for elem in elems}) - def __init__(self, data: Mapping[str, MultiModalFieldElem]) -> None: + def __init__(self, data: Mapping[str, MultiModalFieldElem] = {}) -> None: super().__init__(data) modalities = {elem.modality for elem in self.data.values()} @@ -668,9 +668,7 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): return {key: elem.data for key, elem in self.items()} -# NOTE: UserDict is for V0 compatibility. -# V1 should access individual items via `get_item`. -class MultiModalKwargs(UserDict[str, NestedTensors]): +class MultiModalKwargs: """ A dictionary that represents the keyword arguments to [`torch.nn.Module.forward`][]. @@ -714,40 +712,16 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): elems = [v[item_idx] for v in elems_in_modality.values()] items.append(MultiModalKwargsItem.from_elems(elems)) - return MultiModalKwargs.from_items(items) + return MultiModalKwargs(items) - @staticmethod - def from_items( - items: Sequence[MultiModalKwargsItem], - *, - pin_memory: bool = False, - ): - """Construct a new - [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs] - from multiple items.""" - elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list) - for item in items: - for key, elem in item.items(): - elems_by_key[key].append(elem) + def __init__(self, items: Sequence[MultiModalKwargsItem] = ()) -> None: + super().__init__() - data = { - key: elems[0].field.reduce_data(elems, pin_memory=pin_memory) - for key, elems in elems_by_key.items() if len(elems) > 0 - } - - return MultiModalKwargs(data, items=items) - - def __init__( - self, - data: Mapping[str, NestedTensors], - *, - items: Optional[Sequence[MultiModalKwargsItem]] = None, - ) -> None: - super().__init__(data) - - items_by_modality = full_groupby(items or [], key=lambda x: x.modality) + items_by_modality = full_groupby(items, key=lambda x: x.modality) self._items_by_modality = dict(items_by_modality) + self._data: Optional[Mapping[str, NestedTensors]] = None + @property def modalities(self): return self._items_by_modality.keys() @@ -839,22 +813,41 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): return cast(BatchedTensorInputs, json_mapped) - def __delitem__(self, key: str) -> None: - super().__delitem__(key) + def keys(self): + return self.get_data().keys() + + def values(self): + return self.get_data().values() + + def items(self): + return self.get_data().items() + + def get(self, key: str, /, default=None): + return self.get_data().get(key, default) + + def pop(self, key: str, *args, **kwargs): + data = dict(self.get_data()) + res = data.pop(key, *args, **kwargs) for items in self._items_by_modality.values(): for item in items: - item.pop(key, None) + item.pop(key, *args, **kwargs) + + self._data = None + + return res + + def __iter__(self): + return iter(self.get_data()) + + def __getitem__(self, key: str): + return self.get_data()[key] def __eq__(self, other: object) -> bool: if not isinstance(other, self.__class__): return False - if self._items_by_modality != other._items_by_modality: - return False - ks = self.keys() - return (ks == other.keys() - and all(nested_tensors_equal(self[k], other[k]) for k in ks)) + return self._items_by_modality == other._items_by_modality def _validate_modality(self, method_name: str, modality: str) -> None: if not self._items_by_modality: @@ -888,6 +881,25 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): self._validate_modality("get_items", modality) return self._items_by_modality[modality] + def get_data(self, + *, + pin_memory: bool = False) -> Mapping[str, NestedTensors]: + if self._data is not None: + return self._data + + elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list) + for items in self._items_by_modality.values(): + for item in items: + for key, elem in item.items(): + elems_by_key[key].append(elem) + + data = { + key: elems[0].field.reduce_data(elems, pin_memory=pin_memory) + for key, elems in elems_by_key.items() if len(elems) > 0 + } + self._data = data + return data + MultiModalPlaceholderDict: TypeAlias = Mapping[str, Sequence[PlaceholderRange]] """ diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 38c5d5d99f63e..4684bf6f3d83a 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1480,7 +1480,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): mm_missing_kwargs=mm_missing_kwargs, ) - mm_kwargs = MultiModalKwargs.from_items([ + mm_kwargs = MultiModalKwargs([ item for cache_items in mm_cache_items_merged.values() for item in cache_items ]) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index f914d0dc6c5e7..a80f09bb19272 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -402,12 +402,14 @@ def group_mm_kwargs_by_modality( for modality, items in groupby(mm_kwargs, key=lambda item: item.modality): items_lst = list(items) - # mm_kwargs_group = MultiModalKwargs.from_items(items_lst, - # pin_memory=pin_memory) + # mm_kwargs_group = MultiModalKwargs(items_lst) \ + # .get_data(pin_memory=pin_memory) # if device is not None: - # mm_kwargs_group = json_map_leaves(lambda x: x.to(device=device), - # mm_kwargs_group.data) + # mm_kwargs_group = json_map_leaves( + # lambda x: x.to(device=device), + # mm_kwargs_group, + # ) # TODO: Once V0 is removed, we can use the merging logic above # to avoid creating an extra batch dimension (except for fields @@ -415,7 +417,7 @@ def group_mm_kwargs_by_modality( # We will also need to update each model to remove `flatten_bn`. mm_kwargs_group = MultiModalKwargs.as_kwargs( MultiModalKwargs.batch( - [MultiModalKwargs.from_items([item]) for item in items_lst], + [MultiModalKwargs([item]) for item in items_lst], pin_memory=pin_memory, ), device=device, diff --git a/vllm/sequence.py b/vllm/sequence.py index cbe63f8d1d4e4..b3be10b6bb612 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -524,7 +524,7 @@ class Sequence: if self.inputs["type"] == "multimodal": return self.inputs["mm_kwargs"] - return MultiModalKwargs({}) + return MultiModalKwargs() @property def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: @@ -780,7 +780,7 @@ class SequenceGroup: return self.first_seq.multi_modal_data elif self.encoder_seq is not None: return self.encoder_seq.multi_modal_data - return MultiModalKwargs({}) + return MultiModalKwargs() @property def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index 3f0fad8a64d0a..2857d8ef42909 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -117,16 +117,9 @@ class MsgpackEncoder: return self._encode_mm_item(obj) if isinstance(obj, MultiModalKwargs): - mm: MultiModalKwargs = obj - if not mm.modalities: - # just return the main dict if there are no modalities. - return dict(mm) - - # ignore the main dict, it will be re-indexed. - # Any tensors *not* indexed by modality will be ignored. return [ self._encode_mm_item(item) - for itemlist in mm._items_by_modality.values() + for itemlist in obj._items_by_modality.values() for item in itemlist ] @@ -268,13 +261,7 @@ class MsgpackDecoder: if issubclass(t, MultiModalKwargsItem): return self._decode_mm_item(obj) if issubclass(t, MultiModalKwargs): - if isinstance(obj, list): - return MultiModalKwargs.from_items( - self._decode_mm_items(obj)) - return MultiModalKwargs({ - k: self._decode_nested_tensors(v) - for k, v in obj.items() - }) + return MultiModalKwargs(self._decode_mm_items(obj)) if t is UtilityResult: return self._decode_utility_result(obj) return obj diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index e718d9d5e0fb0..3d4cf27a6ccf3 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -58,7 +58,7 @@ class CachedRequestState: @deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be " "removed in v0.13. Please use `mm_kwargs` instead.") def mm_inputs(self) -> list[MultiModalKwargs]: - return [MultiModalKwargs.from_items([item]) for item in self.mm_kwargs] + return [MultiModalKwargs([item]) for item in self.mm_kwargs] def get_token_id(self, idx: int) -> int: if idx < self.num_prompt_tokens: From 87f48623a537d379284bb3e3d1b23ab0ee2af1c1 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Sun, 17 Aug 2025 12:49:14 +0800 Subject: [PATCH 048/361] [Misc] method name typo fix (#23042) Signed-off-by: Andy Xie --- vllm/v1/worker/cpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index 11b96d946365d..a7180afbd64b5 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -29,7 +29,7 @@ class CPUModelRunner(GPUModelRunner): self.use_cuda_graph = False self.cascade_attn_enabled = False - self._postprocess_tenosrs() + self._postprocess_tensors() def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None: """ @@ -59,7 +59,7 @@ class CPUModelRunner(GPUModelRunner): self.attn_groups[0][0].metadata_builder.reorder_batch( self.input_batch, scheduler_output) - def _postprocess_tenosrs(self) -> None: + def _postprocess_tensors(self) -> None: # Note: replace device tensors with cpu tensors def replace_tensor(obj: Any, cpu_attr_name: str, device_attr_name) -> None: From 4d4061b6e73d82f7e561fff64c2bd914d66ebaff Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sun, 17 Aug 2025 13:03:24 +0800 Subject: [PATCH 049/361] [Kernel] Add cuda kernel for gpt_oss activation (#22951) Signed-off-by: Jee Jee Li --- csrc/activation_kernels.cu | 59 +++++++++++++++++++ csrc/ops.h | 2 + csrc/torch_bindings.cpp | 6 ++ tests/kernels/core/test_activation.py | 45 ++++++++++++-- vllm/model_executor/layers/activation.py | 41 ++++++++++++- .../layers/fused_moe/fused_marlin_moe.py | 22 ++----- .../layers/fused_moe/fused_moe.py | 18 ++---- .../layers/quantization/utils/mxfp4_utils.py | 4 +- vllm/model_executor/models/gpt_oss.py | 2 +- 9 files changed, 157 insertions(+), 42 deletions(-) diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 55e6596797010..a4a880f13cf7e 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -128,6 +128,45 @@ __global__ void act_and_mul_kernel_with_param( } } +template +__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up, + float alpha, float limit) { + // clamp gate: min=None, max=limit + const float gate_f = (float)gate; + const float clamped_gate = gate_f > limit ? limit : gate_f; + + // clamp up: min=-limit, max=limit + const float up_f = (float)up; + const float clamped_up = + up_f > limit ? limit : (up_f < -limit ? -limit : up_f); + + // glu = gate * sigmoid(gate * alpha) + const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha)); + const float glu = clamped_gate * sigmoid_val; + + // (up + 1) * glu + return (T)((clamped_up + 1.0f) * glu); +} + +template +__global__ void swigluoai_and_mul_kernel( + scalar_t* __restrict__ out, // [..., d] + const scalar_t* __restrict__ input, // [..., 2, d] + const int d, const float alpha, const float limit) { + const int64_t token_idx = blockIdx.x; + // TODO: Vectorize loads and stores. + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + // gate = x[..., ::2] (even indices) + const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]); + // up = x[..., 1::2] (odd indices) + const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]); + + out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit); + } +} + } // namespace vllm #define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \ @@ -145,11 +184,31 @@ __global__ void act_and_mul_kernel_with_param( PARAM); \ }); +#define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT) \ + int d = input.size(-1) / 2; \ + int64_t num_tokens = input.numel() / input.size(-1); \ + dim3 grid(num_tokens); \ + dim3 block(std::min(d, 1024)); \ + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ + VLLM_DISPATCH_FLOATING_TYPES( \ + input.scalar_type(), "clamp_swiglu_kernel_with_params", [&] { \ + vllm::swigluoai_and_mul_kernel> \ + <<>>(out.data_ptr(), \ + input.data_ptr(), d, ALPHA, \ + LIMIT); \ + }); + void fatrelu_and_mul(torch::Tensor& out, // [..., d], torch::Tensor& input, // [..., 2 * d] double threshold) { LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold); } +void swigluoai_and_mul(torch::Tensor& out, // [..., d] + torch::Tensor& input, // [..., 2 * d] + double alpha, double limit) { + LAUNCH_SIGLUOAI_AND_MUL(vllm::swigluoai_and_mul, alpha, limit); +} namespace vllm { // Element-wise activation kernel template. diff --git a/csrc/ops.h b/csrc/ops.h index 6e39758f16a1f..64bcec6ca1527 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -138,6 +138,8 @@ void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input); void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input, double threshold); +void swigluoai_and_mul(torch::Tensor& out, torch::Tensor& input, + double alpha = 1.702, double limit = 7.0); void gelu_new(torch::Tensor& out, torch::Tensor& input); diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 5fee106335d3b..7079671c2eb16 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -130,6 +130,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()"); ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul); + ops.def( + "swigluoai_and_mul(Tensor! out, Tensor input, float alpha=1.702, float " + "limit=7.0) " + "-> ()"); + ops.impl("swigluoai_and_mul", torch::kCUDA, &swigluoai_and_mul); + // GELU implementation used in GPT-2. ops.def("gelu_new(Tensor! out, Tensor input) -> ()"); ops.impl("gelu_new", torch::kCUDA, &gelu_new); diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py index 29c5e70a8ba85..ec5c60fd7b0e2 100644 --- a/tests/kernels/core/test_activation.py +++ b/tests/kernels/core/test_activation.py @@ -11,7 +11,7 @@ from tests.kernels.utils import opcheck from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul, GeluAndMul, MulAndSilu, NewGELU, QuickGELU, - SiluAndMul) + SiluAndMul, SwigluOAIAndMul) from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] @@ -25,7 +25,15 @@ CUDA_DEVICES = [ @pytest.mark.parametrize( "activation", - ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"]) + [ + "silu_and_mul", + "mul_and_silu", + "gelu", + "gelu_tanh", + "fatrelu", + "swigluoai_and_mul", + ], +) @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @@ -59,18 +67,43 @@ def test_act_and_mul( threshold = random.uniform(0, 1) layer = FatreluAndMul(threshold) fn = torch.ops._C.fatrelu_and_mul + elif activation == "swigluoai_and_mul": + layer = SwigluOAIAndMul() + fn = torch.ops._C.swigluoai_and_mul out = layer(x) ref_out = layer.forward_native(x) - # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are - # equivalent to the native PyTorch implementations, so we can do exact - # comparison. - torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0) + if activation == "swigluoai_and_mul": + + rtol = { + #For fp16, change the relative tolerance from 1e-3 to 2e-3 + torch.float16: + 2e-3, + torch.bfloat16: + 2e-2, + torch.float: + 1.3e-6 + } + + def _get_rtol(output) -> float: + return rtol[output.dtype] + + torch.testing.assert_close(out, + ref_out, + atol=get_default_atol(out), + rtol=_get_rtol(out)) + else: + # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are + # equivalent to the native PyTorch implementations, so we can do exact + # comparison. + torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0) d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) if activation == "fatrelu": opcheck(fn, (out, x, threshold)) + elif activation == "swigluoai_and_mul": + opcheck(fn, (out, x, layer.alpha, layer.limit)) else: opcheck(fn, (out, x)) diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 7ce44174ead6d..86ab4f546d127 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -239,6 +239,35 @@ class GeluAndMul(CustomOp): return f'approximate={repr(self.approximate)}' +@CustomOp.register("swigluoai_and_mul") +class SwigluOAIAndMul(CustomOp): + # https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110 + def __init__(self, alpha: float = 1.702, limit: float = 7.0): + super().__init__() + self.alpha = alpha + self.limit = limit + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + + gate, up = x[..., ::2], x[..., 1::2] + gate = gate.clamp(min=None, max=self.limit) + up = up.clamp(min=-self.limit, max=self.limit) + glu = gate * torch.sigmoid(gate * self.alpha) + gated_output = (up + 1) * glu + return gated_output + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + torch.ops._C.swigluoai_and_mul(out, x, self.alpha, self.limit) + return out + + def extra_repr(self) -> str: + return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}" + + @CustomOp.register("gelu_new") class NewGELU(CustomOp): @@ -330,6 +359,7 @@ class ReLUSquaredActivation(CustomOp): return torch.square(F.relu(x)) def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + #TODO : implement cuda kenrels return self.forward_native(x) @@ -406,9 +436,14 @@ def get_act_fn(act_fn_name: str) -> nn.Module: _ACTIVATION_AND_MUL_REGISTRY = LazyDict({ - "gelu": lambda: GeluAndMul(), - "silu": lambda: SiluAndMul(), - "geglu": lambda: GeluAndMul(), + "gelu": + lambda: GeluAndMul(), + "silu": + lambda: SiluAndMul(), + "geglu": + lambda: GeluAndMul(), + "swigluoai": + lambda *args, **kwargs: SwigluOAIAndMul(*args, **kwargs), }) diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 3c6ece6737e4d..1e3ac6cd79f68 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -161,25 +161,13 @@ def fused_marlin_moe(hidden_states: torch.Tensor, if activation == "silu": torch.ops._C.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N)) - elif activation == "swiglu_oai": - # NOTE: in gpt-oss, the gate_proj and up_proj is interleaved - # - interleaved: gate, up = gate_up[..., ::2], gate_up[..., 1::2] - # - origin: gate, up = gate_up[..., :N], gate_up[..., N:] - - @torch.compile(dynamic=True) - def swiglu_oai(gate_up): - alpha = 1.702 - limit = 7.0 - gate, up = gate_up[..., ::2], gate_up[..., 1::2] - gate = gate.clamp(min=None, max=limit) - up = up.clamp(min=-limit, max=limit) - glu = gate * torch.sigmoid(gate * alpha) - return (up + 1) * glu - - intermediate_cache2 = swiglu_oai(intermediate_cache1) + elif activation == "swigluoai": + # alpha = 1.702, limit = 7.0 + torch.ops._C.swigluoai_and_mul(intermediate_cache2, + intermediate_cache1.view(-1, 2 * N)) else: raise ValueError(f"Unsupported activation: {activation}. " - "Only silu and swiglu_oai activations are supported.") + "Only silu and swigluoai activations are supported.") if expert_map is not None: intermediate_cache3.zero_() diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 3579ca22bafc7..02b7b65f4a025 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1621,17 +1621,6 @@ def fused_experts_impl( block_shape=block_shape, B_bias=w1_bias) - # TODO fused kernel - def swiglu_oai(gate_up): - alpha = 1.702 - limit = 7.0 - gate, up = gate_up[..., ::2], gate_up[..., 1::2] - gate = gate.clamp(min=None, max=limit) - up = up.clamp(min=-limit, max=limit) - glu = gate * torch.sigmoid(gate * alpha) - gated_output = (up + 1) * glu - return gated_output - # Activation function with multiplication if activation == "silu" and is_act_and_mul: torch.ops._C.silu_and_mul(intermediate_cache2, @@ -1639,13 +1628,16 @@ def fused_experts_impl( elif activation == "gelu" and is_act_and_mul: torch.ops._C.gelu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) + elif activation == "swigluoai" and is_act_and_mul: + # alpha = 1.702, limit = 7.0 + torch.ops._C.swigluoai_and_mul(intermediate_cache2, + intermediate_cache1.view(-1, N)) # Activation function without multiplication elif activation == "silu": intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N)) elif activation == "gelu": intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N)) - elif activation == "swiglu_oai": - intermediate_cache2 = swiglu_oai(intermediate_cache1.view(-1, N)) + else: raise ValueError(f"Unsupported FusedMoe activation: {activation}, " f"with is_act_and_mul={is_act_and_mul}.") diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index deeb69bcad0ec..48f9cc3737e47 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -61,14 +61,14 @@ def _can_support_mxfp4(use_grouped_topk: bool = False, e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, scoring_func: str = "softmax", - activation: str = "swiglu_oai", + activation: str = "swigluoai", expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None): return not (use_grouped_topk or topk_group or num_expert_group or expert_map or custom_routing_function or e_score_correction_bias or apply_router_weight_on_input - or scoring_func != "softmax" or activation != "swiglu_oai" + or scoring_func != "softmax" or activation != "swigluoai" or expert_load_view or logical_to_physical_map or logical_replica_count) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 7c7712dbe106e..2f5d9ddd9054f 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -159,7 +159,7 @@ class MLPBlock(torch.nn.Module): prefix=f"{prefix}.experts", apply_router_weight_on_input=False, has_bias=True, - activation="swiglu_oai") + activation="swigluoai") def forward(self, x: torch.Tensor) -> torch.Tensor: t = self.norm(x) From fe0411fc6fa32cebeacd3a3aef87a591e7309c45 Mon Sep 17 00:00:00 2001 From: 947132885 <947132885@qq.com> Date: Sun, 17 Aug 2025 16:46:36 +0800 Subject: [PATCH 050/361] [Bugfix] should use stack instead of concat (#22972) Signed-off-by: 947132885 <947132885@qq.com> Signed-off-by: Isotr0py Co-authored-by: Isotr0py --- vllm/model_executor/models/transformers.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 4ec2b683fc33a..f3b7263ca3872 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -694,6 +694,17 @@ class TransformersForCausalLM(TransformersBase): return logits +def flatten_and_concat(x: list[torch.Tensor]) -> torch.Tensor: + """Flatten until a list of tensors can be concatenated then do concat""" + + def _can_concat(x: list[torch.Tensor]): + return len(set(map(lambda _x: _x.shape[1:], x))) == 1 + + if _can_concat(x): + return torch.concat(x) + return flatten_and_concat(flatten_bn(x)) + + @MULTIMODAL_REGISTRY.register_processor( MultiModalProcessor, info=MultiModalProcessingInfo, @@ -766,8 +777,7 @@ class TransformersForMultimodalLM(TransformersForCausalLM, SupportsMultiModal): if isinstance(pixel_values, torch.Tensor): pixel_values = flatten_bn(pixel_values).to(self.dtype) elif is_list_of(pixel_values, torch.Tensor): - pixel_values = flatten_bn(flatten_bn(pixel_values), - concat=True).to(self.dtype) + pixel_values = flatten_and_concat(pixel_values).to(self.dtype) else: raise ValueError( f"Unsupported pixel_values type {type(pixel_values)}. " From 16bff144be6739c9f773968ace0b9cd239f67f19 Mon Sep 17 00:00:00 2001 From: Kevinzz Date: Sun, 17 Aug 2025 16:56:20 +0800 Subject: [PATCH 051/361] [Misc] fix typo in the multimodal doc (#23051) --- docs/features/multimodal_inputs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index cdd32924b5668..9d51f9cf52f50 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -216,7 +216,7 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown from vllm import LLM, SamplingParams from qwen_vl_utils import process_vision_info - model_path = "Qwen/Qwen2.5-VL-3B-Instruct/" + model_path = "Qwen/Qwen2.5-VL-3B-Instruct" video_path = "https://content.pexels.com/videos/free-videos.mp4" llm = LLM( From 292084e72ac553dbe14eb897372617a786322a2a Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Sun, 17 Aug 2025 11:52:04 -0400 Subject: [PATCH 052/361] [BugFix] Fix for IMA in FA3 varlen combine (#22967) Signed-off-by: Lucas Wilkinson --- cmake/external_projects/vllm_flash_attn.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index 4e2a0e4533e60..49defccbb1fa4 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 2d3b7508f67ad976f781e2042ace676419dd78dd + GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn From c55bc1db26f5e4385c8a2c1b7e6ba8b54ab2e060 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 17 Aug 2025 10:36:46 -0700 Subject: [PATCH 053/361] [Misc] Remove dead return (#23061) Signed-off-by: Woosuk Kwon --- vllm/model_executor/models/qwen2_vl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index f2d438b3850b8..9e2f7ca42b4ba 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1225,7 +1225,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: return [] - return None # The result multimodal_embeddings is tuple of tensors, with each # tensor correspoending to a multimodal data item (image or video). From 6d243efedab9a03348cbd55fe966b62a08d90676 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 17 Aug 2025 12:41:38 -0700 Subject: [PATCH 054/361] [Misc] Convert use_structured_output property into constant (#23060) Signed-off-by: Woosuk Kwon --- vllm/v1/request.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 8b703b6191fe6..4e99a9ccef46e 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -54,8 +54,7 @@ class Request: time.time() self.status = RequestStatus.WAITING - if sampling_params and sampling_params.guided_decoding is not None: - self.status = RequestStatus.WAITING_FOR_FSM + self.use_structured_output = False self.events: list[EngineCoreEvent] = [] self.stop_reason: Union[int, str, None] = None @@ -63,12 +62,15 @@ class Request: self.kv_transfer_params: Optional[dict[str, Any]] = None if pooling_params is not None: + # Pooling models. self.max_tokens = 1 elif sampling_params is not None: + # Generative models. assert sampling_params.max_tokens is not None self.max_tokens = sampling_params.max_tokens if sampling_params.guided_decoding is not None: self.status = RequestStatus.WAITING_FOR_FSM + self.use_structured_output = True if sampling_params.extra_args is not None: self.kv_transfer_params = \ @@ -192,11 +194,6 @@ class Request: num_tokens = self.mm_positions[input_id].length return num_tokens - @property - def use_structured_output(self) -> bool: - return self.sampling_params is not None and \ - self.sampling_params.guided_decoding is not None - def record_event( self, event_type: EngineCoreEventType, From 21e39436c8062ebbf4a160eebf56d7d303896e68 Mon Sep 17 00:00:00 2001 From: Calvin Chen Date: Mon, 18 Aug 2025 05:45:42 +0800 Subject: [PATCH 055/361] [XPU] fix xpu to set cudagraph batch sizes (#23044) Signed-off-by: calvin chen --- vllm/v1/worker/gpu_model_runner.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4219d9147ada2..adaa1306f6ca4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -232,8 +232,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # The convention is different. # self.cudagraph_batch_sizes sorts in ascending order. # The batch sizes in the config are in descending order. - self.cudagraph_batch_sizes = list( - reversed(self.compilation_config.cudagraph_capture_sizes)) + if self.compilation_config.cudagraph_capture_sizes and \ + self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE: + self.cudagraph_batch_sizes = list( + reversed(self.compilation_config.cudagraph_capture_sizes)) # Cache the device properties. self._init_device_properties() From 0fc8fa751a4321d6531467537ff77cf3c1c70260 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Sun, 17 Aug 2025 15:56:07 -0700 Subject: [PATCH 056/361] fix: gptq marlin weight loading failure (#23066) --- vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index bd14ab9ef6c69..c5d1e017014f3 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -56,7 +56,7 @@ def get_moe_quant_method( # Dynamic per module/layer rules may override base config override_config(cloned_config, prefix=prefix) - return moe_method_cls(cloned_config) + return moe_method_cls(cloned_config, layer.moe_config) return None From 8ea0c2753a273e24957ab4587c200a3254ebe970 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 17 Aug 2025 18:16:03 -0700 Subject: [PATCH 057/361] [Misc] Minor code cleanup for _get_prompt_logprobs_dict (#23064) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index adaa1306f6ca4..fc320be1c3bda 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1722,7 +1722,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Compute prompt logprobs if needed. prompt_logprobs_dict = self._get_prompt_logprobs_dict( hidden_states[:num_scheduled_tokens], - scheduler_output, + scheduler_output.num_scheduled_tokens, ) # Get the valid generated tokens. @@ -2064,7 +2064,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def _get_prompt_logprobs_dict( self, hidden_states: torch.Tensor, - scheduler_output: "SchedulerOutput", + num_scheduled_tokens: dict[str, int], ) -> dict[str, Optional[LogprobsTensors]]: num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs if not num_prompt_logprobs_dict: @@ -2077,8 +2077,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # maintainable loop over optimal performance. completed_prefill_reqs = [] for req_id, num_prompt_logprobs in num_prompt_logprobs_dict.items(): - - num_tokens = scheduler_output.num_scheduled_tokens[req_id] + num_tokens = num_scheduled_tokens[req_id] # Get metadata for this request. request = self.requests[req_id] From 7be3a59d8ee7014d6462c258222cbfa8be815831 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Mon, 18 Aug 2025 13:09:08 +0800 Subject: [PATCH 058/361] [Misc] enhance static type hint (#23059) Signed-off-by: Andy Xie --- vllm/v1/worker/lora_model_runner_mixin.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 2fbdee4724e35..84ed46989ea97 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -8,6 +8,7 @@ from contextlib import contextmanager from typing import Union import numpy as np +import torch import torch.nn as nn from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig @@ -31,7 +32,8 @@ class LoRAModelRunnerMixin: def load_lora_model(self, model: nn.Module, model_config: ModelConfig, scheduler_config: SchedulerConfig, - lora_config: LoRAConfig, device: str) -> nn.Module: + lora_config: LoRAConfig, + device: torch.device) -> nn.Module: if not supports_lora(model): raise ValueError( From 9f1c6422549d37eee22bfa4dbadaaa91d95e98ba Mon Sep 17 00:00:00 2001 From: double7 <33449816+DoubleVII@users.noreply.github.com> Date: Mon, 18 Aug 2025 13:09:11 +0800 Subject: [PATCH 059/361] [Bugfix] fix Qwen2.5-Omni processor output mapping (#23058) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: double7 <33449816+DoubleVII@users.noreply.github.com> Co-authored-by: 杨森 Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/models/qwen2_5_omni_thinker.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index e95295c31885a..59411eb7503bf 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -88,6 +88,11 @@ def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str, torch.Tensor]): video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3))) video_grid_sizes = video_grid_thw.prod(-1) + # vllm use `second_per_grid_ts` to compute multimodal rotary embedding + video_second_per_grid = hf_inputs.get("video_second_per_grid", None) + if video_second_per_grid is not None: + hf_inputs["second_per_grid_ts"] = video_second_per_grid + return dict( input_audio_features=MultiModalFieldConfig.flat_from_sizes( "audio", audio_feature_lengths, dim=1), From b2fd0b81e065c677ceebecb9a0e1ee6f226b7cec Mon Sep 17 00:00:00 2001 From: Andy Lo Date: Mon, 18 Aug 2025 07:10:26 +0200 Subject: [PATCH 060/361] [Bugfix][CI] Machete kernels: deterministic ordering for more cache hits (#23055) Signed-off-by: Andy Lo --- csrc/quantization/machete/generate.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index 9af7833d09f32..88b3f9c734a30 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -349,9 +349,12 @@ def to_cute_constant(value: list[int]): def unique_schedules(impl_configs: list[ImplConfig]): - return list( - set(sch for impl_config in impl_configs - for sch in impl_config.schedules)) + # Use dict over set for deterministic ordering + return list({ + sch: None + for impl_config in impl_configs + for sch in impl_config.schedules + }.keys()) def unsigned_type_with_bitwidth(num_bits): From 08d5f7113a024818b2867782c2539794b7aa162b Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Mon, 18 Aug 2025 13:16:21 +0800 Subject: [PATCH 061/361] [Misc] refactor function name (#23029) Signed-off-by: Andy Xie --- vllm/platforms/cpu.py | 2 +- vllm/v1/worker/cpu_worker.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 0b16a8e1d1d8b..fe258f76b9d7a 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -268,7 +268,7 @@ class CpuPlatform(Platform): DEFAULT_MAX_NUM_BATCHED_TOKENS) @classmethod - def get_allowed_cpu_memory_node_list( + def get_allowed_cpu_core_node_list( cls) -> tuple[list[int], list[LogicalCPUInfo]]: assert platform.system() == "Linux" diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index 2dc28d93049ab..f83d6804840eb 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -132,7 +132,7 @@ class CPUWorker(Worker): """ allowed_numa_nodes, logical_cpu_list = \ - CpuPlatform.get_allowed_cpu_memory_node_list() + CpuPlatform.get_allowed_cpu_core_node_list() assert len(allowed_numa_nodes) >= self.parallel_config.world_size, ( f"No enough allowed NUMA nodes to bind threads of " f"{self.parallel_config.world_size} CPUWorkers. " From 89657a557c6831cca9fa5e59822af0cf27d67a98 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 17 Aug 2025 23:33:29 -0700 Subject: [PATCH 062/361] [Misc] Fix backward compatibility from #23030 (#23070) Signed-off-by: Roger Wang Co-authored-by: Roger Wang --- vllm/multimodal/base.py | 9 ++++++--- vllm/multimodal/inputs.py | 6 +++--- vllm/sequence.py | 4 +++- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index ef8f1b2e17b47..c4bb8d56ce3eb 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar if TYPE_CHECKING: from vllm.sequence import SequenceGroupMetadata -from .inputs import MultiModalKwargs, PlaceholderRange +from .inputs import MultiModalKwargs, NestedTensors, PlaceholderRange _T = TypeVar("_T") @@ -56,7 +56,8 @@ class MultiModalPlaceholderMap: @classmethod def from_seq_group( cls, seq_group: "SequenceGroupMetadata", positions: range - ) -> tuple[MultiModalKwargs, dict[str, "MultiModalPlaceholderMap"]]: + ) -> tuple[dict[str, NestedTensors], dict[str, + "MultiModalPlaceholderMap"]]: """ Returns the multi-modal items that intersect with the portion of a prompt (``seq_group``) represented by ``positions``, as well as a @@ -99,7 +100,7 @@ class MultiModalPlaceholderMap: seq_mm_placeholders = seq_group.multi_modal_placeholders if not seq_mm_data or not seq_mm_placeholders: - return MultiModalKwargs(), {} + return MultiModalKwargs().get_data(), {} placeholder_maps = dict[str, MultiModalPlaceholderMap]() @@ -116,6 +117,8 @@ class MultiModalPlaceholderMap: placeholder_maps[modality] = placeholder_map + seq_mm_data = seq_mm_data if isinstance( + seq_mm_data, dict) else seq_mm_data.get_data() return seq_mm_data, placeholder_maps def append_items_from_seq_group( diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index d3f57cf5338d5..3e0bfce59c5fe 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -664,7 +664,7 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): def modality(self) -> str: return self._modality - def get_data(self) -> Mapping[str, NestedTensors]: + def get_data(self) -> dict[str, NestedTensors]: return {key: elem.data for key, elem in self.items()} @@ -720,7 +720,7 @@ class MultiModalKwargs: items_by_modality = full_groupby(items, key=lambda x: x.modality) self._items_by_modality = dict(items_by_modality) - self._data: Optional[Mapping[str, NestedTensors]] = None + self._data: Optional[dict[str, NestedTensors]] = None @property def modalities(self): @@ -883,7 +883,7 @@ class MultiModalKwargs: def get_data(self, *, - pin_memory: bool = False) -> Mapping[str, NestedTensors]: + pin_memory: bool = False) -> dict[str, NestedTensors]: if self._data is not None: return self._data diff --git a/vllm/sequence.py b/vllm/sequence.py index b3be10b6bb612..2cb254381eff4 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -22,6 +22,7 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import RequestOutputKind, SamplingParams if TYPE_CHECKING: + from vllm.multimodal.inputs import NestedTensors from vllm.v1.worker.kv_connector_model_runner_mixin import ( KVConnectorOutput) @@ -978,7 +979,8 @@ class SequenceGroupMetadata( state: Optional[SequenceGroupState] = msgspec.field( default_factory=lambda: SequenceGroupState()) token_type_ids: Optional[list[int]] = None - multi_modal_data: Optional[MultiModalKwargs] = None + multi_modal_data: Optional[Union[MultiModalKwargs, + dict[str, "NestedTensors"]]] = None multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None encoder_seq_data: Optional[SequenceData] = None cross_block_table: Optional[list[int]] = None From 5f5664b3e4ff8046e26c36165a1294205cb429c5 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Mon, 18 Aug 2025 15:04:08 +0800 Subject: [PATCH 063/361] [XPU] Fix compile size for xpu (#23069) Signed-off-by: Kunshang Ji --- vllm/config/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 51db277f65dc9..cd2be212c23db 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3548,7 +3548,7 @@ class VllmConfig: if self.compilation_config.pass_config.enable_sequence_parallelism: self.compilation_config.custom_ops.append("+rms_norm") - if current_platform.is_cuda_alike(): + if current_platform.is_cuda_alike() or current_platform.is_xpu(): # if cudagraph_mode is not explicitly set by users, set default # value if self.compilation_config.cudagraph_mode is None: From 5c79b0d6484d7d4c5fe007c3c7ad04c72d3bc59e Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Mon, 18 Aug 2025 17:47:03 +0800 Subject: [PATCH 064/361] [XPU][CI]add xpu env vars in CI scripts (#22946) Signed-off-by: Kunshang Ji --- .buildkite/scripts/hardware_ci/run-xpu-test.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index deb61a9bafab6..445cd2735c190 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -23,9 +23,13 @@ docker run \ --device /dev/dri \ -v /dev/dri/by-path:/dev/dri/by-path \ --entrypoint="" \ + -e "HF_TOKEN=${HF_TOKEN}" \ + -e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \ --name "${container_name}" \ "${image_name}" \ - sh -c ' + bash -c ' + set -e + echo $ZE_AFFINITY_MASK VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp @@ -35,8 +39,8 @@ docker run \ pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/structured_output - pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py - pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py + pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py + pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py pytest -v -s v1/test_serial_utils.py pytest -v -s v1/test_utils.py pytest -v -s v1/test_metrics_reader.py From 27e8d1ea3ea9864f371f639daaa5315bf3250364 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 18 Aug 2025 17:52:00 +0800 Subject: [PATCH 065/361] [Refactor] Define MultiModalKwargsItems separate from MultiModalKwargs (#23053) Signed-off-by: DarkLight1337 --- docs/api/README.md | 1 + docs/contributing/model/multimodal.md | 4 +- .../multimodal/processing/test_common.py | 14 +- .../multimodal/processing/test_glm4_1v.py | 3 +- .../multimodal/processing/test_h2ovl.py | 3 +- .../multimodal/processing/test_internvl.py | 3 +- .../multimodal/processing/test_llama4.py | 10 +- .../multimodal/processing/test_mllama.py | 6 +- .../multimodal/processing/test_mllama4.py | 10 +- .../multimodal/processing/test_nemotron_vl.py | 3 +- .../multimodal/processing/test_qwen2_vl.py | 3 +- tests/models/multimodal/test_tensor_schema.py | 2 +- tests/multimodal/test_cache.py | 11 +- tests/v1/test_serial_utils.py | 22 ++- vllm/executor/msgspec_utils.py | 9 +- vllm/model_executor/models/aria.py | 4 +- vllm/model_executor/models/aya_vision.py | 4 +- vllm/model_executor/models/blip2.py | 4 +- vllm/model_executor/models/chameleon.py | 4 +- vllm/model_executor/models/cohere2_vision.py | 4 +- vllm/model_executor/models/deepseek_vl2.py | 7 +- vllm/model_executor/models/florence2.py | 4 +- vllm/model_executor/models/fuyu.py | 4 +- vllm/model_executor/models/gemma3_mm.py | 4 +- vllm/model_executor/models/gemma3n_mm.py | 4 +- vllm/model_executor/models/glm4_1v.py | 10 +- vllm/model_executor/models/glm4v.py | 4 +- vllm/model_executor/models/granite_speech.py | 4 +- vllm/model_executor/models/h2ovl.py | 16 +- .../models/hyperclovax_vision.py | 27 +-- vllm/model_executor/models/idefics3.py | 4 +- vllm/model_executor/models/interns1.py | 13 +- vllm/model_executor/models/internvl.py | 34 ++-- vllm/model_executor/models/keye.py | 7 +- vllm/model_executor/models/kimi_vl.py | 4 +- vllm/model_executor/models/llava.py | 6 +- .../model_executor/models/llava_next_video.py | 4 +- vllm/model_executor/models/llava_onevision.py | 4 +- vllm/model_executor/models/minicpmo.py | 4 +- vllm/model_executor/models/minicpmv.py | 4 +- vllm/model_executor/models/mistral3.py | 4 +- vllm/model_executor/models/mllama.py | 7 +- vllm/model_executor/models/mllama4.py | 12 +- vllm/model_executor/models/molmo.py | 4 +- vllm/model_executor/models/nvlm_d.py | 13 +- vllm/model_executor/models/ovis.py | 9 +- vllm/model_executor/models/paligemma.py | 4 +- vllm/model_executor/models/phi3v.py | 4 +- vllm/model_executor/models/phi4_multimodal.py | 4 +- vllm/model_executor/models/phi4mm.py | 4 +- vllm/model_executor/models/pixtral.py | 7 +- .../models/prithvi_geospatial_mae.py | 7 +- .../models/qwen2_5_omni_thinker.py | 15 +- vllm/model_executor/models/qwen2_audio.py | 7 +- vllm/model_executor/models/qwen2_vl.py | 7 +- vllm/model_executor/models/qwen_vl.py | 4 +- vllm/model_executor/models/skyworkr1v.py | 13 +- vllm/model_executor/models/step3_vl.py | 14 +- vllm/model_executor/models/tarsier.py | 4 +- vllm/model_executor/models/transformers.py | 6 +- vllm/model_executor/models/ultravox.py | 9 +- vllm/model_executor/models/voxtral.py | 7 +- vllm/model_executor/models/whisper.py | 4 +- vllm/multimodal/__init__.py | 4 +- vllm/multimodal/base.py | 9 +- vllm/multimodal/cache.py | 21 ++- vllm/multimodal/inputs.py | 172 ++++++++---------- vllm/multimodal/parse.py | 11 +- vllm/multimodal/processing.py | 38 ++-- vllm/multimodal/profiling.py | 4 +- vllm/multimodal/utils.py | 25 ++- vllm/sequence.py | 6 +- vllm/v1/engine/processor.py | 2 +- vllm/v1/serial_utils.py | 41 ++++- vllm/v1/worker/gpu_input_batch.py | 10 +- vllm/v1/worker/gpu_model_runner.py | 5 +- vllm/v1/worker/tpu_model_runner.py | 5 +- 77 files changed, 431 insertions(+), 383 deletions(-) diff --git a/docs/api/README.md b/docs/api/README.md index 327472df1d52c..57142e8f5625d 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -77,6 +77,7 @@ Internal data structures. - [vllm.multimodal.inputs.MultiModalFieldElem][] - [vllm.multimodal.inputs.MultiModalFieldConfig][] - [vllm.multimodal.inputs.MultiModalKwargsItem][] +- [vllm.multimodal.inputs.MultiModalKwargsItems][] - [vllm.multimodal.inputs.MultiModalKwargs][] - [vllm.multimodal.inputs.MultiModalInputs][] diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index 64a48be32645a..76d0f067fd452 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -629,7 +629,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index @@ -778,7 +778,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() bos_token_id = hf_config.bos_token_id diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 906966ddd0649..a1744317b3944 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -370,10 +370,16 @@ def _assert_inputs_equal( if ignore_mm_keys is None: ignore_mm_keys = set() - assert "mm_kwargs" in a and "mm_kwargs" in b, msg + a_rest = {k: v for k, v in a.items() if k != "mm_kwargs"} + b_rest = {k: v for k, v in b.items() if k != "mm_kwargs"} + + assert a_rest == b_rest, msg + + a_data = a["mm_kwargs"].get_data() + b_data = b["mm_kwargs"].get_data() for key in ignore_mm_keys: - a["mm_kwargs"].pop(key, None) - b["mm_kwargs"].pop(key, None) + a_data.pop(key, None) + b_data.pop(key, None) - assert a == b, msg + assert a_data == b_data, msg diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py index a6d900ec5d895..a49842e1099c2 100644 --- a/tests/models/multimodal/processing/test_glm4_1v.py +++ b/tests/models/multimodal/processing/test_glm4_1v.py @@ -45,7 +45,8 @@ def test_processor_override( video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token) video_tok_count = processed_inputs["prompt_token_ids"].count( video_token_id) - grid_t, _, _ = processed_inputs["mm_kwargs"]["video_grid_thw"][0] + grid_t, _, _ = processed_inputs["mm_kwargs"].get_data( + )["video_grid_thw"][0] assert grid_t == expected_grid_t assert video_tok_count == expected_toks_per_frame * grid_t diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py index 76e4acc67d4d5..1adfe21352c41 100644 --- a/tests/models/multimodal/processing/test_h2ovl.py +++ b/tests/models/multimodal/processing/test_h2ovl.py @@ -108,7 +108,8 @@ def _run_check( # Ensure we have the right number of placeholders per num_crops size image_token_id = tokenizer.convert_tokens_to_ids("") img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) - pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape + pixel_shape = processed_inputs["mm_kwargs"].get_data( + )["pixel_values_flat"].shape assert img_tok_count == 256 * total_expected_num_patches assert pixel_shape[0] == total_expected_num_patches diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py index c3e2841a8f060..e4f25f5ac7123 100644 --- a/tests/models/multimodal/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -68,7 +68,8 @@ def _run_check( # Ensure we have the right number of placeholders per num_crops size image_token_id = tokenizer.convert_tokens_to_ids("") img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) - pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape + pixel_shape = processed_inputs["mm_kwargs"].get_data( + )["pixel_values_flat"].shape assert img_tok_count == 256 * total_expected_num_patches assert pixel_shape[0] == total_expected_num_patches diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py index 5e14f0f9964d6..bea4f43567eee 100644 --- a/tests/models/multimodal/processing/test_llama4.py +++ b/tests/models/multimodal/processing/test_llama4.py @@ -51,14 +51,14 @@ def test_processor_override( prompt = encode_tokens(tokenizer, prompt) processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) - mm_kwargs = processed_inputs["mm_kwargs"] + mm_data = processed_inputs["mm_kwargs"].get_data() # place holder replacements prompt_token_ids = processed_inputs["prompt_token_ids"] assert prompt_token_ids.count(config.boi_token_index) == num_imgs assert prompt_token_ids.count(config.eoi_token_index) == num_imgs assert prompt_token_ids.count(vocab[hf_processor.image_token]) == num_imgs - aspect_ratios = mm_kwargs["aspect_ratios"] + aspect_ratios = mm_data["aspect_ratios"] num_x_separators = num_y_separators = 0 for tiles_y, tiles_x in aspect_ratios: if tiles_x * tiles_y > 1: @@ -80,6 +80,6 @@ def test_processor_override( num_patches_per_chunk = processor.info.get_patch_per_chunk( config.vision_config) assert prompt_token_ids.count(config.image_token_index) \ - == mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk - assert mm_kwargs["pixel_values"].shape[0] \ - == mm_kwargs["patches_per_image"].sum() + == sum(mm_data["patches_per_image"]) * num_patches_per_chunk + assert len(mm_data["pixel_values"]) \ + == sum(mm_data["patches_per_image"]) diff --git a/tests/models/multimodal/processing/test_mllama.py b/tests/models/multimodal/processing/test_mllama.py index a6b20a1e3678e..b42d3f89f3cbf 100644 --- a/tests/models/multimodal/processing/test_mllama.py +++ b/tests/models/multimodal/processing/test_mllama.py @@ -49,18 +49,18 @@ def test_profiling( encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids) ] * max_num_seqs - mm_kwargs = processor.apply( + mm_data = processor.apply( prompt=dummy_mm_data.prompt, mm_data=dummy_mm_data.mm_data, hf_processor_mm_kwargs=dict(), - )["mm_kwargs"] + )["mm_kwargs"].get_data() # Get the actual number of encoder tokens for each sample. # Because attn_metadata.encoder_seq_lens only counts the last # group of images for each sample, which is used to cheat the # block manager to allocate blocks for those images only. # See MllamaMultiModalProcessor for more details. - num_tiles = [[t] for t in mm_kwargs.pop("num_tiles")] + num_tiles = [[t] for t in mm_data.pop("num_tiles")] num_tokens_per_tile = calc_token_per_chunk(image_size) actual_encoder_seq_lens = [ sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py index f3871b60c3f64..3be77b5da63f2 100644 --- a/tests/models/multimodal/processing/test_mllama4.py +++ b/tests/models/multimodal/processing/test_mllama4.py @@ -38,21 +38,21 @@ def test_profiling(model_id: str, max_model_len: int): hf_config = ctx.get_hf_config(Llama4Config) - mm_kwargs = processor.apply( + mm_data = processor.apply( prompt=dummy_mm_data.prompt, mm_data=dummy_mm_data.mm_data, hf_processor_mm_kwargs=dict(), - )["mm_kwargs"] + )["mm_kwargs"].get_data() image_size = hf_config.vision_config.image_size patch_size = hf_config.vision_config.patch_size downsample_ratio = int( round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2))) tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio - chunks_per_image = prod(mm_kwargs["patches_per_image"]) + chunks_per_image = prod(mm_data["patches_per_image"]) total_num_patches = chunks_per_image * tokens_per_patch - num_tiles = mm_kwargs["aspect_ratios"][0][0] * mm_kwargs["aspect_ratios"][ - 0][1] # x-y seperator tokens + num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][ + 1] # x-y seperator tokens total_tokens = total_num_patches.item() + num_tiles.item( ) + 3 # image start, image, image end diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py index 6fbbab0d26124..d9f1965a053df 100644 --- a/tests/models/multimodal/processing/test_nemotron_vl.py +++ b/tests/models/multimodal/processing/test_nemotron_vl.py @@ -70,7 +70,8 @@ def _run_check( # Ensure we have the right number of placeholders per num_crops size image_token_id = tokenizer.convert_tokens_to_ids("") img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) - pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape + pixel_shape = processed_inputs["mm_kwargs"].get_data( + )["pixel_values_flat"].shape print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape) assert img_tok_count == 256 * total_expected_num_patches assert pixel_shape[0] == total_expected_num_patches diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py index 9d1cd183387bc..985f4188fdb66 100644 --- a/tests/models/multimodal/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -48,7 +48,8 @@ def test_processor_override( hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs) image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token) img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) - pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape + pixel_shape = processed_inputs["mm_kwargs"].get_data( + )["pixel_values"].shape assert img_tok_count == expected_toks_per_img * num_imgs assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py index 036624431c20b..51e5b84b6c083 100644 --- a/tests/models/multimodal/test_tensor_schema.py +++ b/tests/models/multimodal/test_tensor_schema.py @@ -128,7 +128,7 @@ def create_batched_mm_kwargs( )["mm_kwargs"] items = [ item for modality in supported_mm_limits - for item in mm_kwargs.get_items(modality) + for item in mm_kwargs[modality] ] return group_mm_kwargs_by_modality(items) diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py index 2149f05b6af09..088cd00db2e04 100644 --- a/tests/multimodal/test_cache.py +++ b/tests/multimodal/test_cache.py @@ -4,8 +4,8 @@ import pytest import torch from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata -from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs, - MultiModalKwargsItem, +from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem, + MultiModalKwargsItems, MultiModalSharedField) @@ -24,8 +24,8 @@ def _dummy_item(modality: str, size_by_key: dict[str, int]): ]) -def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]): - return MultiModalKwargs([ +def _dummy_items(size_by_key_modality: dict[str, dict[str, int]]): + return MultiModalKwargsItems.from_seq([ _dummy_item(modality, size_by_key) for modality, size_by_key in size_by_key_modality.items() ]) @@ -37,7 +37,8 @@ def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]): [ (_dummy_item("a", {"a1": 100}), 100), (_dummy_item("a", {"a1": 100, "a2": 110}), 210), - (_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501 + (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501 + (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}).get_data(), 460), # noqa: E501 ], ) # yapf: enable diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py index 586276ee08aef..118b40d0ef418 100644 --- a/tests/v1/test_serial_utils.py +++ b/tests/v1/test_serial_utils.py @@ -11,7 +11,8 @@ import torch from vllm.multimodal.inputs import (MultiModalBatchedField, MultiModalFieldElem, MultiModalFlatField, - MultiModalKwargs, MultiModalKwargsItem, + MultiModalKwargsItem, + MultiModalKwargsItems, MultiModalSharedField, NestedTensors) from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder @@ -96,7 +97,7 @@ def test_encode_decode(monkeypatch: pytest.MonkeyPatch): class MyRequest(msgspec.Struct): - mm: Optional[list[MultiModalKwargs]] + mm: Optional[list[MultiModalKwargsItems]] def test_multimodal_kwargs(): @@ -119,7 +120,7 @@ def test_multimodal_kwargs(): audio = MultiModalKwargsItem.from_elems([e1]) video = MultiModalKwargsItem.from_elems([e2]) image = MultiModalKwargsItem.from_elems([e3, e4]) - mm = MultiModalKwargs([audio, video, image]) + mm = MultiModalKwargsItems.from_seq([audio, video, image]) # pack mm kwargs into a mock request so that it can be decoded properly req = MyRequest([mm]) @@ -133,19 +134,22 @@ def test_multimodal_kwargs(): total_len = sum(memoryview(x).cast("B").nbytes for x in encoded) - # expected total encoding length, should be 14255, +-20 for minor changes - assert 14250 <= total_len <= 14300 - decoded: MultiModalKwargs = decoder.decode(encoded).mm[0] + # expected total encoding length, should be 14306, +-20 for minor changes + assert 14275 <= total_len <= 14325 + decoded = decoder.decode(encoded).mm[0] + assert isinstance(decoded, MultiModalKwargsItems) # check all modalities were recovered and do some basic sanity checks - assert len(decoded.modalities) == 3 - images = decoded.get_items("image") + assert len(decoded) == 3 + images = decoded["image"] assert len(images) == 1 assert len(images[0].items()) == 2 assert list(images[0].keys()) == ["i0", "i1"] # check the tensor contents and layout in the main dict - assert all(nested_equal(mm[k], decoded[k]) for k in mm) + mm_data = mm.get_data() + decoded_data = decoded.get_data() + assert all(nested_equal(mm_data[k], decoded_data[k]) for k in mm_data) def nested_equal(a: NestedTensors, b: NestedTensors): diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py index 852c8f5cffa0c..4ce6d8dfad2cc 100644 --- a/vllm/executor/msgspec_utils.py +++ b/vllm/executor/msgspec_utils.py @@ -4,11 +4,12 @@ from array import array from typing import Any, Type +from vllm.multimodal.inputs import MultiModalKwargs from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE def encode_hook(obj: Any) -> Any: - """Custom msgspec enc hook that supports array types. + """Custom msgspec enc hook that supports array types and MultiModalKwargs. See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder """ @@ -17,10 +18,12 @@ def encode_hook(obj: Any) -> Any: f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. " f"Given array has a type code of {obj.typecode}.") return obj.tobytes() + if isinstance(obj, MultiModalKwargs): + return dict(obj) def decode_hook(type: Type, obj: Any) -> Any: - """Custom msgspec dec hook that supports array types. + """Custom msgspec dec hook that supports array types and MultiModalKwargs. See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder """ @@ -28,3 +31,5 @@ def decode_hook(type: Type, obj: Any) -> Any: deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE) deserialized.frombytes(obj) return deserialized + if type is MultiModalKwargs: + return MultiModalKwargs(obj) diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index e1368a3f6478a..1c7960fa3e0a5 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -22,7 +22,7 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, @@ -470,7 +470,7 @@ class AriaMultiModalProcessor(BaseMultiModalProcessor[AriaProcessingInfo]): self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 5cd74bbba4827..b02a973d942ce 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -18,7 +18,7 @@ from transformers.models.got_ocr2.image_processing_got_ocr2 import ( from vllm.config import VllmConfig from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs +from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -242,7 +242,7 @@ class AyaVisionMultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_token = hf_processor.image_token diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 8e3505f872eb2..2f2b880bb0e14 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -15,7 +15,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptIndexTargets, @@ -492,7 +492,7 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]): self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: tokenizer = self.info.get_tokenizer() vocab = tokenizer.get_vocab() diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 8d705f40ce8ff..e6914ad4c495d 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -31,7 +31,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, @@ -151,7 +151,7 @@ class ChameleonMultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) tokenizer = self.info.get_tokenizer() diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index f17583768f795..bc526fd661b6d 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -21,7 +21,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs +from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -241,7 +241,7 @@ class Cohere2VisionMultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_token = hf_processor.image_token diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index e0acca75d9dd6..e881e9c6ddb62 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -21,7 +21,7 @@ from vllm.model_executor.model_loader.utils import set_default_torch_dtype from vllm.model_executor.models.transformers import replace_linear_class from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs, NestedTensors) + MultiModalKwargsItems, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -252,7 +252,7 @@ class DeepseekVL2MultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) @@ -291,7 +291,8 @@ class DeepseekVL2MultiModalProcessor( tokenization_kwargs: Mapping[str, object], *, return_mm_hashes: bool, - ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: + ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes], + bool]: # The processor logic is different for len(images) <= 2 vs > 2 # Since the processing cache assumes that the processor output is # invariant of how many images are passed per prompt, we only diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index 56e456c2f1f2a..4a8cb35a54dc8 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -21,7 +21,7 @@ from vllm.model_executor.models.bart import (BartDecoder, BartEncoder, from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseProcessingInfo, EncDecMultiModalProcessor, @@ -860,7 +860,7 @@ class Florence2MultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() pad_token_id = hf_config.pad_token_id diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index b61e0361fe8c3..90af859ab92ec 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -32,7 +32,7 @@ from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -226,7 +226,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]): self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() bos_token_id = hf_config.bos_token_id diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 9871b11b37991..bf5ad633b94a5 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -17,7 +17,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) # yapf: disable @@ -311,7 +311,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]): self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_token = hf_processor.boi_token diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index a0c3bb50070b3..79061fd30c39b 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -24,7 +24,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems, MultiModalDataParser) # yapf: disable @@ -209,7 +209,7 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo] self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 88c53c8363275..015577322ffe3 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -59,7 +59,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs, VideoItem) + MultiModalKwargsItems, VideoItem) from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -1158,7 +1158,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_processor = self.info.get_image_processor( @@ -1175,14 +1175,16 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): merge_length = image_processor.merge_size**2 def get_image_replacement_glm4v(item_idx: int): - grid_thw = out_mm_kwargs["image_grid_thw"][item_idx] + out_item = out_mm_kwargs["image"][item_idx] + grid_thw = out_item["image_grid_thw"].data assert isinstance(grid_thw, torch.Tensor) num_tokens = int(grid_thw.prod()) // merge_length return [hf_processor.image_token_id] * num_tokens def get_video_replacement_glm4v(item_idx: int): - grid_thw = out_mm_kwargs["video_grid_thw"][item_idx] + out_item = out_mm_kwargs["video"][item_idx] + grid_thw = out_item["video_grid_thw"].data assert isinstance(grid_thw, torch.Tensor) video, metadata = mm_items["video"][item_idx] diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 1751fccd08b06..bf33575859aea 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -30,7 +30,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, @@ -503,7 +503,7 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]): self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index c9e3b74e7c3c4..c3ac3bb78c83d 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -40,7 +40,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -118,7 +118,7 @@ class GraniteSpeechMultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> list[PromptUpdate]: processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) tokenizer = self.info.get_tokenizer() diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index c3e4f81597adb..9ab3f4d0d9a15 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -17,7 +17,7 @@ from transformers import PretrainedConfig from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalKwargs +from vllm.multimodal.inputs import MultiModalKwargsItems from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, MultiModalDataItems) from vllm.multimodal.processing import (MultiModalHashes, PromptReplacement, @@ -425,18 +425,19 @@ class H2OVLMultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - if "image_num_patches" in out_mm_kwargs: - image_num_patches = out_mm_kwargs["image_num_patches"] + out_mm_data = out_mm_kwargs.get_data() + if "image_num_patches" in out_mm_data: + image_num_patches = out_mm_data["image_num_patches"] assert isinstance(image_num_patches, torch.Tensor) image_num_patches = image_num_patches.tolist() - elif "image_embeds" in out_mm_kwargs: + elif "image_embeds" in out_mm_data: # TODO: Use image size information in dictionary embedding inputs # to compute num_patches (similar to Qwen2-VL) - image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) + image_num_patches = [None] * len(out_mm_data["image_embeds"]) else: image_num_patches = [] @@ -479,7 +480,8 @@ class H2OVLMultiModalProcessor( tokenization_kwargs: Mapping[str, object], *, return_mm_hashes: bool, - ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: + ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes], + bool]: # The processor logic is different for len(images) <= 1 vs > 1 # Since the processing cache assumes that the processor output is # invariant of how many images are passed per prompt, we only diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index e5c94c7f3a706..d3ddc47ea932f 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -34,7 +34,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import ImageSize, MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, ProcessingCache, @@ -295,7 +295,7 @@ class HCXVisionMultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() placeholder = { @@ -306,21 +306,22 @@ class HCXVisionMultiModalProcessor( def get_replacement_hyperclovax( item_idx: int, modality: str, - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ): - num_tokens = None + out_item = out_mm_kwargs[modality][item_idx] + if modality == "image": + lens = out_item["vision_query_lengths_images"].data num_tokens = self.info.get_num_image_tokens( - vision_query_length=out_mm_kwargs[ - "vision_query_lengths_images"][item_idx], ) - if modality == "video": + vision_query_length=lens) + elif modality == "video": + lens = out_item["vision_query_lengths_videos"].data num_tokens = self.info.get_num_video_tokens( - vision_query_length=out_mm_kwargs[ - "vision_query_lengths_videos"][item_idx], ) - assert isinstance(num_tokens, int) - return [ - placeholder[modality], - ] * num_tokens + vision_query_length=lens) + else: + raise NotImplementedError(modality) + + return [placeholder[modality]] * num_tokens return [ PromptReplacement( diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 3c01789b90066..63307470d959b 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -34,7 +34,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs) + MultiModalKwargsItems) from vllm.multimodal.parse import ImageProcessorItems, ImageSize # yapf conflicts with isort for this block # yapf: disable @@ -374,7 +374,7 @@ class Idefics3MultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_token, _, _ = self.info._get_image_token(hf_processor) diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index d952ced2fa69f..c739e74b058fa 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -24,7 +24,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs, NestedTensors) + MultiModalKwargsItems, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -399,7 +399,7 @@ class InternS1MultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) img_context_token = hf_processor.image_token @@ -407,15 +407,16 @@ class InternS1MultiModalProcessor( end_image_token = hf_processor.end_image_token video_token = hf_processor.video_token - if "video_num_patches" in out_mm_kwargs: - video_num_patches = out_mm_kwargs["video_num_patches"] + out_mm_data = out_mm_kwargs.get_data() + if "video_num_patches" in out_mm_data: + video_num_patches = out_mm_data["video_num_patches"] assert isinstance(video_num_patches, torch.Tensor) video_num_patches = video_num_patches.tolist() else: video_num_patches = [] - if "image_num_patches" in out_mm_kwargs: - image_num_patches = out_mm_kwargs["image_num_patches"] + if "image_num_patches" in out_mm_data: + image_num_patches = out_mm_data["image_num_patches"] assert isinstance(image_num_patches, torch.Tensor) image_num_patches = image_num_patches.tolist() else: diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 8e766dd4c4768..da8ad8396725d 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -28,7 +28,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs, NestedTensors) + MultiModalKwargsItems, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -797,18 +797,19 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]): self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - if "image_num_patches" in out_mm_kwargs: - image_num_patches = out_mm_kwargs["image_num_patches"] + out_mm_data = out_mm_kwargs.get_data() + if "image_num_patches" in out_mm_data: + image_num_patches = out_mm_data["image_num_patches"] assert isinstance(image_num_patches, torch.Tensor) image_num_patches = image_num_patches.tolist() - elif "image_embeds" in out_mm_kwargs: + elif "image_embeds" in out_mm_data: # TODO: Use image size information in dictionary embedding inputs # to compute num_patches (similar to Qwen2-VL) - image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) + image_num_patches = [None] * len(out_mm_data["image_embeds"]) else: image_num_patches = [] @@ -966,15 +967,19 @@ class InternVLMultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: - prompt_repl: list[PromptUpdate] = super()._get_prompt_updates( - mm_items, hf_processor_mm_kwargs, out_mm_kwargs) + prompt_repl = super()._get_prompt_updates( + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + out_mm_kwargs=out_mm_kwargs, + ) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - if "video_num_patches" in out_mm_kwargs: - video_num_patches = out_mm_kwargs["video_num_patches"] + out_mm_data = out_mm_kwargs.get_data() + if "video_num_patches" in out_mm_data: + video_num_patches = out_mm_data["video_num_patches"] assert isinstance(video_num_patches, torch.Tensor) video_num_patches = video_num_patches.tolist() else: @@ -992,12 +997,15 @@ class InternVLMultiModalProcessor( video_context_token=hf_processor.video_token) if self.info.supports_video: - prompt_repl.append( + prompt_repl = [ + *prompt_repl, PromptReplacement( modality="video", target="

Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:
wget http://images.cocodataset.org/zips/train2017.zip + + + ShareGPT4Video (Video) + ✅ + ✅ + + git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video + BurstGPT @@ -231,7 +239,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct ```bash vllm bench serve \ --backend openai-chat \ - --endpoint-type openai-chat \ + --endpoint-type openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ --dataset-name hf \ @@ -246,7 +254,7 @@ vllm bench serve \ ```bash vllm bench serve \ --backend openai-chat \ - --endpoint-type openai-chat \ + --endpoint-type openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ --dataset-name hf \ @@ -612,7 +620,7 @@ vllm bench serve \ --prefix-repetition-prefix-len 512 \ --prefix-repetition-suffix-len 128 \ --prefix-repetition-num-prefixes 5 \ - --prefix-repetition-output-len 128 + --prefix-repetition-output-len 128 ``` @@ -687,4 +695,31 @@ python benchmarks/benchmark_serving.py \ --endpoint /v1/chat/completion ``` +### Videos (ShareGPT4Video) + +Start vLLM: + +```bash +python -m vllm.entrypoints.openai.api_server \ + --model Qwen/Qwen2.5-VL-7B-Instruct \ + --dtype bfloat16 \ + --limit-mm-per-prompt '{"video": 1}' \ + --allowed-local-media-path /path/to/sharegpt4video/videos +``` + +Send requests with videos: + +```bash +python benchmarks/benchmark_serving.py \ + --backend openai-chat \ + --model Qwen/Qwen2.5-VL-7B-Instruct \ + --dataset-name sharegpt \ + --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \ + --num-prompts 100 \ + --save-result \ + --result-dir ~/vllm_benchmark_results \ + --save-detailed \ + --endpoint /v1/chat/completion +``` + diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index c62934ed94cb5..e1a856026c4ae 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -293,6 +293,41 @@ def process_image(image: Any) -> Mapping[str, Any]: ) +def process_video(video: Any) -> Mapping[str, Any]: + """ + Process a single video input and return a multimedia content dictionary. + + Supports the following input types: + + 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key + containing raw video data. + + 2. String input: - Treats the string as a URL or local file path. - + Prepends "file://" if the string doesn't start with "http://" or + "file://". - Returns a dictionary with the image URL. + + Raises: + ValueError: If the input is not a supported type. + """ + if isinstance(video, dict) and "bytes" in video: + video_bytes = video["bytes"] + video_base64 = base64.b64encode(video_bytes).decode("utf-8") + return { + "type": "video_url", + "video_url": {"url": f"data:video/mp4;base64,{video_base64}"}, + } + + if isinstance(video, str): + video_url = ( + video if video.startswith(("http://", "file://")) else f"file://{video}" + ) + return {"type": "video_url", "video_url": {"url": video_url}} + + raise ValueError( + f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501 + ) + + # ----------------------------------------------------------------------------- # Random Dataset Implementation (Synthetic Data) # ----------------------------------------------------------------------------- @@ -451,9 +486,10 @@ class ShareGPTDataset(BenchmarkDataset): skip_min_output_len_check=output_len is not None, ): continue - # TODO: Also support ShareGPT4Video. if image_path := entry.get("image"): mm_content = process_image(image_path) + elif video_path := entry.get("video"): + mm_content = process_video(video_path) else: mm_content = None if enable_multimodal_chat: diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 3532a083fb4a1..f4fbfad2d1d5d 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -281,7 +281,7 @@ def process_image(image: Any) -> Mapping[str, Any]: """ Process a single image input and return a multimedia content dictionary. - Supports three input types: + Supports the following input types: 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key containing raw image data. - Loads the bytes as a PIL.Image.Image. @@ -321,6 +321,41 @@ def process_image(image: Any) -> Mapping[str, Any]: " or str or dictionary with raw image bytes.") +def process_video(video: Any) -> Mapping[str, Any]: + """ + Process a single video input and return a multimedia content dictionary. + + Supports the following input types: + + 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key + containing raw video data. + + 2. String input: - Treats the string as a URL or local file path. - + Prepends "file://" if the string doesn't start with "http://" or + "file://". - Returns a dictionary with the image URL. + + Raises: + ValueError: If the input is not a supported type. + """ + if isinstance(video, dict) and 'bytes' in video: + video_bytes = video['bytes'] + video_base64 = base64.b64encode(video_bytes).decode("utf-8") + return { + "type": "video_url", + "video_url": { + "url": f"data:video/mp4;base64,{video_base64}" + }, + } + + if isinstance(video, str): + video_url = (video if video.startswith( + ("http://", "file://")) else f"file://{video}") + return {"type": "video_url", "video_url": {"url": video_url}} + + raise ValueError( + f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501 + ) + # ----------------------------------------------------------------------------- # Random Dataset Implementation (Synthetic Data) # ----------------------------------------------------------------------------- @@ -474,9 +509,10 @@ class ShareGPTDataset(BenchmarkDataset): skip_min_output_len_check=output_len is not None): continue - # TODO: Also support ShareGPT4Video. if image_path := entry.get("image"): mm_content = process_image(image_path) + elif video_path := entry.get("video"): + mm_content = process_video(video_path) else: mm_content = None if enable_multimodal_chat: From c32e6ad1f63631fd8033f0cca3a35d5e48ccfc7f Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Tue, 19 Aug 2025 20:39:28 -0400 Subject: [PATCH 114/361] [Quantization] Bump Compressed Tensors Version (#23202) Signed-off-by: Kyle Sayers Co-authored-by: Dipika Sikka Co-authored-by: Michael Goin --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index 3c3ac0abf50f3..365457436faa8 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -39,7 +39,7 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.10.2 # required for compressed-tensors +compressed-tensors == 0.11.0 # required for compressed-tensors depyf==0.19.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py watchfiles # required for http server to monitor the updates of TLS files From 0167efe20d3d2280c3da6aea94a6f59afec5099c Mon Sep 17 00:00:00 2001 From: 633WHU Date: Wed, 20 Aug 2025 09:25:59 +0800 Subject: [PATCH 115/361] [Core] Optimize scheduler request removal for single completions (#21917) Signed-off-by: chiliu Signed-off-by: chiliu Co-authored-by: chiliu --- vllm/v1/core/sched/scheduler.py | 14 ++++++-------- vllm/v1/core/sched/utils.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index b3defa443186e..f9a7e21014073 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -25,7 +25,7 @@ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, SchedulerOutput) from vllm.v1.core.sched.request_queue import (SchedulingPolicy, create_request_queue) -from vllm.v1.core.sched.utils import check_stop +from vllm.v1.core.sched.utils import check_stop, remove_all from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs) from vllm.v1.kv_cache_interface import KVCacheConfig @@ -872,9 +872,7 @@ class Scheduler(SchedulerInterface): # Remove the stopped requests from the running and waiting queues. if stopped_running_reqs: - self.running = [ - req for req in self.running if req not in stopped_running_reqs - ] + self.running = remove_all(self.running, stopped_running_reqs) if stopped_preempted_reqs: # This is a rare case and unlikely to impact performance. self.waiting.remove_requests(stopped_preempted_reqs) @@ -1000,7 +998,7 @@ class Scheduler(SchedulerInterface): else: request_ids = set(request_ids) - running_requests_to_remove = [] + running_requests_to_remove = set() waiting_requests_to_remove = [] valid_requests = [] @@ -1013,13 +1011,13 @@ class Scheduler(SchedulerInterface): valid_requests.append(request) if request.status == RequestStatus.RUNNING: - running_requests_to_remove.append(request) + running_requests_to_remove.add(request) else: waiting_requests_to_remove.append(request) # Remove all requests from queues at once for better efficiency - for request in running_requests_to_remove: - self.running.remove(request) + if running_requests_to_remove: + self.running = remove_all(self.running, running_requests_to_remove) if waiting_requests_to_remove: self.waiting.remove_requests(waiting_requests_to_remove) diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py index 42ec95091f962..42d3e5c68b4c8 100644 --- a/vllm/v1/core/sched/utils.py +++ b/vllm/v1/core/sched/utils.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import contextlib from typing import Optional import torch @@ -7,6 +8,38 @@ import torch from vllm.v1.request import Request, RequestStatus +def remove_all(lst: list, items_to_remove: set) -> list: + """Remove all items from a list that are in the items_to_remove set. + + This method optimizes for the common case of removing a single item, + falling back to list comprehension for multiple items. + + Args: + lst: The list to remove items from + items_to_remove: Set of items to remove + + Returns: + Either the modified original list (for single item removal) or + a new list (for multiple item removal). Callers should use the + returned value. + + Note: + For single item removal, this modifies the original list in-place + and returns it. For multiple items, it creates and returns a new list. + """ + if not items_to_remove: + return lst + + if len(items_to_remove) == 1: + # Fast path for single item removal (most common case) + item = next(iter(items_to_remove)) + with contextlib.suppress(ValueError): + lst.remove(item) + return lst + # For multiple items, use list comprehension + return [item for item in lst if item not in items_to_remove] + + def check_stop(request: Request, max_model_len: int, pooler_output: Optional[torch.Tensor] = None) -> bool: From d46d417b5897d7eddb002b61b19e8cba029c3dda Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 19 Aug 2025 22:18:52 -0400 Subject: [PATCH 116/361] [CI Perf] Only test bfloat16 for tests/compile/test_fusion_all_reduce.py (#23132) Signed-off-by: mgoin --- tests/compile/test_fusion_all_reduce.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py index 4c3cf6c2a10cf..dd31e0db1f59f 100644 --- a/tests/compile/test_fusion_all_reduce.py +++ b/tests/compile/test_fusion_all_reduce.py @@ -148,7 +148,7 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module): @pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("seq_len", [8]) @pytest.mark.parametrize("hidden_size", [16]) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA") @pytest.mark.skipif( From e58c5a97688750e7930f13b6fe556d9a28a5b2d9 Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Tue, 19 Aug 2025 19:32:47 -0700 Subject: [PATCH 117/361] [Core] Add torch profiler CPU traces for AsyncLLM. (#21794) Signed-off-by: Chenheli Hua --- vllm/envs.py | 6 ++++-- vllm/v1/engine/async_llm.py | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 861e4c6a1bbe5..70068cca66f8f 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -667,8 +667,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv("VLLM_LORA_RESOLVER_CACHE_DIR", None), - # Enables torch profiler if set. Path to the directory where torch profiler - # traces are saved. Note that it must be an absolute path. + # Enables torch profiler if set. + # Both AsyncLLM's CPU traces as well as workers' + # traces (CPU & GPU) will be saved under this directory. + # Note that it must be an absolute path. "VLLM_TORCH_PROFILER_DIR": lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))), diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 664fec31a4da5..342d7b24f8e98 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,12 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +import os +import socket import time from collections.abc import AsyncGenerator, Iterable, Mapping from copy import copy from typing import Any, Optional, Union import numpy as np +import torch import vllm.envs as envs from vllm.config import ModelConfig, VllmConfig @@ -144,6 +147,26 @@ class AsyncLLM(EngineClient): except RuntimeError: pass + if envs.VLLM_TORCH_PROFILER_DIR: + logger.info( + "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s", # noqa: E501 + envs.VLLM_TORCH_PROFILER_DIR) + worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm" + self.profiler = torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + ], + with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, + on_trace_ready=torch.profiler.tensorboard_trace_handler( + envs.VLLM_TORCH_PROFILER_DIR, + worker_name=worker_name, + use_gzip=True)) + else: + logger.info( + "Torch profiler disabled. AsyncLLM CPU traces will not be collected." # noqa: E501 + ) + self.profiler = None + @classmethod @deprecate_kwargs( "disable_log_requests", @@ -562,10 +585,16 @@ class AsyncLLM(EngineClient): raise self.dead_error async def start_profile(self) -> None: - await self.engine_core.profile_async(True) + coros = [self.engine_core.profile_async(True)] + if self.profiler is not None: + coros.append(asyncio.to_thread(self.profiler.start)) + await asyncio.gather(*coros) async def stop_profile(self) -> None: - await self.engine_core.profile_async(False) + coros = [self.engine_core.profile_async(False)] + if self.profiler is not None: + coros.append(asyncio.to_thread(self.profiler.stop)) + await asyncio.gather(*coros) async def reset_mm_cache(self) -> None: self.processor.mm_registry.reset_processor_cache(self.model_config) From 64ab3c7253afb8cc2008777153812109bf92d7c8 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 20 Aug 2025 10:33:41 +0800 Subject: [PATCH 118/361] [Doc] Update V1 status of various pooling models (#23189) Signed-off-by: DarkLight1337 --- docs/models/supported_models.md | 26 ++++++++++---------- tests/models/language/pooling/test_gritlm.py | 9 ++++--- vllm/model_executor/models/gritlm.py | 6 ++--- vllm/model_executor/models/interfaces.py | 11 ++++++--- 4 files changed, 28 insertions(+), 24 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 1d165fa6f16bd..7908e42387100 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -363,7 +363,7 @@ th { | `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ | -| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | | +| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ | | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ | | `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ | | `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ | @@ -436,17 +436,17 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `BertModel`C | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | | -| `Gemma2Model`C | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | ✅︎ | -| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | | -| `GteModel`C | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | | | -| `GteNewModel`C | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | | -| `ModernBertModel`C | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | | -| `NomicBertModel`C | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | | +| `BertModel`C | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | ✅︎ | +| `Gemma2Model`C | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ | +| `GteModel`C | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | | ✅︎ | +| `GteNewModel`C | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | ✅︎ | +| `ModernBertModel`C | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | ✅︎ | +| `NomicBertModel`C | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | ✅︎ | | `LlamaModel`C, `LlamaForCausalLM`C, `MistralModel`C, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2Model`C, `Qwen2ForCausalLM`C | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen3Model`C, `Qwen3ForCausalLM`C | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | | +| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | ✅︎ | | `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | C Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion)) @@ -476,7 +476,7 @@ These models primarily support the [`LLM.classify`](./pooling_models.md#llmclass | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | | +| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ | | `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | @@ -493,12 +493,12 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | | +| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | ✅︎ | | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | -| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | | -| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | | +| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | ✅︎ | +| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | ✅︎ | | `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | C Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion)) diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py index d21987571cbaa..17a55d916b1ff 100644 --- a/tests/models/language/pooling/test_gritlm.py +++ b/tests/models/language/pooling/test_gritlm.py @@ -14,6 +14,7 @@ from ....utils import RemoteOpenAIServer MODEL_NAME = "parasail-ai/GritLM-7B-vllm" MAX_MODEL_LEN = 4000 +ATOL = 0.002 def _arr(arr): @@ -97,16 +98,16 @@ def get_test_data(): def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]): cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0]) - assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=0.001) + assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=ATOL) cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1]) - assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=0.001) + assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=ATOL) cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0]) - assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=0.001) + assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=ATOL) cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1]) - assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=0.001) + assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=ATOL) def test_gritlm_offline_embedding(vllm_runner): diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index 9e7490e3c4f07..3f6790269ae62 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -20,7 +20,7 @@ from vllm.sequence import PoolerOutput from vllm.tasks import PoolingTask from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config -from .interfaces import SupportsV0Only +from .interfaces import default_pooling_type logger = init_logger(__name__) @@ -215,7 +215,8 @@ class GritLMPooler(Pooler): return build_output(pooled_data) -class GritLM(LlamaForCausalLM, SupportsV0Only): +@default_pooling_type("MEAN") +class GritLM(LlamaForCausalLM): """This class implements the embedding model for parasail-ai/GritLM-7B-vllm. The class inherits from LlamaForCausalLM and provides a custom pooling @@ -241,7 +242,6 @@ class GritLM(LlamaForCausalLM, SupportsV0Only): prefix: str = "", **kwargs, ) -> None: - # Use full attention for pooling (this is why V1 is not supported yet) if vllm_config.model_config.runner_type == "pooling": hf_config = vllm_config.model_config.hf_config hf_config.is_causal = False diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index c425488f834b5..9415e67924e74 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -3,7 +3,7 @@ from collections.abc import Iterable, Mapping, MutableSequence from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol, - Union, overload, runtime_checkable) + TypeVar, Union, overload, runtime_checkable) import numpy as np import torch @@ -641,11 +641,14 @@ def supports_cross_encoding( return is_pooling_model(model) and _supports_cross_encoding(model) -def default_pooling_type(pooling_type: str) -> object: +_T = TypeVar("_T", bound=type[torch.nn.Module]) + + +def default_pooling_type(pooling_type: str): """Set default_pooling_type decorator. """ - def func(model: object): - model.default_pooling_type = pooling_type + def func(model: _T) -> _T: + model.default_pooling_type = pooling_type # type: ignore return model return func From a634733f67b39fd9c1da1a861ba39f75efb576f3 Mon Sep 17 00:00:00 2001 From: Zebing Lin Date: Tue, 19 Aug 2025 22:57:47 -0400 Subject: [PATCH 119/361] [Attention] Optimize make_local_attention_virtual_batches for Flash Attention (#23185) Signed-off-by: linzebing --- vllm/v1/attention/backends/utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 5e6bc331835b6..94dd3d2629ebc 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -464,8 +464,9 @@ def make_local_attention_virtual_batches( attn_chunk_size)[arange > 0] # convert from q_seqlens to cu_seqlens_q - cu_seqlens_q_local = np.pad(np.cumsum(seqlens_q_local), (1, 0))\ - .astype(np.int32) + cu_seqlens_q_local = np.empty(virtual_batches + 1, dtype=np.int32) + np.cumsum(seqlens_q_local, out=cu_seqlens_q_local[1:]) + cu_seqlens_q_local[0] = 0 # compute the seqlens_k_local, # basically a full local attention block for all but the last block in each @@ -508,11 +509,10 @@ def make_local_attention_virtual_batches( # [ 22, 23 ], < local-batch 6, (batch 2, starting from k[4]) # [ 24, 25 ], < local-batch 7, (batch 2, starting from k[8]) # ] - block_indices= np.broadcast_to( - np.arange(pages_per_local_batch, dtype=np.int32), - (virtual_batches, pages_per_local_batch)) \ - + np.expand_dims(block_starts, axis=1) - block_indices = block_indices.flatten().clip(max=block_table.shape[1] - 1) + block_indices = (block_starts[:, None] + + np.arange(pages_per_local_batch, dtype=np.int32)) + block_indices = block_indices.reshape(-1).clip(max=block_table.shape[1] - + 1) batch_indices = np.repeat(np.arange(actual_batch_size, dtype=np.int32), local_blocks * pages_per_local_batch) block_table_local = block_table[batch_indices, block_indices]\ From 941f56858a48e097391cfcc451c3f6d88f7cf20c Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Tue, 19 Aug 2025 20:14:32 -0700 Subject: [PATCH 120/361] Fix a performance comparison issue in Benchmark Suite (#23047) Signed-off-by: Tsai, Louie Signed-off-by: Louie Tsai Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Li, Jiang --- .../scripts/compare-json-results.py | 144 ++++++++++++++---- 1 file changed, 118 insertions(+), 26 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index 12c4ba6aa69a6..50431d0cd4c5e 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -3,44 +3,129 @@ import argparse import json import os +from importlib import util import pandas as pd +plotly_found = util.find_spec("plotly.express") is not None + def compare_data_columns( files, name_column, data_column, info_cols, drop_column, debug=False ): - print("\ncompare_data_column: " + data_column) + """ + Align concatenation by keys derived from info_cols instead of row order. + - Pick one canonical key list: subset of info_cols present in ALL files. + - For each file: set index to those keys, aggregate duplicates + - (mean for metric, first for names). + - Concat along axis=1 (indexes align), then reset_index so callers can + - group by columns. + - If --debug, add a _name column per file. + """ + print("\ncompare_data_column:", data_column) + frames = [] raw_data_cols = [] compare_frames = [] + + # 1) choose a canonical key list from info_cols that exists in ALL files + cols_per_file = [] + for f in files: + try: + df_tmp = pd.read_json(f, orient="records") + except Exception as err: + raise ValueError(f"Failed to read {f}") from err + cols_per_file.append(set(df_tmp.columns)) + + key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)] + if not key_cols: + # soft fallback: use any info_cols present in the first file + key_cols = [c for c in info_cols if c in list(cols_per_file[0])] + if not key_cols: + raise ValueError( + "No common key columns found from info_cols across the input files." + ) + + # 2) build a single "meta" block (keys as columns) once, aligned by the key index + meta_added = False + for file in files: - data_df = pd.read_json(file) - serving_df = data_df.dropna(subset=[drop_column], ignore_index=True) - # Show all info columns in the first couple columns - if not frames: - for col in info_cols: - if col not in serving_df.columns: - print(f"Skipping missing column: {col}") - continue - frames.append(serving_df[col]) - # only show test name under debug mode - if debug is True: - serving_df = serving_df.rename(columns={name_column: file + "_name"}) - frames.append(serving_df[file + "_name"]) + df = pd.read_json(file, orient="records") - file = "/".join(file.split("/")[:-1]) - serving_df = serving_df.rename(columns={data_column: file}) - frames.append(serving_df[file]) - raw_data_cols.append(file) - compare_frames.append(serving_df[file]) + # Keep rows that actually have the compared metric (same as original behavior) + if drop_column in df.columns: + df = df.dropna(subset=[drop_column], ignore_index=True) + + # Stabilize numeric key columns (harmless if missing) + for c in ( + "Input Len", + "Output Len", + "TP Size", + "PP Size", + "# of max concurrency.", + "qps", + ): + if c in df.columns: + df[c] = pd.to_numeric(df[c], errors="coerce") + + # Ensure all key columns exist + for c in key_cols: + if c not in df.columns: + df[c] = pd.NA + + # Set index = key_cols and aggregate duplicates → unique MultiIndex + df_idx = df.set_index(key_cols, drop=False) + + # meta (key columns), unique per key + meta = df_idx[key_cols] + if not meta.index.is_unique: + meta = meta.groupby(level=key_cols, dropna=False).first() + + # metric series for this file, aggregated to one row per key + file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file) + s = df_idx[data_column] + if not s.index.is_unique: + s = s.groupby(level=key_cols, dropna=False).mean() + s.name = file_label # column label like original + + # add meta once (from first file) so keys are the leftmost columns + if not meta_added: + frames.append(meta) + meta_added = True + + # (NEW) debug: aligned test-name column per file + if debug and name_column in df_idx.columns: + name_s = df_idx[name_column] + if not name_s.index.is_unique: + name_s = name_s.groupby(level=key_cols, dropna=False).first() + name_s.name = f"{file_label}_name" + frames.append(name_s) + + frames.append(s) + raw_data_cols.append(file_label) + compare_frames.append(s) + + # Generalize ratio: for any file N>=2, add ratio (fileN / file1) if len(compare_frames) >= 2: - # Compare numbers among two files - ratio_df = compare_frames[1] / compare_frames[0] - frames.append(ratio_df) - compare_frames.pop(1) + base = compare_frames[0] + current = compare_frames[-1] + ratio = current / base + ratio = ratio.mask(base == 0) # avoid inf when baseline is 0 + ratio.name = f"Ratio 1 vs {len(compare_frames)}" + frames.append(ratio) + # 4) concat on columns with aligned MultiIndex; + # then reset_index to return keys as columns concat_df = pd.concat(frames, axis=1) + concat_df = concat_df.reset_index(drop=True).reset_index() + if "index" in concat_df.columns: + concat_df = concat_df.drop(columns=["index"]) + + # Ensure key/info columns appear first (in your info_cols order) + front = [c for c in info_cols if c in concat_df.columns] + rest = [c for c in concat_df.columns if c not in front] + concat_df = concat_df[front + rest] + print(raw_data_cols) return concat_df, raw_data_cols @@ -67,6 +152,15 @@ def split_json_by_tp_pp( df = pd.DataFrame(data) + # Keep only "serving" tests + name_col = next( + (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None + ) + if name_col: + df = df[ + df[name_col].astype(str).str.contains(r"serving", case=False, na=False) + ].copy() + # Handle alias column names rename_map = { "tp_size": "TP Size", @@ -181,7 +275,6 @@ if __name__ == "__main__": f"Expected subset: {filtered_info_cols}, " f"but DataFrame has: {list(output_df.columns)}" ) - output_df_sorted = output_df.sort_values(by=existing_group_cols) output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False) for name, group in output_groups: @@ -189,8 +282,7 @@ if __name__ == "__main__": text_file.write(html_msgs_for_data_cols[i]) text_file.write(html) - if plot is True: - import pandas as pd + if plot and plotly_found: import plotly.express as px df = group[raw_data_cols] From 1a3079a15e5c8ae2790a1897f82e5af0d68a6921 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B8=B8=EC=9E=AC=EC=9D=80?= Date: Wed, 20 Aug 2025 13:02:50 +0900 Subject: [PATCH 121/361] chore: support pytorch format in lora (#22790) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: jaeeun.kil Signed-off-by: 길재은 --- vllm/lora/models.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index e6b19d4748f44..3072047a2606c 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -207,6 +207,7 @@ class LoRAModel(AdapterModel): """ lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors") lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin") + lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt") new_embeddings_tensor_path = os.path.join( lora_dir, "new_embeddings.safetensors") new_embeddings_bin_file_path = os.path.join(lora_dir, @@ -255,9 +256,10 @@ class LoRAModel(AdapterModel): check_unexpected_modules(f) for module in f.keys(): # noqa tensors[module] = f.get_tensor(module) - elif os.path.isfile(lora_bin_file_path): - # When a bin file is provided, we rely on config to find unexpected - # modules. + elif os.path.isfile(lora_bin_file_path) or os.path.isfile( + lora_pt_file_path): + # When a bin/pt file is provided, we rely on config to find + # unexpected modules. unexpected_modules = [] target_modules = peft_helper.target_modules if not isinstance(target_modules, list): @@ -279,7 +281,10 @@ class LoRAModel(AdapterModel): f" target modules in {expected_lora_modules}" f" but received {unexpected_modules}." f" Please verify that the loaded LoRA module is correct") - tensors = torch.load(lora_bin_file_path, + lora_file_path = (lora_bin_file_path + if os.path.isfile(lora_bin_file_path) else + lora_pt_file_path) + tensors = torch.load(lora_file_path, map_location=device, weights_only=True) else: From f72902327246bc68ff0d196a89cc81262f46de1b Mon Sep 17 00:00:00 2001 From: Zhewen Li Date: Tue, 19 Aug 2025 21:09:27 -0700 Subject: [PATCH 122/361] [CI/Build] Also check DP in benchmarks throughput script (#23038) Co-authored-by: Simon Mo --- benchmarks/benchmark_throughput.py | 4 ++-- vllm/benchmarks/throughput.py | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index c51b579686529..c7f290e1eb88e 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -597,8 +597,8 @@ def validate_args(args): # https://github.com/vllm-project/vllm/issues/16222 if args.data_parallel_size > 1: raise ValueError( - "Data parallel is not supported in offline benchmark, \ - please use benchmark serving instead" + "Data parallel is not supported in offline benchmark, " + "please use benchmark serving instead" ) diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 0c19fa6dcfdd2..f022a55e625f5 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -434,6 +434,14 @@ def validate_args(args): if args.backend == "mii" and args.tokenizer != args.model: raise ValueError( "Tokenizer must be the same as the model for MII backend.") + + # --data-parallel is not supported currently. + # https://github.com/vllm-project/vllm/issues/16222 + if args.data_parallel_size > 1: + raise ValueError( + "Data parallel is not supported in offline benchmark, " + "please use benchmark serving instead" + ) def add_cli_args(parser: argparse.ArgumentParser): From de7b67a0232e35ae8e8ecd944aeddfc8cbc02631 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 20 Aug 2025 13:06:42 +0800 Subject: [PATCH 123/361] [CI/Build] Sync multimodal tests (#23181) Signed-off-by: DarkLight1337 --- .../multimodal/processing/test_common.py | 10 +++++--- tests/models/registry.py | 24 +++++++++---------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 0fdc182b9ee91..8aa0dc7e8e348 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -275,16 +275,17 @@ def _test_processing_correctness_one( "google/gemma-3n-E2B-it", "zai-org/glm-4v-9b", "zai-org/GLM-4.1V-9B-Thinking", + "zai-org/GLM-4.5V", "ibm-granite/granite-speech-3.3-2b", "h2oai/h2ovl-mississippi-800m", + "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", + "HuggingFaceM4/Idefics3-8B-Llama3", "internlm/Intern-S1", "OpenGVLab/InternVL2-1B", "OpenGVLab/InternVL3-1B", - "HuggingFaceM4/Idefics3-8B-Llama3", - "HuggingFaceTB/SmolVLM2-2.2B-Instruct", + "Kwai-Keye/Keye-VL-8B-Preview", "moonshotai/Kimi-VL-A3B-Instruct", "meta-llama/Llama-4-Scout-17B-16E-Instruct", - "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", "llava-hf/llava-1.5-7b-hf", "llava-hf/llava-v1.6-mistral-7b-hf", "llava-hf/LLaVA-NeXT-Video-7B-hf", @@ -315,10 +316,13 @@ def _test_processing_correctness_one( "Qwen/Qwen2-Audio-7B-Instruct", "Qwen/Qwen2.5-Omni-3B", "Skywork/Skywork-R1V-38B", + "HuggingFaceTB/SmolVLM2-2.2B-Instruct", + "stepfun-ai/step3", "fixie-ai/ultravox-v0_5-llama-3_2-1b", "openai/whisper-large-v3", "omni-research/Tarsier-7b", "omni-research/Tarsier2-Recap-7b", + "mistralai/Voxtral-Mini-3B-2507", ]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) diff --git a/tests/models/registry.py b/tests/models/registry.py index cbdc9edbbc9d4..28fe9063169e0 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -215,9 +215,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124", trust_remote_code=True, is_available_online=False), - "HCXVisionForCausalLM": _HfExamplesInfo( - "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", - trust_remote_code=True), "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b", trust_remote_code=True), "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b", @@ -298,8 +295,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"), "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"), "Step3TextForCausalLM": _HfExamplesInfo("stepfun-ai/step3", - trust_remote_code=True, - is_available_online=False), + trust_remote_code=True), "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct", trust_remote_code=True), "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B", @@ -405,22 +401,24 @@ _MULTIMODAL_EXAMPLE_MODELS = { hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"), # noqa: E501 "Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V", - is_available_online=False), # noqa: E501 + min_transformers_version="4.56"), # noqa: E501 "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m", trust_remote_code=True, extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501 max_transformers_version="4.48", # noqa: E501 transformers_version_reason="HF model is not compatible."), # noqa: E501 + "HCXVisionForCausalLM": _HfExamplesInfo("naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", # noqa: E501 + trust_remote_code=True), "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501 {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}, # noqa: E501 min_transformers_version="4.55.1", transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501 + "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1", + trust_remote_code=True), # noqa: E501 "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", extras={"2B": "OpenGVLab/InternVL2-2B", "3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501 trust_remote_code=True), - "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1", - trust_remote_code=True), "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501 trust_remote_code=True), "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501 @@ -464,9 +462,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { transformers_version_reason="HF model is not compatible", # noqa: E501 extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B", "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501 - "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True, - max_transformers_version="4.53", - transformers_version_reason="HF model is not compatible"), # noqa: E501 + "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", + trust_remote_code=True, + max_transformers_version="4.53", + transformers_version_reason="HF model is not compatible"), # noqa: E501 "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501 extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501 "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct", @@ -496,8 +495,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { min_transformers_version="4.55.1", transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501 "Step3VLForConditionalGeneration": _HfExamplesInfo("stepfun-ai/step3", - trust_remote_code=True, - is_available_online=False), + trust_remote_code=True), "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 trust_remote_code=True), "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"), # noqa: E501 From 8fd920924c8c13fb757c324f9e73c70d2d5f3029 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 19 Aug 2025 22:50:29 -0700 Subject: [PATCH 124/361] [BugFix] Fix stuck stats/metrics after requests are aborted (#22995) Signed-off-by: Nick Hill --- tests/entrypoints/openai/test_metrics.py | 95 +++++++++++++++++++++++- vllm/v1/core/block_pool.py | 7 +- vllm/v1/core/sched/scheduler.py | 9 ++- 3 files changed, 106 insertions(+), 5 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 9107d089834bb..ff2e7004ff9f8 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - +import asyncio import subprocess import sys import tempfile @@ -294,6 +294,99 @@ async def test_metrics_exist(server: RemoteOpenAIServer, assert metric in response.text +@pytest.mark.asyncio +async def test_abort_metrics_reset(server: RemoteOpenAIServer, + client: openai.AsyncClient, use_v1: bool): + + running_requests, waiting_requests, kv_cache_usage = ( + _get_running_metrics_from_api(server)) + + # Expect no running requests or kvcache usage + assert running_requests == 0 + assert waiting_requests == 0 + assert kv_cache_usage == 0.0 + + # Start some long-running requests that we can abort + tasks = [] + for _ in range(3): + task = asyncio.create_task( + client.completions.create( + model=MODEL_NAME, + prompt=_TOKENIZED_PROMPT, + max_tokens=100, # Long generation to give time to abort + temperature=0.0)) + tasks.append(task) + + # Wait a bit for requests to start processing + await asyncio.sleep(0.5) + + # Check that we have running requests + running_requests, waiting_requests, kv_cache_usage = ( + _get_running_metrics_from_api(server)) + + # Expect running requests and kvcache usage + assert running_requests > 0 + assert kv_cache_usage > 0 + + # Cancel all tasks to abort the requests + for task in tasks: + task.cancel() + + # Wait for cancellations to be processed + await asyncio.sleep(1.0) + + # Check that metrics have reset to zero + response = requests.get(server.url_for("metrics")) + assert response.status_code == HTTPStatus.OK + + # Verify running and waiting requests counts and KV cache usage are zero + running_requests_after, waiting_requests_after, kv_cache_usage_after = ( + _get_running_metrics_from_api(server)) + + assert running_requests_after == 0,\ + (f"Expected 0 running requests after abort, got " + f"{running_requests_after}") + assert waiting_requests_after == 0,\ + (f"Expected 0 waiting requests after abort, got " + f"{waiting_requests_after}") + assert kv_cache_usage_after == 0,\ + (f"Expected 0% KV cache usage after abort, got " + f"{kv_cache_usage_after}") + + +def _get_running_metrics_from_api(server: RemoteOpenAIServer): + """Return (running_count, waiting_count, kv_cache_usage)""" + + response = requests.get(server.url_for("metrics")) + assert response.status_code == HTTPStatus.OK + + # Verify running and waiting requests counts and KV cache usage are zero + running_requests, waiting_requests, kv_cache_usage = None, None, None + + for family in text_string_to_metric_families(response.text): + if family.name == "vllm:num_requests_running": + for sample in family.samples: + if sample.name == "vllm:num_requests_running": + running_requests = sample.value + break + elif family.name == "vllm:num_requests_waiting": + for sample in family.samples: + if sample.name == "vllm:num_requests_waiting": + waiting_requests = sample.value + break + elif family.name == "vllm:gpu_cache_usage_perc": + for sample in family.samples: + if sample.name == "vllm:gpu_cache_usage_perc": + kv_cache_usage = sample.value + break + + assert running_requests is not None + assert waiting_requests is not None + assert kv_cache_usage is not None + + return running_requests, waiting_requests, kv_cache_usage + + def test_metrics_exist_run_batch(use_v1: bool): input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501 diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index 839297135fe0a..fdd96c3e9557d 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -298,7 +298,12 @@ class BlockPool: Returns: The KV cache usage (between 0.0 and 1.0). """ - return 1.0 - (self.get_num_free_blocks() / self.num_gpu_blocks) + + # Subtract 1 to account for null block. + total_gpu_blocks = self.num_gpu_blocks - 1 + if not total_gpu_blocks: + return 0 + return 1.0 - (self.get_num_free_blocks() / total_gpu_blocks) def take_events(self) -> list[KVCacheEvent]: """Atomically takes all events and clears the queue. diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index f9a7e21014073..4b167da5c8f81 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -902,10 +902,13 @@ class Scheduler(SchedulerInterface): finished_requests=finished_set) finished_req_ids.clear() - if engine_core_outputs: + if (stats := self.make_stats(spec_decoding_stats)) is not None: # Return stats to only one of the front-ends. - next(iter(engine_core_outputs.values())).scheduler_stats = ( - self.make_stats(spec_decoding_stats)) + if (eco := next(iter(engine_core_outputs.values()), None)) is None: + # We must return the stats even if there are no request + # outputs this step. + engine_core_outputs[0] = eco = EngineCoreOutputs() + eco.scheduler_stats = stats return engine_core_outputs From d983769c41db224e0897fac2e9aefc5f57ad1122 Mon Sep 17 00:00:00 2001 From: who who who Date: Wed, 20 Aug 2025 14:24:37 +0800 Subject: [PATCH 125/361] fix cuda graph (#22721) Signed-off-by: fsx950223 --- vllm/v1/attention/backends/rocm_aiter_fa.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 7d09ac0a4a3a1..36b5853bfdcbb 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with AiterFlashAttention.""" from dataclasses import dataclass -from typing import ClassVar, Optional +from typing import Optional import torch @@ -11,7 +11,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, +from vllm.v1.attention.backends.utils import (AttentionCGSupport, + AttentionMetadataBuilder, CommonAttentionMetadata) from vllm.v1.kv_cache_interface import AttentionSpec @@ -231,7 +232,7 @@ class AiterFlashAttentionMetadata: class AiterFlashAttentionMetadataBuilder( AttentionMetadataBuilder[AiterFlashAttentionMetadata]): - full_cudagraph_supported: ClassVar[bool] = True + cudagraph_support = AttentionCGSupport.ALWAYS def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): From 103f1ec8d348a5f336f11d972d6285c4fb4736d4 Mon Sep 17 00:00:00 2001 From: Calvin Chen Date: Wed, 20 Aug 2025 18:16:27 +0800 Subject: [PATCH 126/361] [Model] use autoWeightsLoader for gptoss (#22446) Signed-off-by: calvin chen --- vllm/model_executor/models/gpt_oss.py | 724 +++++++++++++------------- 1 file changed, 370 insertions(+), 354 deletions(-) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 2f5d9ddd9054f..cd93f0ef1e310 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -27,7 +27,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.utils import cdiv -from .utils import extract_layer_index, maybe_prefix +from .utils import (AutoWeightsLoader, WeightsMapper, extract_layer_index, + maybe_prefix) class OAIAttention(nn.Module): @@ -203,6 +204,7 @@ class GptOssModel(nn.Module): super().__init__() self.config = vllm_config.model_config.hf_config self.quant_config = vllm_config.quant_config + self.parallel_config = vllm_config.parallel_config self.config.hidden_size = self.config.hidden_size self.embedding = VocabParallelEmbedding( self.config.vocab_size, @@ -225,8 +227,364 @@ class GptOssModel(nn.Module): x = self.norm(x) return x + def _load_weights_mxfp4( + self, + ep_rank_end: int, + ep_rank_start: int, + heads_per_rank: int, + head_start: int, + weights: Iterable[tuple[str, torch.Tensor]], + stacked_params_mapping: list[tuple[str, ...]], + ) -> set[str]: + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + + mxfp4_block = 32 + use_ep = self.parallel_config.enable_expert_parallel + num_experts = self.config.num_local_experts + + tp_rank = get_tensor_model_parallel_rank() + tp_size = get_tensor_model_parallel_world_size() + + intermediate_size = self.config.intermediate_size + intermediate_size_block = intermediate_size // mxfp4_block + per_rank_intermediate_size_block = cdiv(intermediate_size_block, + tp_size) + per_rank_intermediate_size = (per_rank_intermediate_size_block * + mxfp4_block) + + # Calculate common slicing bounds for current rank + tp_rank_start = tp_rank * per_rank_intermediate_size + tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, + intermediate_size) + + for name, weight in weights: + # FIXME(woosuk): Remove this after testing. + weight = weight.cuda() + + if ".w13_weight_scale" in name: + # Handle MLP gate and up projection weights scale + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, + 2 * tp_rank_start:2 * tp_rank_end, + ...] + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=name, + shard_id=None, + expert_id=None) + loaded_params.add(name) + continue + elif ".w2_weight_scale" in name: + # Handle MLP down projection weights + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[..., tp_rank_start // + mxfp4_block:tp_rank_end // + mxfp4_block] + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=name, + shard_id=None, + expert_id=None) + loaded_params.add(name) + continue + elif ".w13_weight" in name: + # Handle MLP gate and up projection weights + # flat weight from (E, 2 * N, block_size, entry_per_block) + # to (E, 2 * N, -1), shouldn't trigger copy for contiguous + weight = weight.view(num_experts, 2 * intermediate_size, + -1).contiguous() + + # Extract gate and up projection parts + # since the weight is shuffled, we can slice directly + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, + 2 * tp_rank_start:2 * tp_rank_end, + ...] + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=name, + shard_id=None, + expert_id=None) + loaded_params.add(name) + continue + elif ".w2_weight" in name: + # Handle MLP down projection weights + # same flatten here, but since 2 mx4 value are packed in 1 + # uint8, divide by 2 + weight = weight.view(num_experts, -1, + intermediate_size // 2).contiguous() + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[..., + tp_rank_start // 2:tp_rank_end // 2] + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=name, + shard_id=None, + expert_id=None) + loaded_params.add(name) + continue + elif ".w13_bias" in name: + # Handle MLP gate and up projection biases + # Extract gate and up projection bias parts + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, + 2 * tp_rank_start:2 * tp_rank_end] + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, + narrow_weight, + weight_name=name, + shard_id=None, + expert_id=None) + loaded_params.add(name) + continue + elif ".w2_bias" in name: + # Handle MLP down projection bias + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if use_ep: + weight = weight[ep_rank_start:ep_rank_end, ...] + else: + # (only load on rank 0 to avoid duplication) + if tp_rank != 0: + weight.zero_() + weight_loader(param, + weight, + weight_name=name, + shard_id=None, + expert_id=None) + loaded_params.add(name) + continue + elif "sinks" in name: + # Handle attention sinks (distributed across ranks) + param = params_dict[name] + narrow_weight = weight.narrow(0, head_start, heads_per_rank) + param.data.copy_(narrow_weight) + loaded_params.add(name) + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if weight_loader == default_weight_loader: + weight_loader(param, weight) + else: + weight_loader(param, weight, shard_id) + break + else: + # Handle all other weights with potential renaming + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, weight) + loaded_params.add(name) + return loaded_params + + def _load_weights_other( + self, + ep_rank_start: int, + ep_rank_end: int, + heads_per_rank: int, + head_start: int, + weights: Iterable[tuple[str, torch.Tensor]], + stacked_params_mapping: list[tuple[str, ...]], + ) -> set[str]: + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + + use_ep = self.parallel_config.enable_expert_parallel + + tp_rank = get_tensor_model_parallel_rank() + tp_size = get_tensor_model_parallel_world_size() + + intermediate_size = self.config.intermediate_size + per_rank_intermediate_size = cdiv(intermediate_size, tp_size) + # Calculate common slicing bounds for current rank + tp_rank_start = tp_rank * per_rank_intermediate_size + tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, + intermediate_size) + + for name, weight in weights: + if ".w13_weight" in name: + # Handle MLP gate and up projection weights + # Extract gate and up projection parts + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, :, + 2 * tp_rank_start:2 * tp_rank_end] + + narrow_weight = narrow_weight.permute(0, 2, 1).contiguous() + param = params_dict[name] + + param.copy_(narrow_weight) + loaded_params.add(name) + continue + elif ".w2_weight" in name: + # Handle MLP down projection weights + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, tp_rank_start:tp_rank_end, :] + narrow_weight = narrow_weight.permute(0, 2, 1).contiguous() + param = params_dict[name] + + param.copy_(narrow_weight) + loaded_params.add(name) + continue + elif ".w13_bias" in name: + # Handle MLP gate and up projection biases + # Extract gate and up projection bias parts + if use_ep: + narrow_weight = weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = weight[:, + 2 * tp_rank_start:2 * tp_rank_end] + + param = params_dict[name] + param.copy_(narrow_weight) + loaded_params.add(name) + continue + elif ".w2_bias" in name: + # Handle MLP down projection bias + if use_ep: + weight = weight[ep_rank_start:ep_rank_end, ...] + else: + # (only load on rank 0 to avoid duplication) + if tp_rank != 0: + weight.zero_() + param = params_dict[name] + param.copy_(weight) + loaded_params.add(name) + continue + elif "sinks" in name: + # Handle attention sinks (distributed across ranks) + param = params_dict[name] + narrow_weight = weight.narrow(0, head_start, heads_per_rank) + param.data.copy_(narrow_weight) + loaded_params.add(name) + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + if weight_loader == default_weight_loader: + weight_loader(param, weight) + else: + weight_loader(param, weight, shard_id) + break + else: + # Handle all other weights with potential renaming + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, weight) + loaded_params.add(name) + return loaded_params + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv", ".q_proj", "q"), + (".qkv", ".k_proj", "k"), + (".qkv", ".v_proj", "v"), + ] + + tp_rank = get_tensor_model_parallel_rank() + tp_size = get_tensor_model_parallel_world_size() + + # Attention heads per rank + heads_per_rank = self.config.num_attention_heads // tp_size + head_start = tp_rank * heads_per_rank + + ep_size = get_ep_group().world_size + ep_rank = get_ep_group().rank + num_experts = self.config.num_local_experts + experts_per_rank = num_experts // ep_size + ep_rank_start = ep_rank * experts_per_rank + ep_rank_end = (ep_rank + 1) * experts_per_rank + + quant_method = (self.config.quantization_config['quant_method'] if + hasattr(self.config, "quantization_config") else None) + if quant_method == "mxfp4": + return self._load_weights_mxfp4(ep_rank_end, ep_rank_start, + heads_per_rank, head_start, + weights, stacked_params_mapping) + else: + return self._load_weights_other(ep_rank_end, ep_rank_start, + heads_per_rank, head_start, + weights, stacked_params_mapping) + class GptOssForCausalLM(nn.Module): + packed_modules_mapping = {"qkv": ["q_proj", "k_proj", "v_proj"]} + + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + ".self_attn.": ".attn.", + ".post_attention_layernorm.": ".mlp.norm.", + }, + orig_to_new_suffix={ + ".embed_tokens.weight": ".embedding.weight", + ".input_layernorm.weight": ".attn.norm.weight", + ".post_attention_layernorm.weight": ".mlp.norm.weight", + + # MoE MXFP4 weights + ".gate_up_proj_blocks": ".w13_weight", + ".down_proj_blocks": ".w2_weight", + ".gate_up_proj_scales": ".w13_weight_scale", + ".down_proj_scales": ".w2_weight_scale", + + # MoE other weights + ".gate_up_proj": ".w13_weight", + ".down_proj": ".w2_weight", + + # MoE Bias + ".gate_up_proj_bias": ".w13_bias", + ".down_proj_bias": ".w2_bias", + }, + ) def __init__( self, @@ -235,16 +593,17 @@ class GptOssForCausalLM(nn.Module): ): super().__init__() self.vllm_config = vllm_config - self.model_config = vllm_config.model_config.hf_config + self.config = vllm_config.model_config.hf_config + self.model = GptOssModel( vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model"), ) self.lm_head = ParallelLMHead( - self.model_config.vocab_size, - self.model_config.hidden_size, + self.config.vocab_size, + self.config.hidden_size, ) - self.logits_processor = LogitsProcessor(self.model_config.vocab_size) + self.logits_processor = LogitsProcessor(self.config.vocab_size) def forward(self, input_ids: torch.Tensor, @@ -261,354 +620,11 @@ class GptOssForCausalLM(nn.Module): sampling_metadata) return logits - def _load_weights_mxfp4( - self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - rename_mapping = { - "self_attn": "attn", - "input_layernorm.weight": "attn.norm.weight", - "post_attention_layernorm.weight": "mlp.norm.weight", - "embed_tokens": "embedding", - } - - def maybe_rename(name: str) -> str: - for remap_name, new_name in rename_mapping.items(): - if remap_name in name: - return name.replace(remap_name, new_name) - return name - - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - mxfp4_block = 32 - - tp_rank = get_tensor_model_parallel_rank() - tp_size = get_tensor_model_parallel_world_size() - intermediate_size = self.model_config.intermediate_size - intermediate_size_block = intermediate_size // mxfp4_block - per_rank_intermediate_size_block = cdiv(intermediate_size_block, - tp_size) - per_rank_intermediate_size = (per_rank_intermediate_size_block * - mxfp4_block) - - # Calculate common slicing bounds for current rank - tp_rank_start = tp_rank * per_rank_intermediate_size - tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, - intermediate_size) - - # Attention heads per rank - heads_per_rank = self.model_config.num_attention_heads // tp_size - head_start = tp_rank * heads_per_rank - - use_ep = self.vllm_config.parallel_config.enable_expert_parallel - ep_size = get_ep_group().world_size - ep_rank = get_ep_group().rank - num_experts = self.model_config.num_local_experts - experts_per_rank = num_experts // ep_size - ep_rank_start = ep_rank * experts_per_rank - ep_rank_end = (ep_rank + 1) * experts_per_rank - - for name, weight in weights: - # FIXME(woosuk): Remove this after testing. - weight = weight.cuda() - - if "gate_up_proj_blocks" in name: - # Handle MLP gate and up projection weights - new_name = name.replace("gate_up_proj_blocks", "w13_weight") - - # flat weight from (E, 2 * N, block_size, entry_per_block) - # to (E, 2 * N, -1), shouldn't trigger copy for contiguous - weight = weight.view(num_experts, 2 * intermediate_size, - -1).contiguous() - - # Extract gate and up projection parts - # since the weight is shuffled, we can slice directly - if use_ep: - narrow_weight = weight[ep_rank_start:ep_rank_end, ...] - else: - narrow_weight = weight[:, - 2 * tp_rank_start:2 * tp_rank_end, - ...] - - param = params_dict[new_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, - narrow_weight, - weight_name=new_name, - shard_id=None, - expert_id=None) - loaded_params.add(new_name) - - elif "down_proj_blocks" in name: - # Handle MLP down projection weights - new_name = name.replace("down_proj_blocks", "w2_weight") - # same flatten here, but since 2 mx4 value are packed in 1 - # uint8, divide by 2 - weight = weight.view(num_experts, -1, - intermediate_size // 2).contiguous() - if use_ep: - narrow_weight = weight[ep_rank_start:ep_rank_end, ...] - else: - narrow_weight = weight[..., - tp_rank_start // 2:tp_rank_end // 2] - - param = params_dict[new_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, - narrow_weight, - weight_name=new_name, - shard_id=None, - expert_id=None) - loaded_params.add(new_name) - - elif "gate_up_proj_scales" in name: - # Handle MLP gate and up projection weights scale - new_name = name.replace("gate_up_proj_scales", - "w13_weight_scale") - if use_ep: - narrow_weight = weight[ep_rank_start:ep_rank_end, ...] - else: - narrow_weight = weight[:, - 2 * tp_rank_start:2 * tp_rank_end, - ...] - - param = params_dict[new_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, - narrow_weight, - weight_name=new_name, - shard_id=None, - expert_id=None) - loaded_params.add(new_name) - - elif "down_proj_scales" in name: - # Handle MLP down projection weights - new_name = name.replace("down_proj_scales", "w2_weight_scale") - if use_ep: - narrow_weight = weight[ep_rank_start:ep_rank_end, ...] - else: - narrow_weight = weight[..., tp_rank_start // - mxfp4_block:tp_rank_end // - mxfp4_block] - - param = params_dict[new_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, - narrow_weight, - weight_name=new_name, - shard_id=None, - expert_id=None) - loaded_params.add(new_name) - elif "gate_up_proj_bias" in name: - # Handle MLP gate and up projection biases - new_name = name.replace("gate_up_proj_bias", "w13_bias") - - # Extract gate and up projection bias parts - if use_ep: - narrow_weight = weight[ep_rank_start:ep_rank_end, ...] - else: - narrow_weight = weight[:, - 2 * tp_rank_start:2 * tp_rank_end] - - param = params_dict[new_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, - narrow_weight, - weight_name=new_name, - shard_id=None, - expert_id=None) - loaded_params.add(new_name) - - elif "down_proj_bias" in name: - # Handle MLP down projection bias - new_name = name.replace("down_proj_bias", "w2_bias") - param = params_dict[new_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - if use_ep: - weight = weight[ep_rank_start:ep_rank_end, ...] - else: - # (only load on rank 0 to avoid duplication) - if tp_rank != 0: - weight.zero_() - weight_loader(param, - weight, - weight_name=new_name, - shard_id=None, - expert_id=None) - loaded_params.add(new_name) - elif "sinks" in name: - # Handle attention sinks (distributed across ranks) - name = name.replace("self_attn", "attn") - param = params_dict[name] - narrow_weight = weight.narrow(0, head_start, heads_per_rank) - param.data.copy_(narrow_weight) - loaded_params.add(name) - elif "q_proj" in name or "k_proj" in name or "v_proj" in name: - shard_id = ("q" if "q_proj" in name else - "k" if "k_proj" in name else "v") - name = name.replace("self_attn", "attn") - param_name = name.replace(f"{shard_id}_proj", "qkv") - param = params_dict[param_name] - weight_loader = param.weight_loader - weight_loader(param, weight, loaded_shard_id=shard_id) - loaded_params.add(param_name) - else: - # Handle all other weights with potential renaming - renamed_name = maybe_rename(name) - if renamed_name not in params_dict: - continue - param = params_dict[renamed_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, weight) - loaded_params.add(renamed_name) - - return loaded_params - - def _load_weights_other( - self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - rename_mapping = { - "self_attn": "attn", - "input_layernorm.weight": "attn.norm.weight", - "post_attention_layernorm.weight": "mlp.norm.weight", - "embed_tokens": "embedding", - } - - def maybe_rename(name: str) -> str: - for remap_name, new_name in rename_mapping.items(): - if remap_name in name: - return name.replace(remap_name, new_name) - return name - - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - - tp_rank = get_tensor_model_parallel_rank() - tp_size = get_tensor_model_parallel_world_size() - intermediate_size = self.model_config.intermediate_size - - per_rank_intermediate_size = cdiv(intermediate_size, tp_size) - # Calculate common slicing bounds for current rank - tp_rank_start = tp_rank * per_rank_intermediate_size - tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, - intermediate_size) - - # Attention heads per rank - heads_per_rank = self.model_config.num_attention_heads // tp_size - head_start = tp_rank * heads_per_rank - - use_ep = self.vllm_config.parallel_config.enable_expert_parallel - ep_size = get_ep_group().world_size - ep_rank = get_ep_group().rank - num_experts = self.model_config.num_local_experts - experts_per_rank = num_experts // ep_size - ep_rank_start = ep_rank * experts_per_rank - ep_rank_end = (ep_rank + 1) * experts_per_rank - - for name, weight in weights: - if ".experts.gate_up_proj" in name and "bias" not in name: - # Handle MLP gate and up projection weights - new_name = name.replace(".experts.gate_up_proj", - ".experts.w13_weight") - - # Extract gate and up projection parts - # since the weight is shuffled, we can slice directly - if use_ep: - narrow_weight = weight[ep_rank_start:ep_rank_end, ...] - else: - narrow_weight = weight[:, :, - 2 * tp_rank_start:2 * tp_rank_end] - - narrow_weight = narrow_weight.permute(0, 2, 1).contiguous() - param = params_dict[new_name] - - param.copy_(narrow_weight) - loaded_params.add(new_name) - - elif ".experts.down_proj" in name and "bias" not in name: - # Handle MLP down projection weights - new_name = name.replace(".experts.down_proj", - ".experts.w2_weight") - - if use_ep: - narrow_weight = weight[ep_rank_start:ep_rank_end, ...] - else: - narrow_weight = weight[:, tp_rank_start:tp_rank_end, :] - narrow_weight = narrow_weight.permute(0, 2, 1).contiguous() - param = params_dict[new_name] - - param.copy_(narrow_weight) - loaded_params.add(new_name) - - elif "gate_up_proj_bias" in name: - # Handle MLP gate and up projection biases - new_name = name.replace("gate_up_proj_bias", "w13_bias") - - # Extract gate and up projection bias parts - if use_ep: - narrow_weight = weight[ep_rank_start:ep_rank_end, ...] - else: - narrow_weight = weight[:, - 2 * tp_rank_start:2 * tp_rank_end] - - param = params_dict[new_name] - - param.copy_(narrow_weight) - loaded_params.add(new_name) - - elif "down_proj_bias" in name: - # Handle MLP down projection bias - new_name = name.replace("down_proj_bias", "w2_bias") - - if use_ep: - weight = weight[ep_rank_start:ep_rank_end, ...] - else: - # (only load on rank 0 to avoid duplication) - if tp_rank != 0: - weight.zero_() - param = params_dict[new_name] - param.copy_(weight) - loaded_params.add(new_name) - elif "sinks" in name: - # Handle attention sinks (distributed across ranks) - name = name.replace("self_attn", "attn") - param = params_dict[name] - narrow_weight = weight.narrow(0, head_start, heads_per_rank) - param.data.copy_(narrow_weight) - loaded_params.add(name) - elif "q_proj" in name or "k_proj" in name or "v_proj" in name: - shard_id = ("q" if "q_proj" in name else - "k" if "k_proj" in name else "v") - name = name.replace("self_attn", "attn") - param_name = name.replace(f"{shard_id}_proj", "qkv") - param = params_dict[param_name] - weight_loader = param.weight_loader - weight_loader(param, weight, loaded_shard_id=shard_id) - loaded_params.add(param_name) - else: - # Handle all other weights with potential renaming - - renamed_name = maybe_rename(name) - if renamed_name not in params_dict: - continue - param = params_dict[renamed_name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, weight) - loaded_params.add(renamed_name) - - return loaded_params - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - quant_method = (self.model_config.quantization_config['quant_method'] - if hasattr(self.model_config, "quantization_config") - else None) - if quant_method == "mxfp4": - return self._load_weights_mxfp4(weights) - else: - return self._load_weights_other(weights) + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) From 3aa8c100381a1c6a99a259d9da5dac70fd3a6c0b Mon Sep 17 00:00:00 2001 From: Shiming Zhang Date: Wed, 20 Aug 2025 18:46:59 +0800 Subject: [PATCH 127/361] Fix missing quotes (#23242) Signed-off-by: Shiming Zhang --- docs/deployment/frameworks/dstack.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md index 23dc58c974ed8..fe4d87f78f2aa 100644 --- a/docs/deployment/frameworks/dstack.md +++ b/docs/deployment/frameworks/dstack.md @@ -9,7 +9,7 @@ vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), To install dstack client, run: ```bash -pip install "dstack[all] +pip install dstack[all] dstack server ``` From 83e69a09d6c1a5e88ae00060e79ec7b7a9465462 Mon Sep 17 00:00:00 2001 From: Xin Yang <105740670+xyang16@users.noreply.github.com> Date: Wed, 20 Aug 2025 04:01:31 -0700 Subject: [PATCH 128/361] [Model] Support deepseek with eagle (#21086) Signed-off-by: Xin Yang --- tests/models/registry.py | 3 + tests/v1/e2e/test_spec_decode.py | 6 +- vllm/model_executor/models/deepseek_eagle.py | 246 +++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 4 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 vllm/model_executor/models/deepseek_eagle.py diff --git a/tests/models/registry.py b/tests/models/registry.py index 28fe9063169e0..739d962279714 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -530,6 +530,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { "DeepSeekMTPModel": _HfExamplesInfo("luccafong/deepseek_mtp_main_random", speculative_model="luccafong/deepseek_mtp_draft_random", # noqa: E501 trust_remote_code=True), + "EagleDeepSeekMTPModel": _HfExamplesInfo("eagle618/deepseek-v3-random", + speculative_model="eagle618/eagle-deepseek-v3-random", # noqa: E501 + trust_remote_code=True), "EagleLlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE-LLaMA3-Instruct-8B", trust_remote_code=True, speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B", diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 7b3f458312792..bd0fa6b80781a 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -144,6 +144,8 @@ def test_ngram_correctness( "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), True, marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), + (("eagle", "eagle618/deepseek-v3-random", + "eagle618/eagle-deepseek-v3-random", 1), False), ], ids=[ # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501 @@ -151,7 +153,8 @@ def test_ngram_correctness( "llama3_eagle", "llama3_eagle3", "llama4_eagle", - "llama4_eagle_mm" + "llama4_eagle_mm", + "deepseek_eagle" ]) @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform()) @@ -177,6 +180,7 @@ def test_eagle_correctness( ''' with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") + m.setenv("VLLM_MLA_DISABLE", "1") m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) if (attn_backend == "TRITON_ATTN_VLLM_V1" diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py new file mode 100644 index 0000000000000..0c9c83cf61000 --- /dev/null +++ b/vllm/model_executor/models/deepseek_eagle.py @@ -0,0 +1,246 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Iterable +from typing import Optional + +import torch +import torch.nn as nn + +from vllm.compilation.decorators import support_torch_compile +from vllm.config import VllmConfig +from vllm.distributed.parallel_state import get_pp_group +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) +from vllm.model_executor.models.deepseek_v2 import (DeepseekV2DecoderLayer, + DeepseekV3ForCausalLM) +from vllm.model_executor.sampling_metadata import SamplingMetadata + +from .utils import AutoWeightsLoader, maybe_prefix + + +@support_torch_compile +class DeepseekV2Model(nn.Module): + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + start_layer_id: int = 0, + ) -> None: + super().__init__() + self.config = vllm_config. \ + speculative_config.draft_model_config.hf_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.vocab_size = self.config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + self.config.vocab_size, + self.config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "embed_tokens"), + ) + + self.layers = nn.ModuleList([ + DeepseekV2DecoderLayer( + self.config, + prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"), + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + ) for i in range(self.config.num_hidden_layers) + ]) + + self.fc = nn.Linear( + self.config.model.hidden_size * 2, + self.config.model.hidden_size, + bias=False, + ) + + self.enorm = RMSNorm(self.config.hidden_size, + eps=self.config.rms_norm_eps) + self.hnorm = RMSNorm(self.config.hidden_size, + eps=self.config.rms_norm_eps) + self.norm = RMSNorm(self.config.hidden_size, + eps=self.config.rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + input_embeds = self.embed_tokens(input_ids) + + inputs = torch.cat( + [self.enorm(input_embeds), + self.hnorm(hidden_states)], dim=-1) + hidden_states = self.fc(inputs) + residual = None + for layer in self.layers: + hidden_states, residual = layer( + positions, + hidden_states, + residual, + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states, hidden_states + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ("fused_qkv_a_proj", "q_a_proj", 0), + ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.n_routed_experts) + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + for param_name, weight_name, shard_id in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if ("mlp.experts." in name) and name not in params_dict: + continue + name_mapped = name.replace(weight_name, param_name) + + # QKV fusion is optional, fall back to normal + # weight loading if it's not enabled + # if go with fusion option, then update name + if ((param_name == "fused_qkv_a_proj") + and name_mapped not in params_dict): + continue + else: + name = name_mapped + + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader( + param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id, + ) + break + else: + # if PP disabled then draft will share embed with target + if get_pp_group().world_size == 1 and \ + "embed_tokens." in name: + continue + + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + nn.Module.__init__(self) + self.config = vllm_config. \ + speculative_config.draft_model_config.hf_config + quant_config = vllm_config.quant_config + target_layer_num = vllm_config.model_config.get_num_layers( + vllm_config.parallel_config) + self.model = DeepseekV2Model(vllm_config=vllm_config, + prefix="model", + start_layer_id=target_layer_num) + + self.lm_head = ParallelLMHead(self.config.vocab_size, + self.config.hidden_size, + quant_config=quant_config) + + logit_scale = getattr(self.config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor(self.config.vocab_size, + scale=logit_scale) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + if inputs_embeds is not None: + raise NotImplementedError( + f"{type(self).__name__} does not support multimodal inputs yet." + ) + return self.model(input_ids, positions, hidden_states) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader( + self, + skip_prefixes=None, + ) + + model_weights = {} + for name, loaded_weight in weights: + if "lm_head" not in name: + name = "model." + name + model_weights[name] = loaded_weight + loader.load_weights(model_weights.items()) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 8728684d8e68d..a94231b0f8461 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -264,6 +264,7 @@ _SPECULATIVE_DECODING_MODELS = { "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501 # "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"), + "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"), "MedusaModel": ("medusa", "Medusa"), From 68fcd3fa7313d00240f766f42affe931f1f379a7 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 20 Aug 2025 19:09:18 +0800 Subject: [PATCH 129/361] [Bugfix] Ensure correctness of Cohere2Vision processing (#23245) Signed-off-by: DarkLight1337 --- .../multimodal/processing/test_common.py | 1 + vllm/model_executor/models/aya_vision.py | 3 +- vllm/model_executor/models/cohere2_vision.py | 71 ++++++++++++++----- 3 files changed, 56 insertions(+), 19 deletions(-) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 8aa0dc7e8e348..d5b1de834a618 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -268,6 +268,7 @@ def _test_processing_correctness_one( "CohereForAI/aya-vision-8b", "Salesforce/blip2-opt-2.7b", "facebook/chameleon-7b", + "CohereLabs/command-a-vision-07-2025", "deepseek-ai/deepseek-vl2-tiny", "microsoft/Florence-2-base", "adept/fuyu-8b", diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index b02a973d942ce..687c82ded9d0a 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -250,8 +250,7 @@ class AyaVisionMultiModalProcessor( image_processor = hf_processor.image_processor def get_replacement(item_idx: int): - images: ImageProcessorItems = mm_items.get("image", - ImageProcessorItems) + images = mm_items.get_items("image", ImageProcessorItems) image_size: ImageSize = images.get_image_size(item_idx) num_patches = self.info.get_num_patches( image_width=image_size.width, diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index bc526fd661b6d..4682a8a428a03 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -10,6 +10,8 @@ import torch from torch import nn from transformers import BatchFeature, PretrainedConfig from transformers.models.cohere2_vision import Cohere2VisionConfig +from transformers.models.cohere2_vision.image_processing_cohere2_vision_fast import ( # noqa: E501 + get_optimal_tiled_canvas) from transformers.models.cohere2_vision.processing_cohere2_vision import ( Cohere2VisionProcessor) @@ -150,14 +152,46 @@ class Cohere2VisionProcessingInfo(BaseProcessingInfo): max_patches = image_processor.max_patches return ImageSize(height=height * max_patches, width=width) - def get_num_patches(self, image_width: int, image_height: int) -> int: + def get_num_patches( + self, + *, + image_width: int, + image_height: int, + processor: Optional[Cohere2VisionProcessor], + ) -> int: """ Calculate the number of image patches for a given image. Uses the HF processor to determine the actual number of patches. """ - return self.get_hf_processor( - ).image_processor.get_number_of_image_patches(image_height, - image_width, {}) + if processor is None: + processor = self.get_hf_processor() + + image_processor = processor.image_processor + + # The current implementation of get_number_of_image_patches + # is incorrect, so we patch it here. + # return image_processor.get_number_of_image_patches(image_height, + # image_width, {}) + + min_patches = image_processor.min_patches + max_patches = image_processor.max_patches + patch_size = image_processor.size + crop_to_patches = image_processor.crop_to_patches + + if not crop_to_patches: + return 1 + + num_columns, num_rows = get_optimal_tiled_canvas( + (image_height, image_width), + (patch_size["height"], patch_size["width"]), + min_patches, + max_patches, + ) + num_patches = num_columns * num_rows + if num_patches > 1: + num_patches += 1 # Thumbnail image + + return num_patches class Cohere2VisionDummyInputsBuilder( @@ -208,6 +242,8 @@ class Cohere2VisionMultiModalProcessor( # Ensure num_patches is available for proper tensor splitting if "num_patches" not in processed_outputs and ( images := mm_data.get("images")) is not None: + hf_processor = self.info.get_hf_processor(**mm_kwargs) + # Fallback calculation if HF processor didn't provide num_patches parsed_images = self._get_data_parser().parse_mm_data({ "image": @@ -217,8 +253,9 @@ class Cohere2VisionMultiModalProcessor( num_patches = [ self.info.get_num_patches( image_width=parsed_images.get_image_size(i).width, - image_height=parsed_images.get_image_size(i).height) - for i in range(len(parsed_images)) + image_height=parsed_images.get_image_size(i).height, + processor=hf_processor, + ) for i in range(len(parsed_images)) ] processed_outputs["num_patches"] = torch.tensor(num_patches) @@ -245,25 +282,25 @@ class Cohere2VisionMultiModalProcessor( ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_token = hf_processor.image_token + img_tokens_per_tile = int(hf_processor.patch_size**2) img_line_break_token = hf_processor.img_line_break_token boi_token = hf_processor.boi_token eoi_token = hf_processor.eoi_token def get_replacement(item_idx: int): - images: ImageProcessorItems = mm_items.get("image", - ImageProcessorItems) + images = mm_items.get_items("image", ImageProcessorItems) image_size: ImageSize = images.get_image_size(item_idx) - num_patches = self.info.get_num_patches(image_size.height, - image_size.width) - img_tokens_per_tile = int(hf_processor.patch_size**2) - single_tile_tokens = image_token * img_tokens_per_tile + \ - img_line_break_token - img_string = f"{boi_token}\ - {single_tile_tokens * num_patches}\ - {eoi_token}" + num_patches = self.info.get_num_patches( + image_width=image_size.width, + image_height=image_size.height, + processor=hf_processor, + ) + patch_tokens = (image_token * img_tokens_per_tile + + img_line_break_token) + repl = f"{boi_token}{patch_tokens * num_patches}{eoi_token}" - return PromptUpdateDetails.select_text(img_string, image_token) + return PromptUpdateDetails.select_text(repl, image_token) return [ PromptReplacement( From 50df09fe13c93b520c64c581de4f0b469995f7b9 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 20 Aug 2025 08:05:54 -0400 Subject: [PATCH 130/361] Update to flashinfer-python==0.2.12 and disable AOT compile for non-release image (#23129) Signed-off-by: mgoin --- .buildkite/release-pipeline.yaml | 2 +- docker/Dockerfile | 52 ++++++++++++++++++++------------ setup.py | 2 +- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 85d3e56387421..e20ce54ca795a 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -68,7 +68,7 @@ steps: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - label: "Annotate release workflow" diff --git a/docker/Dockerfile b/docker/Dockerfile index 74938917781ac..cfaa59868215c 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -372,31 +372,45 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist # Install FlashInfer from source ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" -# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt -# We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel. -ARG FLASHINFER_GIT_REF="v0.2.11" +# Keep this in sync with "flashinfer" extra in setup.py +ARG FLASHINFER_GIT_REF="v0.2.12" +# Flag to control whether to compile FlashInfer AOT kernels +# Set to "true" to enable AOT compilation: +# docker build --build-arg FLASHINFER_AOT_COMPILE=true ... +ARG FLASHINFER_AOT_COMPILE=false RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' . /etc/environment git clone --depth 1 --recursive --shallow-submodules \ --branch ${FLASHINFER_GIT_REF} \ ${FLASHINFER_GIT_REPO} flashinfer - # Exclude CUDA arches for older versions (11.x and 12.0-12.7) - # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg. - if [[ "${CUDA_VERSION}" == 11.* ]]; then - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9" - elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a" - else - # CUDA 12.8+ supports 10.0a and 12.0 - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0" - fi - echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}" - # Needed to build AOT kernels pushd flashinfer - TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ - python3 -m flashinfer.aot - TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ - uv pip install --system --no-build-isolation --force-reinstall --no-deps . + if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then + # Exclude CUDA arches for older versions (11.x and 12.0-12.7) + # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg. + if [[ "${CUDA_VERSION}" == 11.* ]]; then + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9" + elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a" + else + # CUDA 12.8+ supports 10.0a and 12.0 + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0" + fi + echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}" + # Build AOT kernels + TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ + python3 -m flashinfer.aot + # Install with no-build-isolation since we already built AOT kernels + TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ + uv pip install --system --no-build-isolation . \ + --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') + # Download pre-compiled cubins + TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ + python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins." + else + echo "🏗️ Installing FlashInfer without AOT compilation in JIT mode" + uv pip install --system . \ + --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') + fi popd rm -rf flashinfer BASH diff --git a/setup.py b/setup.py index cc3037ebb72cb..6a3013de7937c 100644 --- a/setup.py +++ b/setup.py @@ -685,7 +685,7 @@ setup( "mistral_common[audio]"], # Required for audio processing "video": [], # Kept for backwards compatibility # FlashInfer should be updated together with the Dockerfile - "flashinfer": ["flashinfer-python==0.2.11"], + "flashinfer": ["flashinfer-python==0.2.12"], }, cmdclass=cmdclass, package_data=package_data, From 7cd17e22d76473919c55aa75ac1897e4d3fbe277 Mon Sep 17 00:00:00 2001 From: xyxinyang <43821961+xyxinyang@users.noreply.github.com> Date: Wed, 20 Aug 2025 20:41:55 +0800 Subject: [PATCH 131/361] [Model][V1] Support Ernie MTP (#22169) Signed-off-by: zhouchong Co-authored-by: zhouchong --- tests/models/registry.py | 3 + vllm/config/__init__.py | 31 ++- vllm/model_executor/models/ernie_mtp.py | 287 ++++++++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + vllm/v1/spec_decode/eagle.py | 2 +- vllm/worker/worker.py | 3 +- 6 files changed, 320 insertions(+), 7 deletions(-) create mode 100644 vllm/model_executor/models/ernie_mtp.py diff --git a/tests/models/registry.py b/tests/models/registry.py index 739d962279714..6e6acfb8cd228 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -556,6 +556,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { is_available_online=False, speculative_model="openbmb/MiniCPM-2B-sft-bf16", tokenizer="openbmb/MiniCPM-2B-sft-bf16"), + "ErnieMTPModel": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT", + trust_remote_code=True, + speculative_model="baidu/ERNIE-4.5-21B-A3B-PT"), "Glm4MoeMTPModel": _HfExamplesInfo("zai-org/GLM-4.5", speculative_model="zai-org/GLM-4.5", min_transformers_version="4.54", diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 56a749789b6a4..801fa97fe5daf 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1463,7 +1463,8 @@ class ModelConfig: from vllm.distributed.utils import get_pp_indices if (self.hf_text_config.model_type == "deepseek_mtp" or self.hf_config.model_type == "mimo_mtp" - or self.hf_config.model_type == "glm4_moe_mtp"): + or self.hf_config.model_type == "glm4_moe_mtp" + or self.hf_config.model_type == "ernie_mtp"): total_num_hidden_layers = getattr(self.hf_text_config, "num_nextn_predict_layers", 0) else: @@ -1911,7 +1912,8 @@ class DeviceConfig: SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa", - "mlp_speculator", "draft_model", "deepseek_mtp"] + "mlp_speculator", "draft_model", "deepseek_mtp", + "ernie_mtp"] @config @@ -2044,6 +2046,16 @@ class SpeculativeConfig: "architectures": ["Glm4MoeMTPModel"] }) + if hf_config.model_type == "ernie4_5_moe": + hf_config.model_type = "ernie_mtp" + if hf_config.model_type == "ernie_mtp": + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update({ + "n_predict": n_predict, + "architectures": ["ErnieMTPModel"] + }) + return hf_config + return hf_config def __post_init__(self): @@ -2062,8 +2074,8 @@ class SpeculativeConfig: if self.target_model_config and \ (self.target_model_config.hf_text_config.model_type \ == "deepseek_v3" or - self.target_model_config.hf_text_config.model_type \ - == "mimo"): + self.target_model_config.hf_text_config.model_type in + ("mimo","ernie4_5_moe")): # use the draft model from the same model: self.model = self.target_model_config.model elif self.method in ("ngram", "[ngram]"): @@ -2161,6 +2173,15 @@ class SpeculativeConfig: "one layer. Might need some code changes " \ "to support multiple layers." ) + elif (self.draft_model_config.hf_config.model_type == + "ernie_mtp"): + self.method = "ernie_mtp" + if self.num_speculative_tokens > 1: + logger.warning( + "All Ernie MTP models only have " \ + "one layer. Might need some code changes " \ + "to support multiple layers." + ) else: self.method = "draft_model" raise NotImplementedError( @@ -2376,7 +2397,7 @@ class SpeculativeConfig: return self.num_speculative_tokens def use_eagle(self) -> bool: - return self.method in ("eagle", "eagle3", "deepseek_mtp") + return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp") def __repr__(self) -> str: method = self.method diff --git a/vllm/model_executor/models/ernie_mtp.py b/vllm/model_executor/models/ernie_mtp.py new file mode 100644 index 0000000000000..90a1267b28f0a --- /dev/null +++ b/vllm/model_executor/models/ernie_mtp.py @@ -0,0 +1,287 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The Baidu team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Ernie-MTP model.""" +from collections.abc import Iterable +from typing import Optional + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.config import CacheConfig, ModelConfig, VllmConfig +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .interfaces import SupportsPP +from .llama import LlamaDecoderLayer +from .utils import is_pp_missing_parameter, maybe_prefix + + +class ErnieMultiTokenPredictorLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + prefix: str, + model_config: ModelConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + + self.mtp_emb_norm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.mtp_hidden_norm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.mtp_linear_proj = nn.Linear(config.hidden_size * 2, + config.hidden_size, + bias=False) + self.mtp_block = LlamaDecoderLayer(config, cache_config, quant_config, + prefix) + + def forward( + self, + inputs_embeds: torch.Tensor, + positions: torch.Tensor, + previous_hidden_states: torch.Tensor, + spec_step_index: int = 0, + ) -> torch.Tensor: + assert inputs_embeds is not None + # masking inputs at position 0, as not needed by MTP + inputs_embeds[positions == 0] = 0 + + inputs_embeds = self.mtp_emb_norm(inputs_embeds) + previous_hidden_states = self.mtp_hidden_norm(previous_hidden_states) + + hidden_states = self.mtp_linear_proj( + torch.cat([inputs_embeds, previous_hidden_states], dim=-1)) + + hidden_states, residual = self.mtp_block(positions=positions, + hidden_states=hidden_states, + residual=None) + hidden_states = residual + hidden_states + + return hidden_states + + +class ErnieMultiTokenPredictor(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + self.mtp_start_layer_idx = config.num_hidden_layers + self.num_mtp_layers = config.num_nextn_predict_layers + # to map the exact layer index from weights + self.layers = torch.nn.ModuleDict({ + str(idx): + ErnieMultiTokenPredictorLayer( + config, + f"{prefix}.layers.{idx}", + model_config=vllm_config.model_config, + cache_config=vllm_config.cache_config, + ) + for idx in range(self.mtp_start_layer_idx, + self.mtp_start_layer_idx + self.num_mtp_layers) + }) + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.logits_processor = LogitsProcessor(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + previous_hidden_states: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_idx: int = 0, + ) -> torch.Tensor: + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + return self.layers[str(self.mtp_start_layer_idx + spec_step_idx)]( + inputs_embeds, + positions, + previous_hidden_states, + spec_step_idx, + ) + + def compute_logits( + self, + hidden_states: torch.Tensor, + lm_head: ParallelLMHead, + sampling_metadata: SamplingMetadata, + spec_step_idx: int = 0, + ) -> torch.Tensor: + self.layers[str(self.mtp_start_layer_idx + spec_step_idx)] + logits = self.logits_processor(lm_head, hidden_states, + sampling_metadata) + return logits + + +class ErnieMTP(nn.Module, SupportsPP): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + self.config = vllm_config.model_config.hf_config + self.model = ErnieMultiTokenPredictor(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "model")) + self.lm_head = ParallelLMHead(self.config.vocab_size, + self.config.hidden_size) + self.sampler = get_sampler() + + if self.config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_idx: int = 0, + ) -> torch.Tensor: + assert spec_step_idx == 0, "ernie_mtp only support predict one token" + hidden_states = self.model(input_ids, positions, hidden_states, + inputs_embeds, spec_step_idx) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + spec_step_idx: int = 0, + ) -> Optional[torch.Tensor]: + return self.model.compute_logits(hidden_states, self.lm_head, + sampling_metadata, spec_step_idx) + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + + if self.config.tie_word_embeddings and name.endswith( + "lm_head.weight"): + continue + if "rotary_emb.inv_freq" in name: + continue + if "mtp" in name: + name = self._rewrite_spec_layer_name(self.config, name) + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + if "mtp" not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if (("mlp.experts." in name) and name not in params_dict): + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + + # According to DeepSeek-V3 Technical Report, MTP modules + # shares embedding layer. We only load the first weights. + if "mtp_" not in name and ("embed_tokens" not in name + and "lm_head" not in name): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + def _rewrite_spec_layer_name(self, config: PretrainedConfig, + name: str) -> str: + """ + Rewrite the weight name to match the format of the original model. + """ + spec_layer_weight_names = [ + "embed_tokens", "mtp_emb_norm", "mtp_hidden_norm", + "mtp_linear_proj" + ] + layer_idx = config.num_hidden_layers + for weight_name in spec_layer_weight_names: + if weight_name in name: + name = name.replace( + f"model.{weight_name}.0.", + f"model.layers.{layer_idx}.{weight_name}.") + return name + name = name.replace("model.mtp_block.0.", + f"model.layers.{layer_idx}.mtp_block.") + return name diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index a94231b0f8461..78ef270598b8e 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -266,6 +266,7 @@ _SPECULATIVE_DECODING_MODELS = { # "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), + "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"), "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"), "MedusaModel": ("medusa", "Medusa"), # Temporarily disabled. diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index a8a160a0f9953..8cd2ad12cfa30 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -194,7 +194,7 @@ class EagleProposer: hidden_states=self.hidden_states[:num_input_tokens], inputs_embeds=inputs_embeds, ) - if self.method == "deepseek_mtp": + if self.method in ("deepseek_mtp", "ernie_mtp"): last_hidden_states = ret_hidden_states else: last_hidden_states, hidden_states = ret_hidden_states diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 9dfea947568d4..7a01e585ba6d0 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -77,7 +77,8 @@ class Worker(LocalOrDistributedWorkerBase): "eagle", "deepseek_mtp", "glm4_moe_mtp", - "mimo_mtp")) \ + "mimo_mtp", + "ernie_mtp")) \ else {"return_hidden_states": True} ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner From c6d80a7a9620637ba5016dd3c0d6061e79eed73c Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 20 Aug 2025 20:47:05 +0800 Subject: [PATCH 132/361] [Model] Improve olmo and olmo2 (#23228) Signed-off-by: Jee Jee Li --- docs/models/supported_models.md | 4 ++-- vllm/model_executor/models/olmo.py | 22 +++++++++++++++++++--- vllm/model_executor/models/olmo2.py | 17 +++++++++++++++-- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 7908e42387100..7308d0010690a 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -384,8 +384,8 @@ th { | `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ | | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ | | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | | ✅︎ | ✅︎ | -| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | | ✅︎ | ✅︎ | +| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ | | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | ✅︎ | | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ | diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 1dc4df85c1bc4..01639d398126f 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -47,7 +47,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsPP +from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -91,6 +91,7 @@ class OlmoAttention(nn.Module): self.total_num_heads, bias=config.attention_bias, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) # Rotary embeddings. @@ -114,6 +115,7 @@ class OlmoAttention(nn.Module): self.hidden_size, bias=config.attention_bias, quant_config=quant_config, + prefix=f"{prefix}.o_proj", ) def forward( @@ -142,6 +144,7 @@ class OlmoMLP(nn.Module): self, config: OlmoConfig, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.config = config @@ -154,6 +157,7 @@ class OlmoMLP(nn.Module): [self.intermediate_size] * 2, bias=False, quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", ) # Activation function. @@ -165,6 +169,7 @@ class OlmoMLP(nn.Module): self.hidden_size, bias=False, quant_config=quant_config, + prefix=f"{prefix}.down_proj", ) def forward( @@ -197,7 +202,7 @@ class OlmoDecoderLayer(nn.Module): prefix=f"{prefix}.self_attn") # MLP block. - self.mlp = OlmoMLP(config, quant_config) + self.mlp = OlmoMLP(config, quant_config, prefix=f"{prefix}.mlp") # LayerNorm self.input_layernorm = nn.LayerNorm(config.hidden_size, @@ -326,10 +331,21 @@ class OlmoModel(nn.Module): return loaded_params -class OlmoForCausalLM(nn.Module, SupportsPP): +class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA): """ Extremely barebones HF model wrapper. """ + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 499e6d30ed6b0..66a0f9115585a 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -33,6 +33,7 @@ from torch import nn from transformers import Olmo2Config from vllm.attention import Attention +from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.distributed.communication_op import tensor_model_parallel_all_gather @@ -48,7 +49,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.interfaces import SupportsPP +from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP from vllm.model_executor.models.utils import ( AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -253,6 +254,7 @@ class Olmo2DecoderLayer(nn.Module): return hidden_states +@support_torch_compile class Olmo2Model(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -354,10 +356,21 @@ class Olmo2Model(nn.Module): return loaded_params -class Olmo2ForCausalLM(nn.Module, SupportsPP): +class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): """ Extremely barebones HF model wrapper. """ + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() From 38217877aa70041c0115ee367b75197af9cbc5ad Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Wed, 20 Aug 2025 21:34:49 +0800 Subject: [PATCH 133/361] [Fix] fix offline env use local mode path (#22526) Signed-off-by: rongfu.leng --- .../offline_mode/test_offline_mode.py | 35 +++++++++++++++++++ vllm/engine/arg_utils.py | 10 +++++- vllm/transformers_utils/config.py | 23 ++++++++++-- 3 files changed, 65 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index a606eeab5887e..dd8d63ad319ac 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for HF_HUB_OFFLINE mode""" +import dataclasses import importlib import sys @@ -9,6 +10,7 @@ import urllib3 from vllm import LLM from vllm.distributed import cleanup_dist_env_and_memory +from vllm.engine.arg_utils import EngineArgs MODEL_CONFIGS = [ { @@ -108,3 +110,36 @@ def _re_import_modules(): # Error this test if reloading a module failed if reload_exception is not None: raise reload_exception + + +@pytest.mark.skip_global_cleanup +@pytest.mark.usefixtures("cache_models") +def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch): + # Set HF to offline mode and ensure we can still construct an LLM + with monkeypatch.context() as m: + try: + m.setenv("HF_HUB_OFFLINE", "1") + m.setenv("VLLM_NO_USAGE_STATS", "1") + + def disable_connect(*args, **kwargs): + raise RuntimeError("No http calls allowed") + + m.setattr( + urllib3.connection.HTTPConnection, + "connect", + disable_connect, + ) + m.setattr( + urllib3.connection.HTTPSConnection, + "connect", + disable_connect, + ) + # Need to re-import huggingface_hub + # and friends to setup offline mode + _re_import_modules() + engine_args = EngineArgs(model="facebook/opt-125m") + LLM(**dataclasses.asdict(engine_args)) + finally: + # Reset the environment after the test + # NB: Assuming tests are run in online mode + _re_import_modules() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 679905aed9ec8..48d9cd08af030 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -15,6 +15,7 @@ from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List, Literal, Optional, Type, TypeVar, Union, cast, get_args, get_origin) +import huggingface_hub import regex as re import torch from pydantic import TypeAdapter, ValidationError @@ -39,7 +40,7 @@ from vllm.plugins import load_general_plugins from vllm.ray.lazy_utils import is_ray_initialized from vllm.reasoning import ReasoningParserManager from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 -from vllm.transformers_utils.config import is_interleaved +from vllm.transformers_utils.config import get_model_path, is_interleaved from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor) @@ -457,6 +458,13 @@ class EngineArgs: # Setup plugins from vllm.plugins import load_general_plugins load_general_plugins() + # when use hf offline,replace model id to local model path + if huggingface_hub.constants.HF_HUB_OFFLINE: + model_id = self.model + self.model = get_model_path(self.model, self.revision) + logger.info( + "HF_HUB_OFFLINE is True, replace model_id [%s] " \ + "to model_path [%s]",model_id, self.model) @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index d8c964fb2a4a4..fe345bd8f0a2e 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -14,7 +14,7 @@ from huggingface_hub import get_safetensors_metadata, hf_hub_download from huggingface_hub import list_repo_files as hf_list_repo_files from huggingface_hub import try_to_load_from_cache from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError, - HFValidationError, LocalEntryNotFoundError, + LocalEntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError) from transformers import GenerationConfig, PretrainedConfig @@ -335,6 +335,7 @@ def maybe_override_with_speculators_target_model( gguf_model_repo = Path(model).parent else: gguf_model_repo = None + kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE config_dict, _ = PretrainedConfig.get_config_dict( model if gguf_model_repo is None else gguf_model_repo, revision=revision, @@ -400,6 +401,7 @@ def get_config( raise ValueError(error_message) from e if config_format == ConfigFormat.HF: + kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE config_dict, _ = PretrainedConfig.get_config_dict( model, revision=revision, @@ -532,7 +534,7 @@ def try_get_local_file(model: Union[str, Path], revision=revision) if isinstance(cached_filepath, str): return Path(cached_filepath) - except HFValidationError: + except ValueError: ... return None @@ -908,3 +910,20 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int: exc_info=e) return max_position_embeddings + + +def get_model_path(model: Union[str, Path], revision: Optional[str] = None): + if os.path.exists(model): + return model + assert huggingface_hub.constants.HF_HUB_OFFLINE + common_kwargs = { + "local_files_only": huggingface_hub.constants.HF_HUB_OFFLINE, + "revision": revision, + } + + if envs.VLLM_USE_MODELSCOPE: + from modelscope.hub.snapshot_download import snapshot_download + return snapshot_download(model_id=model, **common_kwargs) + + from huggingface_hub import snapshot_download + return snapshot_download(repo_id=model, **common_kwargs) From 44492358439f612b3934ccd902dbd90fcfa19866 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 20 Aug 2025 22:19:30 +0800 Subject: [PATCH 134/361] [Bugfix] Ensure correctness of HCXVision processing (#23254) Signed-off-by: DarkLight1337 --- .../multimodal/processing/test_common.py | 2 +- .../models/hyperclovax_vision.py | 116 ++++++++---------- 2 files changed, 55 insertions(+), 63 deletions(-) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index d5b1de834a618..02aecfad8281d 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -102,7 +102,7 @@ def _test_processing_correctness( partial(random_video, rng, min_frames=2, - max_frames=8, + max_frames=16, min_wh=128, max_wh=256), "audio": diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index d3ddc47ea932f..f8b30d8d98e5c 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -53,6 +53,21 @@ IMAGE_TOKEN: str = "<|dummy3|>" VIDEO_TOKEN: str = "<|_unuse_missing_100270|>" +# Based on combine_frames_into_images in +# https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B/blob/main/processing_hyperclovax.py +def get_num_combined_frames( + num_frames: int, + max_grid_shape: tuple[int, int] = (3, 3), +) -> int: + max_num_grids = max_grid_shape[0] * max_grid_shape[1] + + # Calculate the number of canvases needed. + num_canvases = num_frames // max_num_grids + leftover_frames = num_frames % max_num_grids + + return num_canvases + (leftover_frames > 0) + + class HCXVisionMultimodalPixelInputs(TypedDict): type: Literal["pixel_values"] pixel_values_images: list[torch.Tensor] @@ -172,23 +187,20 @@ class HCXVisionMultiModalProcessor( def replace_multimodal_token( token_ids: torch.Tensor, target_token: int, - repeats: list, + repeats: list[int], ): - output = list() + output = list[int]() _repeats_idx = 0 for token_id in token_ids: if token_id == target_token: - output += [ - token_id.item(), - ] * repeats[_repeats_idx] + output += [token_id.item()] * repeats[_repeats_idx] _repeats_idx += 1 else: - output += [ - token_id.item(), - ] + output += [token_id.item()] + return torch.tensor(output, device=token_ids.device) - for video_idx, video_arr in enumerate(mm_data.get("videos", list())): + for video_idx, video_arr in enumerate(mm_data.get("videos", [])): if video_arr.dtype == np.uint8: continue mm_data["videos"][video_idx] = video_arr.astype(np.uint8) @@ -205,88 +217,68 @@ class HCXVisionMultiModalProcessor( if len(mm_data) > 0: # batchify input as a single item images = mm_data.get("images", None) - num_images = 0 - if images is not None: - num_images = len(images) - images = [ - images, - ] # batchify + batched_images = None if images is None else [images] - videos = mm_data.get("videos", - None) # list of video in single conversation - num_videos = 0 - if videos is not None: - num_videos = len(videos) - videos = [ - videos, - ] # batchify + # list of video in single conversation + videos = mm_data.get("videos", None) + batched_videos = None if videos is None else [videos] _processed_outputs = self.info.ctx.call_hf_processor( hf_processor=self.info.get_hf_processor(**mm_kwargs), data=dict( text=None, - images=images, - videos=videos, + images=batched_images, + videos=batched_videos, ), ) # mm-only for k, v in _processed_outputs.items(): - if len(v) < 1: - continue - elif k.endswith("_images"): - # list of list of 4D tensor -> list of 4D tensor + if isinstance(v, list) and len(v) > 0: + assert len(v) == 1 _processed_outputs[k] = v[0] - elif k.endswith("_videos"): - # list of list of 4D tensor -> list of 4D tensor - v = v[0] - if k == "pixel_values_videos": - v = torch.cat(v, dim=0) - _c, _w, _h = v.shape[-3:] - v = v.reshape(num_videos, -1, _c, _w, _h) - v = list(torch.unbind(v, dim=0)) - _processed_outputs[k] = v - if num_images > 0: + if images: tokenizer = self.info.get_tokenizer() + image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN) processed_outputs["input_ids"] = torch.stack([ replace_multimodal_token( token_ids=_input_ids, - target_token=tokenizer.convert_tokens_to_ids( - IMAGE_TOKEN), + target_token=image_token_id, repeats=_processed_outputs[ "vision_query_lengths_images"], ) for _input_ids in processed_outputs["input_ids"] ], dim=0) - if num_videos > 0: - tokenizer = self.info.get_tokenizer() - processed_outputs["input_ids"] = torch.stack([ - replace_multimodal_token( - token_ids=_input_ids, - target_token=tokenizer.convert_tokens_to_ids( - VIDEO_TOKEN), - repeats=_processed_outputs[ - "vision_query_lengths_videos"], - ) for _input_ids in processed_outputs["input_ids"] - ], - dim=0) - - _ratios = [ - len(_pixel_values) for _pixel_values in - _processed_outputs["pixel_values_videos"] - ] + if videos: _num_per_videos = [ - int(_e / sum(_ratios) * - len(_processed_outputs["vision_query_lengths_videos"])) - for _e in _ratios + get_num_combined_frames(len(video)) for video in videos + ] + _processed_outputs["pixel_values_videos"] = [ + _processed_outputs["pixel_values_videos"] + [sum(_num_per_videos[:_i]):sum(_num_per_videos[:_i + 1])] + for _i in range(len(videos)) ] _processed_outputs["vision_query_lengths_videos"] = [ _processed_outputs["vision_query_lengths_videos"] [sum(_num_per_videos[:_i]):sum(_num_per_videos[:_i + 1])] - for _i in range(0, num_videos) + for _i in range(len(videos)) ] + tokenizer = self.info.get_tokenizer() + video_token_id = tokenizer.convert_tokens_to_ids(VIDEO_TOKEN) + processed_outputs["input_ids"] = torch.stack([ + replace_multimodal_token( + token_ids=_input_ids, + target_token=video_token_id, + repeats=[ + sum(lens) for lens in + _processed_outputs["vision_query_lengths_videos"] + ], + ) for _input_ids in processed_outputs["input_ids"] + ], + dim=0) + processed_outputs.update(_processed_outputs) return processed_outputs From b17109beeafbf9577c319ab61530810943a7fc4b Mon Sep 17 00:00:00 2001 From: shixianc <49539556+shixianc@users.noreply.github.com> Date: Wed, 20 Aug 2025 07:35:26 -0700 Subject: [PATCH 135/361] [Kernel] CUTLASS MoE FP8: Integrate cuda moe permute/unpermute (#23045) Signed-off-by: Shixian Cui --- .../kernels/benchmark_grouped_gemm_cutlass.py | 35 +++- csrc/moe/moe_permute_unpermute_op.cu | 33 ++-- csrc/ops.h | 5 + .../cutlass_w8a8/moe/get_group_starts.cuh | 6 +- .../quantization/cutlass_w8a8/moe/moe_data.cu | 65 +++++-- .../cutlass_w8a8/scaled_mm_entry.cu | 24 +++ csrc/torch_bindings.cpp | 13 ++ tests/kernels/moe/test_cutlass_moe.py | 18 +- .../kernels/moe/test_moe_permute_unpermute.py | 6 +- tests/kernels/moe/test_pplx_cutlass_moe.py | 22 ++- .../quantization/test_cutlass_scaled_mm.py | 2 +- vllm/_custom_ops.py | 22 +++ .../layers/fused_moe/cutlass_moe.py | 179 +++++++++++------- .../layers/fused_moe/moe_permute_unpermute.py | 29 ++- .../compressed_tensors_moe.py | 31 +++ 15 files changed, 369 insertions(+), 121 deletions(-) diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index 1d4e730f99ae9..a6b42406b5cb0 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -80,6 +80,11 @@ def bench_run( a, score, topk, renormalize=False ) + ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) + ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64) + c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64) + c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) + def run_triton_moe( a: torch.Tensor, w1: torch.Tensor, @@ -111,6 +116,10 @@ def bench_run( w2: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, per_act_token: bool, @@ -125,6 +134,10 @@ def bench_run( topk_ids, w1_scale, w2_scale, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, per_act_token, a1_scale=None, ) @@ -136,6 +149,10 @@ def bench_run( w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, ): @@ -150,6 +167,10 @@ def bench_run( topk_ids, w1_scale, w2_scale, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, per_act_token, a1_scale=None, ) @@ -194,6 +215,10 @@ def bench_run( w2_q, w1_scale, w2_scale, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, topk_weights, topk_ids, ) @@ -231,6 +256,10 @@ def bench_run( "w1_scale": w1_scale, "w2_scale": w2_scale, "per_act_token": per_act_token, + "ab_strides1": ab_strides1, + "ab_strides2": ab_strides2, + "c_strides1": c_strides1, + "c_strides2": c_strides2, # cuda graph params "cutlass_graph": cutlass_graph, "triton_graph": triton_graph, @@ -289,6 +318,10 @@ def bench_run( w2_q, w1_scale, w2_scale, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, topk_weights, topk_ids, per_act_token, @@ -297,7 +330,7 @@ def bench_run( results.append( benchmark.Timer( - stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501 + stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu index 2922352a3f7cc..ca0c873f49d9f 100644 --- a/csrc/moe/moe_permute_unpermute_op.cu +++ b/csrc/moe/moe_permute_unpermute_op.cu @@ -45,8 +45,6 @@ void moe_permute( auto copy_topk_ids = topk_ids.clone(); // copy topk_ids for preprocess auto permuted_experts_id = torch::empty_like(topk_ids); auto sorted_row_idx = torch::empty_like(inv_permuted_idx); - auto align_expert_first_token_offset = - torch::zeros_like(expert_first_token_offset); CubKeyValueSorter sorter{}; int64_t* valid_num_ptr = nullptr; @@ -85,12 +83,14 @@ void moe_permute( }); // get m_indices and update expert_first_token_offset with align block - getMIndices(get_ptr(expert_first_token_offset), - get_ptr(align_expert_first_token_offset), - get_ptr(m_indices), n_local_expert, align_block_size_value, - stream); + // this is only required for DeepGemm and not required for CUTLASS group gemm if (align_block_size.has_value()) { - // update align_expert_first_token_offset + auto align_expert_first_token_offset = + torch::zeros_like(expert_first_token_offset); + getMIndices(get_ptr(expert_first_token_offset), + get_ptr(align_expert_first_token_offset), + get_ptr(m_indices), n_local_expert, align_block_size_value, + stream); expert_first_token_offset.copy_(align_expert_first_token_offset); } } @@ -195,19 +195,14 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights, torch::Tensor& expert_first_token_offset, torch::Tensor& src_row_id2dst_row_id_map, torch::Tensor& m_indices) { - TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0"); + TORCH_CHECK(false, "moe_permute is not supported on CUDA < 12.0"); } -void moe_unpermute(const torch::Tensor& input, - const torch::Tensor& topk_weights, torch::Tensor& topk_ids, - const torch::Tensor& token_expert_indices, - const std::optional& expert_map, - int64_t n_expert, int64_t n_local_expert, int64_t topk, - const std::optional& align_block_size, - torch::Tensor& permuted_input, - torch::Tensor& expert_first_token_offset, - torch::Tensor& src_row_id2dst_row_id_map, - torch::Tensor& m_indices) { +void moe_unpermute( + const torch::Tensor& permuted_hidden_states, + const torch::Tensor& topk_weights, const torch::Tensor& inv_permuted_idx, + const std::optional& expert_first_token_offset, int64_t topk, + torch::Tensor& hidden_states) { TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0"); } @@ -224,4 +219,4 @@ bool moe_permute_unpermute_supported() { TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { m.impl("moe_permute", &moe_permute); m.impl("moe_unpermute", &moe_unpermute); -} +} \ No newline at end of file diff --git a/csrc/ops.h b/csrc/ops.h index 64bcec6ca1527..86fe848e2fd5a 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -229,6 +229,11 @@ void get_cutlass_moe_mm_data( const int64_t num_experts, const int64_t n, const int64_t k, const std::optional& blockscale_offsets); +void get_cutlass_moe_mm_problem_sizes( + const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, + torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, + const int64_t k, const std::optional& blockscale_offsets); + void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, diff --git a/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh index 6c6e89790847f..15bb2c300543c 100644 --- a/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh +++ b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh @@ -10,7 +10,7 @@ template __global__ void get_group_gemm_starts( - int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets, + int64_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets, ElementC** out_offsets, ElementAccumulator** a_scales_offsets, ElementAccumulator** b_scales_offsets, ElementAB* a_base_as_int, ElementAB* b_base_as_int, ElementC* out_base_as_int, @@ -34,7 +34,7 @@ __global__ void get_group_gemm_starts( else if (out_tensors.dtype() == TENSOR_C_TYPE) { \ get_group_gemm_starts \ <<<1, num_experts, 0, stream>>>( \ - static_cast(expert_offsets.data_ptr()), \ + static_cast(expert_offsets.data_ptr()), \ static_cast(a_ptrs.data_ptr()), \ static_cast(b_ptrs.data_ptr()), \ static_cast(out_ptrs.data_ptr()), \ @@ -61,6 +61,8 @@ void run_get_group_gemm_starts( TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn); TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); + // expect int64_t to avoid overflow during offset calculations + TORCH_CHECK(expert_offsets.dtype() == torch::kInt64); int num_experts = static_cast(expert_offsets.size(0)); bool per_act_token = a_scales.numel() != 1; diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu index 100f485084444..49cafcc32adc6 100644 --- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu +++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu @@ -104,6 +104,53 @@ __global__ void compute_arg_sorts(const int32_t* __restrict__ topk_ids, } } +namespace { +inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids, + torch::Tensor& problem_sizes1, + torch::Tensor& problem_sizes2, + torch::Tensor& atomic_buffer, + int64_t num_experts, int64_t n, + int64_t k, cudaStream_t stream, + const bool swap_ab) { + int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel()); + + const int32_t* topk_ptr = static_cast(topk_ids.data_ptr()); + int32_t* ps1_ptr = static_cast(problem_sizes1.data_ptr()); + int32_t* ps2_ptr = static_cast(problem_sizes2.data_ptr()); + int32_t* atomic_ptr = static_cast(atomic_buffer.data_ptr()); + + if (swap_ab) { + compute_problem_sizes<<>>( + topk_ptr, ps1_ptr, ps2_ptr, atomic_ptr, + static_cast(topk_ids.numel()), static_cast(n), + static_cast(k)); + } else { + compute_problem_sizes<<>>( + topk_ptr, ps1_ptr, ps2_ptr, atomic_ptr, + static_cast(topk_ids.numel()), static_cast(n), + static_cast(k)); + } +} +} // namespace + +void get_cutlass_moe_mm_problem_sizes_caller( + const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, + torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, + const int64_t k, const std::optional& blockscale_offsets) { + auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index()); + auto options_int32 = + torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device()); + torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32); + + // Swap-AB should be disabled for FP4 path + bool may_swap_ab = (!blockscale_offsets.has_value()) && + (topk_ids.numel() <= SWAP_AB_THRESHOLD); + + launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2, + atomic_buffer, num_experts, n, k, stream, + may_swap_ab); +} + void get_cutlass_moe_mm_data_caller( const torch::Tensor& topk_ids, torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, @@ -121,21 +168,9 @@ void get_cutlass_moe_mm_data_caller( bool may_swap_ab = (!blockscale_offsets.has_value()) && (topk_ids.numel() <= SWAP_AB_THRESHOLD); - if (may_swap_ab) { - compute_problem_sizes<<>>( - static_cast(topk_ids.data_ptr()), - static_cast(problem_sizes1.data_ptr()), - static_cast(problem_sizes2.data_ptr()), - static_cast(atomic_buffer.data_ptr()), topk_ids.numel(), n, - k); - } else { - compute_problem_sizes<<>>( - static_cast(topk_ids.data_ptr()), - static_cast(problem_sizes1.data_ptr()), - static_cast(problem_sizes2.data_ptr()), - static_cast(atomic_buffer.data_ptr()), topk_ids.numel(), n, - k); - } + launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2, + atomic_buffer, num_experts, n, k, stream, + may_swap_ab); if (blockscale_offsets.has_value()) { // fp4 path diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 106bacb4883cb..84843ee6e0949 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -76,6 +76,11 @@ void get_cutlass_moe_mm_data_caller( const int64_t num_experts, const int64_t n, const int64_t k, const std::optional& blockscale_offsets); +void get_cutlass_moe_mm_problem_sizes_caller( + const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, + torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, + const int64_t k, const std::optional& blockscale_offsets); + void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, @@ -293,6 +298,25 @@ void get_cutlass_moe_mm_data( version_num, ". Required capability: 90 or 100"); } +void get_cutlass_moe_mm_problem_sizes( + const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, + torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, + const int64_t k, const std::optional& blockscale_offsets) { + int32_t version_num = get_sm_version_num(); +#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \ + (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) + get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1, + problem_sizes2, num_experts, n, k, + blockscale_offsets); + return; +#endif + TORCH_CHECK_NOT_IMPLEMENTED( + false, + "No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm " + "kernel for CUDA device capability: ", + version_num, ". Required capability: 90 or 100"); +} + void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 7079671c2eb16..3a0ff6eaa7904 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -440,6 +440,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { {stride_tag}); ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data); + // A function that computes problem sizes for each expert's multiplication + // used by the two mms called from fused MoE operation. It takes topk_ids as + // an input, and computes problem_sizes1 and problem_sizes2 only. + ops.def( + "get_cutlass_moe_mm_problem_sizes(Tensor topk_ids, " + " Tensor! problem_sizes1, " + " Tensor! problem_sizes2, " + " int num_experts, int n, int k, " + " Tensor? blockscale_offsets) -> ()", + {stride_tag}); + ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA, + &get_cutlass_moe_mm_problem_sizes); + // A function that computes data required to run fused MoE with w8a8 grouped // GEMM and PPLX. It takes expert_num_tokens and non_zero_expert_idxs // as an input, and computes expert_offsets (token start indices of each diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index 81fb3ec1de188..c84f66383b902 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -207,6 +207,10 @@ def run_8_bit(moe_tensors: MOETensors8Bit, 'topk_ids': topk_ids, 'w1_scale': moe_tensors.w1_scale, 'w2_scale': moe_tensors.w2_scale, + 'ab_strides1': moe_tensors.ab_strides1, + 'ab_strides2': moe_tensors.ab_strides2, + 'c_strides1': moe_tensors.c_strides1, + 'c_strides2': moe_tensors.c_strides2, 'per_act_token': per_act_token, 'a1_scale': None #moe_tensors.a_scale } @@ -424,8 +428,8 @@ def test_run_cutlass_moe_fp8( topk_ids[0][1] = 1 workspace13_shape = (m * topk, max(2 * n, k)) - workspace2_shape = (m * topk, n) - output_shape = (m * topk, k) + workspace2_shape = (m * topk, max(n, k)) + output_shape = (m, k) workspace13 = torch.empty(prod(workspace13_shape), device="cuda", @@ -440,6 +444,11 @@ def test_run_cutlass_moe_fp8( expert_map[start:end] = list(range(num_local_experts)) expert_map = torch.tensor(expert_map, dtype=torch.int32, device="cuda") + ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64) + ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64) + c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64) + c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64) + activation = lambda o, i: torch.ops._C.silu_and_mul(o, i) a1q, a1q_scale = moe_kernel_quantize_input(mt.a, mt.a_scale, torch.float8_e4m3fn, @@ -448,8 +457,9 @@ def test_run_cutlass_moe_fp8( func = lambda output: run_cutlass_moe_fp8( output, a1q, mt.w1_q, mt.w2_q, topk_ids, activation, global_num_experts, expert_map, mt.w1_scale, mt.w2_scale, - a1q_scale, None, workspace13, workspace2, None, mt.a.dtype, - per_act_token, per_out_channel, False) + a1q_scale, None, ab_strides1, ab_strides2, c_strides1, c_strides2, + workspace13, workspace2, None, mt.a.dtype, per_act_token, + per_out_channel, False, topk_weights) workspace13.random_() output_random_workspace = torch.empty(output_shape, diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py index 6ca01f9271bba..d71664d94b9c8 100644 --- a/tests/kernels/moe/test_moe_permute_unpermute.py +++ b/tests/kernels/moe/test_moe_permute_unpermute.py @@ -238,7 +238,11 @@ def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int, atol=0, rtol=0) # check mindice - torch.testing.assert_close(gold_m_indices, m_indices, atol=0, rtol=0) + # current kernel usage assumes deepgemm requires align_block_size + # when it's not provided then we don't compute m_indices (for cutlass) + if align_block_size is not None: + torch.testing.assert_close(gold_m_indices, m_indices, atol=0, rtol=0) + # check permuted_hidden_states, only valid token torch.testing.assert_close(gold_permuted_hidden_states[valid_row_idx], permuted_hidden_states[valid_row_idx], diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index f98937ee6c527..98908f2714707 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -76,6 +76,7 @@ def pplx_cutlass_moe( assert torch.cuda.current_device() == pgi.local_rank num_tokens, hidden_dim = a.shape + intermediate_dim = w2.shape[2] num_experts = w1.shape[0] block_size = hidden_dim # TODO support more cases device = pgi.device @@ -124,8 +125,27 @@ def pplx_cutlass_moe( num_local_experts=num_local_experts, num_dispatchers=num_dispatchers) + ab_strides1 = torch.full((num_local_experts, ), + hidden_dim, + device="cuda", + dtype=torch.int64) + ab_strides2 = torch.full((num_local_experts, ), + intermediate_dim, + device="cuda", + dtype=torch.int64) + c_strides1 = torch.full((num_local_experts, ), + 2 * intermediate_dim, + device="cuda", + dtype=torch.int64) + c_strides2 = torch.full((num_local_experts, ), + hidden_dim, + device="cuda", + dtype=torch.int64) + experts = CutlassBatchedExpertsFp8(num_local_experts, num_dispatchers, - out_dtype, per_act_token, per_out_ch) + out_dtype, per_act_token, per_out_ch, + ab_strides1, ab_strides2, c_strides1, + c_strides2) fused_cutlass_experts = FusedMoEModularKernel( prepare_finalize, diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py index 8730eeaaa761c..a15decdf6f827 100644 --- a/tests/kernels/quantization/test_cutlass_scaled_mm.py +++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py @@ -535,7 +535,7 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool, expert_offsets = torch.zeros((num_experts + 1), device=device, - dtype=torch.int32) + dtype=torch.int64) problem_sizes = torch.zeros((num_experts, 3), device=device, diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 0d556053f8981..39da08847b2e7 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -844,6 +844,28 @@ def get_cutlass_moe_mm_data(topk_ids: torch.Tensor, blockscale_offsets) +def get_cutlass_moe_mm_problem_sizes( + topk_ids: torch.Tensor, + problem_sizes1: torch.Tensor, + problem_sizes2: torch.Tensor, + num_experts: int, + n: int, + k: int, + blockscale_offsets: Optional[torch.Tensor] = None): + """ + Compute only the per-expert problem sizes needed by the two grouped matrix + multiplications used in CUTLASS-based fused MoE. + + The function takes in topk_ids (token→expert mapping) and computes: + - problem_sizes1, problem_sizes2: M×N×K sizes of each expert's + multiplication for the two grouped MMs + used in the fused MoE operation. + """ + return torch.ops._C.get_cutlass_moe_mm_problem_sizes( + topk_ids, problem_sizes1, problem_sizes2, num_experts, n, k, + blockscale_offsets) + + def shuffle_rows(input_tensor: torch.Tensor, dst2src_map: torch.Tensor): """ Shuffle and expand the input tensor according to the dst2src_map and store the result in output_tensor. diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 0a02b558d09e5..95d23ec0346c1 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -9,12 +9,13 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( + moe_permute, moe_unpermute) from vllm.model_executor.layers.fused_moe.prepare_finalize import ( MoEPrepareAndFinalizeNoEP) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate, TopKWeightAndReduceNoOP) -from vllm.model_executor.layers.fused_moe.utils import (_fp8_perm, - _fp8_quantize, +from vllm.model_executor.layers.fused_moe.utils import (_fp8_quantize, _resize_cache) from vllm.scalar_type import scalar_types @@ -34,6 +35,10 @@ def run_cutlass_moe_fp8( w2_scale: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor], a2_scale: Optional[torch.Tensor], + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, workspace13: torch.Tensor, workspace2: torch.Tensor, expert_num_tokens: Optional[torch.Tensor], @@ -41,6 +46,7 @@ def run_cutlass_moe_fp8( per_act_token: bool, per_out_ch: bool, use_batched_format: bool, + topk_weights: Optional[torch.Tensor], ): a1q = hidden_states @@ -99,6 +105,22 @@ def run_cutlass_moe_fp8( topk = local_topk_ids.size(1) local_E = w1.size(0) + if use_batched_format: + mm1_out = _resize_cache(workspace13, (local_E * padded_M, N * 2)) + act_out = _resize_cache(workspace2, (local_E * padded_M, N)) + quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn), + (local_E * padded_M, N)) + mm2_out = _resize_cache(workspace2, (local_E * padded_M, K)) + else: + a1q_perm = _resize_cache(workspace2.view(dtype=torch.float8_e4m3fn), + (M * topk, K)) + mm1_out = _resize_cache(workspace13, (M * topk, N * 2)) + act_out = _resize_cache(workspace2, (M * topk, N)) + # original workspace are based on input hidden_states dtype (bf16) + quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn), + (M * topk, N)) + mm2_out = _resize_cache(workspace2, (M * topk, K)) + if use_batched_format: assert expert_num_tokens is not None @@ -120,11 +142,10 @@ def run_cutlass_moe_fp8( w2_scale = w2_scale.reshape(w2_scale.size(0), -1) a1q = a1q.reshape(-1, a1q.size(2)) a1q_scale = a1q_scale.reshape(-1, a1q_scale.size(2)).contiguous() - + # c3x get_group_gemm_starts expects int64 to avoid overflow + # during offset calculations + expert_offsets = expert_offsets.to(torch.int64) else: - expert_offsets = torch.empty((global_num_experts + 1), - dtype=torch.int32, - device=device) problem_sizes1 = torch.empty((global_num_experts, 3), dtype=torch.int32, device=device) @@ -132,84 +153,57 @@ def run_cutlass_moe_fp8( dtype=torch.int32, device=device) - # With expert_map each Rank processes only a subset of experts. As - # a result not all of a_map and c2 tensors are filled. We fill it - # zeros for correctness. - if expert_map is not None: - a_map = torch.zeros((local_topk_ids.numel()), - dtype=torch.int32, - device=device) - else: - a_map = torch.empty((local_topk_ids.numel()), - dtype=torch.int32, - device=device) - - c_map = torch.empty((local_topk_ids.numel()), - dtype=torch.int32, - device=device) - - ops.get_cutlass_moe_mm_data(local_topk_ids, expert_offsets, - problem_sizes1, problem_sizes2, a_map, - c_map, global_num_experts, N, K) - - a1q = _fp8_perm(a1q, a_map) - a1q_scale = a1q_scale[a_map] if per_act_token else a1q_scale + num_expert = global_num_experts if expert_map is None \ + else expert_map.size(0) + # permuted a1q reuses workspace2 + a1q, a1q_scale, expert_offsets, inv_perm, _ = moe_permute( + a1q, + a1q_scale, + topk_ids, + num_expert, + local_E, + expert_map, + permuted_hidden_states=a1q_perm) expert_offsets = expert_offsets[:-1] - ab_strides1 = torch.full((w1.size(0), ), - K, - device=device, - dtype=torch.int64) - c_strides1 = torch.full((w1.size(0), ), - 2 * N, - device=device, - dtype=torch.int64) - ab_strides2 = torch.full((w1.size(0), ), - N, - device=device, - dtype=torch.int64) - c_strides2 = torch.full((w1.size(0), ), - K, - device=device, - dtype=torch.int64) - - if use_batched_format: - c1 = _resize_cache(workspace13, (local_E * padded_M, N * 2)) - c2 = _resize_cache(workspace2, (local_E * padded_M, N)) - c3 = _resize_cache(workspace13, (local_E * padded_M, K)) - else: - c1 = _resize_cache(workspace13, (M * topk, N * 2)) - c2 = _resize_cache(workspace2, (M * topk, N)) - c3 = _resize_cache(workspace13, (M * topk, K)) + ops.get_cutlass_moe_mm_problem_sizes(local_topk_ids, problem_sizes1, + problem_sizes2, + global_num_experts, N, K) if not per_act_token and (expert_map is not None or use_batched_format): # this is necessary to avoid imprecise scale calculation caused by # random data in the unused workspace. The workspace is unused when # this rank handles only partial tokens, or when it is batched . - c1.fill_(0) + mm1_out.fill_(0) - ops.cutlass_moe_mm(c1, a1q, w1, a1q_scale, w1_scale, expert_offsets, + ops.cutlass_moe_mm(mm1_out, a1q, w1, a1q_scale, w1_scale, expert_offsets, problem_sizes1, ab_strides1, ab_strides1, c_strides1, per_act_token, per_out_ch) - activation_callable(c2, c1) + activation_callable(act_out, mm1_out) a2q, a2q_scale = ops.scaled_fp8_quant( - c2, a2_scale, use_per_token_if_dynamic=per_act_token) + act_out, + a2_scale, + use_per_token_if_dynamic=per_act_token, + output=quant_out) if expert_map is not None: - c3.fill_(0) + mm2_out.fill_(0) - ops.cutlass_moe_mm(c3, a2q, w2, a2q_scale, w2_scale, expert_offsets, + ops.cutlass_moe_mm(mm2_out, a2q, w2, a2q_scale, w2_scale, expert_offsets, problem_sizes2, ab_strides2, ab_strides2, c_strides2, per_act_token, per_out_ch) if use_batched_format: - output.copy_(c3.reshape(local_E, padded_M, K), non_blocking=True) + output.copy_(mm2_out.reshape(local_E, padded_M, K), non_blocking=True) else: - # We can't do this inplace because output may point to the same tensor - # as c3. - output.copy_(c3[c_map].view(M * topk, K), non_blocking=True) + # for non-chunking mode the output is resized from workspace13 + # so we need to make sure mm2_out uses workspace2. + moe_unpermute(out=output, + permuted_hidden_states=mm2_out, + topk_weights=topk_weights, + inv_permuted_idx=inv_perm) class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): @@ -219,6 +213,10 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): out_dtype: Optional[torch.dtype], per_act_token_quant: bool, per_out_ch_quant: bool, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, block_shape: Optional[list[int]] = None, ): super().__init__( @@ -229,6 +227,10 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): block_shape=block_shape, )) self.out_dtype = out_dtype + self.ab_strides1 = ab_strides1 + self.ab_strides2 = ab_strides2 + self.c_strides1 = c_strides1 + self.c_strides2 = c_strides2 def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: # Let PrepareAndFinalize::finalize() decide the impl. @@ -272,10 +274,11 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): run_cutlass_moe_fp8( output, hidden_states, w1, w2, topk_ids, activation_callable, global_num_experts, expert_map, w1_scale, w2_scale, a1q_scale, - a2_scale, workspace13, workspace2, expert_num_tokens, + a2_scale, self.ab_strides1, self.ab_strides2, self.c_strides1, + self.c_strides2, workspace13, workspace2, expert_num_tokens, self.out_dtype if self.out_dtype is not None else in_dtype, self.per_act_token_quant, self.per_out_ch_quant, - use_batched_format) + use_batched_format, topk_weights) class CutlassExpertsFp8(CutlassExpertsFp8Base): @@ -285,12 +288,20 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base): out_dtype: Optional[torch.dtype], per_act_token_quant: bool, per_out_ch_quant: bool, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, block_shape: Optional[list[int]] = None, ): super().__init__( out_dtype, per_act_token_quant, per_out_ch_quant, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, block_shape, ) @@ -307,6 +318,10 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base): def supports_expert_map(self) -> bool: return True + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + # topk weights and reduction are fused in moe_unpermute cuda kernel + return TopKWeightAndReduceNoOP() + def workspace_shapes( self, a: torch.Tensor, @@ -320,8 +335,8 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base): expert_tokens_meta: Optional[mk.ExpertTokensMetadata], ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]: workspace1 = (M * topk, max(N, K)) - workspace2 = (M * topk, N // 2) - output = (M * topk, K) + workspace2 = (M * topk, max(N // 2, K)) + output = (M, K) return (workspace1, workspace2, output, self.out_dtype if self.out_dtype is not None else a.dtype) @@ -335,12 +350,20 @@ class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base): out_dtype: Optional[torch.dtype], per_act_token_quant: bool, per_out_ch_quant: bool, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, block_shape: Optional[list[int]] = None, ): super().__init__( out_dtype, per_act_token_quant, per_out_ch_quant, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, block_shape, ) assert max_experts_per_worker > 0 @@ -378,7 +401,8 @@ class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base): assert num_dp is not None workspace1 = (self.max_experts_per_worker, padded_M * num_dp, max(N, K)) - workspace2 = (self.max_experts_per_worker, padded_M * num_dp, (N // 2)) + workspace2 = (self.max_experts_per_worker, padded_M * num_dp, + max(N // 2, K)) output = (self.max_experts_per_worker, padded_M, K) return (workspace1, workspace2, output, self.out_dtype if self.out_dtype is not None else a.dtype) @@ -392,6 +416,10 @@ def cutlass_moe_fp8( topk_ids: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, per_act_token: Optional[bool] = None, activation: str = "silu", a1_scale: Optional[torch.Tensor] = None, @@ -419,6 +447,17 @@ def cutlass_moe_fp8( Shape: [num_experts] or [num_experts, 2N] - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q. Shape: [num_experts] or [num_experts, K] + - ab_strides1 (torch.Tensor): The input/weight strides for the first gemm. + Shape: [num_experts] + - ab_strides2 (torch.Tensor): The input/weight strides for the second gemm. + Shape: [num_experts] + - c_strides1 (torch.Tensor): The output strides for the first gemm. + Shape: [num_experts] + - c_strides2 (torch.Tensor): The output strides for the second gemm. + Shape: [num_experts] + - per_act_token (Optional[bool]): Whether the scale is per-token or + per-tensor. + - activation (str): The activation function to use. - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a. Shape: scalar or [M] - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to @@ -450,6 +489,10 @@ def cutlass_moe_fp8( out_dtype=a.dtype, per_act_token_quant=per_act_token, per_out_ch_quant=per_out_ch, + ab_strides1=ab_strides1, + ab_strides2=ab_strides2, + c_strides1=c_strides1, + c_strides2=c_strides2, ), ) diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py index d9059f50b4459..16a155e718478 100644 --- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py @@ -82,7 +82,8 @@ def moe_permute( n_local_expert: int = -1, expert_map: Optional[torch.Tensor] = None, align_block_size: Optional[int] = None, - fill_invalid_expert: int = -1 + fill_invalid_expert: int = -1, + permuted_hidden_states: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor]: """ @@ -95,14 +96,17 @@ def moe_permute( - n_expert (int): The number of expert. - n_local_expert (int): The number of expert in current EP rank. - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices - from the global expert space to the local expert space of the expert + from the global expert space to the local expert space of the expert parallel shard. - align_block_size (Optional[int]): align group gemm block size for deepgemm - fill_invalid_expert(int): fill expert id in m_indices for invalid expert to workaround DeepGemm unsupported -1 in m_indices + - permuted_hidden_states (Optional[torch.Tensor]): Optional output tensor. + If None, the output tensor will be created in this function. Returns: - permuted_hidden_states (torch.Tensor): permuted activation. - - a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states + - a1q_scale (Optional[torch.Tensor]): permuted quant scale for hidden_states + if original scale not per-tensor scaling - expert_first_token_offset (torch.Tensor): offset of the first token of each expert for standard grouped gemm. if enable 'align_block_size' expert_first_token_offset will align up to 'align_block_size'. @@ -122,11 +126,16 @@ def moe_permute( 1) // align_block_size * align_block_size if n_local_expert == -1: n_local_expert = n_expert - permuted_hidden_states = torch.empty( - (permuted_row_size, n_hidden), - dtype=hidden_states.dtype, - device=hidden_states.device, - ) + if permuted_hidden_states is None: + permuted_hidden_states = torch.empty( + (permuted_row_size, n_hidden), + dtype=hidden_states.dtype, + device=hidden_states.device, + ) + assert permuted_hidden_states.size() == (permuted_row_size, n_hidden), ( + f"Expected permuted hidden states to be {(permuted_row_size, n_hidden)}" + f" but got {permuted_hidden_states.size()}") + token_expert_indices = torch.arange(0, n_token * topk, dtype=torch.int32, @@ -153,7 +162,8 @@ def moe_permute( align_block_size, permuted_hidden_states, expert_first_token_offset, inv_permuted_idx, permuted_idx, m_indices) - if a1q_scale is not None: + + if a1q_scale is not None and a1q_scale.dim() > 1: a1q_scale = a1q_scale[permuted_idx.clamp(max=n_token * topk - 1) // topk] return (permuted_hidden_states, a1q_scale, expert_first_token_offset, @@ -185,6 +195,7 @@ def moe_unpermute( n_hidden = permuted_hidden_states.size(-1) assert (n_hidden * permuted_hidden_states.element_size() ) % 16 == 0, "unpermue kernel need hidden dim align to 16B" + torch.ops._moe_C.moe_unpermute(permuted_hidden_states, topk_weights, inv_permuted_idx, expert_first_token_offset, topk, out) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 8ca8249e694ea..7bc35cd81ac3f 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -669,6 +669,25 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): from vllm.model_executor.layers.fused_moe import fused_experts self.fused_experts_func = fused_experts + if self.use_cutlass: + device = layer.w13_weight.device + # ab_strides1 and c_strides2 are the same + self.ab_strides1_c_strides2 = torch.full( + (layer.local_num_experts, ), + layer.hidden_size, + device=device, + dtype=torch.int64) + self.ab_strides2 = torch.full( + (layer.local_num_experts, ), + layer.intermediate_size_per_partition, + device=device, + dtype=torch.int64) + self.c_strides1 = torch.full( + (layer.local_num_experts, ), + 2 * layer.intermediate_size_per_partition, + device=device, + dtype=torch.int64) + def select_gemm_impl( self, prepare_finalize: FusedMoEPrepareAndFinalize, @@ -693,6 +712,10 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): moe.in_dtype, self.input_quant.strategy == QuantizationStrategy.TOKEN, self.weight_quant.strategy == QuantizationStrategy.CHANNEL, + ab_strides1=self.ab_strides1_c_strides2, + ab_strides2=self.ab_strides2, + c_strides1=self.c_strides1, + c_strides2=self.ab_strides1_c_strides2, ) else: logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__) @@ -700,6 +723,10 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): moe.in_dtype, self.input_quant.strategy == QuantizationStrategy.TOKEN, self.weight_quant.strategy == QuantizationStrategy.CHANNEL, + ab_strides1=self.ab_strides1_c_strides2, + ab_strides2=self.ab_strides2, + c_strides1=self.c_strides1, + c_strides2=self.ab_strides1_c_strides2, ) self.disable_expert_map = (num_dispatchers > 1 @@ -822,6 +849,10 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): expert_map=None if self.disable_expert_map else expert_map, w1_scale=layer.w13_weight_scale, w2_scale=layer.w2_weight_scale, + ab_strides1=self.ab_strides1_c_strides2, + ab_strides2=self.ab_strides2, + c_strides1=self.c_strides1, + c_strides2=self.ab_strides1_c_strides2, a1_scale=layer.w13_input_scale, a2_scale=layer.w2_input_scale, ) From 5efd6905bc8469a30664de83bdafaad56aa92903 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 20 Aug 2025 23:42:28 +0800 Subject: [PATCH 136/361] [CLI][Doc] Formalize `--mm-encoder-tp-mode` (#23190) Signed-off-by: DarkLight1337 --- docs/configuration/optimization.md | 45 ++++++++++++++++++++++++ vllm/config/__init__.py | 34 +++++++++++++++++- vllm/config/parallel.py | 4 --- vllm/engine/arg_utils.py | 35 +++++++++++------- vllm/model_executor/models/mllama4.py | 4 +-- vllm/model_executor/models/qwen2_5_vl.py | 3 +- vllm/model_executor/models/step3_vl.py | 3 +- 7 files changed, 104 insertions(+), 24 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index c7f50497d6ffa..db9dfb313fb87 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -129,6 +129,51 @@ Data parallelism replicates the entire model across multiple GPU sets and proces Data parallelism can be combined with the other parallelism strategies and is set by `data_parallel_size=N`. Note that MoE layers will be sharded according to the product of the tensor parallel size and data parallel size. +### Batch-level DP for Multi-Modal Encoders + +By default, TP is used to shard the weights of multi-modal encoders just like for language decoders, +in order to reduce the memory and compute load on each GPU. + +However, since the size of multi-modal encoders is very small compared to language decoders, +there is relatively little gain from TP. On the other hand, TP incurs significant communication +overhead because of all-reduce being performed after every layer. + +Given this, it may be advantageous to instead shard the batched input data using TP, essentially +performing batch-level DP. This has been shown to improve the throughput by around 10% for +`tensor_parallel_size=8`. For vision encoders that use hardware-unoptimized Conv3D operations, +batch-level DP can provide another 40% increase to throughput compared to regular TP. + +Nevertheless, since the weights of the multi-modal encoder are replicated across each TP rank, +there will be a minor increase in memory consumption and may cause OOM if you can barely fit the model already. + +You can enable batch-level DP by setting `mm_encoder_tp_mode="data"`, for example: + +```python +from vllm import LLM + +llm = LLM( + model="Qwen/Qwen2.5-VL-72B-Instruct", + # Create two EngineCore instances, one per DP rank + data_parallel_size=2, + # Within each EngineCore instance: + # The vision encoder uses TP=4 (not DP=2) to shard the input data + # The language decoder uses TP=4 to shard the weights as usual + tensor_parallel_size=4, + mm_encoder_tp_mode="data", +) +``` + +!! important + Batch-level DP is not to be confused with API request-level DP + (which is instead controlled by `data_parallel_size`). + +The availablilty of batch-level DP is based on model implementation. +Currently, the following models support `mm_encoder_tp_mode="data"`: + +- Llama4 () +- Qwen2.5-VL () +- Step3 () + ## Input Processing ### Parallel Processing diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 801fa97fe5daf..5b5d477ef066b 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -258,6 +258,7 @@ TokenizerMode = Literal["auto", "slow", "mistral", "custom"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] LogprobsMode = Literal["raw_logprobs", "raw_logits", "processed_logprobs", "processed_logits"] +MMEncoderTPMode = Literal["weights", "data"] @config @@ -438,6 +439,19 @@ class ModelConfig: `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. Set to `0` to disable this cache completely (not recommended).""" + mm_encoder_tp_mode: MMEncoderTPMode = "weights" + """Indicates how to optimize multi-modal encoder inference using + tensor parallelism (TP). + + - `"weights"`: Within the same vLLM engine, split the weights of + each layer across TP ranks. (default TP behavior) + - `"data"`: Within the same vLLM engine, split the batched input data + across TP ranks to process the data in parallel, while hosting + the full weights on each TP rank. + This batch-level DP is not to be confused with API request-level + DP (which is controlled by `--data-parallel-size`). + This is only supported on a per-model basis and falls back to + `"weights"` if the encoder does not support DP.""" override_neuron_config: dict[str, Any] = field(default_factory=dict) """Initialize non-default neuron config or override default neuron config that are specific to Neuron devices, this argument will be used to @@ -856,8 +870,10 @@ class ModelConfig: media_io_kwargs=self.media_io_kwargs, mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_cache_gb=self.mm_processor_cache_gb, + mm_encoder_tp_mode=self.mm_encoder_tp_mode, interleave_mm_strings=self.interleave_mm_strings, - skip_mm_profiling=self.skip_mm_profiling) + skip_mm_profiling=self.skip_mm_profiling, + ) return None @@ -2547,6 +2563,22 @@ class MultiModalConfig: Set to `0` to disable this cache completely (not recommended). """ + mm_encoder_tp_mode: MMEncoderTPMode = "weights" + """ + Indicates how to optimize multi-modal encoder inference using + tensor parallelism (TP). + + - `"weights"`: Within the same vLLM engine, split the weights of + each layer across TP ranks. (default TP behavior) + - `"data"`: Within the same vLLM engine, split the batched input data + across TP ranks to process the data in parallel, while hosting + the full weights on each TP rank. + This batch-level DP is not to be confused with API request-level + DP (which is controlled by `--data-parallel-size`). + This is only supported on a per-model basis and falls back to + `"weights"` if the encoder does not support DP. + """ + interleave_mm_strings: bool = False """ Enable fully interleaved support for multimodal prompts. diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index bac1e63800d7b..7a9e68f0ea332 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -137,10 +137,6 @@ class ParallelConfig: rank: int = 0 """Global rank in distributed setup.""" - enable_multimodal_encoder_data_parallel: bool = False - """ Use data parallelism instead of tensor parallelism for vision encoder. - Only support LLama4 for now""" - @property def world_size_across_dp(self) -> int: """world_size_across_dp is TPxPPxDP, it is the size of the world diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 48d9cd08af030..6869c3f23f315 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -28,12 +28,12 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, DeviceConfig, DistributedExecutorBackend, GuidedDecodingBackend, HfOverrides, KVEventsConfig, KVTransferConfig, LoadConfig, LogprobsMode, - LoRAConfig, MambaDType, ModelConfig, ModelDType, - ModelImpl, MultiModalConfig, ObservabilityConfig, - ParallelConfig, PoolerConfig, PrefixCachingHashAlgo, - RunnerOption, SchedulerConfig, SchedulerPolicy, - SpeculativeConfig, TaskOption, TokenizerMode, - VllmConfig, get_attr_docs, get_field) + LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig, + ModelDType, ModelImpl, MultiModalConfig, + ObservabilityConfig, ParallelConfig, PoolerConfig, + PrefixCachingHashAlgo, RunnerOption, SchedulerConfig, + SchedulerPolicy, SpeculativeConfig, TaskOption, + TokenizerMode, VllmConfig, get_attr_docs, get_field) from vllm.logger import init_logger from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins @@ -352,6 +352,7 @@ class EngineArgs: MultiModalConfig.mm_processor_kwargs disable_mm_preprocessor_cache: bool = False # DEPRECATED mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb + mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling # LoRA fields enable_lora: bool = False @@ -434,16 +435,14 @@ class EngineArgs: use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load pt_load_map_location: str = LoadConfig.pt_load_map_location - enable_multimodal_encoder_data_parallel: bool = \ - ParallelConfig.enable_multimodal_encoder_data_parallel + # DEPRECATED + enable_multimodal_encoder_data_parallel: bool = False logits_processors: Optional[list[Union[ str, type[LogitsProcessor]]]] = ModelConfig.logits_processors """Custom logitproc types""" async_scheduling: bool = SchedulerConfig.async_scheduling - # DEPRECATED - enable_prompt_adapter: bool = False kv_sharing_fast_prefill: bool = \ CacheConfig.kv_sharing_fast_prefill @@ -685,7 +684,8 @@ class EngineArgs: **parallel_kwargs["worker_extension_cls"]) parallel_group.add_argument( "--enable-multimodal-encoder-data-parallel", - **parallel_kwargs["enable_multimodal_encoder_data_parallel"]) + action="store_true", + deprecated=True) # KV cache arguments cache_kwargs = get_kwargs(CacheConfig) @@ -735,6 +735,8 @@ class EngineArgs: multimodal_group.add_argument("--disable-mm-preprocessor-cache", action="store_true", deprecated=True) + multimodal_group.add_argument( + "--mm-encoder-tp-mode", **multimodal_kwargs["mm_encoder_tp_mode"]) multimodal_group.add_argument( "--interleave-mm-strings", **multimodal_kwargs["interleave_mm_strings"]) @@ -909,6 +911,14 @@ class EngineArgs: self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB + if self.enable_multimodal_encoder_data_parallel: + logger.warning( + "--enable-multimodal-encoder-data-parallel` is deprecated " + "and will be removed in v0.13. " + "Please use `--mm-encoder-tp-mode data` instead.") + + self.mm_encoder_tp_mode = "data" + return ModelConfig( model=self.model, hf_config_path=self.hf_config_path, @@ -947,6 +957,7 @@ class EngineArgs: config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_cache_gb=self.mm_processor_cache_gb, + mm_encoder_tp_mode=self.mm_encoder_tp_mode, override_neuron_config=self.override_neuron_config, override_pooler_config=self.override_pooler_config, logits_processor_pattern=self.logits_processor_pattern, @@ -1258,8 +1269,6 @@ class EngineArgs: distributed_executor_backend=self.distributed_executor_backend, worker_cls=self.worker_cls, worker_extension_cls=self.worker_extension_cls, - enable_multimodal_encoder_data_parallel=self. - enable_multimodal_encoder_data_parallel, ) if model_config.is_multimodal_model: diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 35103eac8fb56..595bdd17cf2c2 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -728,8 +728,8 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config - self.use_data_parallel = (vllm_config.parallel_config. - enable_multimodal_encoder_data_parallel) + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" + self.config = config self.quant_config = quant_config self.multimodal_config = multimodal_config diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 34eec10296b50..811ecffcc1e49 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -877,8 +877,7 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config multimodal_config = vllm_config.model_config.multimodal_config - self.use_data_parallel = (vllm_config.parallel_config. - enable_multimodal_encoder_data_parallel) + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" self.config = config self.multimodal_config = multimodal_config diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 5d41a9e569f53..f8877b584b198 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -882,8 +882,7 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, self.config = config self.multimodal_config = multimodal_config - self.use_data_parallel = (vllm_config.parallel_config. - enable_multimodal_encoder_data_parallel) + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" if multimodal_config.get_limit_per_prompt("image"): self.vision_model = Step3VisionTransformer( From d6d13bd49ed7fda56ac6a1b0aa53621490c975ac Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 20 Aug 2025 09:05:29 -0700 Subject: [PATCH 137/361] [Misc] Add max_seq_len to CommonAttentionMetadata (#23216) Signed-off-by: Woosuk Kwon --- tests/v1/attention/utils.py | 2 ++ tests/v1/spec_decode/test_tree_attention.py | 2 ++ vllm/v1/attention/backends/flash_attn.py | 2 +- vllm/v1/attention/backends/flashinfer.py | 2 +- vllm/v1/attention/backends/flex_attention.py | 2 +- vllm/v1/attention/backends/rocm_aiter_fa.py | 2 +- vllm/v1/attention/backends/tree_attn.py | 2 +- vllm/v1/attention/backends/triton_attn.py | 2 +- vllm/v1/attention/backends/utils.py | 6 ++++++ vllm/v1/attention/backends/xformers.py | 2 +- vllm/v1/spec_decode/eagle.py | 1 + vllm/v1/worker/gpu_model_runner.py | 4 ++++ 12 files changed, 22 insertions(+), 7 deletions(-) diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index a4e38eb32f6a1..e547e71e0cdb7 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -58,6 +58,7 @@ def create_common_attn_metadata( dtype=torch.int32, device=device) seq_lens_cpu = seq_lens.cpu() + max_seq_len = int(seq_lens_cpu.max()) # Create computed tokens (context length for each sequence) context_lens = [ @@ -101,6 +102,7 @@ def create_common_attn_metadata( num_reqs=batch_spec.batch_size, num_actual_tokens=num_tokens, max_query_len=max_query_len, + max_seq_len=max_seq_len, block_table_tensor=block_table_tensor, slot_mapping=slot_mapping, causal=True, diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py index 456ce712d36e4..6317817408661 100644 --- a/tests/v1/spec_decode/test_tree_attention.py +++ b/tests/v1/spec_decode/test_tree_attention.py @@ -50,6 +50,7 @@ def forward_attention( dtype=torch.int32, ) context_lens = seq_lens - query_lens + max_seq_len = int(seq_lens.max()) max_query_len = q_len num_actual_tokens = query_start_loc[-1] @@ -81,6 +82,7 @@ def forward_attention( num_reqs=batch_size, num_actual_tokens=num_actual_tokens, max_query_len=max_query_len, + max_seq_len=max_seq_len, block_table_tensor=block_table, slot_mapping=slot_mapping, ) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index ab7a71a399b34..eed3cba9a2ca7 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -233,7 +233,7 @@ class FlashAttentionMetadataBuilder( num_reqs = common_attn_metadata.num_reqs num_actual_tokens = common_attn_metadata.num_actual_tokens max_query_len = common_attn_metadata.max_query_len - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + max_seq_len = common_attn_metadata.max_seq_len query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens seq_lens_cpu = common_attn_metadata.seq_lens_cpu diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 53fafbc4af91d..8a25088848a44 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -463,7 +463,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): page_size = self.page_size max_q_len = common_attn_metadata.max_query_len - max_seq_len = common_attn_metadata.seq_lens_cpu.max().item() + max_seq_len = common_attn_metadata.max_seq_len seq_lens = common_attn_metadata.seq_lens seq_lens_cpu = common_attn_metadata.seq_lens_cpu block_table_tensor = common_attn_metadata.block_table_tensor diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index e599411b2d7e8..abca981035d9e 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -305,7 +305,7 @@ class FlexAttentionMetadataBuilder( num_actual_tokens = common_attn_metadata.num_actual_tokens max_query_len = common_attn_metadata.max_query_len - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + max_seq_len = common_attn_metadata.max_seq_len query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens block_table_tensor = common_attn_metadata.block_table_tensor diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 36b5853bfdcbb..b9ff113573a12 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -270,7 +270,7 @@ class AiterFlashAttentionMetadataBuilder( num_actual_tokens = common_attn_metadata.num_actual_tokens max_query_len = common_attn_metadata.max_query_len - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + max_seq_len = common_attn_metadata.max_seq_len query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens block_table_tensor = common_attn_metadata.block_table_tensor diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py index 5d10e9e26082d..2a0c52377cc7f 100644 --- a/vllm/v1/attention/backends/tree_attn.py +++ b/vllm/v1/attention/backends/tree_attn.py @@ -205,7 +205,7 @@ class TreeAttentionMetadataBuilder( q_start_loc = common_attn_metadata.query_start_loc max_query_len = common_attn_metadata.max_query_len kv_seqlens = common_attn_metadata.seq_lens - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + max_seq_len = common_attn_metadata.max_seq_len block_table = common_attn_metadata.block_table_tensor slot_mapping = common_attn_metadata.slot_mapping diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 48a9af3decac0..c69dd8415f922 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -90,7 +90,7 @@ class TritonAttentionMetadataBuilder( num_actual_tokens = common_attn_metadata.num_actual_tokens max_query_len = common_attn_metadata.max_query_len - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + max_seq_len = common_attn_metadata.max_seq_len query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens block_table_tensor = common_attn_metadata.block_table_tensor diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 94dd3d2629ebc..57c4d436c5b6b 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -58,6 +58,8 @@ class CommonAttentionMetadata: """Total number of tokens in batch""" max_query_len: int """Longest query in batch""" + max_seq_len: int + """Longest context length in batch""" block_table_tensor: torch.Tensor slot_mapping: torch.Tensor @@ -107,6 +109,7 @@ def _make_metadata_with_slice( seq_lens = attn_metadata.seq_lens[request_slice] seq_lens_cpu = attn_metadata.seq_lens_cpu[request_slice] + max_seq_len = int(seq_lens_cpu.max()) num_computed_tokens_cpu = attn_metadata.num_computed_tokens_cpu[ request_slice] @@ -128,6 +131,7 @@ def _make_metadata_with_slice( num_reqs=num_requests, num_actual_tokens=num_actual_tokens, max_query_len=max_query_len, + max_seq_len=max_seq_len, block_table_tensor=block_table_tensor, slot_mapping=slot_mapping, ) @@ -520,6 +524,7 @@ def make_local_attention_virtual_batches( query_start_loc_cpu = torch.from_numpy(cu_seqlens_q_local) seq_lens_cpu = torch.from_numpy(seqlens_k_local) + max_seq_len = int(seq_lens_cpu.max()) return CommonAttentionMetadata( query_start_loc_cpu=query_start_loc_cpu, @@ -531,6 +536,7 @@ def make_local_attention_virtual_batches( num_reqs=len(seq_lens_cpu), num_actual_tokens=common_attn_metadata.num_actual_tokens, max_query_len=seqlens_q_local.max(), + max_seq_len=max_seq_len, block_table_tensor=block_table_local, slot_mapping=common_attn_metadata.slot_mapping, causal=True, diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py index fe732c6017702..b305bc1539081 100644 --- a/vllm/v1/attention/backends/xformers.py +++ b/vllm/v1/attention/backends/xformers.py @@ -231,7 +231,7 @@ class XFormersAttentionMetadataBuilder( q_seqlens = torch.diff(q_start_loc) max_query_len = common_attn_metadata.max_query_len kv_seqlens = common_attn_metadata.seq_lens - max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + max_seq_len = common_attn_metadata.max_seq_len block_table = common_attn_metadata.block_table_tensor slot_mapping = common_attn_metadata.slot_mapping diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 8cd2ad12cfa30..cc2b2a139d5e9 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -582,6 +582,7 @@ class EagleProposer: num_reqs=common_attn_metadata.num_reqs, num_actual_tokens=total_num_tokens, max_query_len=new_query_len_per_req.max().item(), + max_seq_len=new_seq_lens_cpu.max().item(), block_table_tensor=common_attn_metadata.block_table_tensor, slot_mapping=common_attn_metadata.slot_mapping[token_indices], causal=True, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e0bab3367cafe..d9770226b14ee 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -774,6 +774,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.seq_lens_np[num_reqs:].fill(0) self.seq_lens.copy_(self.seq_lens_cpu, non_blocking=True) seq_lens = self.seq_lens[:num_reqs] + max_seq_len = self.seq_lens_np[:num_reqs].max().item() # Copy the tensors to the GPU. self.input_ids[:total_num_scheduled_tokens].copy_( @@ -886,6 +887,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_reqs=num_reqs, num_actual_tokens=total_num_scheduled_tokens, max_query_len=max_num_scheduled_tokens, + max_seq_len=max_seq_len, block_table_tensor=blk_table_tensor, slot_mapping=slot_mapping, causal=True, @@ -2338,6 +2340,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_reqs=num_reqs, num_actual_tokens=num_tokens, max_query_len=max_query_len, + max_seq_len=self.max_model_len, block_table_tensor=self.input_batch.block_table[ kv_cache_group_id].get_device_tensor()[:num_reqs], slot_mapping=self.input_batch. @@ -3343,6 +3346,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_reqs=num_reqs, num_actual_tokens=total_num_scheduled_tokens, max_query_len=max_num_scheduled_tokens, + max_seq_len=self.seq_lens_cpu[:num_reqs].max().item(), block_table_tensor=dummy_block_table, slot_mapping=dummy_slot_mapping, causal=False, From 3b11b26b5069718a6bde11b9041681bc17369f96 Mon Sep 17 00:00:00 2001 From: JartX Date: Wed, 20 Aug 2025 18:08:29 +0200 Subject: [PATCH 138/361] [FIXBUG ] Allow disabling rocm_aiter_fa backend for ROCm GPUs not compatible with AITER (#22795) Signed-off-by: JartX Signed-off-by: tjtanaa Co-authored-by: tjtanaa --- vllm/v1/spec_decode/eagle.py | 80 ++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index cc2b2a139d5e9..0a0e9fed725cb 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -2,7 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ast from dataclasses import replace -from typing import Optional +from importlib.util import find_spec +from typing import Optional, Protocol import numpy as np import torch @@ -20,8 +21,6 @@ from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM from vllm.platforms import current_platform from vllm.utils import is_pin_memory_available from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata -from vllm.v1.attention.backends.rocm_aiter_fa import ( - AiterFlashAttentionMetadata) from vllm.v1.attention.backends.tree_attn import (TreeAttentionMetadata, TreeAttentionMetadataBuilder) from vllm.v1.attention.backends.triton_attn import TritonAttentionMetadata @@ -34,6 +33,17 @@ logger = init_logger(__name__) PADDING_SLOT_ID = -1 +class EagleAttentionMetadata(Protocol): + # Required attributes + num_actual_tokens: int + max_query_len: int + query_start_loc: torch.Tensor + max_seq_len: int + seq_lens: torch.Tensor + block_table: torch.Tensor + slot_mapping: torch.Tensor + + class EagleProposer: def __init__( @@ -97,6 +107,20 @@ class EagleProposer: dtype=self.dtype, device=device) + # Determine allowed attention backends once during initialization. + self.allowed_attn_types: tuple[type[EagleAttentionMetadata], ...] + if current_platform.is_rocm(): + rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata] + # vllm.v1.attention.backends.rocm_aiter_fa is an optional backend + if find_spec("vllm.v1.attention.backends.rocm_aiter_fa"): + from vllm.v1.attention.backends.rocm_aiter_fa import ( + AiterFlashAttentionMetadata) + rocm_types.append(AiterFlashAttentionMetadata) + self.allowed_attn_types = tuple(rocm_types) + else: + self.allowed_attn_types = (FlashAttentionMetadata, + TreeAttentionMetadata) + # Parse the speculative token tree. spec_token_tree = self.speculative_config.speculative_token_tree self.tree_choices: list[tuple[int, @@ -165,7 +189,7 @@ class EagleProposer: for layer_name in self.attn_layer_names: per_layer_attn_metadata[layer_name] = attn_metadata if self.use_cuda_graph and \ - num_tokens <= self.cudagraph_batch_sizes[-1]: + num_tokens <= self.cudagraph_batch_sizes[-1]: num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) else: num_input_tokens = num_tokens @@ -225,25 +249,13 @@ class EagleProposer: # TODO: Currently, MTP module released by deepseek only has # one layer. Adapt this code to support multiple layers once # there's a multi-layer MTP module. - - # On ROCm, both AiterFlashAttention and TritonAttention - # support multi-token eagle spec decode. - if current_platform.is_rocm(): - assert isinstance( - attn_metadata, - (TritonAttentionMetadata, AiterFlashAttentionMetadata, - FlashAttentionMetadata)) - else: - # Currently, only FlashAttention supports multi-token eagle spec - # decode. This is because the code below makes assumptions about - # attn_metadata attributes available. - assert isinstance(attn_metadata, FlashAttentionMetadata) + assert isinstance(attn_metadata, self.allowed_attn_types) # Generate the remaining draft tokens. draft_token_ids_list = [draft_token_ids] if self.use_cuda_graph and \ - batch_size <= self.cudagraph_batch_sizes[-1]: + batch_size <= self.cudagraph_batch_sizes[-1]: input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size) else: input_batch_size = batch_size @@ -449,7 +461,7 @@ class EagleProposer: num_tokens, -1) if self.use_cuda_graph and \ - num_tokens <= self.cudagraph_batch_sizes[-1]: + num_tokens <= self.cudagraph_batch_sizes[-1]: num_input_tokens = self.vllm_config.pad_for_cudagraph( num_tokens) else: @@ -508,19 +520,19 @@ class EagleProposer: """ # E.g. # common_attn_metadata.query_start_loc{_cpu}: - # [0, q1, q1 + q2, q1 + q2 + q3] + # [0, q1, q1 + q2, q1 + q2 + q3] # common_attn_metadata.seq_lens{_cpu}: [s1, s2, s3] # num_rejected_tokens: [n1, n2, n3] # This function computes the intermediate values: # num_tokens_per_req: [q1 - n1, q2 - n2, q3 - n3] # And returns: # common_attn_metadata.query_start_loc{_cpu}: - # [0, q1 - n1, q1 + q2 - n1 - n2, q1 + q2 + q3 - n1 - n2 - n3] + # [0, q1 - n1, q1 + q2 - n1 - n2, q1 + q2 + q3 - n1 - n2 - n3] # common_attn_metadata.seq_lens{_cpu}: - # [s1 - n1 + 1, s2 - n2 + 1, s3 - n3 + 1] + # [s1 - n1 + 1, s2 - n2 + 1, s3 - n3 + 1] # token_indices: [0, 1, ..., q1 - n1 - 1, - # q1, q1 + 1, ..., q1 + q2 - n2 - 1, - # q1 + q2, q1 + q2 + 1, ..., q1 + q2 + q3 - n3 - 1] + # q1, q1 + 1, ..., q1 + q2 - n2 - 1, + # q1 + q2, q1 + q2 + 1, ..., q1 + q2 + q3 - n3 - 1] device = common_attn_metadata.query_start_loc.device query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu @@ -564,9 +576,9 @@ class EagleProposer: old_query_start_locs_expanded = np.repeat( query_start_loc_cpu[:-1].numpy(), new_num_tokens_per_req_np) # Final token indices are: - # [0, 1, // req 1 - # q1 + 0, q1 + 1, q1 + 2, q1 + 3, // req 2 - # q1 + q2 + 0, q1 + q2 + 1, q1 + q2 + 2] // req 3 + # [0, 1, // req 1 + # q1 + 0, q1 + 1, q1 + 2, q1 + 3, // req 2 + # q1 + q2 + 0, q1 + q2 + 1, q1 + q2 + 2] // req 3 token_indices_np = token_offests + old_query_start_locs_expanded token_indices = torch.from_numpy(token_indices_np).to( device, non_blocking=True) @@ -616,20 +628,18 @@ class EagleProposer: target_language_model = target_model # share embed_tokens with the target model if needed if get_pp_group().world_size == 1 \ - and self.model.model.embed_tokens.weight.shape \ - == target_language_model.model.embed_tokens.weight.shape: + and self.model.model.embed_tokens.weight.shape \ + == target_language_model.model.embed_tokens.weight.shape: logger.info( - "Assuming the EAGLE head shares the same vocab embedding" \ - " with the target model." - ) + "Assuming the EAGLE head shares the same vocab embedding" + " with the target model.") del self.model.model.embed_tokens self.model.model.embed_tokens = ( target_language_model.model.embed_tokens) else: logger.info( - "The EAGLE head's vocab embedding will be loaded separately" \ - " from the target model." - ) + "The EAGLE head's vocab embedding will be loaded separately" + " from the target model.") # share lm_head with the target model if needed # some model definition do not define lm_head explicitly From dfd2382039c38be80d6c2c9b56e441b5bd7cd0ad Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Wed, 20 Aug 2025 09:52:59 -0700 Subject: [PATCH 139/361] [torch.compile] Support conditional torch.compile per module (#22269) Signed-off-by: Yong Hoon Shin --- .buildkite/test-pipeline.yaml | 2 + .../compile/piecewise/test_multiple_graphs.py | 135 +++------- tests/compile/test_decorator.py | 251 ++++++++++++++++++ vllm/compilation/decorators.py | 21 +- 4 files changed, 307 insertions(+), 102 deletions(-) create mode 100644 tests/compile/test_decorator.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 2f7f1db75bfb9..745420664010a 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -328,6 +328,7 @@ steps: - pytest -v -s compile/test_sequence_parallelism.py - pytest -v -s compile/test_async_tp.py - pytest -v -s compile/test_fusion_all_reduce.py + - pytest -v -s compile/test_decorator.py - label: PyTorch Fullgraph Smoke Test # 9min mirror_hardwares: [amdexperimental] @@ -341,6 +342,7 @@ steps: - pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_toy_llama.py - pytest -v -s compile/piecewise/test_full_cudagraph.py + - pytest -v -s compile/piecewise/test_multiple_graphs.py - label: PyTorch Fullgraph Test # 18min mirror_hardwares: [amdexperimental] diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py index e460d70951786..f5e2d9ddb7528 100644 --- a/tests/compile/piecewise/test_multiple_graphs.py +++ b/tests/compile/piecewise/test_multiple_graphs.py @@ -12,10 +12,9 @@ from vllm.compilation.backends import set_model_tag from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import (ignore_torch_compile, support_torch_compile) -from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, - set_current_vllm_config) -from vllm.envs import VLLM_USE_V1 -from vllm.forward_context import set_forward_context +from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode, + VllmConfig, set_current_vllm_config) +from vllm.forward_context import BatchDescriptor, set_forward_context from vllm.utils import direct_register_custom_op # create a library to hold the custom op @@ -164,104 +163,34 @@ class SimpleModelWithTwoGraphs(ParentModel): return x -def test_ignore_torch_compile_decorator(): - assert VLLM_USE_V1 - - # piecewise - vllm_config = VllmConfig(compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, - use_cudagraph=True, - splitting_ops=["silly.attention"], - cudagraph_capture_sizes=[1, 2], - )) - - @support_torch_compile - class A(nn.Module): - - def __init__(self, - *, - vllm_config: VllmConfig, - prefix: str = '', - **kwargs) -> None: - super().__init__() - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x = x + x - attn_output = torch.empty_like(x) - torch.ops.silly.attention(x, x, x, attn_output) - x = attn_output - x = x * 3 - return x - - @ignore_torch_compile - class B(A): - ... - - @support_torch_compile - class C(B): - ... - - with set_current_vllm_config(vllm_config): - mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda() - - # A has support_torch_compile - with compilation_counter.expect( - num_graphs_seen=1, - num_piecewise_graphs_seen=3, - num_piecewise_capturable_graphs_seen=2, - num_backend_compilations=2, - num_cudagraph_captured=4, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen - ), set_forward_context({}, vllm_config=vllm_config): - # first run is for compile - mod_A(torch.randn(BATCH_SIZE, MLP_SIZE).cuda()) - # run cudagraph captured sizes - mod_A(torch.randn(2, MLP_SIZE).cuda()) - mod_A(torch.randn(1, MLP_SIZE).cuda()) - - with set_current_vllm_config(vllm_config): - mod_B = B(vllm_config=vllm_config, prefix='').eval().cuda() - - # B's ignore_torch_compile should override A's support_torch_compile - with compilation_counter.expect( - num_graphs_seen=0, - num_piecewise_graphs_seen=0, - num_piecewise_capturable_graphs_seen=0, - num_backend_compilations=0, - num_cudagraph_captured=0, - ), set_forward_context({}, vllm_config=vllm_config): - mod_B(torch.randn(BATCH_SIZE, MLP_SIZE).cuda()) - mod_B(torch.randn(2, MLP_SIZE).cuda()) - mod_B(torch.randn(1, MLP_SIZE).cuda()) - - with set_current_vllm_config(vllm_config): - mod_C = C(vllm_config=vllm_config, prefix='').eval().cuda() - - # C's support_torch_compile should override B's ignore_torch_compile - with compilation_counter.expect( - num_graphs_seen=1, - num_piecewise_graphs_seen=3, - num_piecewise_capturable_graphs_seen=2, - num_backend_compilations=2, - num_cudagraph_captured=4, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen - ), set_forward_context({}, vllm_config=vllm_config): - mod_C(torch.randn(BATCH_SIZE, MLP_SIZE).cuda()) - mod_C(torch.randn(2, MLP_SIZE).cuda()) - mod_C(torch.randn(1, MLP_SIZE).cuda()) - - @torch.inference_mode -def run_model(vllm_config, model: nn.Module, inputs: torch.Tensor): +def run_model(vllm_config: VllmConfig, model: nn.Module, inputs: torch.Tensor, + cudagraph_runtime_mode: CUDAGraphMode): with set_forward_context({}, vllm_config=vllm_config): - # First run is for compile + # warmup for the model with cudagraph_mode NONE model(inputs) - # Run CUDAGraph captured sizes - model(inputs[:2]) - model(inputs[:1]) + # simulate cudagraphs capturing + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=2, )): + model(inputs[:2]) + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=1, )): + model(inputs[:1]) - output = model(inputs[:2]) + # simulate cudagraphs replay + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=2, )): + output = model(inputs[:2]) output = output.cpu() return output.cpu() @@ -277,6 +206,7 @@ def test_multi_graph_piecewise_compile_outputs_equal(): splitting_ops=["silly.attention"], cudagraph_capture_sizes=[1, 2], )) + cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE with set_current_vllm_config(vllm_config): model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE, @@ -299,11 +229,13 @@ def test_multi_graph_piecewise_compile_outputs_equal(): num_cudagraph_captured=8, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen ): - outputs.append(run_model(vllm_config, model, inputs)) + outputs.append( + run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) # no compile or cudagraph vllm_config = VllmConfig(compilation_config=CompilationConfig( level=CompilationLevel.NO_COMPILATION, )) + cudagraph_runtime_mode = CUDAGraphMode.NONE with set_current_vllm_config(vllm_config): model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE, @@ -318,7 +250,8 @@ def test_multi_graph_piecewise_compile_outputs_equal(): num_backend_compilations=0, num_cudagraph_captured=0, ): - outputs.append(run_model(vllm_config, model, inputs)) + outputs.append( + run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) # piecewise compile without CUDA graph vllm_config = VllmConfig(compilation_config=CompilationConfig( @@ -326,6 +259,7 @@ def test_multi_graph_piecewise_compile_outputs_equal(): use_cudagraph=False, splitting_ops=["silly.attention"], )) + cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE with set_current_vllm_config(vllm_config): model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE, @@ -340,7 +274,8 @@ def test_multi_graph_piecewise_compile_outputs_equal(): num_backend_compilations=4, num_cudagraph_captured=0, # no cudagraph captured ): - outputs.append(run_model(vllm_config, model, inputs)) + outputs.append( + run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) # Generally don't expect outputs with and without inductor # to be bitwise equivalent diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py new file mode 100644 index 0000000000000..51f8ddd566d56 --- /dev/null +++ b/tests/compile/test_decorator.py @@ -0,0 +1,251 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch +from torch import nn +from torch.library import Library + +from vllm.compilation.counter import compilation_counter +from vllm.compilation.decorators import (ignore_torch_compile, + support_torch_compile) +from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel, + CUDAGraphMode, VllmConfig, set_current_vllm_config) +from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.utils import direct_register_custom_op + +# create a library to hold the custom op +silly_lib = Library("silly", "FRAGMENT") # noqa + +BATCH_SIZE = 32 +MLP_SIZE = 128 + + +def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + out.copy_(q) + out += k + out += v + + +def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + return + + +direct_register_custom_op( + op_name="attention", + op_func=silly_attention, + mutates_args=["out"], + fake_impl=silly_attention_fake, + target_lib=silly_lib, +) + + +@torch.inference_mode +def run_model(vllm_config: VllmConfig, model: nn.Module, + cudagraph_runtime_mode: CUDAGraphMode): + with set_forward_context({}, vllm_config=vllm_config): + # warmup for the model with cudagraph_mode NONE + model(torch.randn(BATCH_SIZE, MLP_SIZE).cuda()) + + # simulate cudagraphs capturing + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=2, )): + model(torch.randn(2, MLP_SIZE).cuda()) + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=1, )): + model(torch.randn(1, MLP_SIZE).cuda()) + + # simulate cudagraphs replay + with set_forward_context({}, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=BatchDescriptor( + num_tokens=2, )): + output = model(torch.randn(2, MLP_SIZE).cuda()) + + output = output.cpu() + return output.cpu() + + +def test_ignore_torch_compile_decorator(): + # piecewise + vllm_config = VllmConfig(compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=True, + splitting_ops=["silly.attention"], + cudagraph_capture_sizes=[1, 2], + )) + cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE + + @support_torch_compile + class A(nn.Module): + + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = '', + **kwargs) -> None: + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + x + attn_output = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, attn_output) + x = attn_output + x = x * 3 + return x + + @ignore_torch_compile + class B(A): + ... + + @support_torch_compile + class C(B): + ... + + with set_current_vllm_config(vllm_config): + mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda() + + # A has support_torch_compile + with compilation_counter.expect( + num_graphs_seen=1, + num_piecewise_graphs_seen=3, + num_piecewise_capturable_graphs_seen=2, + num_backend_compilations=2, + num_cudagraph_captured=4, + # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + run_model(vllm_config, mod_A, cudagraph_runtime_mode) + + with set_current_vllm_config(vllm_config): + mod_B = B(vllm_config=vllm_config, prefix='').eval().cuda() + + # B's ignore_torch_compile should override A's support_torch_compile + with compilation_counter.expect( + num_graphs_seen=0, + num_piecewise_graphs_seen=0, + num_piecewise_capturable_graphs_seen=0, + num_backend_compilations=0, + num_cudagraph_captured=0, + ): + run_model(vllm_config, mod_B, cudagraph_runtime_mode) + + with set_current_vllm_config(vllm_config): + mod_C = C(vllm_config=vllm_config, prefix='').eval().cuda() + + # C's support_torch_compile should override B's ignore_torch_compile + with compilation_counter.expect( + num_graphs_seen=1, + num_piecewise_graphs_seen=3, + num_piecewise_capturable_graphs_seen=2, + num_backend_compilations=2, + num_cudagraph_captured=4, + # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + run_model(vllm_config, mod_C, cudagraph_runtime_mode) + + +# Only enable torch.compile if +# vllm_config.cache_config.kv_sharing_fast_prefill=True +@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config. + kv_sharing_fast_prefill) +class B(nn.Module): + + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = '', + **kwargs) -> None: + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + x + attn_output = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, attn_output) + x = attn_output + x = x + x + return x + + +# Only enable torch.compile if +# vllm_config.cache_config.kv_sharing_fast_prefill=False +@support_torch_compile(enable_if=lambda vllm_config: not vllm_config. + cache_config.kv_sharing_fast_prefill) +class A(nn.Module): + + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = '', + **kwargs) -> None: + super().__init__() + self.mod1 = B(vllm_config=vllm_config, prefix=prefix, **kwargs) + self.mod2 = B(vllm_config=vllm_config, prefix=prefix, **kwargs) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.mod1(x) + attn_output = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, attn_output) + x = attn_output + x = self.mod2(x) + return x + + +def test_conditional_compile_enable_if(): + vllm_config = VllmConfig(cache_config=CacheConfig( + kv_sharing_fast_prefill=True, ), + compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=True, + splitting_ops=["silly.attention"], + cudagraph_capture_sizes=[1, 2], + )) + cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE + + with set_current_vllm_config(vllm_config): + mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda() + + # A has support_torch_compile but enable_if fn returns False + # enalbe_if will be True for B, so we expect mod1 and mod2 + # to be compiled + with compilation_counter.expect( + num_graphs_seen=2, + num_piecewise_graphs_seen=6, + # 3 piecewise graphs per instance of B() + num_piecewise_capturable_graphs_seen=4, + num_backend_compilations=4, + num_cudagraph_captured=8, + # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + run_model(vllm_config, mod_A, cudagraph_runtime_mode) + + # Set kv_sharing_fast_prefill=False + # which will cause A to be compiled and B to not be compiled + vllm_config = VllmConfig(cache_config=CacheConfig( + kv_sharing_fast_prefill=False, ), + compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=True, + splitting_ops=["silly.attention"], + cudagraph_capture_sizes=[1, 2], + )) + + with set_current_vllm_config(vllm_config): + mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda() + + with compilation_counter.expect( + num_graphs_seen=1, + num_piecewise_graphs_seen=7, + # 3 attn ops and 4 non-attn ops + num_piecewise_capturable_graphs_seen=4, + num_backend_compilations=4, + num_cudagraph_captured=8, + # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + run_model(vllm_config, mod_A, cudagraph_runtime_mode) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 58f70ef9ef0aa..41d9fcb824b01 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -52,6 +52,14 @@ def _should_ignore_torch_compile(cls) -> bool: return getattr(cls, IGNORE_COMPILE_KEY, False) +@overload +def support_torch_compile( + *, + enable_if: Optional[Callable[[VllmConfig], bool]] = None, +) -> Callable[[_T], _T]: + ... + + @overload def support_torch_compile( *, @@ -69,6 +77,7 @@ def support_torch_compile( cls: Optional[_T] = None, *, dynamic_arg_dims: Optional[dict[str, Union[int, list[int]]]] = None, + enable_if: Optional[Callable[[VllmConfig], bool]] = None, ) -> Union[Callable[[_T], _T], _T]: """ A decorator to add support for compiling the forward method of a class. @@ -118,6 +127,11 @@ def support_torch_compile( NOTE: if an argument is `None`, it should always be passed as `None` during the lifetime of the model, otherwise, it cannot be captured as a single computation graph. + + `enable_if` is a function that takes a `VllmConfig` object as input and + returns a boolean value indicating whether to compile the model or not. + This is useful if you want to compile the model only when certain + conditions are met. """ def cls_decorator_helper(cls: _T) -> _T: @@ -149,7 +163,8 @@ def support_torch_compile( if k not in sig.parameters: raise ValueError( f"Argument {k} not found in the forward method of {cls}") - return _support_torch_compile(cls, inferred_dynamic_arg_dims) + return _support_torch_compile(cls, inferred_dynamic_arg_dims, + enable_if) if cls is not None: # use `support_torch_compile` as a decorator without arguments @@ -162,6 +177,7 @@ def support_torch_compile( def _support_torch_compile( cls: _T, dynamic_arg_dims: dict[str, Union[int, list[int]]], + enable_if: Optional[Callable[[VllmConfig], bool]] = None, ) -> _T: """ A decorator to add support for compiling the forward method of a class. @@ -182,13 +198,14 @@ def _support_torch_compile( def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs): old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) self.vllm_config = vllm_config + enable_compile = enable_if is None or enable_if(vllm_config) # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner # will handle the compilation, so we don't need to do anything here. self.do_not_compile = \ vllm_config.compilation_config.level in [ CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS ] or not supports_dynamo() or _should_ignore_torch_compile( - self.__class__) + self.__class__) or not enable_compile if self.do_not_compile: return From c4477f55e581e5ef5f52bbe39cba6e0de1956444 Mon Sep 17 00:00:00 2001 From: Benji Beck Date: Wed, 20 Aug 2025 10:37:29 -0700 Subject: [PATCH 140/361] Migrate Mistral3ImagePixelInputs to TensorSchema (#21945) Signed-off-by: Benji Beck Co-authored-by: Cyrus Leung --- vllm/model_executor/models/mistral3.py | 38 ++++++++++++-------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index a647292d3a68b..438513433d3b2 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -3,7 +3,7 @@ from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, +from typing import (Annotated, Final, Literal, Optional, Protocol, TypeVar, Union) import torch @@ -32,6 +32,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) @@ -42,15 +43,23 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, from .vision import get_vision_encoder_info -class Mistral3ImagePixelInputs(TypedDict): - type: Literal["pixel_values_pixtral"] - pixel_values: Union[torch.Tensor, list[torch.Tensor]] +class Mistral3ImagePixelInputs(TensorSchema): + """ + Dimensions: + - bn: Batch size * number of images + - c: Number of channels (3) + - h: Height of each image + - w: Width of each image """ - Shape: `(batch_size * num_images, num_channels, height, width)` - Note that `height` or `width` may be different per batch and image, - in which case the data is passed as a list instead of a batched tensor. - """ + type: Literal["pixel_values_pixtral"] = "pixel_values_pixtral" + + # Note that `height` or `width` may be different per batch and image, + # in which case the data is passed as a list instead of a batched tensor. + pixel_values: Annotated[ + Union[torch.Tensor, list[torch.Tensor]], + TensorShape("bn", 3, "h", "w", dynamic_dims={"h", "w"}), + ] class Mistral3PatchMerger(nn.Module): @@ -456,19 +465,6 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: - h = w = self.config.vision_config.image_size - expected_dims = (3, h, w) - actual_dims = tuple(data.shape[1:]) - - if actual_dims != expected_dims: - expected_expr = ("batch_size", *map(str, expected_dims)) - raise ValueError( - f"The expected shape of pixel values is {expected_expr}. " - f"You supplied {tuple(data.shape)}.") - - return data - def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[Mistral3ImagePixelInputs]: pixel_values = kwargs.pop("pixel_values", None) From f77a0802b758a32c5b9f7bc04e9498d77e8d99e0 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 20 Aug 2025 13:57:37 -0400 Subject: [PATCH 141/361] Limit HTTP header count and size (#23267) Signed-off-by: Taneem Ibrahim Signed-off-by: Russell Bryant Co-authored-by: Taneem Ibrahim --- vllm/entrypoints/constants.py | 10 ++++++++++ vllm/entrypoints/launcher.py | 21 +++++++++++++++++++++ vllm/entrypoints/openai/api_server.py | 2 ++ vllm/entrypoints/openai/cli_args.py | 8 ++++++++ 4 files changed, 41 insertions(+) create mode 100644 vllm/entrypoints/constants.py diff --git a/vllm/entrypoints/constants.py b/vllm/entrypoints/constants.py new file mode 100644 index 0000000000000..b5bcccc35d6c8 --- /dev/null +++ b/vllm/entrypoints/constants.py @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Shared constants for vLLM entrypoints. +""" + +# HTTP header limits for h11 parser +# These constants help mitigate header abuse attacks +H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304 # 4 MB +H11_MAX_HEADER_COUNT_DEFAULT = 256 diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index 9f4dc19fb4ab7..4e852ba594930 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -14,6 +14,8 @@ from vllm import envs from vllm.engine.async_llm_engine import AsyncEngineDeadError from vllm.engine.multiprocessing import MQEngineDeadError from vllm.engine.protocol import EngineClient +from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT, + H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT) from vllm.entrypoints.ssl import SSLCertRefresher from vllm.logger import init_logger from vllm.utils import find_process_using_port @@ -26,6 +28,11 @@ async def serve_http(app: FastAPI, sock: Optional[socket.socket], enable_ssl_refresh: bool = False, **uvicorn_kwargs: Any): + """ + Start a FastAPI app using Uvicorn, with support for custom Uvicorn config + options. Supports http header limits via h11_max_incomplete_event_size and + h11_max_header_count. + """ logger.info("Available routes are:") for route in app.routes: methods = getattr(route, "methods", None) @@ -36,7 +43,21 @@ async def serve_http(app: FastAPI, logger.info("Route: %s, Methods: %s", path, ', '.join(methods)) + # Extract header limit options if present + h11_max_incomplete_event_size = uvicorn_kwargs.pop( + "h11_max_incomplete_event_size", None) + h11_max_header_count = uvicorn_kwargs.pop("h11_max_header_count", None) + + # Set safe defaults if not provided + if h11_max_incomplete_event_size is None: + h11_max_incomplete_event_size = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT + if h11_max_header_count is None: + h11_max_header_count = H11_MAX_HEADER_COUNT_DEFAULT + config = uvicorn.Config(app, **uvicorn_kwargs) + # Set header limits + config.h11_max_incomplete_event_size = h11_max_incomplete_event_size + config.h11_max_header_count = h11_max_header_count config.load() server = uvicorn.Server(config) _add_shutdown_handlers(app, server) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 765327da3b306..24148bcef2353 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1922,6 +1922,8 @@ async def run_server_worker(listen_address, ssl_certfile=args.ssl_certfile, ssl_ca_certs=args.ssl_ca_certs, ssl_cert_reqs=args.ssl_cert_reqs, + h11_max_incomplete_event_size=args.h11_max_incomplete_event_size, + h11_max_header_count=args.h11_max_header_count, **uvicorn_kwargs, ) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index e15f65b43082c..6e4eff5c80243 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -20,6 +20,8 @@ from vllm.config import config from vllm.engine.arg_utils import AsyncEngineArgs, optional_type from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, validate_chat_template) +from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT, + H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT) from vllm.entrypoints.openai.serving_models import LoRAModulePath from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.logger import init_logger @@ -172,6 +174,12 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`""" enable_log_outputs: bool = False """If set to True, enable logging of model outputs (generations) in addition to the input logging that is enabled by default.""" + h11_max_incomplete_event_size: int = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT + """Maximum size (bytes) of an incomplete HTTP event (header or body) for + h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB).""" + h11_max_header_count: int = H11_MAX_HEADER_COUNT_DEFAULT + """Maximum number of HTTP headers allowed in a request for h11 parser. + Helps mitigate header abuse. Default: 256.""" @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: From ebe56a0064f7a72a5c51d4cd6bcca165590c5bed Mon Sep 17 00:00:00 2001 From: dongluw <108290936+dongluw@users.noreply.github.com> Date: Wed, 20 Aug 2025 14:15:18 -0400 Subject: [PATCH 142/361] Small fix for Command-A-Vision (#23268) Signed-off-by: donglu --- vllm/model_executor/models/cohere2_vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index 4682a8a428a03..fca1aee835b89 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -348,7 +348,7 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, vllm_config=vllm_config, hf_config=config.text_config, prefix=maybe_prefix(prefix, "language_model"), - architectures=["Cohere2ForCausalLM"]) + architectures=config.text_config.architectures) @property def dtype(self): From 0cdbf5e61ce3fd97d33b31b775d2faaadc99fbc5 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 20 Aug 2025 15:13:36 -0400 Subject: [PATCH 143/361] [Kernel/Quant] Remove the original marlin format and qqq (#23204) Signed-off-by: mgoin --- .../configs/Meta-Llama-3-8B-QQQ.yaml | 12 - .../lm-eval-harness/configs/models-large.txt | 1 - CMakeLists.txt | 2 - benchmarks/kernels/benchmark_machete.py | 23 +- csrc/quantization/machete/generate.py | 139 +- csrc/quantization/marlin/dense/LICENSE | 209 --- csrc/quantization/marlin/dense/common/base.h | 32 - csrc/quantization/marlin/dense/common/mem.h | 89 -- .../marlin/dense/marlin_cuda_kernel.cu | 1073 -------------- .../marlin/qqq/marlin_qqq_gemm_kernel.cu | 1248 ----------------- csrc/torch_bindings.cpp | 17 - tests/compile/test_full_graph.py | 6 - tests/kernels/quantization/test_machete_mm.py | 34 +- .../kernels/quantization/test_marlin_gemm.py | 83 -- tests/quantization/test_configs.py | 10 - tests/quantization/test_lm_head.py | 6 +- tests/weight_loading/models.txt | 4 - vllm/_custom_ops.py | 36 - vllm/config/__init__.py | 7 +- vllm/lora/layers.py | 3 - vllm/model_executor/layers/linear.py | 1 - .../layers/quantization/__init__.py | 6 - .../layers/quantization/marlin.py | 263 ---- .../model_executor/layers/quantization/qqq.py | 275 ---- .../utils/marlin_utils_test_qqq.py | 126 -- .../layers/quantization/utils/quant_utils.py | 85 -- 26 files changed, 92 insertions(+), 3698 deletions(-) delete mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml delete mode 100644 csrc/quantization/marlin/dense/LICENSE delete mode 100644 csrc/quantization/marlin/dense/common/base.h delete mode 100644 csrc/quantization/marlin/dense/common/mem.h delete mode 100644 csrc/quantization/marlin/dense/marlin_cuda_kernel.cu delete mode 100644 csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu delete mode 100644 vllm/model_executor/layers/quantization/marlin.py delete mode 100644 vllm/model_executor/layers/quantization/qqq.py delete mode 100644 vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml deleted file mode 100644 index 56ec933c9cc0e..0000000000000 --- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# For vllm script, with -t option (tensor parallel size). -# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1 -model_name: "HandH1998/QQQ-Llama-3-8b-g128" -tasks: -- name: "gsm8k" - metrics: - - name: "exact_match,strict-match" - value: 0.419 - - name: "exact_match,flexible-extract" - value: 0.416 -limit: 1000 -num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt index 27a1a9a82bd35..37eeac85c933b 100644 --- a/.buildkite/lm-eval-harness/configs/models-large.txt +++ b/.buildkite/lm-eval-harness/configs/models-large.txt @@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml Mixtral-8x7B-Instruct-v0.1.yaml Qwen2-57B-A14-Instruct.yaml DeepSeek-V2-Lite-Chat.yaml -Meta-Llama-3-8B-QQQ.yaml diff --git a/CMakeLists.txt b/CMakeLists.txt index bcbd1b52a06c6..a1deefb07f09c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -357,9 +357,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC}) set(MARLIN_SRCS - "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" - "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu" "csrc/quantization/gptq_marlin/gptq_marlin.cu" "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" "csrc/quantization/gptq_marlin/awq_marlin_repack.cu") diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index 975d10f2e92ec..a9c4d30d9b189 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -253,28 +253,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable: else: assert bt.a.dtype == torch.int8 assert bt.wtype == scalar_types.uint4b8 - - if bt.w_ch_s is not None: - s_ch = bt.w_ch_s.to(torch.float32) - else: - s_ch = torch.ones(bt.w_ref.shape[1], dtype=torch.float32, device=device) - - if bt.w_tok_s is not None: - s_tok = bt.w_tok_s.to(torch.float32) - else: - s_tok = torch.ones(bt.a.shape[0], dtype=torch.float32, device=device) - - fn = lambda: ops.marlin_qqq_gemm( - a=bt.a, - b_q_weight=w_q, - s_group=w_s, - s_tok=s_tok, - s_ch=s_ch, - workspace=workspace.scratch, - size_m=bt.a.shape[0], - size_n=bt.w_ref.shape[1], - size_k=bt.w_ref.shape[0], - ) + raise NotImplementedError("QQQ is not supported anymore") return fn diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index 88b3f9c734a30..0d14ba15937c6 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -571,78 +571,79 @@ def generate(): itertools.repeat(default_heuristic)) ] - # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk)) - # TODO (LucasWilkinson): Further tuning required - qqq_tile_heuristic_config = { - #### M = 257+ - # ((128, 256), (2, 1, 1)) Broken for QQQ types - # TODO (LucasWilkinson): Investigate further - # "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)), - # "M > 256": ((128, 256), (2, 1, 1)), - "M > 256": ((128, 128), (2, 1, 1)), - #### M = 129-256 - "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)), - "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)), - # ((128, 256), (2, 1, 1)) Broken for QQQ types - # TODO (LucasWilkinson): Investigate further - # "M > 128": ((128, 256), (2, 1, 1)), - "M > 128": ((128, 128), (2, 1, 1)), - #### M = 65-128 - "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)), - "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)), - "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)), - "M > 64": ((128, 128), (2, 1, 1)), - #### M = 33-64 - "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)), - # Broken for QQQ types - # TODO (LucasWilkinson): Investigate further - #"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)), - "M > 32": ((128, 64), (2, 1, 1)), - #### M = 17-32 - "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)), - "M > 16": ((256, 32), (2, 1, 1)), - #### M = 1-16 - "N >= 26624": ((256, 16), (1, 1, 1)), - None: ((128, 16), (1, 1, 1)), - } + # TODO: Support W4A8 when ready + # # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk)) + # # TODO (LucasWilkinson): Further tuning required + # qqq_tile_heuristic_config = { + # #### M = 257+ + # # ((128, 256), (2, 1, 1)) Broken for QQQ types + # # TODO (LucasWilkinson): Investigate further + # # "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)), + # # "M > 256": ((128, 256), (2, 1, 1)), + # "M > 256": ((128, 128), (2, 1, 1)), + # #### M = 129-256 + # "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)), + # "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)), + # # ((128, 256), (2, 1, 1)) Broken for QQQ types + # # TODO (LucasWilkinson): Investigate further + # # "M > 128": ((128, 256), (2, 1, 1)), + # "M > 128": ((128, 128), (2, 1, 1)), + # #### M = 65-128 + # "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)), + # "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)), + # "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)), + # "M > 64": ((128, 128), (2, 1, 1)), + # #### M = 33-64 + # "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)), + # # Broken for QQQ types + # # TODO (LucasWilkinson): Investigate further + # #"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)), + # "M > 32": ((128, 64), (2, 1, 1)), + # #### M = 17-32 + # "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)), + # "M > 16": ((256, 32), (2, 1, 1)), + # #### M = 1-16 + # "N >= 26624": ((256, 16), (1, 1, 1)), + # None: ((128, 16), (1, 1, 1)), + # } - # For now we use the same heuristic for all types - # Heuristic is currently tuned for H100s - qqq_heuristic = [ - (cond, ScheduleConfig(*tile_config, - **sch_common_params)) # type: ignore - for cond, tile_config in qqq_tile_heuristic_config.items() - ] + # # For now we use the same heuristic for all types + # # Heuristic is currently tuned for H100s + # qqq_heuristic = [ + # (cond, ScheduleConfig(*tile_config, + # **sch_common_params)) # type: ignore + # for cond, tile_config in qqq_tile_heuristic_config.items() + # ] - QQQ_kernel_types = [ - *(TypeConfig( - a=DataType.s8, - b=VLLMDataType.u4b8, - b_group_scale=b_group_scale, - b_group_zeropoint=DataType.void, - b_channel_scale=DataType.f32, - a_token_scale=DataType.f32, - out=DataType.f16, - accumulator=DataType.s32, - ) for b_group_scale in (DataType.f16, DataType.void)), - *(TypeConfig( - a=DataType.e4m3, - b=VLLMDataType.u4b8, - b_group_scale=b_group_scale, - b_group_zeropoint=DataType.void, - b_channel_scale=DataType.f32, - a_token_scale=DataType.f32, - out=DataType.f16, - accumulator=DataType.f32, - ) for b_group_scale in (DataType.f16, DataType.void)), - ] + # QQQ_kernel_types = [ + # *(TypeConfig( + # a=DataType.s8, + # b=VLLMDataType.u4b8, + # b_group_scale=b_group_scale, + # b_group_zeropoint=DataType.void, + # b_channel_scale=DataType.f32, + # a_token_scale=DataType.f32, + # out=DataType.f16, + # accumulator=DataType.s32, + # ) for b_group_scale in (DataType.f16, DataType.void)), + # *(TypeConfig( + # a=DataType.e4m3, + # b=VLLMDataType.u4b8, + # b_group_scale=b_group_scale, + # b_group_zeropoint=DataType.void, + # b_channel_scale=DataType.f32, + # a_token_scale=DataType.f32, + # out=DataType.f16, + # accumulator=DataType.f32, + # ) for b_group_scale in (DataType.f16, DataType.void)), + # ] - impl_configs += [ - ImplConfig(x[0], x[1], x[2]) - for x in zip(QQQ_kernel_types, - itertools.repeat(get_unique_schedules(qqq_heuristic)), - itertools.repeat(qqq_heuristic)) - ] + # impl_configs += [ + # ImplConfig(x[0], x[1], x[2]) + # for x in zip(QQQ_kernel_types, + # itertools.repeat(get_unique_schedules(qqq_heuristic)), + # itertools.repeat(qqq_heuristic)) + # ] output_dir = os.path.join(SCRIPT_DIR, "generated") diff --git a/csrc/quantization/marlin/dense/LICENSE b/csrc/quantization/marlin/dense/LICENSE deleted file mode 100644 index 1d1e4cf9c8233..0000000000000 --- a/csrc/quantization/marlin/dense/LICENSE +++ /dev/null @@ -1,209 +0,0 @@ -Contains code from https://github.com/IST-DASLab/marlin - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "{}" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright {yyyy} {name of copyright owner} - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ------------------------------------------------------------------------------------- - -This product bundles various third-party components under other open source licenses. -This section summarizes those components and their licenses. See licenses/ -for text of these licenses. diff --git a/csrc/quantization/marlin/dense/common/base.h b/csrc/quantization/marlin/dense/common/base.h deleted file mode 100644 index 68c83d5478cf8..0000000000000 --- a/csrc/quantization/marlin/dense/common/base.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Modified by HandH1998 - * Modified by Neural Magic - * Copyright (C) Marlin.2024 Elias Frantar - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; } - -// Instances of `Vec` are used to organize groups of >>registers<<, as needed -// for instance as inputs to tensor core operations. Consequently, all -// corresponding index accesses must be compile-time constants, which is why we -// extensively use `#pragma unroll` throughout the kernel code to guarantee -// this. -template -struct Vec { - T elems[n]; - __device__ T& operator[](int i) { return elems[i]; } -}; diff --git a/csrc/quantization/marlin/dense/common/mem.h b/csrc/quantization/marlin/dense/common/mem.h deleted file mode 100644 index 64f9c393d77ce..0000000000000 --- a/csrc/quantization/marlin/dense/common/mem.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Modified by HandH1998 - * Modified by Neural Magic - * Copyright (C) Marlin.2024 Elias Frantar - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -// Predicated asynchronous global->shared copy; used for inputs A where we apply -// predication to handle batchsizes that are not multiples of 16. -__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, - bool pred = true) { - const int BYTES = 16; - uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile( - "{\n" - " .reg .pred p;\n" - " setp.ne.b32 p, %0, 0;\n" - " @p cp.async.cg.shared.global [%1], [%2], %3;\n" - "}\n" ::"r"((int)pred), - "r"(smem), "l"(glob_ptr), "n"(BYTES)); -} - -// Asynchronous global->shared copy -__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) { - const int BYTES = 16; - uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile( - "{\n" - " cp.async.cg.shared.global [%0], [%1], %2;\n" - "}\n" ::"r"(smem), - "l"(glob_ptr), "n"(BYTES)); -} - -// Async copy fence. -__device__ inline void cp_async_fence() { - asm volatile("cp.async.commit_group;\n" ::); -} - -// Wait until at most `n` async copy stages are still pending. -template -__device__ inline void cp_async_wait() { - asm volatile("cp.async.wait_group %0;\n" ::"n"(n)); -} - -// Wait until barrier reaches `count`, then lock for current threadblock. -__device__ inline void barrier_acquire(int* lock, int count) { - if (threadIdx.x == 0) { - int state = -1; - do - // Guarantee that subsequent writes by this threadblock will be visible - // globally. - asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" - : "=r"(state) - : "l"(lock)); - while (state != count); - } - __syncthreads(); -} - -// Release barrier and increment visitation count. -__device__ inline void barrier_release(int* lock, bool reset = false) { - __syncthreads(); - if (threadIdx.x == 0) { - if (reset) { - lock[0] = 0; - return; - } - int val = 1; - // Make sure that all writes since acquiring this barrier are visible - // globally, while releasing the barrier. - asm volatile("fence.acq_rel.gpu;\n"); - asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n" - : - : "l"(lock), "r"(val)); - } -} diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu deleted file mode 100644 index ea96326ed7e61..0000000000000 --- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu +++ /dev/null @@ -1,1073 +0,0 @@ -/* - * Modified by Neural Magic - * Copyright (C) Marlin.2024 Elias Frantar - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include - -#include - -#include "common/base.h" -#include "core/registration.h" - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - #include "common/mem.h" -#endif - -template -inline std::string str(T x) { - return std::to_string(x); -} - -namespace marlin_dense { - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - -using I4 = Vec; -// Matrix fragments for tensor core instructions; their precise layout is -// documented here: -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type -using FragA = Vec; -using FragB = Vec; -using FragC = Vec; -using FragS = Vec; // quantization scales - -// m16n8k16 tensor core mma instruction with fp16 inputs and fp32 -// output/accumulation. -__device__ inline void mma(const FragA& a_frag, const FragB& frag_b, - FragC& frag_c) { - const uint32_t* a = reinterpret_cast(&a_frag); - const uint32_t* b = reinterpret_cast(&frag_b); - float* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), - "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); -} - -// Instruction for loading a full 16x16 matrix fragment of operand A from shared -// memory, directly in tensor core layout. -__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) { - uint32_t* a = reinterpret_cast(&frag_a); - uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n" - : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3]) - : "r"(smem)); -} - -// Lookup-table based 3-input logical operation; explicitly used for -// dequantization as the compiler does not seem to automatically recognize it in -// all cases. -template -__device__ inline int lop3(int a, int b, int c) { - int res; - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" - : "=r"(res) - : "r"(a), "r"(b), "r"(c), "n"(lut)); - return res; -} - -// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16 -// values. We mostly follow the strategy in the link below, with some small -// changes: -// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h -__device__ inline FragB dequant(int q) { - const int LO = 0x000f000f; - const int HI = 0x00f000f0; - const int EX = 0x64006400; - // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); - // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point - // directly into `SUB` and `ADD`. - const int SUB = 0x64086408; - const int MUL = 0x2c002c00; - const int ADD = 0xd480d480; - FragB frag_b; - frag_b[0] = __hsub2(*reinterpret_cast(&lo), - *reinterpret_cast(&SUB)); - frag_b[1] = __hfma2(*reinterpret_cast(&hi), - *reinterpret_cast(&MUL), - *reinterpret_cast(&ADD)); - return frag_b; -} - -// Multiply dequantized values by the corresponding quantization scale; used -// only for grouped quantization. -__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) { - half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]); - frag_b[0] = __hmul2(frag_b[0], s); - frag_b[1] = __hmul2(frag_b[1], s); -} - -template shared - // fetch pipeline - const int group_blocks = -1 // number of consecutive 16x16 blocks - // with a separate quantization scale - > -__global__ void Marlin( - const int4* __restrict__ A, // fp16 input matrix of shape mxk - const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn - int4* __restrict__ C, // fp16 output buffer of shape mxn - const int4* __restrict__ s, // fp16 quantization scales of shape - // (k/groupsize)xn - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int* locks // extra global storage for barrier synchronization -) { - // Each threadblock processes one "stripe" of the B matrix with (roughly) the - // same size, which might involve multiple column "slices" (of width 16 * - // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM - // example: - // 0 1 3 - // 0 2 3 - // 1 2 4 - // While this kind of partitioning makes things somewhat more complicated, it - // ensures good utilization of all SMs for many kinds of shape and GPU - // configurations, while requiring as few slow global cross-threadblock - // reductions as possible. - - // For larger GEMMs we run multiple batchsize 64 versions in parallel for a - // better partitioning with less reductions - int parallel = 1; - if (prob_m > 16 * thread_m_blocks) { - parallel = prob_m / (16 * thread_m_blocks); - prob_m = 16 * thread_m_blocks; - } - - int k_tiles = prob_k / 16 / thread_k_blocks; - int n_tiles = prob_n / 16 / thread_n_blocks; - int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x); - // Ensure that the number of tiles in each stripe is a multiple of the - // groupsize; this avoids an annoying special case where a stripe starts in - // the middle of group. - if (group_blocks != -1) - iters = (group_blocks / thread_k_blocks) * - ceildiv(iters, (group_blocks / thread_k_blocks)); - - int slice_row = (iters * blockIdx.x) % k_tiles; - int slice_col_par = (iters * blockIdx.x) / k_tiles; - int slice_col = slice_col_par; - int slice_iters; // number of threadblock tiles in the current slice - int slice_count = - 0; // total number of active threadblocks in the current slice - int slice_idx; // index of threadblock in current slice; numbered bottom to - // top - - // We can easily implement parallel problem execution by just remapping - // indices and advancing global pointers - if (slice_col_par >= n_tiles) { - A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8; - C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8; - locks += (slice_col_par / n_tiles) * n_tiles; - slice_col = slice_col_par % n_tiles; - } - - // Compute all information about the current slice which is required for - // synchronization. - auto init_slice = [&]() { - slice_iters = - iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row); - if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0; - if (slice_iters == 0) return; - if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row; - slice_count = 1; - slice_idx = 0; - int col_first = iters * ceildiv(k_tiles * slice_col_par, iters); - if (col_first <= k_tiles * (slice_col_par + 1)) { - int col_off = col_first - k_tiles * slice_col_par; - slice_count = ceildiv(k_tiles - col_off, iters); - if (col_off > 0) slice_count++; - int delta_first = iters * blockIdx.x - col_first; - if (delta_first < 0 || (col_off == 0 && delta_first == 0)) - slice_idx = slice_count - 1; - else { - slice_idx = slice_count - 1 - delta_first / iters; - if (col_off > 0) slice_idx--; - } - } - if (slice_col == n_tiles) { - A += 16 * thread_m_blocks * prob_k / 8; - C += 16 * thread_m_blocks * prob_n / 8; - locks += n_tiles; - slice_col = 0; - } - }; - init_slice(); - - int a_gl_stride = prob_k / 8; // stride of the A matrix in global memory - // We typically use `constexpr` to indicate that this value is a compile-time - // constant - constexpr int a_sh_stride = - 16 * thread_k_blocks / 8; // stride of an A matrix tile in shared memory - constexpr int a_gl_rd_delta_o = - 16 * thread_k_blocks / - 8; // delta between subsequent A tiles in global memory - int a_gl_rd_delta_i = - a_gl_stride * - (threads / a_gl_rd_delta_o); // between subsequent accesses within a tile - constexpr int a_sh_wr_delta = - a_sh_stride * - (threads / a_gl_rd_delta_o); // between shared memory writes - constexpr int a_sh_rd_delta_o = - 2 * ((threads / 32) / - (thread_n_blocks / 4)); // between shared memory tile reads - constexpr int a_sh_rd_delta_i = - a_sh_stride * 16; // within a shared memory tile - constexpr int a_sh_stage = - a_sh_stride * (16 * thread_m_blocks); // overall size of a tile - constexpr int a_sh_wr_iters = - ceildiv(a_sh_stage, - a_sh_wr_delta); // number of shared write iterations for a tile - - int b_gl_stride = 16 * prob_n / 32; - constexpr int b_sh_stride = 32 * thread_n_blocks / 4; - int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks; - int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride); - constexpr int b_sh_wr_delta = threads; - constexpr int b_sh_rd_delta = threads; - constexpr int b_sh_stage = b_sh_stride * thread_k_blocks; - constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta; - - int s_gl_stride = prob_n / 8; - constexpr int s_sh_stride = 16 * thread_n_blocks / 8; - constexpr int s_sh_stage = s_sh_stride; - int s_gl_rd_delta = s_gl_stride; - - // Global A read index of current thread. - int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + - (threadIdx.x % a_gl_rd_delta_o); - a_gl_rd += a_gl_rd_delta_o * slice_row; - // Shared write index of current thread. - int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) + - (threadIdx.x % a_gl_rd_delta_o); - // Shared read index. - int a_sh_rd = - a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16; - a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4)); - - int b_gl_rd = - b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride); - b_gl_rd += b_sh_stride * slice_col; - b_gl_rd += b_gl_rd_delta_o * slice_row; - auto b_sh_wr = threadIdx.x; - auto b_sh_rd = threadIdx.x; - - int s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) + - s_sh_stride * slice_col + threadIdx.x; - auto s_sh_wr = threadIdx.x; - int s_sh_rd; - // We use a different scale layout for grouped and column-wise quantization as - // we scale a `half2` tile in column-major layout in the former and in - // row-major in the latter case. - if (group_blocks != -1) - s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + - (threadIdx.x % 32) / 4; - else - s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + - (threadIdx.x % 32) % 4; - - // Precompute which thread should not read memory in which iterations; this is - // needed if there are more threads than required for a certain tilesize or - // when the batchsize is not a multiple of 16. - bool a_sh_wr_pred[a_sh_wr_iters]; - #pragma unroll - for (int i = 0; i < a_sh_wr_iters; i++) - a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m; - bool s_sh_wr_pred = threadIdx.x < s_sh_stride; - - // To ensure that writing and reading A tiles to/from shared memory, the - // latter in fragment format, is fully bank conflict free, we need to use a - // rather fancy XOR-based layout. The key here is that neither reads nor - // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the - // same shared memory banks. Further, it seems (based on NSight-Compute) that - // each warp must also write a consecutive memory segment? - auto transform_a = [&](int i) { - int row = i / a_gl_rd_delta_o; - return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row; - }; - // Since the computation of this remapping is non-trivial and, due to our main - // loop unrolls, all shared memory accesses are static, we simply precompute - // both transformed reads and writes. - int a_sh_wr_trans[a_sh_wr_iters]; - #pragma unroll - for (int i = 0; i < a_sh_wr_iters; i++) - a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr); - int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks]; - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) { - #pragma unroll - for (int j = 0; j < thread_m_blocks; j++) - a_sh_rd_trans[i][j] = - transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd); - } - - // Since B-accesses have non-constant stride they have to be computed at - // runtime; we break dependencies between subsequent accesses with a tile by - // maintining multiple pointers (we have enough registers), a tiny - // optimization. - const int4* B_ptr[b_sh_wr_iters]; - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) - B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd; - - extern __shared__ int4 sh[]; - // Shared memory storage for global fetch pipelines. - int4* sh_a = sh; - int4* sh_b = sh_a + (stages * a_sh_stage); - int4* sh_s = sh_b + (stages * b_sh_stage); - // Register storage for double buffer of shared memory reads. - FragA frag_a[2][thread_m_blocks]; - I4 frag_b_quant[2]; - FragC frag_c[thread_m_blocks][4][2]; - FragS frag_s[2][4]; - - // Zero accumulators. - auto zero_accums = [&]() { - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++) - reinterpret_cast(frag_c)[i] = 0; - }; - - // Asynchronously fetch the next A, B and s tile from global to the next - // shared memory pipeline location. - auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) { - if (pred) { - int4* sh_a_stage = sh_a + a_sh_stage * pipe; - #pragma unroll - for (int i = 0; i < a_sh_wr_iters; i++) { - cp_async4_pred( - &sh_a_stage[a_sh_wr_trans[i]], - &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off], - a_sh_wr_pred[i]); - } - int4* sh_b_stage = sh_b + b_sh_stage * pipe; - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) { - cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]); - B_ptr[i] += b_gl_rd_delta_o; - } - // Only fetch scales if this tile starts a new group - if constexpr (group_blocks != -1) { - // This assumes group_blocks >= thread_k_blocks - // and would need to be modified to support smaller groups. - static_assert(group_blocks >= thread_k_blocks); - if (pipe % (group_blocks / thread_k_blocks) == 0) { - int4* sh_s_stage = sh_s + s_sh_stage * pipe; - if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]); - s_gl_rd += s_gl_rd_delta; - } - } - } - // Insert a fence even when we are winding down the pipeline to ensure that - // waiting is also correct at this point. - cp_async_fence(); - }; - - // Wait until the next thread tile has been loaded to shared memory. - auto wait_for_stage = [&]() { - // We only have `stages - 2` active fetches since we are double buffering - // and can only issue the next fetch when it is guaranteed that the previous - // shared memory load is fully complete (as it may otherwise be - // overwritten). - cp_async_wait(); - __syncthreads(); - }; - - // Load the next sub-tile from the current location in the shared memory pipe - // into the current register buffer. - auto fetch_to_registers = [&](int k, int pipe) { - // It may seem inefficient that we reload the groups for every sub-tile; - // however, this does not seem to be a significant bottleneck, while some - // theoretically better attempts have lead to bad instruction ordering by - // the compiler and correspondingly a noticeable drop in performance. - if constexpr (group_blocks != -1) { - // This assumes group_blocks >= thread_k_blocks - // and would need to be modified to support smaller groups. - static_assert(group_blocks >= thread_k_blocks); - int4* sh_s_stage = - sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) * - (pipe / (group_blocks / thread_k_blocks))); - reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd]; - } - int4* sh_a_stage = sh_a + a_sh_stage * pipe; - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) - ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]); - int4* sh_b_stage = sh_b + b_sh_stage * pipe; - frag_b_quant[k % 2] = *reinterpret_cast( - &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]); - }; - - // Execute the actual tensor core matmul of a sub-tile. - auto matmul = [&](int k) { - // We have the m dimension as the inner loop in order to encourage overlapping - // dequantization and matmul operations. - #pragma unroll - for (int j = 0; j < 4; j++) { - int b_quant = frag_b_quant[k % 2][j]; - int b_quant_shift = b_quant >> 8; - FragB frag_b0 = dequant(b_quant); - // If there are no groups, we can just scale the final output once and can - // avoid doing so for each weight. - if (group_blocks != -1) scale(frag_b0, frag_s[k % 2][j], 0); - FragB frag_b1 = dequant(b_quant_shift); - if (group_blocks != -1) scale(frag_b1, frag_s[k % 2][j], 1); - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) { - mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]); - mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]); - } - } - }; - - // Since we slice across the k dimension of a tile in order to increase the - // number of warps while keeping the n dimension of a tile reasonable, we have - // multiple warps that accumulate their partial sums of the same output - // location; which we have to reduce over in the end. We do in shared memory. - auto thread_block_reduce = [&]() { - constexpr int red_off = threads / b_sh_stride / 2; - if (red_off >= 1) { - auto red_idx = threadIdx.x / b_sh_stride; - constexpr int red_sh_stride = b_sh_stride * 4 * 2; - constexpr int red_sh_delta = b_sh_stride; - int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) + - (threadIdx.x % b_sh_stride); - - // Parallel logarithmic shared memory reduction. We make sure to avoid any - // unnecessary read or write iterations, e.g., for two warps we write only - // once by warp 1 and read only once by warp 0. - - #pragma unroll - for (int m_block = 0; m_block < thread_m_blocks; m_block++) { - #pragma unroll - for (int i = red_off; i > 0; i /= 2) { - if (i <= red_idx && red_idx < 2 * i) { - #pragma unroll - for (int j = 0; j < 4 * 2; j++) { - int red_sh_wr = - red_sh_delta * j + (red_sh_rd - red_sh_stride * i); - if (i < red_off) { - float* c_rd = - reinterpret_cast(&sh[red_sh_delta * j + red_sh_rd]); - float* c_wr = reinterpret_cast(&sh[red_sh_wr]); - #pragma unroll - for (int k = 0; k < 4; k++) - reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += - c_rd[k] + c_wr[k]; - } - sh[red_sh_wr] = - reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; - } - } - __syncthreads(); - } - if (red_idx == 0) { - #pragma unroll - for (int i = 0; i < 4 * 2; i++) { - float* c_rd = - reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); - #pragma unroll - for (int j = 0; j < 4; j++) - reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += - c_rd[j]; - } - } - __syncthreads(); - } - } - }; - - // Since multiple threadblocks may process parts of the same column slice, we - // finally have to globally reduce over the results. As the striped - // partitioning minimizes the number of such reductions and our outputs are - // usually rather small, we perform this reduction serially in L2 cache. - auto global_reduce = [&](bool first = false, bool last = false) { - // We are very careful here to reduce directly in the output buffer to - // maximize L2 cache utilization in this step. To do this, we write out - // results in FP16 (but still reduce with FP32 compute). - constexpr int active_threads = 32 * thread_n_blocks / 4; - if (threadIdx.x < active_threads) { - int c_gl_stride = prob_n / 8; - int c_gl_wr_delta_o = 8 * c_gl_stride; - int c_gl_wr_delta_i = 4 * (active_threads / 32); - int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) + - 4 * (threadIdx.x / 32) + threadIdx.x % 4; - c_gl_wr += (2 * thread_n_blocks) * slice_col; - constexpr int c_sh_wr_delta = active_threads; - auto c_sh_wr = threadIdx.x; - - int row = (threadIdx.x % 32) / 4; - - if (!first) { - // Interestingly, doing direct global accesses here really seems to mess up - // the compiler and lead to slowdowns, hence we also use async-copies even - // though these fetches are not actually asynchronous. - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4; i++) { - cp_async4_pred( - &sh[c_sh_wr + c_sh_wr_delta * i], - &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + - c_gl_wr_delta_i * (i % 2)], - i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m); - } - cp_async_fence(); - cp_async_wait<0>(); - } - - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4; i++) { - if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) { - if (!first) { - int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta]; - #pragma unroll - for (int j = 0; j < 2 * 4; j++) { - reinterpret_cast( - &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] += - __half2float(reinterpret_cast<__half*>(&c_red)[j]); - } - } - if (!last) { - int4 c; - #pragma unroll - for (int j = 0; j < 2 * 4; j++) { - reinterpret_cast<__half*>(&c)[j] = - __float2half(reinterpret_cast( - &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]); - } - C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] = - c; - } - } - } - } - }; - - // Write out the reduce final result in the correct layout. We only actually - // reshuffle matrix fragments in this step, the reduction above is performed - // in fragment layout. - auto write_result = [&]() { - int c_gl_stride = prob_n / 8; - constexpr int c_sh_stride = 2 * thread_n_blocks + 1; - int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks)); - constexpr int c_sh_rd_delta = - c_sh_stride * (threads / (2 * thread_n_blocks)); - - int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) + - (threadIdx.x % (2 * thread_n_blocks)); - c_gl_wr += (2 * thread_n_blocks) * slice_col; - int c_sh_wr = - (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4; - c_sh_wr += 32 * (threadIdx.x / 32); - int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) + - (threadIdx.x % (2 * thread_n_blocks)); - - int c_gl_wr_end = c_gl_stride * prob_m; - - // We first reorder in shared memory to guarantee the most efficient final - // global write patterns - auto write = [&](int idx, float c0, float c1, FragS& s) { - half2 res = __halves2half2(__float2half(c0), __float2half(c1)); - if (group_blocks == - -1) // for per-column quantization we finally apply the scale here - res = __hmul2(res, s[0]); - ((half2*)sh)[idx] = res; - }; - if (threadIdx.x / 32 < thread_n_blocks / 4) { - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) { - #pragma unroll - for (int j = 0; j < 4; j++) { - int wr = c_sh_wr + 8 * j; - write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0], - frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]); - write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2], - frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]); - write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0], - frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]); - write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2], - frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]); - } - c_sh_wr += 16 * (4 * c_sh_stride); - } - } - __syncthreads(); - - #pragma unroll - for (int i = 0; - i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); - i++) { - if (c_gl_wr < c_gl_wr_end) { - C[c_gl_wr] = sh[c_sh_rd]; - c_gl_wr += c_gl_wr_delta; - c_sh_rd += c_sh_rd_delta; - } - } - }; - - // Start global fetch and register load pipelines. - auto start_pipes = [&]() { - #pragma unroll - for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters); - zero_accums(); - wait_for_stage(); - fetch_to_registers(0, 0); - a_gl_rd += a_gl_rd_delta_o * (stages - 1); - }; - start_pipes(); - - // Main loop. - while (slice_iters) { - // We unroll over both the global fetch and the register load pipeline to - // ensure all shared memory accesses are static. Note that both pipelines have - // even length meaning that the next iteration will always start at index 0. - #pragma unroll - for (int pipe = 0; pipe < stages;) { - #pragma unroll - for (int k = 0; k < b_sh_wr_iters; k++) { - fetch_to_registers(k + 1, pipe % stages); - if (k == b_sh_wr_iters - 2) { - fetch_to_shared((pipe + stages - 1) % stages, pipe, - slice_iters >= stages); - pipe++; - wait_for_stage(); - } - matmul(k); - } - slice_iters--; - if (slice_iters == 0) break; - } - a_gl_rd += a_gl_rd_delta_o * stages; - - // Process results and, if necessary, proceed to the next column slice. - // While this pattern may not be the most readable, other ways of writing - // the loop seemed to noticeably worse performance after compilation. - if (slice_iters == 0) { - cp_async_wait<0>(); - bool last = slice_idx == slice_count - 1; - // For per-column scales, we only fetch them here in the final step before - // write-out - if (group_blocks == -1 && last) { - if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]); - cp_async_fence(); - } - thread_block_reduce(); - if (group_blocks == -1 && last) { - cp_async_wait<0>(); - __syncthreads(); - if (threadIdx.x / 32 < thread_n_blocks / 4) { - reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd + 0]; - reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4]; - } - } - if (slice_count > 1) { // only globally reduce if there is more than one - // block in a slice - barrier_acquire(&locks[slice_col], slice_idx); - global_reduce(slice_idx == 0, last); - barrier_release(&locks[slice_col], last); - } - if (last) // only the last block in a slice actually writes the result - write_result(); - slice_row = 0; - slice_col_par++; - slice_col++; - init_slice(); - if (slice_iters) { - a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + - (threadIdx.x % a_gl_rd_delta_o); - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) - B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles; - if (slice_col == 0) { - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride; - } - s_gl_rd = s_sh_stride * slice_col + threadIdx.x; - start_pipes(); - } - } - } -} - -#else - -template shared - // fetch pipeline - const int group_blocks = -1 // number of consecutive 16x16 blocks - // with a separate quantization scale - > -__global__ void Marlin( - const int4* __restrict__ A, // fp16 input matrix of shape mxk - const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn - int4* __restrict__ C, // fp16 output buffer of shape mxn - const int4* __restrict__ s, // fp16 quantization scales of shape - // (k/groupsize)xn - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int* locks // extra global storage for barrier synchronization -) { - // Marlin is not implemented yet for SM < 8.0 - assert(false); - return; -} - -#endif - -// 8 warps are a good choice since every SM has 4 schedulers and having more -// than 1 warp per schedule allows some more latency hiding. At the same time, -// we want relatively few warps to have many registers per warp and small tiles. -const int USER_THREADS = - 256; // Note: This is only used with user-provided thread_k/n -const int STAGES = 4; // 4 pipeline stages fit into shared memory -const int SHARED_MEM = - 96 * 1024; // max shared memory on compute capability 8.6 (< 8.0) - -static constexpr int min_thread_n = 64; -static constexpr int min_thread_k = 64; - -static constexpr int tile_size = 16; -static constexpr int max_par = 16; - -static constexpr int pack_factor_4bit = - 8; // We have 8 4-bit vals inside a 32 bit - -#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ - GROUP_BLOCKS, NUM_THREADS) \ - else if (thread_m_blocks == THREAD_M_BLOCKS && \ - thread_n_blocks == THREAD_N_BLOCKS && \ - thread_k_blocks == THREAD_K_BLOCKS && \ - group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) { \ - cudaFuncSetAttribute(Marlin, \ - cudaFuncAttributeMaxDynamicSharedMemorySize, \ - SHARED_MEM); \ - Marlin<<>>( \ - A_ptr, B_ptr, C_ptr, s_ptr, prob_m, prob_n, prob_k, locks); \ - } - -typedef struct { - int thread_k; - int thread_n; - int num_threads; -} thread_config_t; - -thread_config_t small_batch_thread_configs[] = { - // Ordered by priority - - // thread_k, thread_n, num_threads - {128, 128, 256}, // Default - {128, 64, 128}, // Reduce N 2X, same K - {64, 256, 256}, // Reduce K 2X, increase N 2X - {64, 128, 128}, // Reduce K 2X, same N -}; - -thread_config_t large_batch_thread_configs[] = { - // Ordered by priority - - // thread_k, thread_n, num_threads - {64, 256, 256}, // Default - {128, 128, 256}, // Reduce N 2X, increase K 2X - {64, 128, 128}, // Reduce N 2X, same K - {128, 64, 128}, // Reduce N 4X, increase K 2X -}; - -bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n, - int prob_k) { - // Sanity - if (th_config.thread_k == -1 || th_config.thread_n == -1 || - th_config.num_threads == -1) { - return false; - } - - // Verify K/N are divisible by thread K/N - if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) { - return false; - } - - // thread_k can be only 128 or 64 (because it must be less than groupsize - // which is 128) - if (th_config.thread_k != 128 && th_config.thread_k != 64) { - return false; - } - - // Verify min for thread K/N - if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) { - return false; - } - - // num_threads must be at least 128 (= 4 warps) - if (th_config.num_threads < 128) { - return false; - } - - return true; -} - -thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) { - if (prob_m <= 16) { - for (auto th_config : small_batch_thread_configs) { - if (is_valid_config(th_config, prob_m, prob_n, prob_k)) { - return th_config; - } - } - - } else { - for (auto th_config : large_batch_thread_configs) { - if (is_valid_config(th_config, prob_m, prob_n, prob_k)) { - return th_config; - } - } - } - - return thread_config_t{-1, -1, -1}; -} - -#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) - -void marlin_cuda(const void* A, const void* B, void* C, void* s, int prob_m, - int prob_n, int prob_k, void* workspace, int groupsize = -1, - int dev = 0, cudaStream_t stream = 0, int thread_k = -1, - int thread_n = -1, int sms = -1, int max_par = 16) { - int tot_m = prob_m; - int tot_m_blocks = ceildiv(tot_m, 16); - int pad = 16 * tot_m_blocks - tot_m; - - if (sms == -1) - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev); - - // Set thread config - thread_config_t th_config; - if (thread_k != -1 && thread_n != -1) { - // User-defined config - th_config = thread_config_t{thread_k, thread_n, USER_THREADS}; - } else { - // Auto config - th_config = determine_thread_config(prob_m, prob_n, prob_k); - } - - if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) { - throw std::runtime_error( - "Invalid thread config: thread_k = " + str(th_config.thread_k) + - ", thread_n = " + str(th_config.thread_n) + - ", num_threads = " + str(th_config.num_threads) + " for MKN = [" + - str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]"); - } - - // Uncomment for debug - // std::cout << "Using thread_config: thread_k = " + str(th_config.thread_k) + - // ", thread_n = " + str(th_config.thread_n) + - // ", num_threads = " + str(th_config.num_threads) + " for - // MKN = [" + str(prob_m) + - // ", " + str(prob_k) + ", " + str(prob_n) + "]\n"; - - int num_threads = th_config.num_threads; - thread_k = th_config.thread_k; - thread_n = th_config.thread_n; - - int thread_k_blocks = thread_k / 16; - int thread_n_blocks = thread_n / 16; - int group_blocks = (groupsize == -1) ? -1 : groupsize / 16; - int blocks = sms; - - if (prob_m == 0 || prob_n == 0 || prob_k == 0) { - return; - } - - TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n, - " is not divisible by thread_n = ", thread_n); - TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k, - " is not divisible by thread_k = ", thread_k); - if (group_blocks != -1) { - TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k, - " is not divisible by group_blocks = ", group_blocks); - } - - const int4* A_ptr = (const int4*)A; - const int4* B_ptr = (const int4*)B; - int4* C_ptr = (int4*)C; - const int4* s_ptr = (const int4*)s; - - int* locks = (int*)workspace; - - for (int i = 0; i < tot_m_blocks; i += 4) { - int thread_m_blocks = tot_m_blocks - i; - prob_m = tot_m - 16 * i; - int par = 1; - if (thread_m_blocks > 4) { - // Note that parallel > 1 currently only works for inputs without any - // padding - par = (16 * thread_m_blocks - pad) / 64; - if (par > max_par) par = max_par; - prob_m = 64 * par; - i += 4 * (par - 1); - thread_m_blocks = 4; - } - - // For compilation speed, we only define the kernel configurations that have - // seemed useful (in terms of performance) in our testing, however many more - // are, in principle, possible. - if (false) { - } - CALL_IF(8, 8, 256) - CALL_IF(16, 4, 256) - CALL_IF(8, 4, 128) - CALL_IF(4, 8, 128) - else { - throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) + - ", " + str(prob_k) + ", " + str(prob_n) + "]" + - ", groupsize = " + str(groupsize) + - ", thread_m_blocks = " + str(thread_m_blocks) + - ", thread_n_blocks = " + str(thread_n_blocks) + - ", thread_k_blocks = " + str(thread_k_blocks)); - } - - A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par; - C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par; - } -} - -} // namespace marlin_dense - -torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, - torch::Tensor& b_scales, torch::Tensor& workspace, - int64_t size_m, int64_t size_n, int64_t size_k) { - // Verify M - TORCH_CHECK(size_m == a.size(0), - "Shape mismatch: a.size(0) = " + str(a.size(0)) + - ", size_m = " + str(size_m)); - - // Verify K - TORCH_CHECK(size_k == a.size(1), - "Shape mismatch: a.size(1) = " + str(a.size(1)) + - ", size_k = " + str(size_k)); - TORCH_CHECK(size_k % marlin_dense::tile_size == 0, - "size_k = " + str(size_k) + " is not divisible by tile_size = " + - str(marlin_dense::tile_size)); - TORCH_CHECK((size_k / marlin_dense::tile_size) == b_q_weight.size(0), - "Shape mismatch: b_q_weight.size(0) = " + - str(b_q_weight.size(0)) + ", size_k = " + str(size_k) + - ", tile_size = " + str(marlin_dense::tile_size)); - - // Verify N - TORCH_CHECK(b_scales.size(1) == size_n, - "b_scales.size(1) = " + str(b_scales.size(1)) + - ", size_n = " + str(size_n)); - TORCH_CHECK( - b_q_weight.size(1) % marlin_dense::tile_size == 0, - "b_q_weight.size(1) = " + str(b_q_weight.size(1)) + - " is not divisible by tile_size = " + str(marlin_dense::tile_size)); - - int actual_size_n = (b_q_weight.size(1) / marlin_dense::tile_size) * - marlin_dense::pack_factor_4bit; - TORCH_CHECK( - size_n == actual_size_n, - "size_n = " + str(size_n) + ", actual_size_n = " + str(actual_size_n)); - - // Verify A device and strides - TORCH_CHECK(a.device().is_cuda(), "A is not on GPU"); - TORCH_CHECK(a.is_contiguous(), "A is not contiguous"); - - // Verify B device and strides - TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU"); - TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous"); - - // Verify scales device and strides - TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU"); - TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous"); - - // Alloc C matrix - const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); - auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device()); - torch::Tensor c = torch::empty({size_m, size_n}, options); - - // thread_k: `k` size of a thread_tile in `weights` (can usually be left as - // auto -1) - int thread_k = -1; - // thread_n: `n` size of a thread_tile in `weights` (can usually be left as - // auto -1) - int thread_n = -1; - // sms: number of SMs to use for the kernel (can usually be left as auto -1) - int sms = -1; - - // Detect groupsize - if (b_scales.size(0) != 1) { - TORCH_CHECK(size_k % b_scales.size(0) == 0, - "size_k = " + str(size_k) + - ", is not divisible by b_scales.size(0) = " + - str(b_scales.size(0))); - } - int groupsize = b_scales.size(0) == 1 ? -1 : size_k / b_scales.size(0); - - // Verify groupsize - TORCH_CHECK(groupsize == -1 || groupsize == 128, - "Unexpected groupsize = " + str(groupsize)); - - // Verify workspace size - TORCH_CHECK(size_n % marlin_dense::min_thread_n == 0, - "size_n = " + str(size_n) + - ", is not divisible by min_thread_n = " + - str(marlin_dense::min_thread_n)); - int min_workspace_size = - (size_n / marlin_dense::min_thread_n) * marlin_dense::max_par; - TORCH_CHECK(workspace.numel() >= min_workspace_size, - "workspace.numel = " + str(workspace.numel()) + - " is below min_workspace_size = " + str(min_workspace_size)); - - int dev = a.get_device(); - marlin_dense::marlin_cuda(a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), - b_scales.data_ptr(), size_m, size_n, size_k, - workspace.data_ptr(), groupsize, dev, - at::cuda::getCurrentCUDAStream(dev), thread_k, - thread_n, sms, marlin_dense::max_par); - - return c; -} - -TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { - m.impl("marlin_gemm", &marlin_gemm); -} diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu deleted file mode 100644 index c96d68d9b29aa..0000000000000 --- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu +++ /dev/null @@ -1,1248 +0,0 @@ -/* - * Adapted from - * https://github.com/IST-DASLab/marlin/blob/master/marlin/marlin_cuda_kernel.cu - * https://github.com/IST-DASLab/marlin/blob/master/marlin/marlin_cuda.cpp - * Modified by HandH1998 - * Copyright (C) 2024 HandH1998 - * Copyright (C) Marlin.2024 Elias Frantar - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include - -#include - -#include "../dense/common/base.h" -#include "core/registration.h" - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - #include "../dense/common/mem.h" -#endif - -template -inline std::string str(T x) { - return std::to_string(x); -} - -namespace { - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - -using I4 = Vec; -// Matrix fragments for tensor core instructions; their precise layout is -// documented here: -// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-integer-type -using FragA = Vec; -using FragB = Vec; -using FragC = Vec; -using FragS_GROUP = Vec; // weight per-group quantization scales -using FragS_CHANNEL = - Vec; // weight per-channel quantization scales or activaton - // per-token quantization scales - -// NOTE(HandH1998): cp.async.cg only support BYTES = 16, however, -// cp.async.ca can support BYTES = 4, 8, 16; -// as s_tok's shape is equal to prob_m, we need set s_tok to float type, -// and cp_size = 1 float, i.e., 4 BYTES -// Asynchronous global->shared copy for activation quantizaton scales s_tok -__device__ inline void cp_async1(void* smem_ptr, const void* glob_ptr) { - const int BYTES = 4; - uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile( - "{\n" - " cp.async.ca.shared.global [%0], [%1], %2;\n" - "}\n" ::"r"(smem), - "l"(glob_ptr), "n"(BYTES)); -} - -// m16n8k16 tensor core mma instruction with int8 inputs and int32 -// output/accumulation. -__device__ inline void mma(const FragA& a_frag, const FragB& frag_b, - FragC& frag_c) { - const uint32_t* a = reinterpret_cast(&a_frag); - const uint32_t* b = reinterpret_cast(&frag_b); - int* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.satfinite.s32.s8.s8.s32 " - "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" - : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3]) - : "r"(a[0]), "r"(a[1]), "r"(b[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]), - "r"(c[3])); -} - -// Instruction for loading a full 16x16 matrix fragment of operand A from shared -// memory, directly in int8 tensor core layout. -__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) { - uint32_t* a = reinterpret_cast(&frag_a); - uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); - asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n" - : "=r"(a[0]), "=r"(a[1]) - : "r"(smem)); -} - -inline __device__ half2 float2_to_half2(float2 f) { - uint32_t res; - // NOTE(HandH1998): h0,h1 should be uint16_t, not half - uint16_t h0, h1; - asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(h0) : "f"(f.x)); - asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(h1) : "f"(f.y)); - asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(res) : "h"(h0), "h"(h1)); - return reinterpret_cast(res); -} - -inline __device__ float int32_to_float(int h) { - float res; - asm volatile("cvt.rn.f32.s32 %0, %1;\n" : "=f"(res) : "r"(h)); - return res; -} - -// Lookup-table based 3-input logical operation; explicitly used for -// dequantization as the compiler does not seem to automatically recognize it in -// all cases. -template -__device__ inline int lop3(int a, int b, int c) { - int res; - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" - : "=r"(res) - : "r"(a), "r"(b), "r"(c), "n"(lut)); - return res; -} - -// Efficiently dequantize an int32 value into a full B-fragment of 4 int8 values -// for weight per channel dequant. -__device__ inline FragB dequant_per_channel(int q) { - static constexpr int MASK = 0xf0f0f0f0; - FragB frag_b; - frag_b[0] = (q & MASK); - return frag_b; -} - -// Efficiently dequantize an int32 value into a full B-fragment of 4 int8 values -// for weight per group dequant. -__device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) { - static constexpr uint32_t LO = 0x000f000f; - static constexpr uint32_t HI = 0x00f000f0; - static constexpr uint32_t EX = 0x64006400; - // Guarantee that the `(a & b) | c` operations are LOP3s. - uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); - // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point - // directly into `SUB` and `ADD`. - static constexpr uint32_t SUB = 0x64086408; - static constexpr uint32_t MUL = 0x2c002c00; - static constexpr uint32_t ADD = 0xd480d480; - *reinterpret_cast(&t0) = __hsub2( - *reinterpret_cast(&t0), *reinterpret_cast(&SUB)); - *reinterpret_cast(&t1) = __hfma2( - *reinterpret_cast(&t1), *reinterpret_cast(&MUL), - *reinterpret_cast(&ADD)); - - uint16_t s = reinterpret_cast(&frag_s)[i]; - uint32_t double_s; - // pack 2xfp16 to half2 - asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(double_s) : "h"(s), "h"(s)); - // dequant and convert 4 half to 4 uint8 (be placed at the low 8 bits of 4 - // half, respectively) - static constexpr uint32_t MAGIC_NUM = 0x64806480; - *reinterpret_cast(&t0) = __hfma2( - *reinterpret_cast(&t0), *reinterpret_cast(&double_s), - *reinterpret_cast(&MAGIC_NUM)); - *reinterpret_cast(&t1) = __hfma2( - *reinterpret_cast(&t1), *reinterpret_cast(&double_s), - *reinterpret_cast(&MAGIC_NUM)); - // take out the 4 uint8 from 4 half, then convert them to 4 int8 and pack 4 - // int8 into 1 uint32 - FragB frag_b; - uint32_t uint8s; - static constexpr uint32_t MASK_0246 = 0x6420; - static constexpr uint32_t UINT8s_TO_INT8s_MASK = 0x80808080; - asm volatile("prmt.b32 %0,%1,%2,%3;\n" - : "=r"(uint8s) - : "r"(t0), "r"(t1), "n"(MASK_0246)); - frag_b[0] = (uint8s ^ UINT8s_TO_INT8s_MASK); - return frag_b; -} - -template shared - // fetch pipeline - const int group_blocks = -1 // number of consecutive 16x16 blocks - // with a separate quantization scale - > -__global__ void Marlin( - const int4* __restrict__ A, // int8 input matrix of shape mxk - const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn - int4* __restrict__ C, // int32 global_reduce buffer of shape - // (max_par*16*4)xn, as int8 tensor core's output is - // int32 dtype - int4* __restrict__ D, // fp16 output buffer of shape mxn - const float* __restrict__ s_tok, // fp32 activation per-token quantization - // scales of shape mx1 - const int4* __restrict__ s_ch, // fp32 weight per-channel quantization - // scales of shape 1xn - const int4* __restrict__ s_group, // fp16 weight per-group quantization - // scales of shape (k/groupsize)xn, when - // group_blocks=-1, it should be nullptr - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int* locks // extra global storage for barrier synchronization -) { - // Each threadblock processes one "stripe" of the B matrix with (roughly) the - // same size, which might involve multiple column "slices" (of width 16 * - // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM - // example: - // 0 1 3 - // 0 2 3 - // 1 2 4 - // While this kind of partitioning makes things somewhat more complicated, it - // ensures good utilization of all SMs for many kinds of shape and GPU - // configurations, while requiring as few slow global cross-threadblock - // reductions as possible. - - // For larger GEMMs we run multiple batchsize 64 versions in parallel for a - // better partitioning with less reductions - int parallel = 1; - if (prob_m > 16 * thread_m_blocks) { - parallel = prob_m / (16 * thread_m_blocks); - prob_m = 16 * thread_m_blocks; - } - - int k_tiles = prob_k / 16 / thread_k_blocks; - int n_tiles = prob_n / 16 / thread_n_blocks; - int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x); - // Ensure that the number of tiles in each stripe is a multiple of the - // groupsize; this avoids an annoying special case where a stripe starts in - // the middle of group. - if constexpr (group_blocks != -1) - iters = (group_blocks / thread_k_blocks) * - ceildiv(iters, (group_blocks / thread_k_blocks)); - - int slice_row = (iters * blockIdx.x) % k_tiles; - int slice_col_par = (iters * blockIdx.x) / k_tiles; - int slice_col = slice_col_par; - int slice_iters; // number of threadblock tiles in the current slice - int slice_count = - 0; // total number of active threadblocks in the current slice - int slice_idx; // index of threadblock in current slice; numbered bottom to - // top - - // We can easily implement parallel problem execution by just remapping - // indices and advancing global pointers - if (slice_col_par >= n_tiles) { - A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 16; - C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 4; - D += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8; - s_tok += (slice_col_par / n_tiles) * 16 * thread_m_blocks; - locks += (slice_col_par / n_tiles) * n_tiles; - slice_col = slice_col_par % n_tiles; - } - - // Compute all information about the current slice which is required for - // synchronization. - auto init_slice = [&]() { - slice_iters = - iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row); - if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0; - if (slice_iters == 0) return; - if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row; - slice_count = 1; - slice_idx = 0; - int col_first = iters * ceildiv(k_tiles * slice_col_par, iters); - if (col_first <= k_tiles * (slice_col_par + 1)) { - int col_off = col_first - k_tiles * slice_col_par; - slice_count = ceildiv(k_tiles - col_off, iters); - if (col_off > 0) slice_count++; - int delta_first = iters * blockIdx.x - col_first; - if (delta_first < 0 || (col_off == 0 && delta_first == 0)) - slice_idx = slice_count - 1; - else { - slice_idx = slice_count - 1 - delta_first / iters; - if (col_off > 0) slice_idx--; - } - } - if (slice_col == n_tiles) { - A += 16 * thread_m_blocks * prob_k / 16; - C += 16 * thread_m_blocks * prob_n / 4; - D += 16 * thread_m_blocks * prob_n / 8; - s_tok += 16 * thread_m_blocks; - locks += n_tiles; - slice_col = 0; - } - }; - init_slice(); - - int a_gl_stride = prob_k / 16; // stride of the A matrix in global memory - // We typically use `constexpr` to indicate that this value is a compile-time - // constant - constexpr int a_sh_stride = - 16 * thread_k_blocks / 16; // stride of an A matrix tile in shared memory - constexpr int a_gl_rd_delta_o = - 16 * thread_k_blocks / - 16; // delta between subsequent A tiles in global memory - int a_gl_rd_delta_i = - a_gl_stride * - (threads / a_gl_rd_delta_o); // between subsequent accesses within a tile - constexpr int a_sh_wr_delta = - a_sh_stride * - (threads / a_gl_rd_delta_o); // between shared memory writes - constexpr int a_sh_rd_delta_o = - 1 * ((threads / 32) / - (thread_n_blocks / 4)); // between shared memory tile reads - constexpr int a_sh_rd_delta_i = - a_sh_stride * 16; // within a shared memory tile - constexpr int a_sh_stage = - a_sh_stride * (16 * thread_m_blocks); // overall size of a tile - constexpr int a_sh_wr_iters = - ceildiv(a_sh_stage, - a_sh_wr_delta); // number of shared write iterations for a tile - - int b_gl_stride = 16 * prob_n / 32; - constexpr int b_sh_stride = 32 * thread_n_blocks / 4; - int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks; - int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride); - constexpr int b_sh_wr_delta = threads; - constexpr int b_sh_rd_delta = threads; - constexpr int b_sh_stage = b_sh_stride * thread_k_blocks; - constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta; - - constexpr int s_tok_sh_stride = 16 * thread_m_blocks; - - constexpr int s_ch_sh_stride = 16 * thread_n_blocks / 4; - - int s_group_gl_stride = prob_n / 8; - constexpr int s_group_sh_stride = 16 * thread_n_blocks / 8; - constexpr int s_group_sh_stage = s_group_sh_stride; - int s_group_gl_rd_delta = s_group_gl_stride; - - // Global A read index of current thread. - int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + - (threadIdx.x % a_gl_rd_delta_o); - a_gl_rd += a_gl_rd_delta_o * slice_row; - // Shared write index of current thread. - int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) + - (threadIdx.x % a_gl_rd_delta_o); - // Shared read index. - // NOTE(HandH1998): int8 input a only need 16 threads to load 16x16 matrix - int a_sh_rd = a_sh_stride * ((threadIdx.x % 32) % 16); - a_sh_rd += 1 * ((threadIdx.x / 32) / (thread_n_blocks / 4)); - - int b_gl_rd = - b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride); - b_gl_rd += b_sh_stride * slice_col; - b_gl_rd += b_gl_rd_delta_o * slice_row; - auto b_sh_wr = threadIdx.x; - auto b_sh_rd = threadIdx.x; - - auto s_tok_gl_rd = threadIdx.x; - // NOTE(HandH1998): activation scale s_tok need shuffle to [0, 8, 1, 9, 2, 10, - // 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] for example, 0, 8 row scales serve for - // thread 0, 1, 2, 3. For more details, refer to mma operand A layout as - // s_tok's size is not fixed, we can not shuffle before inference we shuffle - // it when fetching s_tok from global memory to shared memory, that's why - // s_tok_sh_wr is like this - int s_tok_sh_wr = - (threadIdx.x / 16) * 16 + (threadIdx.x % 8) * 2 + (threadIdx.x % 16) / 8; - int s_tok_sh_rd = (threadIdx.x % 32) / 4; - bool s_tok_sh_wr_pred = threadIdx.x < prob_m; - - auto s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x; - auto s_ch_sh_wr = threadIdx.x; - int s_ch_sh_rd = 16 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + - 2 * ((threadIdx.x % 32) % 4); - bool s_ch_sh_wr_pred = threadIdx.x < s_ch_sh_stride; - - int s_group_gl_rd, s_group_sh_wr, s_group_sh_rd; - bool s_group_sh_wr_pred; - if constexpr (group_blocks != -1) { - s_group_gl_rd = - s_group_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) + - s_group_sh_stride * slice_col + threadIdx.x; - s_group_sh_wr = threadIdx.x; - // NOTE(HandH1998): s_group_sh_rd is related to mma output C - s_group_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + - (threadIdx.x % 32) / 4; - s_group_sh_wr_pred = threadIdx.x < s_group_sh_stride; - } - - // Precompute which thread should not read memory in which iterations; this is - // needed if there are more threads than required for a certain tilesize or - // when the batchsize is not a multiple of 16. - bool a_sh_wr_pred[a_sh_wr_iters]; - #pragma unroll - for (int i = 0; i < a_sh_wr_iters; i++) - a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m; - - // To ensure that writing and reading A tiles to/from shared memory, the - // latter in fragment format, is fully bank conflict free, we need to use a - // rather fancy XOR-based layout. The key here is that neither reads nor - // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the - // same shared memory banks. Further, it seems (based on NSight-Compute) that - // each warp must also write a consecutive memory segment? - auto transform_a = [&](int i) { - int row = i / a_gl_rd_delta_o; - return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row; - }; - // Since the computation of this remapping is non-trivial and, due to our main - // loop unrolls, all shared memory accesses are static, we simply precompute - // both transformed reads and writes. - int a_sh_wr_trans[a_sh_wr_iters]; - #pragma unroll - for (int i = 0; i < a_sh_wr_iters; i++) - a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr); - int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks]; - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) { - #pragma unroll - for (int j = 0; j < thread_m_blocks; j++) - a_sh_rd_trans[i][j] = - transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd); - } - - // Since B-accesses have non-constant stride they have to be computed at - // runtime; we break dependencies between subsequent accesses with a tile by - // maintining multiple pointers (we have enough registers), a tiny - // optimization. - const int4* B_ptr[b_sh_wr_iters]; - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) - B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd; - - extern __shared__ int4 sh[]; - // Shared memory storage for global fetch pipelines. - // NOTE(HandH1998): stages need >= 4, otherwise, sh_s_tok = sh + max(stages * - // a_sh_stage + stages * b_sh_stage, 4 * stages * a_sh_stage) - int4* sh_a = sh; - int4* sh_b = sh_a + (stages * a_sh_stage); - int4* sh_s_tok = sh_b + (stages * b_sh_stage); - int4* sh_s_ch = sh_s_tok + s_tok_sh_stride; - int4* sh_s_group = sh_s_ch + s_ch_sh_stride; - - // Register storage for double buffer of shared memory reads. - FragA frag_a[2][thread_m_blocks]; - I4 frag_b_quant[2]; - FragC frag_c[thread_m_blocks][4][2]; - FragS_GROUP frag_s_group[2][4]; - FragS_CHANNEL frag_s_tok[thread_m_blocks]; - FragS_CHANNEL frag_s_ch[2][4]; - - // Zero accumulators. - auto zero_accums = [&]() { - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++) - reinterpret_cast(frag_c)[i] = 0; - }; - - // Asynchronously fetch the next A, B and s tile from global to the next - // shared memory pipeline location. - auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) { - if (pred) { - int4* sh_a_stage = sh_a + a_sh_stage * pipe; - #pragma unroll - for (int i = 0; i < a_sh_wr_iters; i++) { - cp_async4_pred( - &sh_a_stage[a_sh_wr_trans[i]], - &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off], - a_sh_wr_pred[i]); - } - int4* sh_b_stage = sh_b + b_sh_stage * pipe; - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) { - cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]); - B_ptr[i] += b_gl_rd_delta_o; - } - // Only fetch scales if this tile starts a new group - if constexpr (group_blocks != -1) { - if (pipe % (group_blocks / thread_k_blocks) == 0) { - int4* sh_s_group_stage = sh_s_group + s_group_sh_stage * pipe; - if (s_group_sh_wr_pred) - cp_async4(&sh_s_group_stage[s_group_sh_wr], - &s_group[s_group_gl_rd]); - s_group_gl_rd += s_group_gl_rd_delta; - } - } - } - // Insert a fence even when we are winding down the pipeline to ensure that - // waiting is also correct at this point. - cp_async_fence(); - }; - - // Wait until the next thread tile has been loaded to shared memory. - auto wait_for_stage = [&]() { - // We only have `stages - 2` active fetches since we are double buffering - // and can only issue the next fetch when it is guaranteed that the previous - // shared memory load is fully complete (as it may otherwise be - // overwritten). - cp_async_wait(); - __syncthreads(); - }; - - // Load the next sub-tile from the current location in the shared memory pipe - // into the current register buffer. - auto fetch_to_registers = [&](int k, int pipe) { - // It may seem inefficient that we reload the groups for every sub-tile; - // however, this does not seem to be a significant bottleneck, while some - // theoretically better attempts have lead to bad instruction ordering by - // the compiler and correspondingly a noticeable drop in performance. - if constexpr (group_blocks != -1) { - int4* sh_s_group_stage = - sh_s_group + - s_group_sh_stage * ((group_blocks / thread_k_blocks) * - (pipe / (group_blocks / thread_k_blocks))); - reinterpret_cast(&frag_s_group[k % 2])[0] = - sh_s_group_stage[s_group_sh_rd]; - } - int4* sh_a_stage = sh_a + a_sh_stage * pipe; - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) - ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]); - int4* sh_b_stage = sh_b + b_sh_stage * pipe; - frag_b_quant[k % 2] = *reinterpret_cast( - &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]); - }; - - // Execute the actual tensor core matmul of a sub-tile. - auto matmul = [&](int k) { - // We have the m dimension as the inner loop in order to encourage overlapping - // dequantization and matmul operations. - #pragma unroll - for (int j = 0; j < 4; j++) { - int b_quant = frag_b_quant[k % 2][j]; - // int b_quant_shift = b_quant << 4; - FragB frag_b0, frag_b1; - // If there are no groups, we can just scale the final output once and can - // avoid doing so for each weight. - if constexpr (group_blocks != -1) { - int b_quant_shift = b_quant >> 8; - frag_b0 = dequant_per_group(b_quant, frag_s_group[k % 2][j], 0); - frag_b1 = dequant_per_group(b_quant_shift, frag_s_group[k % 2][j], 1); - } else { - int b_quant_shift = b_quant << 4; - frag_b0 = dequant_per_channel(b_quant); - frag_b1 = dequant_per_channel(b_quant_shift); - } - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) { - mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]); - mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]); - } - } - }; - - // Since we slice across the k dimension of a tile in order to increase the - // number of warps while keeping the n dimension of a tile reasonable, we have - // multiple warps that accumulate their partial sums of the same output - // location; which we have to reduce over in the end. We do in shared memory. - auto thread_block_reduce = [&]() { - constexpr int red_off = threads / b_sh_stride / 2; - if (red_off >= 1) { - auto red_idx = threadIdx.x / b_sh_stride; - constexpr int red_sh_stride = b_sh_stride * 4 * 2; - constexpr int red_sh_delta = b_sh_stride; - int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) + - (threadIdx.x % b_sh_stride); - - // Parallel logarithmic shared memory reduction. We make sure to avoid any - // unnecessary read or write iterations, e.g., for two warps we write only - // once by warp 1 and read only once by warp 0. - - #pragma unroll - for (int m_block = 0; m_block < thread_m_blocks; m_block++) { - #pragma unroll - for (int i = red_off; i > 0; i /= 2) { - if (i <= red_idx && red_idx < 2 * i) { - #pragma unroll - for (int j = 0; j < 4 * 2; j++) { - int red_sh_wr = - red_sh_delta * j + (red_sh_rd - red_sh_stride * i); - if (i < red_off) { - int* c_rd = - reinterpret_cast(&sh[red_sh_delta * j + red_sh_rd]); - int* c_wr = reinterpret_cast(&sh[red_sh_wr]); - #pragma unroll - for (int k = 0; k < 4; k++) - reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += - c_rd[k] + c_wr[k]; - } - sh[red_sh_wr] = - reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; - } - } - __syncthreads(); - } - if (red_idx == 0) { - #pragma unroll - for (int i = 0; i < 4 * 2; i++) { - int* c_rd = - reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); - #pragma unroll - for (int j = 0; j < 4; j++) - reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += - c_rd[j]; - } - } - __syncthreads(); - } - } - }; - - // Since multiple threadblocks may process parts of the same column slice, we - // finally have to globally reduce over the results. As the striped - // partitioning minimizes the number of such reductions and our outputs are - // usually rather small, we perform this reduction serially in L2 cache. - // global_reduce works on INT32 elements, which are the results of INT8 GEMM. - // This is why we need another INT32 maxtrix `C` to reduce instead of the - // original half matrix `D`. - auto global_reduce = [&](bool first = false, bool last = false) { - // We are very careful here to reduce directly in the output buffer to - // maximize L2 cache utilization in this step. To do this, we write out - // results in FP16 (but still reduce with FP32 compute). - constexpr int active_threads = 32 * thread_n_blocks / 4; - if (threadIdx.x < active_threads) { - int c_gl_stride = prob_n / 4; - int c_gl_wr_delta_o = 8 * c_gl_stride; - int c_gl_wr_delta_i = 8 * (active_threads / 32); - int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) + - 8 * (threadIdx.x / 32) + (threadIdx.x % 4) * 2; - c_gl_wr += (4 * thread_n_blocks) * slice_col; - constexpr int c_sh_wr_delta = active_threads * 2; - auto c_sh_wr = 2 * threadIdx.x; - - int row = (threadIdx.x % 32) / 4; - - if (!first) { - // Interestingly, doing direct global accesses here really seems to mess up - // the compiler and lead to slowdowns, hence we also use async-copies even - // though these fetches are not actually asynchronous. - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4; i++) { - cp_async4_pred( - &sh[c_sh_wr + c_sh_wr_delta * i], - &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + - c_gl_wr_delta_i * (i % 2)], - i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m); - cp_async4_pred( - &sh[c_sh_wr + c_sh_wr_delta * i + 1], - &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + - c_gl_wr_delta_i * (i % 2) + 1], - i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m); - } - cp_async_fence(); - cp_async_wait<0>(); - } - - #pragma unroll - for (int i = 0; i < thread_m_blocks * 4; i++) { - if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) { - if (!first) { - int4 d_red1 = sh[c_sh_wr + i * c_sh_wr_delta]; - int4 d_red2 = sh[c_sh_wr + i * c_sh_wr_delta + 1]; - #pragma unroll - for (int j = 0; j < 4; j++) { - reinterpret_cast( - &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] += - reinterpret_cast(&d_red1)[j]; - } - #pragma unroll - for (int j = 0; j < 4; j++) { - reinterpret_cast( - &frag_c)[4 * 2 * 4 * (i / 4) + 4 * (j + 4) + (i % 4)] += - reinterpret_cast(&d_red2)[j]; - } - } - if (!last) { - int4 d1, d2; - #pragma unroll - for (int j = 0; j < 4; j++) { - reinterpret_cast(&d1)[j] = reinterpret_cast( - &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]; - } - #pragma unroll - for (int j = 0; j < 4; j++) { - reinterpret_cast(&d2)[j] = reinterpret_cast( - &frag_c)[4 * 2 * 4 * (i / 4) + 4 * (j + 4) + (i % 4)]; - } - C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] = - d1; - C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2) + - 1] = d2; - } - } - } - } - }; - - // Write out the reduce final result in the correct layout. We only actually - // reshuffle matrix fragments in this step, the reduction above is performed - // in fragment layout. - auto write_result = [&]() { - int d_gl_stride = prob_n / 8; - constexpr int d_sh_stride = 2 * thread_n_blocks + 1; - int d_gl_wr_delta = d_gl_stride * (threads / (2 * thread_n_blocks)); - constexpr int d_sh_rd_delta = - d_sh_stride * (threads / (2 * thread_n_blocks)); - - int d_gl_wr = d_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) + - (threadIdx.x % (2 * thread_n_blocks)); - d_gl_wr += (2 * thread_n_blocks) * slice_col; - int d_sh_wr = - (4 * d_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4; - d_sh_wr += 32 * (threadIdx.x / 32); - int d_sh_rd = d_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) + - (threadIdx.x % (2 * thread_n_blocks)); - - int d_gl_wr_end = d_gl_stride * prob_m; - - // We first reorder in shared memory to guarantee the most efficient final - // global write patterns - auto write = [&](int idx, int c0, int c1, float a_s, FragS_CHANNEL& w_s) { - float2 deq_res; - deq_res.x = int32_to_float(c0) * w_s[0] * a_s; - deq_res.y = int32_to_float(c1) * w_s[1] * a_s; - ((half2*)sh)[idx] = float2_to_half2(deq_res); - }; - - if (threadIdx.x / 32 < thread_n_blocks / 4) { - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) { - #pragma unroll - for (int j = 0; j < 4; j++) { - int wr = d_sh_wr + 8 * j; - write(wr + (4 * d_sh_stride) * 0 + 0, frag_c[i][j][0][0], - frag_c[i][j][0][1], frag_s_tok[i][0], - frag_s_ch[j / 2][2 * (j % 2) + 0]); - write(wr + (4 * d_sh_stride) * 8 + 0, frag_c[i][j][0][2], - frag_c[i][j][0][3], frag_s_tok[i][1], - frag_s_ch[j / 2][2 * (j % 2) + 0]); - write(wr + (4 * d_sh_stride) * 0 + 4, frag_c[i][j][1][0], - frag_c[i][j][1][1], frag_s_tok[i][0], - frag_s_ch[j / 2][2 * (j % 2) + 1]); - write(wr + (4 * d_sh_stride) * 8 + 4, frag_c[i][j][1][2], - frag_c[i][j][1][3], frag_s_tok[i][1], - frag_s_ch[j / 2][2 * (j % 2) + 1]); - } - d_sh_wr += 16 * (4 * d_sh_stride); - } - } - __syncthreads(); - - #pragma unroll - for (int i = 0; - i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); - i++) { - if (d_gl_wr < d_gl_wr_end) { - D[d_gl_wr] = sh[d_sh_rd]; - d_gl_wr += d_gl_wr_delta; - d_sh_rd += d_sh_rd_delta; - } - } - }; - - // Start global fetch and register load pipelines. - auto start_pipes = [&]() { - #pragma unroll - for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters); - zero_accums(); - wait_for_stage(); - fetch_to_registers(0, 0); - a_gl_rd += a_gl_rd_delta_o * (stages - 1); - }; - start_pipes(); - - // Main loop. - while (slice_iters) { - // We unroll over both the global fetch and the register load pipeline to - // ensure all shared memory accesses are static. Note that both pipelines have - // even length meaning that the next iteration will always start at index 0. - #pragma unroll - for (int pipe = 0; pipe < stages;) { - #pragma unroll - for (int k = 0; k < b_sh_wr_iters; k++) { - fetch_to_registers(k + 1, pipe % stages); - if (k == b_sh_wr_iters - 2) { - fetch_to_shared((pipe + stages - 1) % stages, pipe, - slice_iters >= stages); - pipe++; - wait_for_stage(); - } - matmul(k); - } - slice_iters--; - if (slice_iters == 0) break; - } - a_gl_rd += a_gl_rd_delta_o * stages; - - // Process results and, if necessary, proceed to the next column slice. - // While this pattern may not be the most readable, other ways of writing - // the loop seemed to noticeably worse performance after compilation. - if (slice_iters == 0) { - cp_async_wait<0>(); - bool last = slice_idx == slice_count - 1; - // For per-column scales, we only fetch them here in the final step before - // write-out - if (last) { - if (s_tok_sh_wr_pred) { - cp_async1(&sh_s_tok[s_tok_sh_wr], &s_tok[s_tok_gl_rd]); - } - if (s_ch_sh_wr_pred) { - cp_async4(&sh_s_ch[s_ch_sh_wr], &s_ch[s_ch_gl_rd]); - } - cp_async_fence(); - } - thread_block_reduce(); - if (last) { - cp_async_wait<0>(); - __syncthreads(); - if (threadIdx.x / 32 < thread_n_blocks / 4) { - #pragma unroll - for (int i = 0; i < thread_m_blocks; i++) { - frag_s_tok[i][0] = - *reinterpret_cast(&sh_s_tok[16 * i + 2 * s_tok_sh_rd]); - frag_s_tok[i][1] = *reinterpret_cast( - &sh_s_tok[16 * i + 2 * s_tok_sh_rd + 1]); - } - reinterpret_cast(&frag_s_ch)[0] = sh_s_ch[s_ch_sh_rd + 0]; - reinterpret_cast(&frag_s_ch)[1] = sh_s_ch[s_ch_sh_rd + 1]; - reinterpret_cast(&frag_s_ch)[2] = sh_s_ch[s_ch_sh_rd + 8]; - reinterpret_cast(&frag_s_ch)[3] = sh_s_ch[s_ch_sh_rd + 9]; - } - } - if (slice_count > 1) { // only globally reduce if there is more than one - // block in a slice - barrier_acquire(&locks[slice_col], slice_idx); - global_reduce(slice_idx == 0, last); - barrier_release(&locks[slice_col], last); - } - if (last) // only the last block in a slice actually writes the result - write_result(); - slice_row = 0; - slice_col_par++; - slice_col++; - init_slice(); - if (slice_iters) { - a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + - (threadIdx.x % a_gl_rd_delta_o); - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) - B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles; - if (slice_col == 0) { - #pragma unroll - for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride; - } - s_group_gl_rd = s_group_sh_stride * slice_col + threadIdx.x; - s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x; - start_pipes(); - } - } - } -} - -#else - -template shared - // fetch pipeline - const int group_blocks = -1 // number of consecutive 16x16 blocks - // with a separate quantization scale - > -__global__ void Marlin( - const int4* __restrict__ A, // int8 input matrix of shape mxk - const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn - int4* __restrict__ C, // int32 global_reduce buffer of shape - // (max_par*16*4)xn, as int8 tensor core's output is - // int32 dtype - int4* __restrict__ D, // fp16 output buffer of shape mxn - const float* __restrict__ s_tok, // fp32 activation per-token quantization - // scales of shape mx1 - const int4* __restrict__ s_ch, // fp32 weight per-channel quantization - // scales of shape 1xn - const int4* __restrict__ s_group, // fp16 weight per-group quantization - // scales of shape (k/groupsize)xn, when - // group_blocks=-1, it should be nullptr - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int* locks // extra global storage for barrier synchronization -) { - // Marlin is not implemented yet for SM < 8.0 - assert(false); - return; -} - -#endif - -// 8 warps are a good choice since every SM has 4 schedulers and having more -// than 1 warp per schedule allows some more latency hiding. At the same time, -// we want relatively few warps to have many registers per warp and small tiles. -const int USER_THREADS = - 256; // Note: This is only used with user-provided thread_k/n -const int STAGES = 4; // 4 pipeline stages fit into shared memory - -static constexpr int min_thread_n = 64; -static constexpr int min_thread_k = 64; - -static constexpr int tile_size = 16; -static constexpr int max_par = 16; - -static constexpr int pack_factor_4bit = - 8; // We have 8 4-bit vals inside a 32 bit - -#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ - GROUP_BLOCKS, NUM_THREADS) \ - else if (thread_m_blocks == THREAD_M_BLOCKS && \ - thread_n_blocks == THREAD_N_BLOCKS && \ - thread_k_blocks == THREAD_K_BLOCKS && \ - group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) { \ - cudaFuncSetAttribute(Marlin, \ - cudaFuncAttributeMaxDynamicSharedMemorySize, \ - max_shared_mem); \ - Marlin \ - <<>>( \ - A_ptr, B_ptr, C_ptr, D_ptr, s_tok_ptr, s_ch_ptr, s_group_ptr, \ - prob_m, prob_n, prob_k, locks); \ - } - -typedef struct { - int thread_k; - int thread_n; - int num_threads; -} thread_config_t; - -thread_config_t small_batch_thread_configs[] = { - // Ordered by priority - - // thread_k, thread_n, num_threads - {128, 128, 256}, // Default - {128, 64, 128}, // Reduce N 2X, same K - {64, 256, 256}, // Reduce K 2X, increase N 2X - {64, 128, 128}, // Reduce K 2X, same N -}; - -thread_config_t large_batch_thread_configs[] = { - // Ordered by priority - - // thread_k, thread_n, num_threads - {64, 256, 256}, // Default - {128, 128, 256}, // Reduce N 2X, increase K 2X - {64, 128, 128}, // Reduce N 2X, same K - {128, 64, 128}, // Reduce N 4X, increase K 2X -}; - -bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n, - int prob_k) { - // Sanity - if (th_config.thread_k == -1 || th_config.thread_n == -1 || - th_config.num_threads == -1) { - return false; - } - - // Verify K/N are divisible by thread K/N - if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) { - return false; - } - - // thread_k can be only 128 or 64 (because it must be less than groupsize - // which is 128) - if (th_config.thread_k != 128 && th_config.thread_k != 64) { - return false; - } - - // Verify min for thread K/N - if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) { - return false; - } - - // num_threads must be at least 128 (= 4 warps) - if (th_config.num_threads < 128) { - return false; - } - - return true; -} - -thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) { - if (prob_m <= 16) { - for (auto th_config : small_batch_thread_configs) { - if (is_valid_config(th_config, prob_m, prob_n, prob_k)) { - return th_config; - } - } - - } else { - for (auto th_config : large_batch_thread_configs) { - if (is_valid_config(th_config, prob_m, prob_n, prob_k)) { - return th_config; - } - } - } - - return thread_config_t{-1, -1, -1}; -} - -#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ - __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ - __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) - -void marlin_qqq_cuda(const void* A, const void* B, void* C, void* D, - void* s_tok, void* s_ch, void* s_group, int prob_m, - int prob_n, int prob_k, void* workspace, - int groupsize = -1, int dev = 0, cudaStream_t stream = 0, - int thread_k = -1, int thread_n = -1, int sms = -1, - int max_par = 16) { - int tot_m = prob_m; - int tot_m_blocks = ceildiv(tot_m, 16); - int pad = 16 * tot_m_blocks - tot_m; - - if (sms == -1) - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev); - - int max_shared_mem = 0; - cudaDeviceGetAttribute(&max_shared_mem, - cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); - TORCH_CHECK(max_shared_mem > 0); - - // Set thread config - thread_config_t th_config; - if (thread_k != -1 && thread_n != -1) { - // User-defined config - th_config = thread_config_t{thread_k, thread_n, USER_THREADS}; - } else { - // Auto config - th_config = determine_thread_config(prob_m, prob_n, prob_k); - } - - if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) { - throw std::runtime_error( - "Invalid thread config: thread_k = " + str(th_config.thread_k) + - ", thread_n = " + str(th_config.thread_n) + - ", num_threads = " + str(th_config.num_threads) + " for MKN = [" + - str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]"); - } - - int num_threads = th_config.num_threads; - thread_k = th_config.thread_k; - thread_n = th_config.thread_n; - - int thread_k_blocks = thread_k / 16; - int thread_n_blocks = thread_n / 16; - int group_blocks = (groupsize == -1) ? -1 : groupsize / 16; - int blocks = sms; - - if (prob_m == 0 || prob_n == 0 || prob_k == 0) { - return; - } - - TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n, - " is not divisible by thread_n = ", thread_n); - TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k, - " is not divisible by thread_k = ", thread_k); - if (group_blocks != -1) { - TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k, - " is not divisible by group_blocks = ", group_blocks); - } - - const int4* A_ptr = (const int4*)A; - const int4* B_ptr = (const int4*)B; - int4* C_ptr = (int4*)C; - int4* D_ptr = (int4*)D; - const float* s_tok_ptr = (const float*)s_tok; - const int4* s_ch_ptr = (const int4*)s_ch; - const int4* s_group_ptr = (const int4*)s_group; - - int* locks = (int*)workspace; - - for (int i = 0; i < tot_m_blocks; i += 4) { - int thread_m_blocks = tot_m_blocks - i; - prob_m = tot_m - 16 * i; - int par = 1; - if (thread_m_blocks > 4) { - // Note that parallel > 1 currently only works for inputs without any - // padding - par = (16 * thread_m_blocks - pad) / 64; - if (par > max_par) par = max_par; - prob_m = 64 * par; - i += 4 * (par - 1); - thread_m_blocks = 4; - } - - // For compilation speed, we only define the kernel configurations that have - // seemed useful (in terms of performance) in our testing, however many more - // are, in principle, possible. - if (false) { - } - CALL_IF(8, 8, 256) - CALL_IF(16, 4, 256) - CALL_IF(8, 4, 128) - CALL_IF(4, 8, 128) - else { - throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) + - ", " + str(prob_k) + ", " + str(prob_n) + "]" + - ", groupsize = " + str(groupsize) + - ", thread_m_blocks = " + str(thread_m_blocks) + - ", thread_n_blocks = " + str(thread_n_blocks) + - ", thread_k_blocks = " + str(thread_k_blocks)); - } - - A_ptr += 16 * thread_m_blocks * (prob_k / 16) * par; - D_ptr += 16 * thread_m_blocks * (prob_n / 8) * par; - s_tok_ptr += 16 * thread_m_blocks * par; - } -} -} // anonymous namespace - -torch::Tensor marlin_qqq_gemm(torch::Tensor const& a, - torch::Tensor const& b_q_weight, - torch::Tensor const& s_tok, - torch::Tensor const& s_ch, - torch::Tensor const& s_group, - torch::Tensor& workspace, int64_t size_m, - int64_t size_n, int64_t size_k) { - // Verify M - TORCH_CHECK(size_m == a.size(0), - "Shape mismatch: a.size(0) = " + str(a.size(0)) + - ", size_m = " + str(size_m)); - TORCH_CHECK(size_m == s_tok.numel(), - "Shape mismatch: s_tok.numel() = " + str(s_tok.numel()) + - ", size_m = " + str(size_m)); - - // Verify K - TORCH_CHECK(size_k == a.size(1), - "Shape mismatch: a.size(1) = " + str(a.size(1)) + - ", size_k = " + str(size_k)); - TORCH_CHECK(size_k % tile_size == 0, - "size_k = " + str(size_k) + - " is not divisible by tile_size = " + str(tile_size)); - TORCH_CHECK( - (size_k / tile_size) == b_q_weight.size(0), - "Shape mismatch: b_q_weight.size(0) = " + str(b_q_weight.size(0)) + - ", size_k = " + str(size_k) + ", tile_size = " + str(tile_size)); - - int groupsize = (s_group.numel() == 0) ? -1 : size_k / s_group.size(0); - // Verify groupsize - TORCH_CHECK(groupsize == -1 || groupsize == 128, - "Unexpected groupsize = " + str(groupsize)); - - // Verify N - TORCH_CHECK(s_ch.numel() == size_n, - "Shape mismatch: s_ch.numel() = " + str(s_ch.numel()) + - ", size_n = " + str(size_n)); - TORCH_CHECK(b_q_weight.size(1) % tile_size == 0, - "b_q_weight.size(1) = " + str(b_q_weight.size(1)) + - " is not divisible by tile_size = " + str(tile_size)); - if (groupsize != -1) { - TORCH_CHECK(s_group.size(1) == size_n, - "Shape mismatch: s_group.size(1) = " + str(s_group.size(1)) + - ", size_n = " + str(size_n)); - TORCH_CHECK( - size_k % s_group.size(0) == 0, - "size_k = " + str(size_k) + - ", is not divisible by s_group.size(0) = " + str(s_group.size(0))); - } - - int actual_size_n = (b_q_weight.size(1) / tile_size) * pack_factor_4bit; - TORCH_CHECK(size_n == actual_size_n, - "Shape mismatch: size_n = " + str(size_n) + - ", actual_size_n = " + str(actual_size_n)); - - // Verify A device and strides - TORCH_CHECK(a.device().is_cuda(), "A is not on GPU"); - TORCH_CHECK(a.is_contiguous(), "A is not contiguous"); - - // Verify B device and strides - TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU"); - TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous"); - - // Verify s_tok device, strides and dtype - TORCH_CHECK(s_tok.device().is_cuda(), "s_tok is not on GPU"); - TORCH_CHECK(s_tok.is_contiguous(), "s_tok is not contiguous"); - TORCH_CHECK(s_tok.dtype() == torch::kFloat32, "s_tok's dtype is not float32"); - - // Verify s_ch device, strides and dtype - TORCH_CHECK(s_ch.device().is_cuda(), "s_ch is not on GPU"); - TORCH_CHECK(s_ch.is_contiguous(), "s_ch is not contiguous"); - TORCH_CHECK(s_ch.dtype() == torch::kFloat32, "s_ch's dtype is not float32"); - - // Verify s_group device, strides and dtype - TORCH_CHECK(s_group.device().is_cuda(), "s_group is not on GPU"); - TORCH_CHECK(s_group.is_contiguous(), "s_group is not contiguous"); - TORCH_CHECK(s_group.dtype() == torch::kFloat16, - "s_group's dtype is not float16"); - - // Verify workspace size - TORCH_CHECK(size_n % min_thread_n == 0, - "size_n = " + str(size_n) + - ", is not divisible by min_thread_n = " + str(min_thread_n)); - int min_workspace_size = (size_n / min_thread_n) * max_par; - TORCH_CHECK(workspace.numel() >= min_workspace_size, - "workspace.numel = " + str(workspace.numel()) + - " is below min_workspace_size = " + str(min_workspace_size)); - - // Alloc C matrix - const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); - auto options_c = torch::TensorOptions().dtype(torch::kInt).device(a.device()); - torch::Tensor c = torch::empty({max_par * 64, size_n}, options_c); - - // Alloc D matrix - auto options_d = - torch::TensorOptions().dtype(torch::kFloat16).device(a.device()); - torch::Tensor d = torch::empty({size_m, size_n}, options_d); - - // thread_k: `k` size of a thread_tile in `weights` (can usually be left as - // auto -1) - int thread_k = -1; - // thread_n: `n` size of a thread_tile in `weights` (can usually be left as - // auto -1) - int thread_n = -1; - // sms: number of SMs to use for the kernel (can usually be left as auto -1) - int sms = -1; - - int dev = a.get_device(); - marlin_qqq_cuda( - a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), d.data_ptr(), - s_tok.data_ptr(), s_ch.data_ptr(), s_group.data_ptr(), size_m, size_n, - size_k, workspace.data_ptr(), groupsize, dev, - at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, max_par); - - return d; -} - -TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { - m.impl("marlin_qqq_gemm", &marlin_qqq_gemm); -} diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 3a0ff6eaa7904..60710f62c064b 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -241,14 +241,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // custom types: // https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA - // Marlin (Dense) Optimized Quantized GEMM for GPTQ. - ops.def( - "marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, " - "Tensor! workspace, SymInt size_m, SymInt size_n, SymInt size_k) -> " - "Tensor", - {stride_tag}); - // conditionally compiled so impl in source file - // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ. ops.def( "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, " @@ -353,15 +345,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("ggml_moe_get_block_size", &ggml_moe_get_block_size); #ifndef USE_ROCM - // marlin_qqq_gemm for QQQ. - ops.def( - "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, " - "Tensor s_tok, Tensor s_ch, Tensor s_group, " - "Tensor! workspace, SymInt size_m, SymInt size_n, " - "SymInt size_k) -> Tensor", - {stride_tag}); - // conditionally compiled so impl registration is in source file - // CUTLASS nvfp4 block scaled GEMM ops.def( "cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b," diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index a2fc6ffeb8b26..84178344a5f36 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -53,12 +53,6 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None): "quantization": "gptq_marlin_24" })) - if is_quant_method_supported("marlin"): - TEST_MODELS.append( - ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", { - "quantization": "marlin" - })) - if not current_platform.is_rocm() and is_quant_method_supported("awq"): TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", { "quantization": "AWQ" diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py index a842d2f1cbe8d..0e09661c955e4 100644 --- a/tests/kernels/quantization/test_machete_mm.py +++ b/tests/kernels/quantization/test_machete_mm.py @@ -95,23 +95,23 @@ TEST_TYPES = [ token_scale_type=None) for w_type in [scalar_types.uint4, scalar_types.uint8] for a_type in [torch.float16, torch.bfloat16]), - # QQQ style - *(TypeConfig(act_type=torch.int8, - weight_type=scalar_types.uint4b8, - output_type=torch.float16, - group_scale_type=group_scale_type, - group_zero_type=None, - channel_scale_type=torch.float, - token_scale_type=torch.float) - for group_scale_type in [None, torch.float16]), - *(TypeConfig(act_type=torch.float8_e4m3fn, - weight_type=scalar_types.uint4b8, - output_type=torch.float16, - group_scale_type=group_scale_type, - group_zero_type=None, - channel_scale_type=torch.float, - token_scale_type=torch.float) - for group_scale_type in [None, torch.float16]), + # # QQQ style + # *(TypeConfig(act_type=torch.int8, + # weight_type=scalar_types.uint4b8, + # output_type=torch.float16, + # group_scale_type=group_scale_type, + # group_zero_type=None, + # channel_scale_type=torch.float, + # token_scale_type=torch.float) + # for group_scale_type in [None, torch.float16]), + # *(TypeConfig(act_type=torch.float8_e4m3fn, + # weight_type=scalar_types.uint4b8, + # output_type=torch.float16, + # group_scale_type=group_scale_type, + # group_zero_type=None, + # channel_scale_type=torch.float, + # token_scale_type=torch.float) + # for group_scale_type in [None, torch.float16]), ] # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py index cea7700ac3293..ad077e0b94732 100644 --- a/tests/kernels/quantization/test_marlin_gemm.py +++ b/tests/kernels/quantization/test_marlin_gemm.py @@ -13,11 +13,7 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES) -from vllm.model_executor.layers.quantization.qqq import ( - MARLIN_QQQ_MAX_PARALLEL, MARLIN_QQQ_MIN_THREAD_N, - MARLIN_QQQ_SUPPORTED_GROUP_SIZES, MARLIN_QQQ_SUPPORTED_NUM_BITS) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( - GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx, marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales, query_marlin_supported_quant_types) @@ -31,8 +27,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( marlin_weights) from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import ( marlin_24_quantize) -from vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq import ( # noqa: E501 - marlin_qqq_quantize) from vllm.model_executor.layers.quantization.utils.quant_utils import ( awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights) from vllm.scalar_type import scalar_types @@ -449,68 +443,6 @@ def test_hqq_marlin_gemm( assert max_diff < 0.04 -@pytest.mark.skipif(not is_quant_method_supported("qqq"), - reason="Marlin is not supported on this GPU type.") -@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS) -@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS) -@pytest.mark.parametrize("num_bits", MARLIN_QQQ_SUPPORTED_NUM_BITS) -@pytest.mark.parametrize("group_size", MARLIN_QQQ_SUPPORTED_GROUP_SIZES) -@pytest.mark.parametrize("mnk_factors", MNK_FACTORS) -def test_marlin_qqq_gemm( - k_chunk, - n_chunk, - num_bits, - group_size, - mnk_factors, -): - int8_traits = torch.iinfo(torch.int8) - m_factor, n_factor, k_factor = mnk_factors - - size_m = m_factor - size_k = k_chunk * k_factor - size_n = n_chunk * n_factor - - a_input = rand_data((size_m, size_k)) - b_weight = rand_data((size_k, size_n)) - - # Quantize activations - s_a = a_input.abs().max(dim=-1, keepdim=True)[0].div(int8_traits.max).to( - torch.float) - q_a = (a_input / s_a).round().clamp(int8_traits.min, - int8_traits.max).to(torch.int8) - - # Quantize weights - w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel = \ - marlin_qqq_quantize(b_weight, num_bits, group_size) - - workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N, - MARLIN_QQQ_MAX_PARALLEL) - - opcheck(torch.ops._C.marlin_qqq_gemm, - (q_a, marlin_qqq_q_w, s_a, marlin_qqq_s_channel, - marlin_qqq_s_group, workspace.scratch, a_input.shape[0], - b_weight.shape[1], a_input.shape[1])) - - output = ops.marlin_qqq_gemm( - q_a, - marlin_qqq_q_w, - s_a, - marlin_qqq_s_channel, - marlin_qqq_s_group, - workspace.scratch, - a_input.shape[0], - b_weight.shape[1], - a_input.shape[1], - ) - output_ref = torch.matmul(q_a.half() * s_a.half(), w_ref) - - torch.cuda.synchronize() - - max_diff = compute_max_diff(output, output_ref) - - assert max_diff < 0.04 - - def test_marlin_gemm_subset_input(): quant_type = scalar_types.uint4b8 group_size = 128 @@ -602,18 +534,3 @@ def test_marlin_gemm_with_bias(size_m): max_diff = compute_max_diff(output, output_ref) assert max_diff < 0.04 - - -def test_marlin_gemm_opcheck(): - size_m = 2048 - size_n = 4096 - size_k = 4096 - a = torch.rand((size_m, size_n), device='cuda', dtype=torch.float16) - w = torch.randint(-5, 5, (256, 8192), device='cuda', dtype=torch.int32) - s = torch.full((32, size_k), 0.125, device='cuda', dtype=torch.float16) - wk = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N, - GPTQ_MARLIN_MAX_PARALLEL).scratch - x = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k) - y = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k) - torch.testing.assert_close(x, y) - opcheck(torch.ops._C.marlin_gemm, (a, w, s, wk, size_m, size_n, size_k)) diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index 8cf8402436ff5..1843bffd21159 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -22,22 +22,12 @@ class ModelPair: MODEL_ARG_EXPTYPES = [ # AUTOGPTQ # compat: autogptq <=0.7.1 is_marlin_format: bool - # Model Serialized in Marlin Format should always use Marlin kernel. - ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", None, "marlin"), - ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin", "marlin"), - ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "gptq", "marlin"), - ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "awq", "ERROR"), # Model Serialized in Exllama Format. ("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"), ("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"), ("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"), ("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"), # compat: autogptq >=0.8.0 use checkpoint_format: str - # Model Serialized in Marlin Format should always use Marlin kernel. - ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", None, "marlin"), - ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin", "marlin"), - ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "gptq", "marlin"), - ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "awq", "ERROR"), # Model Serialized in Exllama Format. ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"), ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"), diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index 11f78a23bb4c0..5ec8b27c1571f 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -11,7 +11,6 @@ import torch from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod from vllm.model_executor.layers.quantization.gptq_marlin import ( GPTQMarlinLinearMethod) -from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod from vllm.model_executor.layers.vocab_parallel_embedding import ( UnquantizedEmbeddingMethod) @@ -19,9 +18,7 @@ PROMPT = "On the surface of Mars, we found" MODELS_QUANT = [ ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", True), - ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", False), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False), - ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False) ] @@ -41,8 +38,7 @@ def test_lm_head( lm_head_layer = model.lm_head if lm_head_quantized: assert isinstance(lm_head_layer.quant_method, - (GPTQLinearMethod, GPTQMarlinLinearMethod, - MarlinLinearMethod)) + (GPTQLinearMethod, GPTQMarlinLinearMethod)) else: assert isinstance(lm_head_layer.quant_method, UnquantizedEmbeddingMethod) diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt index 1b797074096ed..cc18c9ff1f096 100644 --- a/tests/weight_loading/models.txt +++ b/tests/weight_loading/models.txt @@ -26,9 +26,5 @@ compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing awq, casperhansen/mixtral-instruct-awq, main awq_marlin, casperhansen/mixtral-instruct-awq, main fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main -marlin, nm-testing/zephyr-beta-7b-marlin-g128, main -marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main -qqq, HandH1998/QQQ-Llama-3-8b-g128, main -qqq, HandH1998/QQQ-Llama-3-8b, main hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main None, mgleize/fairseq2-dummy-Llama-3.2-1B, main \ No newline at end of file diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 39da08847b2e7..59f2d7737f19d 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -387,14 +387,6 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor, torch.ops._C.gptq_shuffle(q_weight, q_perm, bit) -# marlin -def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, - b_scales: torch.Tensor, workspace: torch.Tensor, size_m: int, - size_n: int, size_k: int) -> torch.Tensor: - return torch.ops._C.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m, - size_n, size_k) - - # marlin_24 def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, b_meta: torch.Tensor, b_scales: torch.Tensor, @@ -437,25 +429,6 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"): is_zp_float: bool = False) -> torch.Tensor: return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype) - @register_fake("_C::marlin_qqq_gemm") - def _marlin_qqq_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor, - s_tok: torch.Tensor, s_ch: torch.Tensor, - s_group: torch.Tensor, workspace: torch.Tensor, - size_m: torch.SymInt, size_n: torch.SymInt, - size_k: torch.SymInt) -> torch.Tensor: - return torch.empty((size_m, size_n), - dtype=torch.float16, - device=a.device) - - @register_fake("_C::marlin_gemm") - def _marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor, - b_scales: torch.Tensor, workspace: torch.Tensor, - size_m: torch.SymInt, size_n: torch.SymInt, - size_k: torch.SymInt) -> torch.Tensor: - return torch.empty((size_m, size_n), - dtype=torch.float16, - device=a.device) - @register_fake("_C::awq_dequantize") def _awq_dequantize_fake(qweight: torch.Tensor, scales: torch.Tensor, zeros: torch.Tensor, split_k_iters: torch.SymInt, @@ -1348,15 +1321,6 @@ def scaled_int8_quant( return output, input_scales, input_azp -# qqq ops -def marlin_qqq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, - s_tok: torch.Tensor, s_ch: torch.Tensor, - s_group: torch.Tensor, workspace: torch.Tensor, - size_m: int, size_n: int, size_k: int) -> torch.Tensor: - return torch.ops._C.marlin_qqq_gemm(a, b_q_weight, s_tok, s_ch, s_group, - workspace, size_m, size_n, size_k) - - # gguf def ggml_dequantize(W: torch.Tensor, quant_type: int, m: int, n: int, dtype: Optional[torch.dtype]) -> torch.Tensor: diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 5b5d477ef066b..62dfd4333bee8 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1112,9 +1112,9 @@ class ModelConfig: def _verify_quantization(self) -> None: supported_quantization = me_quant.QUANTIZATION_METHODS optimized_quantization_methods = [ - "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin", - "awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8", - "quark", "modelopt_fp4", "bitblas", "gptq_bitblas", "inc" + "fp8", "modelopt", "gptq_marlin_24", "gptq_marlin", "awq_marlin", + "fbgemm_fp8", "compressed-tensors", "experts_int8", "quark", + "modelopt_fp4", "bitblas", "gptq_bitblas", "inc" ] if self.quantization is not None: self.quantization = cast(me_quant.QuantizationMethods, @@ -1137,7 +1137,6 @@ class ModelConfig: # `override_quantization_method` method) must be checked in order # of preference (this is particularly important for GPTQ). overrides = [ - "marlin", "bitblas", "gptq_marlin_24", "gptq_marlin", diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index de5933d6d41e5..24a05d310d108 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -48,9 +48,6 @@ def _get_lora_device(base_layer: nn.Module) -> torch.device: # GPTQ/AWQ elif hasattr(base_layer, "qweight"): return base_layer.qweight.device - # marlin - elif hasattr(base_layer, "B"): - return base_layer.B.device # HQQ marlin elif hasattr(base_layer, "W_q"): return base_layer.W_q.device diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index d3b6b2089f426..654e2ec7b2fa0 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -42,7 +42,6 @@ WEIGHT_LOADER_V2_SUPPORTED = [ "GPTQMarlinLinearMethod", "Fp8LinearMethod", "MarlinLinearMethod", - "QQQLinearMethod", "GPTQMarlin24LinearMethod", "TPUInt8LinearMethod", "GPTQLinearMethod", diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index a4c2671225f57..ea51468422dcd 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -15,7 +15,6 @@ QuantizationMethods = Literal[ "fbgemm_fp8", "modelopt", "modelopt_fp4", - "marlin", "bitblas", "gguf", "gptq_marlin_24", @@ -25,7 +24,6 @@ QuantizationMethods = Literal[ "gptq", "compressed-tensors", "bitsandbytes", - "qqq", "hqq", "experts_int8", "neuron_quant", @@ -106,13 +104,11 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: from .hqq_marlin import HQQMarlinConfig from .inc import INCConfig from .ipex_quant import IPEXConfig - from .marlin import MarlinConfig from .modelopt import ModelOptFp8Config, ModelOptNvFp4Config from .moe_wna16 import MoeWNA16Config from .mxfp4 import Mxfp4Config from .neuron_quant import NeuronQuantConfig from .ptpc_fp8 import PTPCFp8Config - from .qqq import QQQConfig from .rtn import RTNConfig from .torchao import TorchAOConfig from .tpu_int8 import Int8TpuConfig @@ -125,7 +121,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: "fbgemm_fp8": FBGEMMFp8Config, "modelopt": ModelOptFp8Config, "modelopt_fp4": ModelOptNvFp4Config, - "marlin": MarlinConfig, "bitblas": BitBLASConfig, "gguf": GGUFConfig, "gptq_marlin_24": GPTQMarlin24Config, @@ -136,7 +131,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: "compressed-tensors": CompressedTensorsConfig, "bitsandbytes": BitsAndBytesConfig, "ptpc_fp8": PTPCFp8Config, - "qqq": QQQConfig, "hqq": HQQMarlinConfig, "experts_int8": ExpertsInt8Config, "neuron_quant": NeuronQuantConfig, diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py deleted file mode 100644 index 18d1c13373df9..0000000000000 --- a/vllm/model_executor/layers/quantization/marlin.py +++ /dev/null @@ -1,263 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Any, Optional - -import torch -from torch.nn.parameter import Parameter - -from vllm import _custom_ops as ops -from vllm.logger import init_logger -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase -from vllm.model_executor.layers.quantization import QuantizationMethods -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) -from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead -from vllm.model_executor.parameter import (BasevLLMParameter, - ChannelQuantScaleParameter, - GroupQuantScaleParameter, - PackedvLLMParameter) - -logger = init_logger(__name__) - - -class MarlinConfig(QuantizationConfig): - """Config class for Marlin. - - Reference: https://github.com/IST-DASLab/marlin/tree/master - """ - - def __init__( - self, - group_size: int, - lm_head_quantized: bool, - ) -> None: - super().__init__() - - # Group size for the quantization. - self.group_size = group_size - self.lm_head_quantized = lm_head_quantized - if self.group_size != 128 and self.group_size != -1: - raise ValueError( - "Currently, only group size 128 and -1 (channelwise) " - "is supported for Marlin, but got group_size of " - f"{self.group_size}") - - # 4 Bits packed into 32 bit datatype. - self.pack_factor = 32 // 4 - - # Tile size used by marlin kernels. - self.tile_size = 16 - - # Min out_features dim - self.min_n_threads = 64 - - # Min in_features dim - self.min_k_threads = 128 - - # Max parallel problems to solve at once (improves large - # batch performance) - self.max_parallel = 16 - - # Permutation length used by the marlin kernels. - self.perm_len = 1024 - - def __repr__(self) -> str: - return (f"MarlinConfig(group_size={self.group_size}, " - f"lm_head_quantized={self.lm_head_quantized})") - - @classmethod - def get_name(cls) -> QuantizationMethods: - return "marlin" - - @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: - return [torch.half] - - @classmethod - # Need to figure it out - def get_min_capability(cls) -> int: - return 80 - - @classmethod - def get_config_filenames(cls) -> list[str]: - return ["quantize_config.json"] - - @classmethod - def from_config(cls, config: dict[str, Any]) -> "MarlinConfig": - group_size = cls.get_from_keys(config, ["group_size"]) - lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], - default=False) - return cls(group_size, lm_head_quantized) - - @classmethod - def override_quantization_method( - cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]: - # compat: autogptq >=0.8.0 use checkpoint_format: str - # compat: autogptq <=0.7.1 is_marlin_format: bool - is_marlin_format = (hf_quant_cfg.get("checkpoint_format") == "marlin" - or hf_quant_cfg.get("is_marlin_format", False)) - - is_valid_user_quant = (user_quant is None or user_quant == "gptq" - or user_quant == "marlin") - - if is_marlin_format and is_valid_user_quant: - msg = ("The model is serialized in {} format. Using {} kernel.". - format(cls.get_name(), cls.get_name())) - logger.info(msg) - return cls.get_name() - - return None - - def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["MarlinLinearMethod"]: - if (isinstance(layer, LinearBase) or - (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)): - return MarlinLinearMethod(self) - return None - - -class MarlinLinearMethod(LinearMethodBase): - """Linear method for Marlin. - - Args: - quant_config: The Marlin quantization config. - """ - - def __init__(self, quant_config: MarlinConfig): - self.quant_config = quant_config - - def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: list[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - del output_size # Unused. - weight_loader = extra_weight_attrs["weight_loader"] - - if params_dtype != torch.float16: - raise ValueError( - f"The params dtype must be float16, but got {params_dtype}") - - # Validate output_size_per_partition - output_size_per_partition = sum(output_partition_sizes) - if output_size_per_partition % self.quant_config.min_n_threads != 0: - raise ValueError( - f"Weight output_size_per_partition = " - f"{output_size_per_partition} is not divisible by " - f"min_n_threads = {self.quant_config.min_n_threads}.") - if output_size_per_partition % self.quant_config.pack_factor != 0: - raise ValueError( - f"Weight output_size_per_partition = " - f"{output_size_per_partition} is not divisible by " - f"pack_factor = {self.quant_config.pack_factor}.") - - # Validate input_size_per_partition - if input_size_per_partition % self.quant_config.min_k_threads != 0: - raise ValueError( - f"Weight input_size_per_partition = " - f"{input_size_per_partition} is not divisible by " - f"min_k_threads = {self.quant_config.min_k_threads}.") - if (self.quant_config.group_size != -1 and - input_size_per_partition % self.quant_config.group_size != 0): - raise ValueError(f"Weight input_size_per_partition = " - f"{input_size_per_partition} is not divisible by " - f"group_size = {self.quant_config.group_size}.") - - # Check that we have at least 4 tiles horizontally in the shard - num_tiles_per_perm = self.quant_config.perm_len // ( - self.quant_config.tile_size**2) - if output_size_per_partition % num_tiles_per_perm != 0: - raise ValueError( - "Each permutation group must reside on the same gpu") - - # Quantized 4Bit weights packed into Int32. - qweight = PackedvLLMParameter( - data=torch.empty( - input_size_per_partition // self.quant_config.tile_size, - output_size_per_partition * self.quant_config.tile_size // - self.quant_config.pack_factor, - device="cuda", - dtype=torch.int32, - ), - input_dim=0, - output_dim=1, - packed_dim=1, - packed_factor=self.quant_config.pack_factor, - marlin_tile_size=self.quant_config.tile_size, - weight_loader=weight_loader) - - # Determine if channelwise or not - input_groups = (1 if self.quant_config.group_size == -1 else - input_size_per_partition // - self.quant_config.group_size) - - weight_scale_args = { - "data": - torch.empty( - input_groups, - output_size_per_partition, - device="cuda", - dtype=params_dtype, - ), - "weight_loader": - weight_loader - } - if input_groups == 1: - scales = ChannelQuantScaleParameter(output_dim=1, - **weight_scale_args) - else: - scales = GroupQuantScaleParameter(output_dim=1, - input_dim=0, - **weight_scale_args) - - # Allocate workspace (Used for internal locking mechanism) - max_workspace_size = ( - output_size_per_partition // - self.quant_config.min_n_threads) * self.quant_config.max_parallel - - workspace = BasevLLMParameter(data=torch.zeros(max_workspace_size, - device="cuda", - dtype=torch.int), - weight_loader=weight_loader) - - layer.register_parameter("B", qweight) - layer.register_parameter("s", scales) - layer.register_parameter("workspace", workspace) - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - # required by torch.compile - layer.B = Parameter(layer.B.data, requires_grad=False) - layer.s = Parameter(layer.s.data, requires_grad=False) - layer.workspace = Parameter(layer.workspace.data, requires_grad=False) - - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - qweight = layer.B - scales = layer.s - workspace = layer.workspace - - x_2d = x.view(-1, x.shape[-1]) - - size_m = x_2d.shape[0] - size_k = x_2d.shape[1] - size_n = scales.shape[1] - - output_2d = ops.marlin_gemm(x_2d, qweight, scales, workspace, size_m, - size_n, size_k) - - output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], )) - - if bias is not None: - output.add_(bias) # In-place add - - return output diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py deleted file mode 100644 index 25978cb13b3ab..0000000000000 --- a/vllm/model_executor/layers/quantization/qqq.py +++ /dev/null @@ -1,275 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Any, Optional - -import torch -from torch.nn.parameter import Parameter - -from vllm import _custom_ops as ops -from vllm.logger import init_logger -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase -from vllm.model_executor.layers.quantization import QuantizationMethods -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) -from vllm.model_executor.parameter import (BasevLLMParameter, - ChannelQuantScaleParameter, - GroupQuantScaleParameter, - PackedvLLMParameter) - -logger = init_logger(__name__) - -MARLIN_QQQ_TILE = 16 -MARLIN_QQQ_MIN_THREAD_N = 64 -MARLIN_QQQ_MIN_THREAD_K = 128 -MARLIN_QQQ_MAX_PARALLEL = 16 - -MARLIN_QQQ_SUPPORTED_NUM_BITS = [4] -MARLIN_QQQ_SUPPORTED_GROUP_SIZES = [-1, 128] -MARLIN_QQQ_SUPPORTED_SYM = [True] - - -class QQQConfig(QuantizationConfig): - """Config class for QQQ - - Reference: https://arxiv.org/pdf/2406.09904 - """ - - def __init__( - self, - weight_bits: int, - group_size: int, - is_sym: bool = True, - ) -> None: - super().__init__() - self.weight_bits = weight_bits - self.group_size = group_size - self.is_sym = is_sym - - # Verify - if self.weight_bits not in MARLIN_QQQ_SUPPORTED_NUM_BITS: - raise ValueError( - f"QQQ does not support weight_bits = {self.weight_bits}. " - f"Only weight_bits = {MARLIN_QQQ_SUPPORTED_NUM_BITS} " - "are supported.") - if self.group_size not in MARLIN_QQQ_SUPPORTED_GROUP_SIZES: - raise ValueError( - f"QQQ does not support group_size = {self.group_size}. " - f"Only group_sizes = {MARLIN_QQQ_SUPPORTED_GROUP_SIZES} " - "are supported.") - if self.is_sym not in MARLIN_QQQ_SUPPORTED_SYM: - raise ValueError( - f"QQQ does not support is_sym = {self.is_sym}. " - f"Only sym = {MARLIN_QQQ_SUPPORTED_SYM} are supported.") - - # 4 Bits packed into 32 bit datatype. - self.pack_factor = 32 // self.weight_bits - - # Tile size used by QQQ kernels. - self.tile_size = MARLIN_QQQ_TILE - - # Min out_features dim - self.min_n_threads = MARLIN_QQQ_MIN_THREAD_N - - # Min in_features dim - self.min_k_threads = MARLIN_QQQ_MIN_THREAD_K - - # Max parallel problems to solve at once (improves large - # batch performance) - self.max_parallel = MARLIN_QQQ_MAX_PARALLEL - - # Permutation length used by the QQQ kernels. - self.perm_len = 1024 - - def __repr__(self) -> str: - return "QQQConfig(weight_bits={}, group_size={})".format( - self.weight_bits, self.group_size) - - @classmethod - def get_name(cls) -> QuantizationMethods: - return "qqq" - - @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: - return [torch.half] - - @classmethod - def get_min_capability(cls) -> int: - return 80 - - @classmethod - def get_config_filenames(cls) -> list[str]: - """List of filenames to search for in the model directory.""" - return [ - "quant_config.json", - "quantize_config.json", - ] - - @classmethod - def from_config(cls, config: dict[str, Any]) -> "QQQConfig": - weight_bits = cls.get_from_keys(config, ["wbits"]) - group_size = cls.get_from_keys(config, ["group_size"]) - return cls(weight_bits, group_size) - - def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["QQQLinearMethod"]: - if isinstance(layer, LinearBase): - return QQQLinearMethod(self) - return None - - -class QQQLinearMethod(LinearMethodBase): - """Linear method for QQQ. - - Args: - quant_config: The QQQ quantization config. - """ - - def __init__(self, quant_config: QQQConfig): - self.quant_config = quant_config - - def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: list[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - weight_loader = extra_weight_attrs["weight_loader"] - if params_dtype != torch.float16: - raise ValueError( - f"The params dtype must be float16, but got {params_dtype}") - - # Validate output_size_per_partition - output_size_per_partition = sum(output_partition_sizes) - if output_size_per_partition % self.quant_config.min_n_threads != 0: - raise ValueError( - f"Weight output_size_per_partition = " - f"{output_size_per_partition} is not divisible by " - f"min_n_threads = {self.quant_config.min_n_threads}.") - if output_size_per_partition % self.quant_config.pack_factor != 0: - raise ValueError( - f"Weight output_size_per_partition = " - f"{output_size_per_partition} is not divisible by " - f"pack_factor = {self.quant_config.pack_factor}.") - - # Validate input_size_per_partition - if input_size_per_partition % self.quant_config.min_k_threads != 0: - raise ValueError( - f"Weight input_size_per_partition = " - f"{input_size_per_partition} is not divisible by " - f"min_k_threads = {self.quant_config.min_k_threads}.") - if (self.quant_config.group_size != -1 and - input_size_per_partition % self.quant_config.group_size != 0): - raise ValueError(f"Weight input_size_per_partition = " - f"{input_size_per_partition} is not divisible by " - f"group_size = {self.quant_config.group_size}.") - - # Check that we have at least 4 tiles horizontally in the shard - num_tiles_per_perm = self.quant_config.perm_len // ( - self.quant_config.tile_size**2) - if output_size_per_partition % num_tiles_per_perm != 0: - raise ValueError( - "Each permutation group must reside on the same gpu") - - # Quantized 4Bit weights packed into Int32. - qweight = PackedvLLMParameter( - data=torch.empty( - input_size_per_partition // self.quant_config.tile_size, - output_size_per_partition * self.quant_config.tile_size // - self.quant_config.pack_factor, - device="cuda", - dtype=torch.int32, - ), - input_dim=0, - output_dim=1, - packed_dim=1, - packed_factor=self.quant_config.pack_factor, - marlin_tile_size=self.quant_config.tile_size, - weight_loader=weight_loader) - - s_channel = ChannelQuantScaleParameter(data=torch.empty( - 1, - output_size_per_partition, - device="cuda", - dtype=torch.float, - ), - weight_loader=weight_loader, - output_dim=1) - - if self.quant_config.group_size == -1: - s_group_data = torch.tensor( - [], - device="cuda", - dtype=torch.half, - ) - else: - s_group_data = torch.empty( - input_size_per_partition // self.quant_config.group_size, - output_size_per_partition, - device="cuda", - dtype=torch.half, - ) - - s_group_attr = {"data": s_group_data, "weight_loader": weight_loader} - - if self.quant_config.group_size == -1: - s_group = BasevLLMParameter(**s_group_attr) - else: - s_group = GroupQuantScaleParameter(output_dim=1, - input_dim=0, - **s_group_attr) - - # Allocate workspace (Used for internal locking mechanism) - max_workspace_size = ( - output_size_per_partition // - self.quant_config.min_n_threads) * self.quant_config.max_parallel - - workspace = BasevLLMParameter(data=torch.zeros(max_workspace_size, - device="cuda", - dtype=torch.int), - weight_loader=weight_loader) - - layer.register_parameter("B", qweight) - layer.register_parameter("s_channel", s_channel) - layer.register_parameter("s_group", s_group) - layer.register_parameter("workspace", workspace) - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - # required by torch.compile - layer.B = Parameter(layer.B.data, requires_grad=False) - layer.s_channel = Parameter(layer.s_channel.data, requires_grad=False) - layer.s_group = Parameter(layer.s_group.data, requires_grad=False) - layer.workspace = Parameter(layer.workspace.data, requires_grad=False) - - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - qweight = layer.B - s_ch = layer.s_channel - s_group = layer.s_group - workspace = layer.workspace - - x_2d = x.view(-1, x.shape[-1]) - - size_m = x_2d.shape[0] - size_k = x_2d.shape[1] - size_n = s_ch.shape[1] - - x_int8, s_tok, _ = ops.scaled_int8_quant(x_2d) - - output_2d = ops.marlin_qqq_gemm(x_int8, qweight, s_tok, s_ch, s_group, - workspace, size_m, size_n, size_k) - - output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], )) - - if bias is not None: - output.add_(bias) # In-place add - - return output diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py deleted file mode 100644 index 8a64bebae04c9..0000000000000 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +++ /dev/null @@ -1,126 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import numpy -import torch - -from .marlin_utils_test import marlin_permute_weights -from .quant_utils import get_pack_factor, qqq_quantize_weights - - -def marlin_qqq_weights(q_w, size_k, size_n, num_bits, perm, group_size): - # Permute - q_w = marlin_permute_weights(q_w, size_k, size_n, perm) - - # Pack - pack_factor = get_pack_factor(num_bits) - orig_device = q_w.device - - q_w = q_w.cpu().numpy().astype(numpy.uint32) - - q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor), - dtype=numpy.uint32) - if group_size == size_k: - for i in range(pack_factor): - q_packed |= (q_w[:, i::pack_factor] & 0xF) << num_bits * i - else: - for i in range(pack_factor): - q_packed |= q_w[:, i::pack_factor] << num_bits * i - - q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device) - - return q_packed - - -def get_qqq_scale_perms(): - scale_perm: list[int] = [] - for i in range(8): - scale_perm.extend([i + 8 * j for j in range(8)]) - scale_perm_single: list[int] = [] - for i in range(4): - scale_perm_single.extend( - [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) - return scale_perm, scale_perm_single - - -# NOTE(HandH1998): QQQ employs different perms for per-group and per-channel weight quantization. # noqa: E501 -def get_qqq_weight_perm(num_bits: int, quant_type: str): - perm_list: list[int] = [] - for i in range(32): - perm1: list[int] = [] - col = i // 4 - for block in [0, 1]: - for row in [ - 4 * (i % 4), - 4 * (i % 4) + 1, - 4 * (i % 4) + 2, - 4 * (i % 4) + 3, - ]: - perm1.append(16 * row + col + 8 * block) - for j in range(4): - perm_list.extend([p + 256 * j for p in perm1]) - - perm = numpy.array(perm_list) - - assert quant_type in ["per-channel", - "per-group"], "not supported quantization type" - if num_bits == 4: - if quant_type == "per-channel": - interleave = numpy.array([4, 0, 5, 1, 6, 2, 7, 3]) - else: - interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7]) - else: - raise Exception("num_bits must be 4, got {}".format(num_bits)) - - perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel() - perm = torch.from_numpy(perm) - return perm - - -def marlin_qqq_permute_scales(s_group, s_channel, size_k, size_n, group_size): - scale_perm, scale_perm_single = get_qqq_scale_perms() - if group_size < size_k and group_size != -1: - s_group = s_group.reshape((-1, len(scale_perm)))[:, scale_perm] - s_channel = s_channel.reshape( - (-1, len(scale_perm_single)))[:, scale_perm_single] - s_group = s_group.reshape((-1, size_n)).contiguous() - else: - s_channel = s_channel.reshape( - (-1, len(scale_perm_single)))[:, scale_perm_single] - s_channel = s_channel.reshape((-1, size_n)).contiguous() - - return s_group, s_channel - - -def marlin_qqq_quantize( - w: torch.Tensor, - num_bits: int, - group_size: int, -): - size_k, size_n = w.shape - - # Normalize group_size - if group_size == -1: - group_size = size_k - assert group_size <= size_k - quant_type = "per-channel" if group_size == size_k else "per-group" - - # Quantize - w_ref, q_w, s_group, s_channel = qqq_quantize_weights( - w, num_bits, group_size) - - # Reformat to marlin_qqq - weight_perm = get_qqq_weight_perm(num_bits, quant_type) - marlin_qqq_q_w = marlin_qqq_weights(q_w, size_k, size_n, num_bits, - weight_perm, group_size) - marlin_qqq_s_group, marlin_qqq_s_channel = marlin_qqq_permute_scales( - s_group, s_channel, size_k, size_n, group_size) - - # Create result - res_list = [ - w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel - ] - for i in range(len(res_list)): - res_list[i] = res_list[i].to(w.device) - - return res_list diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index 428e9e99aa881..3cfaca6230b12 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -9,8 +9,6 @@ import numpy import torch from vllm._custom_ops import cutlass_scaled_mm_supports_fp4 -from vllm.model_executor.layers.quantization.qqq import ( - MARLIN_QQQ_SUPPORTED_NUM_BITS) from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types @@ -386,89 +384,6 @@ def gptq_quantize_weights(w: torch.Tensor, return w_ref, w_q, w_s, g_idx, rand_perm -# QQQ employs different quant schemes for per-group and -# per-channel quantization. -def qqq_quantize_weights(w: torch.Tensor, num_bits: int, group_size: int): - orig_device = w.device - size_k, size_n = w.shape - - assert w.is_floating_point(), "w must be float" - assert num_bits in MARLIN_QQQ_SUPPORTED_NUM_BITS, \ - f"Unsupported num_bits = {num_bits}" - assert group_size in SUPPORTED_GROUP_SIZES + [ - size_k - ], f"Unsupported groupsize = {group_size}" - - if group_size == -1: - group_size = size_k - assert group_size <= size_k - - if group_size < size_k: - # Reshape to [groupsize, -1] - w = w.reshape((-1, group_size, size_n)) - w = w.permute(1, 0, 2) - w = w.reshape((group_size, -1)) - - max_q_val = 2**num_bits - 1 - half_q_val = (max_q_val + 1) // 2 - - # Compute scale for each group - s_group = torch.max(torch.abs(w), 0, keepdim=True)[0] - s_group *= 2 / max_q_val # 2 => symmetric - - # Quantize - q_w = torch.round(w / s_group).int() - q_w += half_q_val - q_w = torch.clamp(q_w, 0, max_q_val) - # Compute ref (dequantized) - w_ref = (q_w - half_q_val).half() * s_group - - # Restore original shapes - def reshape_w(w): - w = w.reshape((group_size, -1, size_n)) - w = w.permute(1, 0, 2) - w = w.reshape((size_k, size_n)).contiguous() - return w - - q_w = reshape_w(q_w) - w_ref = reshape_w(w_ref) - - # Compute int8 quantization scale for each channel - s_channel = torch.max(torch.abs(w_ref), 0, keepdim=True)[0] - s_channel /= 127.0 - t_int8 = (w_ref / s_channel).round().clamp(-128, 127).to(torch.int8) - w_ref = t_int8.half() * s_channel - s_channel = s_channel.reshape(1, -1).to(dtype=torch.float) - - # Fuse scales - s_group = (s_group.reshape(-1, size_n).contiguous() / - s_channel).to(dtype=torch.half) - else: - max_q_val = 2**(num_bits - 1) - 1 - - # Compute scale for each channel - s_channel = torch.max(torch.abs(w), 0, keepdim=True)[0] - s_channel /= max_q_val - - # Quantize - q_w = torch.round(w / s_channel).int() - q_w = torch.clamp(q_w, -max_q_val, max_q_val) - # Compute ref (dequantized) - w_ref = q_w.half() * s_channel - - s_group = torch.tensor([], dtype=torch.half) - # div 2 ** (8 - self.bits)) to offset right shift in unpacking - s_channel /= (2**(8 - num_bits)) - s_channel = s_channel.reshape(-1, size_n).contiguous().to(torch.float) - - return ( - w_ref.to(device=orig_device), - q_w.to(device=orig_device), - s_group.to(device=orig_device), - s_channel.to(device=orig_device), - ) - - def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor): orig_device = q_w.device From 582bbe6bd708d01d74d6d02d6ef59b4c3c34a7b1 Mon Sep 17 00:00:00 2001 From: bigmoyan Date: Thu, 21 Aug 2025 03:59:54 +0800 Subject: [PATCH 144/361] [Fix] correct tool_id for kimi-k2 when use tool_choice=required (#21259) Co-authored-by: wangzhengtao --- .../test_completion_with_function_calling.py | 314 +++++++++++------- tests/utils.py | 10 +- vllm/entrypoints/chat_utils.py | 17 +- vllm/entrypoints/openai/protocol.py | 4 +- vllm/entrypoints/openai/serving_chat.py | 64 +++- .../tool_parsers/deepseekv3_tool_parser.py | 4 +- .../granite_20b_fc_tool_parser.py | 4 +- .../tool_parsers/granite_tool_parser.py | 4 +- .../openai/tool_parsers/hermes_tool_parser.py | 4 +- .../tool_parsers/internlm2_tool_parser.py | 4 +- .../openai/tool_parsers/jamba_tool_parser.py | 4 +- .../openai/tool_parsers/llama_tool_parser.py | 4 +- .../tool_parsers/minimax_tool_parser.py | 4 +- .../tool_parsers/phi4mini_tool_parser.py | 4 +- .../openai/tool_parsers/xlam_tool_parser.py | 4 +- 15 files changed, 283 insertions(+), 166 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index a5b081f861074..4ef5d4e8a699a 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -13,6 +13,127 @@ from ...utils import RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "Qwen/Qwen3-0.6B" +tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": + "The city to find the weather for, e.g. 'Vienna'", + "default": "Vienna", + }, + "country": { + "type": + "string", + "description": + "The country that the city is in, e.g. 'Austria'", + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, + "options": { + "$ref": "#/$defs/WeatherOptions", + "description": "Optional parameters for weather query", + }, + }, + "required": ["country", "unit"], + "$defs": { + "WeatherOptions": { + "title": "WeatherOptions", + "type": "object", + "additionalProperties": False, + "properties": { + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "default": "celsius", + "description": "Temperature unit", + "title": "Temperature Unit", + }, + "include_forecast": { + "type": "boolean", + "default": False, + "description": + "Whether to include a 24-hour forecast", + "title": "Include Forecast", + }, + "language": { + "type": "string", + "default": "zh-CN", + "description": "Language of the response", + "title": "Language", + "enum": ["zh-CN", "en-US", "ja-JP"], + }, + }, + }, + }, + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_forecast", + "description": "Get the weather forecast for a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": + "The city to get the forecast for, e.g. 'Vienna'", + "default": "Vienna", + }, + "country": { + "type": + "string", + "description": + "The country that the city is in, e.g. 'Austria'", + }, + "days": { + "type": + "integer", + "description": + "Number of days to get the forecast for (1-7)", + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, + }, + "required": ["country", "days", "unit"], + }, + }, + }, +] + +messages = [ + { + "role": "user", + "content": "Hi! How are you doing today?" + }, + { + "role": "assistant", + "content": "I'm doing well! How can I help you?" + }, + { + "role": + "user", + "content": + "Can you tell me what the current weather is in Berlin and the "\ + "forecast for the next 5 days, in fahrenheit?", + }, +] + @pytest.fixture(scope="module") def server(): # noqa: F811 @@ -27,6 +148,8 @@ def server(): # noqa: F811 "hermes", "--reasoning-parser", "qwen3", + "--gpu-memory-utilization", + "0.4" ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: @@ -54,129 +177,6 @@ async def client(server): async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str, stream: bool, tool_choice: Union[str, dict], enable_thinking: bool): - tools = [ - { - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "city": { - "type": "string", - "description": - "The city to find the weather for, e.g. 'Vienna'", - "default": "Vienna", - }, - "country": { - "type": - "string", - "description": - "The country that the city is in, e.g. 'Austria'", - }, - "unit": { - "type": "string", - "description": - "The unit to fetch the temperature in", - "enum": ["celsius", "fahrenheit"], - }, - "options": { - "$ref": "#/$defs/WeatherOptions", - "description": - "Optional parameters for weather query", - }, - }, - "required": ["country", "unit"], - "$defs": { - "WeatherOptions": { - "title": "WeatherOptions", - "type": "object", - "additionalProperties": False, - "properties": { - "unit": { - "type": "string", - "enum": ["celsius", "fahrenheit"], - "default": "celsius", - "description": "Temperature unit", - "title": "Temperature Unit", - }, - "include_forecast": { - "type": "boolean", - "default": False, - "description": - "Whether to include a 24-hour forecast", - "title": "Include Forecast", - }, - "language": { - "type": "string", - "default": "zh-CN", - "description": "Language of the response", - "title": "Language", - "enum": ["zh-CN", "en-US", "ja-JP"], - }, - }, - }, - }, - }, - }, - }, - { - "type": "function", - "function": { - "name": "get_forecast", - "description": "Get the weather forecast for a given location", - "parameters": { - "type": "object", - "properties": { - "city": { - "type": "string", - "description": - "The city to get the forecast for, e.g. 'Vienna'", - "default": "Vienna", - }, - "country": { - "type": - "string", - "description": - "The country that the city is in, e.g. 'Austria'", - }, - "days": { - "type": - "integer", - "description": - "Number of days to get the forecast for (1-7)", - }, - "unit": { - "type": "string", - "description": - "The unit to fetch the temperature in", - "enum": ["celsius", "fahrenheit"], - }, - }, - "required": ["country", "days", "unit"], - }, - }, - }, - ] - - messages = [ - { - "role": "user", - "content": "Hi! How are you doing today?" - }, - { - "role": "assistant", - "content": "I'm doing well! How can I help you?" - }, - { - "role": - "user", - "content": - "Can you tell me what the current weather is in Berlin and the "\ - "forecast for the next 5 days, in fahrenheit?", - }, - ] if not stream: # Non-streaming test chat_completion = await client.chat.completions.create( @@ -216,3 +216,71 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str, output.extend(chunk.choices[0].delta.tool_calls) assert len(output) > 0 + + +@pytest.fixture(scope="module") +def k2_server(): # noqa: F811 + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "half", + "--enable-auto-tool-choice", + "--guided-decoding-backend", + "xgrammar", + "--tool-call-parser", + "hermes", + "--reasoning-parser", + "qwen3", + "--gpu-memory-utilization", + "0.4", + ] + # hack to test kimi_k2 tool use tool_id format. + # avoid error in is_deepseek_mla check by setting kv_lora_rank=null + with RemoteOpenAIServer(MODEL_NAME, + args, + override_hf_configs={ + "model_type": 'kimi_k2', + 'kv_lora_rank': None + }) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def k2_client(k2_server): + async with k2_server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("stream", [True, False]) +@pytest.mark.parametrize("tool_choice", ["required"]) +async def test_tool_id_kimi_k2(k2_client: openai.AsyncOpenAI, model_name: str, + stream: bool, tool_choice: str): + + if not stream: + # Non-streaming test + chat_completion = await k2_client.chat.completions.create( + messages=messages, + model=model_name, + tools=tools, + tool_choice=tool_choice) + assert chat_completion.choices[0].message.tool_calls is not None + assert len(chat_completion.choices[0].message.tool_calls) > 0 + assert chat_completion.choices[0].message.tool_calls[ + 0].id == 'functions.get_current_weather:0' + else: + # Streaming test + output_stream = await k2_client.chat.completions.create( + messages=messages, + model=model_name, + tools=tools, + tool_choice=tool_choice, + stream=True) + + output = [] + async for chunk in output_stream: + if chunk.choices and chunk.choices[0].delta.tool_calls: + output.extend(chunk.choices[0].delta.tool_calls) + for o in output: + assert o.id is None or o.id == 'functions.get_current_weather:0' diff --git a/tests/utils.py b/tests/utils.py index e98707fb44475..4dba5494665a3 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -5,6 +5,7 @@ import asyncio import copy import functools import importlib +import json import os import signal import subprocess @@ -101,7 +102,8 @@ class RemoteOpenAIServer: env_dict: Optional[dict[str, str]] = None, seed: Optional[int] = 0, auto_port: bool = True, - max_wait_seconds: Optional[float] = None) -> None: + max_wait_seconds: Optional[float] = None, + override_hf_configs: Optional[dict[str, Any]] = None) -> None: if auto_port: if "-p" in vllm_serve_args or "--port" in vllm_serve_args: raise ValueError("You have manually specified the port " @@ -120,6 +122,12 @@ class RemoteOpenAIServer: vllm_serve_args = vllm_serve_args + ["--seed", str(seed)] + if override_hf_configs is not None: + vllm_serve_args = vllm_serve_args + [ + "--hf-overrides", + json.dumps(override_hf_configs) + ] + parser = FlexibleArgumentParser( description="vLLM's remote OpenAI server.") subparsers = parser.add_subparsers(required=False, dest="subparser") diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 74c8093f49674..87772a499f423 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1345,5 +1345,18 @@ def apply_mistral_chat_template( "template") raise ValueError(str(e)) from e -def random_tool_call_id() -> str: - return f"chatcmpl-tool-{random_uuid()}" +def get_history_tool_calls_cnt(conversation: list[ConversationMessage]): + idx = 0 + for msg in conversation: + if msg['role'] == 'assistant': + tool_calls = msg.get('tool_calls') + idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa + return idx + +def make_tool_call_id(id_type:str='random', func_name=None, idx=None): + + if id_type=='kimi_k2': + return f'functions.{func_name}:{idx}' + else: + # by default return random + return f"chatcmpl-tool-{random_uuid()}" diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 39facd4d53d32..a44868973f5d8 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -38,7 +38,7 @@ from typing_extensions import TypeAlias from vllm import envs from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, - random_tool_call_id) + make_tool_call_id) from vllm.entrypoints.score_utils import (ScoreContentPartParam, ScoreMultiModalParam) from vllm.logger import init_logger @@ -1634,7 +1634,7 @@ class FunctionCall(OpenAIBaseModel): class ToolCall(OpenAIBaseModel): - id: str = Field(default_factory=random_tool_call_id) + id: str = Field(default_factory=make_tool_call_id) type: Literal["function"] = "function" function: FunctionCall diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index d57868847eedd..65aac23ee618e 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -19,7 +19,8 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, ConversationMessage, - random_tool_call_id) + get_history_tool_calls_cnt, + make_tool_call_id) from vllm.entrypoints.harmony_utils import ( get_developer_message, get_stop_tokens_for_assistant_actions, get_streamable_parser_for_assistant, get_system_message, parse_chat_input, @@ -133,6 +134,10 @@ class OpenAIServingChat(OpenAIServing): source = "model" if source == "auto" else source logger.info("Using default chat sampling params from %s: %s", source, self.default_sampling_params) + if self.model_config.hf_config.model_type == 'kimi_k2': + self.tool_call_id_type = 'kimi_k2' + else: + self.tool_call_id_type = 'random' self.use_harmony = model_config.hf_config.model_type == "gpt_oss" if self.use_harmony: @@ -379,6 +384,7 @@ class OpenAIServingChat(OpenAIServing): current_text: Optional[str], delta_text: str, function_name_returned: bool, + tool_call_idx: Optional[int] = None ) -> tuple[Optional[DeltaMessage], bool]: if current_text is None or current_text == "": # if the current text is empty, we cannot parse it @@ -424,8 +430,12 @@ class OpenAIServingChat(OpenAIServing): current_tool_call = obj[-2] function_name_returned = True + tool_call_id = make_tool_call_id( + id_type=self.tool_call_id_type, + func_name=current_tool_call["name"], + idx=tool_call_idx) delta_message = DeltaMessage(tool_calls=[ - DeltaToolCall(id=random_tool_call_id(), + DeltaToolCall(id=tool_call_id, function=DeltaFunctionCall( name=current_tool_call["name"], arguments=arguments), @@ -491,6 +501,10 @@ class OpenAIServingChat(OpenAIServing): all_previous_token_ids: Optional[list[list[int]]] function_name_returned = [False] * num_choices + if self.tool_call_id_type == 'kimi_k2': + history_tool_call_cnt = get_history_tool_calls_cnt(conversation) + else: + history_tool_call_cnt = 0 # Always track previous_texts for comprehensive output logging previous_texts = [""] * num_choices @@ -673,7 +687,6 @@ class OpenAIServingChat(OpenAIServing): previous_text = previous_texts[i] previous_token_ids = all_previous_token_ids[i] current_text = previous_text + delta_text - # avoid the None + list error. if previous_token_ids: current_token_ids = previous_token_ids + as_list( @@ -733,7 +746,7 @@ class OpenAIServingChat(OpenAIServing): index=i) else: delta_tool_call = DeltaToolCall( - id=random_tool_call_id(), + id=make_tool_call_id(), type="function", function=DeltaFunctionCall( name=tool_choice_function_name, @@ -764,7 +777,11 @@ class OpenAIServingChat(OpenAIServing): previous_text=previous_text, current_text=content, delta_text=delta_text, - function_name_returned=fn_name_returned)) + function_name_returned=fn_name_returned, + tool_call_idx=history_tool_call_cnt)) + if (delta_message and delta_message.tool_calls and + delta_message.tool_calls[0].id is not None): + history_tool_call_cnt += 1 # update the previous values for the next iteration previous_texts[i] = current_text @@ -1089,6 +1106,10 @@ class OpenAIServingChat(OpenAIServing): assert final_res is not None choices: list[ChatCompletionResponseChoice] = [] + if self.tool_call_id_type == 'kimi_k2': + history_tool_call_cnt = get_history_tool_calls_cnt(conversation) + else: + history_tool_call_cnt = 0 role = self.get_chat_request_role(request) for output in final_res.outputs: @@ -1194,17 +1215,26 @@ class OpenAIServingChat(OpenAIServing): assert content is not None tool_calls = TypeAdapter( list[FunctionDefinition]).validate_json(content) + tool_call_ids = [] + for tool_call in tool_calls: + tool_call_ids.append( + make_tool_call_id(id_type=self.tool_call_id_type, + func_name=tool_call.name, + idx=history_tool_call_cnt)) + history_tool_call_cnt += 1 message = ChatMessage( role=role, content="", - reasoning_content=reasoning_content, tool_calls=[ - tool_call_class(function=FunctionCall( - name=tool_call.name, - arguments=json.dumps(tool_call.parameters, - ensure_ascii=False))) - for tool_call in tool_calls - ]) + tool_call_class(id=tool_call_ids[i], + function=FunctionCall( + name=tool_call.name, + arguments=json.dumps( + tool_call.parameters, + ensure_ascii=False))) + for i, tool_call in enumerate(tool_calls) + ], + reasoning_content=reasoning_content) # if the request doesn't use tool choice # OR specifies to not use a tool @@ -1248,7 +1278,6 @@ class OpenAIServingChat(OpenAIServing): if (tool_call_info.content and len(tool_call_info.content) > 0): ret_content = tool_call_info.content - message = ChatMessage(role=role, reasoning_content=reasoning_content, content=ret_content) @@ -1327,12 +1356,11 @@ class OpenAIServingChat(OpenAIServing): elif choice.message.tool_calls: # For tool calls, log the function name and arguments tool_call_descriptions = [] - for tool_call in choice.message.tool_calls: - if hasattr(tool_call.function, "name") and hasattr( - tool_call.function, "arguments"): + for tc in choice.message.tool_calls: + if hasattr(tc.function, "name") and hasattr( + tc.function, "arguments"): tool_call_descriptions.append( - f"{tool_call.function.name}({tool_call.function.arguments})" - ) + f"{tc.function.name}({tc.function.arguments})") tool_calls_str = ", ".join(tool_call_descriptions) output_text = f"[tool_calls: {tool_calls_str}]" diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py index da4760ad1b642..ac272b0c3b205 100644 --- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py @@ -6,7 +6,7 @@ from typing import Union import regex as re -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -267,7 +267,7 @@ class DeepSeekV3ToolParser(ToolParser): DeltaToolCall( index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True), diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index 5508ba6a39408..824b100f357b5 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -10,7 +10,7 @@ import partial_json_parser import regex as re from partial_json_parser.core.options import Allow -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -203,7 +203,7 @@ class Granite20bFCToolParser(ToolParser): delta = DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py index fcc5b7edda83f..ac517616a95b4 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -8,7 +8,7 @@ from typing import Union import partial_json_parser from partial_json_parser.core.options import Allow -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -185,7 +185,7 @@ class GraniteToolParser(ToolParser): delta = DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index d126130ab9bc3..a6ce33af6bd00 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -9,7 +9,7 @@ import partial_json_parser import regex as re from partial_json_parser.core.options import Allow -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -307,7 +307,7 @@ class Hermes2ProToolParser(ToolParser): return DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index 92004de030d14..6ef8fadf59ac5 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -8,7 +8,7 @@ from typing import Union import partial_json_parser from partial_json_parser.core.options import Allow -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -107,7 +107,7 @@ class Internlm2ToolParser(ToolParser): delta = DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py index 66b483d8b0f66..3b41f6034704c 100644 --- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -9,7 +9,7 @@ import partial_json_parser import regex as re from partial_json_parser.core.options import Allow -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -222,7 +222,7 @@ class JambaToolParser(ToolParser): delta = DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index 194a144ad576e..31b19c8db4163 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -10,7 +10,7 @@ import regex as re from partial_json_parser.core.options import Allow from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -213,7 +213,7 @@ class Llama3JsonToolParser(ToolParser): delta = DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, type="function", - id=random_tool_call_id(), + id=make_tool_call_id(), function=DeltaFunctionCall( name=function_name).model_dump( exclude_none=True)) diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py index 226309ef293a9..283e6095013d6 100644 --- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py @@ -7,7 +7,7 @@ from typing import Any, Optional, Union import regex as re -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -394,7 +394,7 @@ class MinimaxToolParser(ToolParser): sent_tools.append({ "sent_name": False, "sent_arguments": "", - "id": random_tool_call_id(), + "id": make_tool_call_id(), }) while len(tool_ids) < tool_count: diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py index 5501028cf36b8..85dd56213c6ac 100644 --- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py @@ -8,7 +8,7 @@ from typing import Any, Optional import regex as re from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaMessage, ExtractedToolCallInformation, @@ -74,7 +74,7 @@ class Phi4MiniJsonToolParser(ToolParser): tool_calls: list[ToolCall] = [ ToolCall( - id=random_tool_call_id(), + id=make_tool_call_id(), type="function", function=FunctionCall( name=raw_function_call["name"], diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py index 321718b1c950b..87cd413b37200 100644 --- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py @@ -7,7 +7,7 @@ from typing import Any, Optional, Union import regex as re -from vllm.entrypoints.chat_utils import random_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -226,7 +226,7 @@ class xLAMToolParser(ToolParser): function_name = name_match.group(1) # The test expects us to send just the name first - tool_id = random_tool_call_id() + tool_id = make_tool_call_id() delta = DeltaMessage(tool_calls=[ DeltaToolCall( index=0, From b95697d7310637399998ebf1f21a26b523aa6611 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 20 Aug 2025 13:03:37 -0700 Subject: [PATCH 145/361] [Frontend] improve error logging of chat completion (#22957) Signed-off-by: Chen Zhang --- vllm/entrypoints/openai/api_server.py | 74 +++++++++++++++++++++------ 1 file changed, 57 insertions(+), 17 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 24148bcef2353..14ba8aa641837 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -600,8 +600,11 @@ async def create_responses(request: ResponsesRequest, raw_request: Request): if handler is None: return base(raw_request).create_error_response( message="The model does not support Responses API") - - generator = await handler.create_responses(request, raw_request) + try: + generator = await handler.create_responses(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), @@ -618,7 +621,11 @@ async def retrieve_responses(response_id: str, raw_request: Request): return base(raw_request).create_error_response( message="The model does not support Responses API") - response = await handler.retrieve_responses(response_id) + try: + response = await handler.retrieve_responses(response_id) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(), @@ -633,7 +640,11 @@ async def cancel_responses(response_id: str, raw_request: Request): return base(raw_request).create_error_response( message="The model does not support Responses API") - response = await handler.cancel_responses(response_id) + try: + response = await handler.cancel_responses(response_id) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(), @@ -667,9 +678,11 @@ async def create_chat_completion(request: ChatCompletionRequest, if handler is None: return base(raw_request).create_error_response( message="The model does not support Chat Completions API") - - generator = await handler.create_chat_completion(request, raw_request) - + try: + generator = await handler.create_chat_completion(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.error.code) @@ -742,7 +755,11 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): return base(raw_request).create_error_response( message="The model does not support Embeddings API") - generator = await handler.create_embedding(request, raw_request) + try: + generator = await handler.create_embedding(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), @@ -770,8 +787,11 @@ async def create_pooling(request: PoolingRequest, raw_request: Request): if handler is None: return base(raw_request).create_error_response( message="The model does not support Pooling API") - - generator = await handler.create_pooling(request, raw_request) + try: + generator = await handler.create_pooling(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.error.code) @@ -791,7 +811,11 @@ async def create_classify(request: ClassificationRequest, return base(raw_request).create_error_response( message="The model does not support Classification API") - generator = await handler.create_classify(request, raw_request) + try: + generator = await handler.create_classify(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.error.code) @@ -820,7 +844,11 @@ async def create_score(request: ScoreRequest, raw_request: Request): return base(raw_request).create_error_response( message="The model does not support Score API") - generator = await handler.create_score(request, raw_request) + try: + generator = await handler.create_score(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.error.code) @@ -878,8 +906,12 @@ async def create_transcriptions(raw_request: Request, message="The model does not support Transcriptions API") audio_data = await request.file.read() - generator = await handler.create_transcription(audio_data, request, - raw_request) + try: + generator = await handler.create_transcription(audio_data, request, + raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), @@ -919,8 +951,12 @@ async def create_translations(request: Annotated[TranslationRequest, message="The model does not support Translations API") audio_data = await request.file.read() - generator = await handler.create_translation(audio_data, request, - raw_request) + try: + generator = await handler.create_translation(audio_data, request, + raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), @@ -949,7 +985,11 @@ async def do_rerank(request: RerankRequest, raw_request: Request): if handler is None: return base(raw_request).create_error_response( message="The model does not support Rerank (Score) API") - generator = await handler.do_rerank(request, raw_request) + try: + generator = await handler.do_rerank(request, raw_request) + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.error.code) From bf7c99dfc40bff6844b2ae57554516922eb93b71 Mon Sep 17 00:00:00 2001 From: Saurabh Misra Date: Wed, 20 Aug 2025 13:17:11 -0700 Subject: [PATCH 146/361] [Perf] Speed up function `_convert_tokens_to_string_with_added_encoders` by 13.7x (#20413) Signed-off-by: Saurabh Misra Signed-off-by: Aseem Saxena Co-authored-by: codeflash-ai[bot] <148906541+codeflash-ai[bot]@users.noreply.github.com> Co-authored-by: Aseem Saxena --- vllm/transformers_utils/detokenizer_utils.py | 25 ++++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py index be1040c3e0147..101f31d39cc1f 100644 --- a/vllm/transformers_utils/detokenizer_utils.py +++ b/vllm/transformers_utils/detokenizer_utils.py @@ -23,27 +23,32 @@ def _convert_tokens_to_string_with_added_encoders( # NOTE(woosuk): The following code is slow because it runs a for loop over # the output_tokens. In Python, running a for loop over a list can be slow # even when the loop body is very simple. + # Performance improvements: avoid repeated attribute and function lookups; + # localize frequently used objects; + sub_texts: list[str] = [] current_sub_text: list[str] = [] - all_special_tokens = set(tokenizer.all_special_tokens) + convert_tokens_to_string = tokenizer.convert_tokens_to_string + added_vocab_set = set(tokenizer.get_added_vocab()) + all_special_tokens = set( + tokenizer.all_special_tokens) if skip_special_tokens else () + for token in output_tokens: - if skip_special_tokens and token in all_special_tokens: + # Use precomputed set for skip-special check + if token in all_special_tokens: continue - if token in tokenizer.get_added_vocab(): + if token in added_vocab_set: if current_sub_text: - sub_text = tokenizer.convert_tokens_to_string(current_sub_text) - sub_texts.append(sub_text) - current_sub_text = [] + sub_texts.append(convert_tokens_to_string(current_sub_text)) + current_sub_text.clear() sub_texts.append(token) else: current_sub_text.append(token) if current_sub_text: - sub_text = tokenizer.convert_tokens_to_string(current_sub_text) - sub_texts.append(sub_text) + sub_texts.append(convert_tokens_to_string(current_sub_text)) if spaces_between_special_tokens: return " ".join(sub_texts) - else: - return "".join(sub_texts) + return "".join(sub_texts) # 5 is an arbitrary value that should work for all From 4e51fa8cbaba2c6fd516b4615a533b0a94796516 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 20 Aug 2025 16:28:30 -0400 Subject: [PATCH 147/361] Do not use eval() to convert unknown types (#23266) Signed-off-by: Russell Bryant --- .../openai/tool_parsers/qwen3coder_tool_parser.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py index cf4d0b231aee1..2501d6739e8f6 100644 --- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py @@ -208,15 +208,10 @@ class Qwen3CoderToolParser(ToolParser): "valid JSON object in tool '%s', will try other " "methods to parse it.", param_value, param_name, func_name) - try: - converted_value = eval(param_value) - return converted_value - except Exception: - logger.warning( - "Parsed value '%s' of parameter '%s' cannot be " - "converted via Python `eval()` in tool '%s', " - "degenerating to string.", param_value, param_name, - func_name) + logger.warning( + "Parameter '%s' has unknown type '%s'. " + "The value will be treated as a string.", param_name, + param_type) return param_value # Extract function name From 4fbda0b20cc539f72314375c2abc6100ebac8392 Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Thu, 21 Aug 2025 05:07:28 +0800 Subject: [PATCH 148/361] [Feature] use --eplb_config to set eplb param (#20562) Signed-off-by: rongfu.leng Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: rongfu.leng Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/__init__.py | 3 +- vllm/config/parallel.py | 108 +++++++++++++++++----- vllm/distributed/eplb/eplb_state.py | 4 +- vllm/engine/arg_utils.py | 63 +++++++++---- vllm/model_executor/models/deepseek_v2.py | 4 +- vllm/model_executor/models/glm4_moe.py | 4 +- vllm/model_executor/models/qwen3_moe.py | 7 +- vllm/v1/worker/gpu_model_runner.py | 4 +- vllm/v1/worker/gpu_worker.py | 4 +- 9 files changed, 149 insertions(+), 52 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 62dfd4333bee8..959f111ced22e 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -33,7 +33,8 @@ from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType, PrefixCachingHashAlgo) from vllm.config.compilation import (CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig) -from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig +from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, + ParallelConfig) from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy from vllm.config.utils import ConfigType, config from vllm.logger import init_logger diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 7a9e68f0ea332..2b716a77066ac 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -6,7 +6,7 @@ from dataclasses import field from typing import TYPE_CHECKING, Any, Literal, Optional, Union import torch -from pydantic import model_validator +from pydantic import TypeAdapter, model_validator from pydantic.dataclasses import dataclass from torch.distributed import ProcessGroup, ReduceOp from typing_extensions import Self @@ -32,6 +32,38 @@ logger = init_logger(__name__) DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"] +@config +@dataclass +class EPLBConfig: + """Configuration for Expert Parallel Load Balancing (EP).""" + + window_size: int = 1000 + """Window size for expert load recording.""" + step_interval: int = 3000 + """ + Interval for rearranging experts in expert parallelism. + + Note that if this is greater than the EPLB window size, only the metrics + of the last `lb_window_size` steps will be used for rearranging experts. + """ + + num_redundant_experts: int = 0 + """Number of redundant experts to use for expert parallelism.""" + + log_balancedness: bool = False + """ + Log the balancedness each step of expert parallelism. + This is turned off by default since it will cause communication overhead. + """ + + @classmethod + def from_cli(cls, cli_value: str) -> "EPLBConfig": + """Parse the CLI value for the compilation config. + -O1, -O2, -O3, etc. is handled in FlexibleArgumentParser. + """ + return TypeAdapter(EPLBConfig).validate_json(cli_value) + + @config @dataclass class ParallelConfig: @@ -75,22 +107,24 @@ class ParallelConfig: """Use expert parallelism instead of tensor parallelism for MoE layers.""" enable_eplb: bool = False """Enable expert parallelism load balancing for MoE layers.""" - num_redundant_experts: int = 0 - """Number of redundant experts to use for expert parallelism.""" - eplb_window_size: int = 1000 - """Window size for expert load recording.""" - eplb_step_interval: int = 3000 - """ - Interval for rearranging experts in expert parallelism. - - Note that if this is greater than the EPLB window size, only the metrics - of the last `eplb_window_size` steps will be used for rearranging experts. - """ - eplb_log_balancedness: bool = False - """ - Log the balancedness each step of expert parallelism. - This is turned off by default since it will cause communication overhead. - """ + eplb_config: EPLBConfig = field(default_factory=EPLBConfig) + """Expert parallelism configuration.""" + num_redundant_experts: Optional[int] = None + """`num_redundant_experts` is deprecated and has been replaced with + `eplb_config.num_redundant_experts`. This will be removed in v0.12.0. + Please use `eplb_config.num_redundant_experts` instead.""" + eplb_window_size: Optional[int] = None + """`eplb_window_size` is deprecated and has been replaced with + `eplb_config.window_size`. This will be removed in v0.12.0. + Please use `eplb_config.window_size` instead.""" + eplb_step_interval: Optional[int] = None + """`eplb_step_interval` is deprecated and has been replaced with + `eplb_config.step_interval`. This will be removed in v0.12.0. + Please use `eplb_config.step_interval` instead.""" + eplb_log_balancedness: Optional[bool] = None + """`eplb_log_balancedness` is deprecated and has been replaced with + `eplb_config.log_balancedness`. This will be removed in v0.12.0. + Please use `eplb_config.log_balancedness` instead.""" max_parallel_loading_workers: Optional[int] = None """Maximum number of parallel loading workers when loading model @@ -237,6 +271,38 @@ class ParallelConfig: return hashlib.sha256(str(factors).encode()).hexdigest() def __post_init__(self) -> None: + # Forward deprecated fields to their new location + if self.num_redundant_experts is not None: + self.eplb_config.num_redundant_experts = ( + self.num_redundant_experts) + logger.warning_once( + "num_redundant_experts is deprecated and has been replaced " + "with eplb_config.num_redundant_experts. This will be removed " + "in v0.12.0. Changing this field after initialization will " + "have no effect.") + if self.eplb_window_size is not None: + self.eplb_config.window_size = self.eplb_window_size + logger.warning_once( + "eplb_window_size is deprecated and has been replaced " + "with eplb_config.window_size. This will be removed " + "in v0.12.0. Changing this field after initialization will " + "have no effect.") + if self.eplb_step_interval is not None: + self.eplb_config.step_interval = self.eplb_step_interval + logger.warning_once( + "eplb_step_interval is deprecated and has been replaced " + "with eplb_config.step_interval. This will be removed " + "in v0.12.0. Changing this field after initialization will " + "have no effect.") + if self.eplb_log_balancedness is not None: + self.eplb_config.log_balancedness = self.eplb_log_balancedness + logger.warning_once( + "eplb_log_balancedness is deprecated and has been replaced " + "with eplb_config.log_balancedness. This will be removed " + "in v0.12.0. Changing this field after initialization will " + "have no effect.") + + # Continue with the rest of the initialization self.world_size = self.pipeline_parallel_size * \ self.tensor_parallel_size @@ -275,10 +341,10 @@ class ParallelConfig: raise ValueError( "Expert parallelism load balancing is only supported on " "CUDA devices now.") - if self.num_redundant_experts < 0: + if self.eplb_config.num_redundant_experts < 0: raise ValueError( "num_redundant_experts must be non-negative, but got " - f"{self.num_redundant_experts}.") + f"{self.eplb_config.num_redundant_experts}.") if not self.enable_expert_parallel: raise ValueError( "enable_expert_parallel must be True to use EPLB.") @@ -289,10 +355,10 @@ class ParallelConfig: f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}." ) else: - if self.num_redundant_experts != 0: + if self.eplb_config.num_redundant_experts != 0: raise ValueError( "num_redundant_experts should be used with EPLB." - f"{self.num_redundant_experts}.") + f"{self.eplb_config.num_redundant_experts}.") if self.distributed_executor_backend is None and self.world_size > 1: # We use multiprocessing by default if world_size fits on the # current node and we aren't in a ray placement group. diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 979f2a06cec9f..042acf40d67c2 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -244,7 +244,7 @@ class EplbState: dtype=torch.int32, device=device, ) - expert_load_window_size = parallel_config.eplb_window_size + expert_load_window_size = parallel_config.eplb_config.window_size expert_load_window = torch.zeros( (expert_load_window_size, model.num_moe_layers, model.num_physical_experts), @@ -253,7 +253,7 @@ class EplbState: ) # Set the initial progress of rearrangement to 3/4 - eplb_step_interval = parallel_config.eplb_step_interval + eplb_step_interval = parallel_config.eplb_config.step_interval expert_rearrangement_step = max( 0, eplb_step_interval - eplb_step_interval // 4) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6869c3f23f315..dcf78758946f9 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -25,7 +25,7 @@ import vllm.envs as envs from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, ConfigFormat, ConfigType, ConvertOption, DecodingConfig, DetailedTraceModules, Device, - DeviceConfig, DistributedExecutorBackend, + DeviceConfig, DistributedExecutorBackend, EPLBConfig, GuidedDecodingBackend, HfOverrides, KVEventsConfig, KVTransferConfig, LoadConfig, LogprobsMode, LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig, @@ -305,11 +305,12 @@ class EngineArgs: data_parallel_hybrid_lb: bool = False data_parallel_backend: str = ParallelConfig.data_parallel_backend enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel + eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config") enable_eplb: bool = ParallelConfig.enable_eplb - num_redundant_experts: int = ParallelConfig.num_redundant_experts - eplb_window_size: int = ParallelConfig.eplb_window_size - eplb_step_interval: int = ParallelConfig.eplb_step_interval - eplb_log_balancedness: bool = ParallelConfig.eplb_log_balancedness + num_redundant_experts: int = EPLBConfig.num_redundant_experts + eplb_window_size: int = EPLBConfig.window_size + eplb_step_interval: int = EPLBConfig.step_interval + eplb_log_balancedness: bool = EPLBConfig.log_balancedness max_parallel_loading_workers: Optional[ int] = ParallelConfig.max_parallel_loading_workers block_size: Optional[BlockSize] = CacheConfig.block_size @@ -454,6 +455,9 @@ class EngineArgs: if isinstance(self.compilation_config, dict): self.compilation_config = CompilationConfig( **self.compilation_config) + if isinstance(self.eplb_config, dict): + self.eplb_config = EPLBConfig.from_cli(json.dumps( + self.eplb_config)) # Setup plugins from vllm.plugins import load_general_plugins load_general_plugins() @@ -661,14 +665,32 @@ class EngineArgs: **parallel_kwargs["enable_expert_parallel"]) parallel_group.add_argument("--enable-eplb", **parallel_kwargs["enable_eplb"]) - parallel_group.add_argument("--num-redundant-experts", - **parallel_kwargs["num_redundant_experts"]) - parallel_group.add_argument("--eplb-window-size", - **parallel_kwargs["eplb_window_size"]) - parallel_group.add_argument("--eplb-step-interval", - **parallel_kwargs["eplb_step_interval"]) - parallel_group.add_argument("--eplb-log-balancedness", - **parallel_kwargs["eplb_log_balancedness"]) + parallel_group.add_argument("--eplb-config", + **parallel_kwargs["eplb_config"]) + parallel_group.add_argument( + "--num-redundant-experts", + type=int, + help= + "[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.", + deprecated=True) + parallel_group.add_argument( + "--eplb-window-size", + type=int, + help="[DEPRECATED] --eplb-window-size will be removed in v0.12.0.", + deprecated=True) + parallel_group.add_argument( + "--eplb-step-interval", + type=int, + help= + "[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.", + deprecated=True) + parallel_group.add_argument( + "--eplb-log-balancedness", + action=argparse.BooleanOptionalAction, + help= + "[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.", + deprecated=True) + parallel_group.add_argument( "--max-parallel-loading-workers", **parallel_kwargs["max_parallel_loading_workers"]) @@ -1244,6 +1266,16 @@ class EngineArgs: "Currently, speculative decoding is not supported with " "async scheduling.") + # Forward the deprecated CLI args to the EPLB config. + if self.num_redundant_experts is not None: + self.eplb_config.num_redundant_experts = self.num_redundant_experts + if self.eplb_window_size is not None: + self.eplb_config.window_size = self.eplb_window_size + if self.eplb_step_interval is not None: + self.eplb_config.step_interval = self.eplb_step_interval + if self.eplb_log_balancedness is not None: + self.eplb_config.log_balancedness = self.eplb_log_balancedness + parallel_config = ParallelConfig( pipeline_parallel_size=self.pipeline_parallel_size, tensor_parallel_size=self.tensor_parallel_size, @@ -1257,10 +1289,7 @@ class EngineArgs: data_parallel_hybrid_lb=self.data_parallel_hybrid_lb, enable_expert_parallel=self.enable_expert_parallel, enable_eplb=self.enable_eplb, - num_redundant_experts=self.num_redundant_experts, - eplb_window_size=self.eplb_window_size, - eplb_step_interval=self.eplb_step_interval, - eplb_log_balancedness=self.eplb_log_balancedness, + eplb_config=self.eplb_config, max_parallel_loading_workers=self.max_parallel_loading_workers, disable_custom_all_reduce=self.disable_custom_all_reduce, ray_workers_use_nsight=self.ray_workers_use_nsight, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index f199da135ec76..d56224b4b7b30 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -132,10 +132,10 @@ class DeepseekV2MoE(nn.Module): # Load balancing settings. vllm_config = get_current_vllm_config() - parallel_config = vllm_config.parallel_config + eplb_config = vllm_config.parallel_config.eplb_config self.enable_eplb = enable_eplb - self.n_redundant_experts = parallel_config.num_redundant_experts + self.n_redundant_experts = eplb_config.num_redundant_experts self.n_logical_experts = self.n_routed_experts self.n_physical_experts = (self.n_logical_experts + self.n_redundant_experts) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index aff491f9596c3..fe5e46a99826f 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -131,10 +131,10 @@ class Glm4MoE(nn.Module): # Load balancing settings. vllm_config = get_current_vllm_config() - parallel_config = vllm_config.parallel_config + eplb_config = vllm_config.parallel_config.eplb_config self.enable_eplb = enable_eplb - self.n_redundant_experts = parallel_config.num_redundant_experts + self.n_redundant_experts = eplb_config.num_redundant_experts self.n_logical_experts = self.n_routed_experts self.n_physical_experts = (self.n_logical_experts + self.n_redundant_experts) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 05bbb0d2e8995..2812f79a66b70 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -121,11 +121,11 @@ class Qwen3MoeSparseMoeBlock(nn.Module): # Load balancing settings. vllm_config = get_current_vllm_config() - parallel_config = vllm_config.parallel_config + eplb_config = vllm_config.parallel_config.eplb_config self.enable_eplb = enable_eplb self.n_logical_experts = self.n_routed_experts - self.n_redundant_experts = parallel_config.num_redundant_experts + self.n_redundant_experts = eplb_config.num_redundant_experts self.n_physical_experts = (self.n_logical_experts + self.n_redundant_experts) self.n_local_physical_experts = self.n_physical_experts // self.ep_size @@ -363,7 +363,8 @@ class Qwen3MoeModel(nn.Module): quant_config = vllm_config.quant_config parallel_config = vllm_config.parallel_config enable_eplb = parallel_config.enable_eplb - self.num_redundant_experts = parallel_config.num_redundant_experts + eplb_config = parallel_config.eplb_config + self.num_redundant_experts = eplb_config.num_redundant_experts self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d9770226b14ee..33747d6917a5a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1435,7 +1435,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): model, is_dummy, is_profile, - log_stats=self.parallel_config.eplb_log_balancedness, + log_stats=self.parallel_config.eplb_config.log_balancedness, ) def get_dp_padding(self, @@ -1977,7 +1977,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): global_expert_load, old_global_expert_indices = ( EplbState.recv_state()) num_logical_experts = global_expert_load.shape[1] - self.parallel_config.num_redundant_experts = ( + self.parallel_config.eplb_config.num_redundant_experts = ( num_local_physical_experts * new_ep_size - num_logical_experts) assert old_global_expert_indices.shape[ 1] % num_local_physical_experts == 0 diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 22e639b97d09c..d61177d4245dd 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -515,7 +515,7 @@ class Worker(WorkerBase): assert self.model_runner.eplb_state is not None new_physical_experts = \ self.model_runner.eplb_state.physical_to_logical_map.shape[1] - parallel_config.num_redundant_experts = ( + parallel_config.eplb_config.num_redundant_experts = ( new_physical_experts - self.model_runner.eplb_state.logical_replica_count.shape[1]) global_expert_load = None @@ -531,7 +531,7 @@ class Worker(WorkerBase): assert self.model_runner.eplb_state is not None global_expert_load = self.model_runner.eplb_state.rearrange( self.model_runner.model, execute_shuffle=False) - parallel_config.num_redundant_experts = ( + parallel_config.eplb_config.num_redundant_experts = ( new_physical_experts - global_expert_load.shape[1]) prepare_communication_buffer_for_model(self.model_runner.model) self.model_runner.model.update_physical_experts_metadata( From 1b125004bea9f4cd120d3ce96dc1d3a2962ebace Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 21 Aug 2025 05:15:34 +0800 Subject: [PATCH 149/361] [misc] fix multiple arch wheels for the nightly index (#23110) Signed-off-by: youkaichao --- .buildkite/generate_index.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py index 7045d8810493e..6b5a2a99356aa 100644 --- a/.buildkite/generate_index.py +++ b/.buildkite/generate_index.py @@ -8,7 +8,8 @@ template = """

Links for vLLM

-
{wheel}
+ {x86_wheel}
+ {arm_wheel}
""" @@ -21,7 +22,20 @@ filename = os.path.basename(args.wheel) with open("index.html", "w") as f: print(f"Generated index.html for {args.wheel}") + if "x86_64" in filename: + x86_wheel = filename + arm_wheel = filename.replace("x86_64", "aarch64") + elif "aarch64" in filename: + x86_wheel = filename.replace("aarch64", "x86_64") + arm_wheel = filename + else: + raise ValueError(f"Unsupported wheel: {filename}") # cloudfront requires escaping the '+' character f.write( - template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B")) + template.format( + x86_wheel=x86_wheel, + x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"), + arm_wheel=arm_wheel, + arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"), + ) ) From a4fbb32fab3d2f91b3672bf581565378aaa18d6c Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 20 Aug 2025 17:43:17 -0400 Subject: [PATCH 150/361] Remove chunked_prefill_enabled flag in V1 MLA (#23183) Signed-off-by: Matthew Bonanni --- vllm/v1/attention/backends/mla/common.py | 50 +++++++++++------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index f2610671f769e..646e4fec836bd 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -416,7 +416,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): self.model_config = vllm_config.model_config cache_config = vllm_config.cache_config parallel_config = vllm_config.parallel_config - self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled self.num_heads = self.model_config.get_num_attention_heads( parallel_config) self.mla_dims = get_mla_dims(self.model_config) @@ -426,30 +425,28 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): if self.aot_schedule: self.page_size = self.kv_cache_spec.block_size - if self.chunked_prefill_enabled: - self.chunked_prefill_workspace_size = min( - # Max sure there is enough for 8 full length request or at least - # 4 pages of cache per request - max( - 8 * self.model_config.max_model_len, 4 * - scheduler_config.max_num_seqs * cache_config.block_size), - # For long-context models try not to over-allocate limiting - # kv-cache space, limiting it to 64k tokens, - # which would result in the workspace being: - # 2*(576)*(64*1024) = 144mb - # (assuming 576 MLA head dim, and fp16) - # which would result in up-projected context being - # 2*(192*128)*(64*1024) = 3gb - # (assuming 192 QK head dim, 128 heads, and fp16) - 128 * 1024) - assert self.chunked_prefill_workspace_size >= \ - scheduler_config.max_num_seqs * cache_config.block_size - self.chunked_prefill_workspace = torch.empty( - (self.chunked_prefill_workspace_size, - self.model_config.get_head_size()), - dtype=self.model_config.dtype, - device=device, - ) + self.chunked_prefill_workspace_size = min( + # Max sure there is enough for 8 full length request or at least + # 4 pages of cache per request + max(8 * self.model_config.max_model_len, + 4 * scheduler_config.max_num_seqs * cache_config.block_size), + # For long-context models try not to over-allocate limiting + # kv-cache space, limiting it to 64k tokens, + # which would result in the workspace being: + # 2*(576)*(64*1024) = 144mb + # (assuming 576 MLA head dim, and fp16) + # which would result in up-projected context being + # 2*(192*128)*(64*1024) = 3gb + # (assuming 192 QK head dim, 128 heads, and fp16) + 128 * 1024) + assert self.chunked_prefill_workspace_size >= \ + scheduler_config.max_num_seqs * cache_config.block_size + self.chunked_prefill_workspace = torch.empty( + (self.chunked_prefill_workspace_size, + self.model_config.get_head_size()), + dtype=self.model_config.dtype, + device=device, + ) self._use_cudnn_prefill = use_cudnn_prefill() self._use_fi_prefill = use_flashinfer_prefill() @@ -620,8 +617,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): reqs_start:] - query_start_loc[reqs_start] chunked_context_metadata = None - if self.chunked_prefill_enabled and num_prefills > 0 \ - and max_context_len_cpu > 0: + if max_context_len_cpu > 0: # NOTE: it is recommend you read the `Chunked Prefill` section # in the comment at the top of the file before trying to # understand the following code From 10cc12ba66834e33659f1ce3a00235506db20dd5 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 20 Aug 2025 17:46:47 -0400 Subject: [PATCH 151/361] Feature/mla tests (#23195) Signed-off-by: Matthew Bonanni Signed-off-by: Matthew Bonanni --- tests/v1/attention/test_attention_backends.py | 26 +- tests/v1/attention/test_mla_backends.py | 522 ++++++++++++++++++ tests/v1/attention/utils.py | 11 +- vllm/v1/attention/backends/mla/common.py | 16 +- 4 files changed, 551 insertions(+), 24 deletions(-) create mode 100644 tests/v1/attention/test_mla_backends.py diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py index ac08b9052cd80..60e04ad9069e7 100644 --- a/tests/v1/attention/test_attention_backends.py +++ b/tests/v1/attention/test_attention_backends.py @@ -150,15 +150,15 @@ def create_and_prepopulate_kv_cache( # Permute the context blocks (excluding block 0 which is null) if randomize_blocks: - perm = torch.randperm( - blocks_end - 1) + 1 # Random permutation starting from block 1 + # Random permutation starting from block 1 + perm = torch.randperm(blocks_end - 1) + 1 else: - perm = torch.arange( - 1, blocks_end) # Sequential order starting from block 1 + # Sequential order starting from block 1 + perm = torch.arange(1, blocks_end) inv_perm = torch.zeros(blocks_end, dtype=torch.long, device=device) - inv_perm[1:] = torch.argsort( - perm) + 1 # Add 1 to account for starting from block 1 + # Add 1 to account for starting from block 1 + inv_perm[1:] = torch.argsort(perm) + 1 kv_cache[:, 1:blocks_end, ...] = kv_cache[:, perm, ...] # Construct the right block table @@ -281,7 +281,8 @@ def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec, @pytest.mark.parametrize("batch_spec_name", [ "small_decode", "small_prefill", "mixed_small", "medium_decode", - "medium_prefill", "mixed_medium" + "medium_prefill", "mixed_medium", "large_decode", "large_prefill", + "single_decode", "single_prefill" ]) @pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"]) def test_backend_correctness(batch_spec_name: str, model: str): @@ -302,7 +303,8 @@ def test_backend_correctness(batch_spec_name: str, model: str): """ batch_spec = BATCH_SPECS[batch_spec_name] vllm_config = create_vllm_config(model_name=model, - max_model_len=max(batch_spec.seq_lens)) + max_model_len=max(batch_spec.seq_lens), + num_gpu_blocks=8192) device = torch.device("cuda:0") kv_cache_spec = create_standard_kv_cache_spec(vllm_config) @@ -465,12 +467,6 @@ def test_backend_correctness(batch_spec_name: str, model: str): rtol=rtol, atol=atol) - if not all_close: - print(f"[{backend_name}] output differs from SDPA baseline. " - f"Max diff: {max_diff:.6f} (rel: {max_rel_diff:.6f})") - print(f"[{backend_name}] output: {backend_output}") - print(f"[{backend_name}] SDPA baseline: {sdpa_output}") - assert all_close, ( f"[{backend_name}] output differs from SDPA baseline. " - f"Max diff: {max_diff:.6f} (rel: {max_rel_diff:.6f})") + f"Max diff: {max_diff:.6f}, max rel diff: {max_rel_diff:.6f})") \ No newline at end of file diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py new file mode 100644 index 0000000000000..24070358799ef --- /dev/null +++ b/tests/v1/attention/test_mla_backends.py @@ -0,0 +1,522 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for v1 MLA backends without GPUModelRunner dependency.""" + +import pytest +import torch + +from tests.v1.attention.utils import (BatchSpec, _Backend, + create_common_attn_metadata, + create_standard_kv_cache_spec, + create_vllm_config, + get_attention_backend) +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv +from vllm.v1.attention.backends.utils import CommonAttentionMetadata +from vllm.v1.kv_cache_interface import FullAttentionSpec + +BACKENDS_TO_TEST = [ + _Backend.CUTLASS_MLA, _Backend.FLASHMLA_VLLM_V1, + _Backend.TRITON_MLA_VLLM_V1 +] + +# Remove CUTLASS_MLA from the list if not using sm100 +if not torch.cuda.is_available() or torch.cuda.get_device_properties( + 0).major < 10: + BACKENDS_TO_TEST.remove(_Backend.CUTLASS_MLA) + +torch.manual_seed(42) + + +def _convert_dtype_to_torch(dtype): + """Convert ModelDType to torch.dtype.""" + if isinstance(dtype, str): + if dtype == "auto": + return torch.float16 # Default dtype for testing + elif dtype in STR_DTYPE_TO_TORCH_DTYPE: + return STR_DTYPE_TO_TORCH_DTYPE[dtype] + else: + raise ValueError(f"Unknown dtype: {dtype}") + elif isinstance(dtype, torch.dtype): + return dtype + else: + raise ValueError(f"Unknown dtype: {dtype}") + + +# Define common batch configurations +BATCH_SPECS = { + "small_decode": + BatchSpec(seq_lens=[32, 40], query_lens=[1, 1]), + "small_prefill": + BatchSpec(seq_lens=[32, 40], query_lens=[8, 8]), + "mixed_small": + BatchSpec(seq_lens=[32, 40, 48, 56], query_lens=[1, 1, 5, 5]), + "medium_decode": + BatchSpec(seq_lens=[128, 256, 512, 1024, 128, 256, 512, 1024], + query_lens=[1, 1, 1, 1, 1, 1, 1, 1]), + "medium_prefill": + BatchSpec(seq_lens=[256, 512, 1024, 2048], query_lens=[16, 16, 16, 16]), + "mixed_medium": + BatchSpec(seq_lens=[512, 1024, 2048, 512, 1024, 2048], + query_lens=[1, 1, 1, 7, 7, 7]), + "large_decode": + BatchSpec(seq_lens=[2048] * 32, query_lens=[1] * 32), + "large_prefill": + BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8), + "single_decode": + BatchSpec(seq_lens=[1024], query_lens=[1]), + "single_prefill": + BatchSpec(seq_lens=[1024], query_lens=[64]), +} + + +def create_dummy_kv_cache(kv_cache_spec: FullAttentionSpec, + device: torch.device, + num_blocks: int = 100) -> torch.Tensor: + """Create a dummy KV cache tensor for testing.""" + kv_cache = torch.randn( + num_blocks, + kv_cache_spec.block_size, + kv_cache_spec.head_size, # latent dimension + dtype=_convert_dtype_to_torch(kv_cache_spec.dtype), + device=device, + ) + return kv_cache + + +def create_and_prepopulate_kv_cache( + kv_c_contexts: list[torch.Tensor], + k_pe_contexts: list[torch.Tensor], + block_size: int, + num_kv_heads: int, + head_size: int, + dtype: torch.dtype, + device: torch.device, + num_blocks: int, + common_attn_metadata: CommonAttentionMetadata, + randomize_blocks: bool = True) -> torch.Tensor: + """Create and prepopulate an MLA KV cache with context data. + + Args: + kv_c_contexts: List of latent KV context tensors for each sequence + k_pe_contexts: List of key positional embedding context tensors + for each sequence + block_size: Size of each block + num_kv_heads: Number of KV heads (should be 1 for MLA) + head_size: Size of each head (latent dimension) + dtype: Data type for the cache + device: Device to create the cache on + num_blocks: Total number of blocks in the cache + common_attn_metadata: Common attention metadata + randomize_blocks: Whether to randomly permute blocks + or use sequential order + + Returns: + MLA KV cache tensor + """ + batch_size = len(kv_c_contexts) + seq_lens = common_attn_metadata.seq_lens_cpu + query_lens = common_attn_metadata.query_start_loc_cpu[ + 1:] - common_attn_metadata.query_start_loc_cpu[:-1] + context_lens = common_attn_metadata.num_computed_tokens_cpu + block_table = common_attn_metadata.block_table_tensor + slot_mapping = common_attn_metadata.slot_mapping + + # Create MLA KV cache: (num_blocks, block_size, head_size) + kv_cache = torch.empty(num_blocks, + block_size, + head_size, + dtype=dtype, + device=device) + kv_cache_flat = kv_cache.view(-1, head_size) + + # Populate the cache with the context tokens + # Start from block_id=1 since block_id=0 is considered the null block + start_block_idx = 1 + for i in range(batch_size): + kv_c_context, k_pe_context = kv_c_contexts[i], k_pe_contexts[i] + kv_context = torch.cat([kv_c_context, k_pe_context.squeeze(1)], dim=-1) + start = start_block_idx * block_size + end = start + kv_context.shape[0] + kv_cache_flat[start:end, ...] = kv_context + + # Stay block aligned and allocate enough blocks for the new tokens + start_block_idx += cdiv(int(seq_lens[i]), block_size) + + blocks_end = start_block_idx + + # Permute the context blocks (excluding block 0 which is null) + if randomize_blocks: + perm = torch.randperm( + blocks_end - 1) + 1 # Random permutation starting from block 1 + else: + perm = torch.arange( + 1, blocks_end) # Sequential order starting from block 1 + + inv_perm = torch.zeros(blocks_end, dtype=torch.long, device=device) + inv_perm[1:] = torch.argsort( + perm) + 1 # Add 1 to account for starting from block 1 + kv_cache[1:blocks_end, ...] = kv_cache[perm, ...] + + # Construct the right block table + # Start from block_id=1 since block_id=0 is considered the null block + start_block_idx = 1 + for i in range(batch_size): + num_blocks_for_seq = cdiv(int(seq_lens[i]), block_size) + start = start_block_idx + end = start + num_blocks_for_seq + block_table[i, :num_blocks_for_seq] = inv_perm[start:end] + start_block_idx += num_blocks_for_seq + + # Create a realistic slot mapping that corresponds to the block table + for i in range(batch_size): + token_offsets = torch.arange(int(query_lens[i])) + int(context_lens[i]) + block_indices = token_offsets // block_size + token_inter_block_offsets = token_offsets % block_size + start = common_attn_metadata.query_start_loc_cpu[i] + end = common_attn_metadata.query_start_loc_cpu[i + 1] + slot_mapping[start:end] = block_table[ + i, + block_indices] * block_size + token_inter_block_offsets.to(device) + + return kv_cache + + +class MockAttentionLayer: + """A mock attention layer for testing.""" + + def __init__(self, device: torch.device): + self._q_scale = torch.tensor(1.0, device=device) + self._k_scale = torch.tensor(1.0, device=device) + self._v_scale = torch.tensor(1.0, device=device) + + +def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec, + layer_names: list[str], vllm_config, + device: torch.device, + common_attn_metadata: CommonAttentionMetadata, + query: torch.Tensor, kv_c: torch.Tensor, + k_pe: torch.Tensor, kv_cache: torch.Tensor, + kv_lora_rank: int, qk_nope_head_dim: int, + qk_rope_head_dim: int, v_head_dim: int, + mock_kv_b_proj) -> torch.Tensor: + """Run attention computation using the specified backend's AttentionImpl.""" + + builder_cls, impl_cls = get_attention_backend(backend) + + # Build metadata + builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device) + attn_metadata = builder.build( + common_prefix_len=0, + common_attn_metadata=common_attn_metadata, + ) + + # Instantiate MLA implementation + num_heads = vllm_config.model_config.get_num_attention_heads( + vllm_config.parallel_config) + num_kv_heads = vllm_config.model_config.get_num_kv_heads( + vllm_config.parallel_config) + head_size = vllm_config.model_config.get_head_size() + scale = 1.0 / (head_size**0.5) + impl = impl_cls( + num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=num_kv_heads, + alibi_slopes=None, + sliding_window=None, + kv_cache_dtype="auto", + logits_soft_cap=None, + attn_type="decoder", + kv_sharing_target_layer_name=None, + q_lora_rank=None, + kv_lora_rank=kv_lora_rank, + qk_nope_head_dim=qk_nope_head_dim, + qk_rope_head_dim=qk_rope_head_dim, + qk_head_dim=qk_nope_head_dim + qk_rope_head_dim, + v_head_dim=v_head_dim, + kv_b_proj=mock_kv_b_proj, + ) + + # Process weights to create W_UK_T and W_UV attributes needed by MLA + act_dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype) + impl.process_weights_after_loading(act_dtype) + + # Create mock layer and output buffer + mock_layer = MockAttentionLayer(device) + num_tokens = query.shape[0] + output = torch.empty(num_tokens, + num_heads * v_head_dim, + dtype=query.dtype, + device=query.device) + + # Run forward pass + # NOTE: The query, key, and value are already shaped correctly + # in the calling test function. + output = impl.forward(mock_layer, + query, + kv_c, + k_pe, + kv_cache, + attn_metadata, + output=output) + + return output + + +@pytest.mark.parametrize("batch_spec_name", [ + "small_decode", "small_prefill", "mixed_small", "medium_decode", + "medium_prefill", "mixed_medium", "large_decode", "large_prefill", + "single_decode", "single_prefill" +]) +@pytest.mark.parametrize("model", ["deepseek-ai/DeepSeek-V2-Lite-Chat"]) +def test_backend_correctness(dist_init, batch_spec_name: str, model: str): + """ + Test that all backends produce similar outputs to a reference implementation + using torch.nn.functional.scaled_dot_product_attention. + + This test works by: + 1. Generating a batch of sequences with specified context and query lengths. + 2. Computing a ground-truth attention output using torch.sdpa on + contiguous Q, K, and V tensors. + 3. Simulating vLLM's paged KV cache: It takes the context portion of the + K/V tensors and manually places them into a paged buffer according to + the test's (randomly generated) block table. + 4. Running each vLLM attention backend with the new queries and the + simulated paged KV cache. + 5. Comparing the vLLM backend's output to the ground-truth SDPA output. + """ + batch_spec = BATCH_SPECS[batch_spec_name] + vllm_config = create_vllm_config(model_name=model, + max_model_len=max(batch_spec.seq_lens), + num_gpu_blocks=2048) + device = torch.device("cuda:0") + + kv_cache_spec = create_standard_kv_cache_spec(vllm_config) + + # 1. Setup + batch_size = batch_spec.batch_size + seq_lens = batch_spec.seq_lens + query_lens = batch_spec.query_lens + num_q_heads = vllm_config.model_config.get_num_attention_heads( + vllm_config.parallel_config) + num_kv_heads = vllm_config.model_config.get_num_kv_heads( + vllm_config.parallel_config) + head_size = vllm_config.model_config.get_head_size() + dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype) + block_size = vllm_config.cache_config.block_size + kv_lora_rank = 512 + qk_rope_head_dim = 64 + qk_nope_head_dim = 128 + v_head_dim = 128 + total_head_size = kv_lora_rank + qk_rope_head_dim + assert kv_lora_rank + qk_rope_head_dim == head_size, \ + f"MLA dimensions don't match: {total_head_size} != {head_size}" + scale = 1.0 / (total_head_size**0.5) + + # 2. Generate data and compute SDPA reference output for MLA + all_q_vllm, all_kv_c_vllm, all_k_pe_vllm = [], [], [] + all_sdpa_outputs = [] + kv_c_contexts, k_pe_contexts = [], [] + + # Create shared MLA weight matrices for consistency across all sequences + W_UK = torch.randn(kv_lora_rank, + num_q_heads, + qk_nope_head_dim, + dtype=dtype, + device=device) + W_UV = torch.randn(kv_lora_rank, + num_q_heads, + v_head_dim, + dtype=dtype, + device=device) + kv_b_proj_weight = torch.cat([W_UK, W_UV], dim=-1) + + for i in range(batch_size): + s_len = seq_lens[i] + q_len = query_lens[i] + context_len = s_len - q_len + + # Generate MLA tensors + # Q has both nope and rope components: + # [q_len, num_heads, qk_nope_head_dim + qk_rope_head_dim] + q_c = torch.randn(q_len, + num_q_heads, + qk_nope_head_dim + qk_rope_head_dim, + dtype=dtype, + device=device) + + # KV_C (latent K/V): [s_len, kv_lora_rank] + kv_c_full = torch.randn(s_len, + kv_lora_rank, + dtype=dtype, + device=device) + + # K_PE (rope component): [s_len, 1, qk_rope_head_dim] + k_pe_full = torch.randn(s_len, + 1, + qk_rope_head_dim, + dtype=dtype, + device=device) + + # Determine if this is decode (single token) + # or prefill (multiple tokens) + is_decode = q_len == 1 + + # Split q into nope and rope components + q_nope, q_pe = q_c.split([qk_nope_head_dim, qk_rope_head_dim], dim=-1) + + if is_decode: + # Decode path: MQA-style attention in latent space + # Transform q_nope to latent space: q_nope @ W_UK + # q_nope: [1, num_heads, qk_nope_head_dim] + # W_UK: [kv_lora_rank, num_heads, qk_nope_head_dim] + ql_nope = torch.einsum("qnh,lnh->qnl", q_nope, + W_UK) # [1, num_heads, kv_lora_rank] + + # Build MQA attention inputs + # Q: [1, num_heads, kv_lora_rank + qk_rope_head_dim] + q_mqa = torch.cat([ql_nope, q_pe], dim=-1) + # K: [s_len, kv_lora_rank + qk_rope_head_dim] + # (broadcasted to all heads) + k_mqa = torch.cat([kv_c_full, k_pe_full.squeeze(1)], dim=-1) + k_mqa = k_mqa.unsqueeze(1).expand(-1, num_q_heads, -1) + # V: [s_len, kv_lora_rank] (broadcasted to all heads) + v_mqa = kv_c_full.unsqueeze(1).expand(-1, num_q_heads, -1) + + # SDPA expects (N, H, L, D) + q_sdpa_in = q_mqa.unsqueeze(0).transpose(1, 2) + k_sdpa_in = k_mqa.unsqueeze(0).transpose(1, 2) + v_sdpa_in = v_mqa.unsqueeze(0).transpose(1, 2) + + sdpa_out_i = torch.nn.functional.scaled_dot_product_attention( + q_sdpa_in, k_sdpa_in, v_sdpa_in, is_causal=False, scale=scale) + sdpa_out_i = sdpa_out_i.transpose(1, 2).squeeze( + 0) # [1, num_heads, kv_lora_rank] + + # Project back to output space: sdpa_out @ W_UV + sdpa_out_i = torch.einsum("qnl,lnv->qnv", sdpa_out_i, W_UV) + sdpa_out_i = sdpa_out_i.flatten(start_dim=-2) + else: + # Prefill path: MHA-style attention with full sequence + # Apply kv_b_proj to the full kv_c tensor + kv_nope_full = torch.einsum("sl,lnh->snh", kv_c_full, + kv_b_proj_weight) + k_nope_full, v_full = kv_nope_full.split( + [qk_nope_head_dim, v_head_dim], dim=-1) + + # Build attention inputs for full sequence + q_mha = torch.cat([q_nope, q_pe], + dim=-1) # [q_len, num_heads, total_dim] + k_pe_full_expanded = k_pe_full.expand(-1, num_q_heads, -1) + k_full = torch.cat([k_nope_full, k_pe_full_expanded], dim=-1) + + # Create custom attention mask: + # - Query tokens can attend to all context tokens + # - Query tokens can only attend to query tokens up to their pos + attn_mask = torch.ones(q_len, + s_len, + dtype=torch.bool, + device=device) + # Apply causal mask only to the query portion (context_len onwards) + causal_mask = torch.tril(torch.ones(q_len, q_len, device=device)) + attn_mask[:, context_len:] = causal_mask + + # SDPA expects (N, H, L, D) + q_sdpa_in = q_mha.unsqueeze(0).transpose(1, 2) + k_sdpa_in = k_full.unsqueeze(0).transpose(1, 2) + v_sdpa_in = v_full.unsqueeze(0).transpose(1, 2) + + # Single attention call with custom mask + sdpa_out_i = torch.nn.functional.scaled_dot_product_attention( + q_sdpa_in, + k_sdpa_in, + v_sdpa_in, + attn_mask=attn_mask, + scale=scale) + sdpa_out_i = sdpa_out_i.transpose(1, 2).squeeze(0) + sdpa_out_i = sdpa_out_i.flatten(start_dim=-2) + + all_sdpa_outputs.append(sdpa_out_i) + + # Inputs for vLLM MLA backends are just the new tokens + all_q_vllm.append(q_c) + all_kv_c_vllm.append(kv_c_full[context_len:]) # New kv_c tokens + all_k_pe_vllm.append(k_pe_full[context_len:]) # New k_pe tokens + + # Contextual K/V data used to populate the paged cache (MLA format) + kv_c_contexts.append(kv_c_full[:context_len]) + k_pe_contexts.append(k_pe_full[:context_len]) + + # Concatenate all sequences (no reordering needed) + query_vllm = torch.cat(all_q_vllm, dim=0) + kv_c_vllm = torch.cat(all_kv_c_vllm, dim=0) + k_pe_vllm = torch.cat(all_k_pe_vllm, dim=0) + sdpa_output = torch.cat(all_sdpa_outputs, dim=0) + + # Create mock kv_b_proj using the same weights as reference implementation + from vllm.model_executor.layers.linear import ColumnParallelLinear + mock_kv_b_proj = ColumnParallelLinear(input_size=kv_lora_rank, + output_size=num_q_heads * + (qk_nope_head_dim + v_head_dim), + bias=False).to(device=device, + dtype=dtype) + + # Set the mock weights to match our reference implementation + # Reshape W_UK and W_UV to match the expected kv_b_proj format + # [kv_lora_rank, num_heads, qk_nope_head_dim + v_head_dim] + kv_b_proj_weight = kv_b_proj_weight.view( + kv_lora_rank, num_q_heads * (qk_nope_head_dim + v_head_dim)) + mock_kv_b_proj.weight = torch.nn.Parameter(kv_b_proj_weight.T) + + # Create metadata using original batch spec + common_attn_metadata = create_common_attn_metadata( + batch_spec, vllm_config.cache_config.block_size, device) + + # 3. Simulate Paged KV Cache and a realistic slot_mapping + kv_cache = create_and_prepopulate_kv_cache( + kv_c_contexts=kv_c_contexts, + k_pe_contexts=k_pe_contexts, + block_size=block_size, + num_kv_heads=num_kv_heads, + head_size=head_size, + dtype=dtype, + device=device, + num_blocks=vllm_config.cache_config.num_gpu_blocks, + common_attn_metadata=common_attn_metadata, + randomize_blocks=True) + + # 4. Run vLLM backends and compare + for backend_name in BACKENDS_TO_TEST: + backend_output = run_attention_backend( + backend_name, kv_cache_spec, ["placeholder"], vllm_config, device, + common_attn_metadata, query_vllm, kv_c_vllm, k_pe_vllm, kv_cache, + kv_lora_rank, qk_nope_head_dim, qk_rope_head_dim, v_head_dim, + mock_kv_b_proj) + + # Check shape and dtype consistency + assert backend_output.shape == sdpa_output.shape, ( + f"[{backend_name}] shape {backend_output.shape} != " + f"SDPA shape {sdpa_output.shape}") + assert backend_output.dtype == sdpa_output.dtype, ( + f"[{backend_name}] dtype {backend_output.dtype} != " + f"SDPA dtype {sdpa_output.dtype}") + + assert torch.isfinite(backend_output).all(), ( + f"[{backend_name}] produced non-finite values") + + # Check numerical similarity + rtol = 1e-2 + atol = 5e-1 + + max_diff = torch.max(torch.abs(backend_output - sdpa_output)).item() + max_rel_diff = torch.max( + torch.abs(backend_output - sdpa_output) / + torch.abs(sdpa_output)).item() + all_close = torch.allclose(backend_output, + sdpa_output, + rtol=rtol, + atol=atol) + + assert all_close, ( + f"[{backend_name}] output differs from SDPA baseline. " + f"Max diff: {max_diff:.6f}, max rel diff: {max_rel_diff:.6f})") diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index e547e71e0cdb7..6a08cdc56f736 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -135,6 +135,12 @@ def get_attention_backend(backend_name: _Backend): "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend", _Backend.XFORMERS_VLLM_V1: "vllm.v1.attention.backends.xformers.XFormersAttentionBackend", + _Backend.CUTLASS_MLA: + "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend", + _Backend.FLASHMLA_VLLM_V1: + "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend", + _Backend.TRITON_MLA_VLLM_V1: + "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend", } if backend_name not in backend_map: @@ -167,9 +173,11 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B", tensor_parallel_size: int = 1, max_model_len: int = 1024, dtype: Union[ModelDType, torch.dtype] = "auto", + num_gpu_blocks: int = 1000, block_size: int = 16, max_num_seqs: int = 256, max_num_batched_tokens: int = 8192, + enable_chunked_prefill: bool = True, add_mock_model_methods: bool = True) -> VllmConfig: """Create a VllmConfig for testing with reasonable defaults.""" @@ -189,7 +197,7 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B", ) # Set cache blocks for testing # (these may be set during initialization normally) - cache_config.num_gpu_blocks = 1000 + cache_config.num_gpu_blocks = num_gpu_blocks cache_config.num_cpu_blocks = 0 parallel_config = ParallelConfig( @@ -198,6 +206,7 @@ def create_vllm_config(model_name: str = "meta-llama/Meta-Llama-3-8B", scheduler_config = SchedulerConfig( max_num_seqs=max_num_seqs, max_num_batched_tokens=max_num_batched_tokens, + enable_chunked_prefill=enable_chunked_prefill, ) device_config = DeviceConfig() diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 646e4fec836bd..03028ebfe76ad 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -24,7 +24,7 @@ Main reference: DeepseekV2 paper, and FlashInfer Implementation (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551). Deepseek's MLA attention works the following way: -* Use a single latent vector to represent the per-token entry of the KV cache. +* Use a single latent vector to represent the per-token entry of the KV cache. * For decode (i.e. the memory friendly approach) the attention "simulates" a multi-head attention, while the compute is similar to multi-query attention. @@ -82,7 +82,7 @@ spda_o = scaled_dot_product_attention( torch.cat([q_nope, q_pe], dim=-1), torch.cat([k_nope, k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1), v -) +) return spda_o @ W_O NOTE: in the actual code, @@ -120,20 +120,20 @@ return o.view(-1, N * V) @ self.num_heads @ W_O ## Chunked Prefill -For chunked prefill we want to use the compute friendly algorithm. We are -assuming sufficiently large Sq / Skv ratio, in the future may want to switch to +For chunked prefill we want to use the compute friendly algorithm. We are +assuming sufficiently large Sq / Skv ratio, in the future may want to switch to the data-movement friendly approach if the chunk (i.e. `Sq`) is small. However, the compute-friendly approach can potentially run out of memory if Skv is large due to: `k_nope = (kv_c @ W_UK).view(Skv, N, P)` -To mitigate this, we chunk the computation of attention with respect to the -current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a +To mitigate this, we chunk the computation of attention with respect to the +current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a fixed workspace size. The chunked prefill approach is as follows: -MCC Max chunk of context to process per iter, computed dynamically, +MCC Max chunk of context to process per iter, computed dynamically, used to bound the memory usage q_c = h_t @ W_DQ @@ -155,7 +155,7 @@ curr_o, curr_lse = scaled_dot_product_attention( new_v, casual=True, return_softmax_lse=True -) +) // Compute attention with the already existing context for chunk_idx in range(cdiv(C, MCC)): From c86af22f31838ee654c856279ac5110ae3fdb2cc Mon Sep 17 00:00:00 2001 From: shixianc <49539556+shixianc@users.noreply.github.com> Date: Wed, 20 Aug 2025 15:04:21 -0700 Subject: [PATCH 152/361] [Fix] remove is_marlin param in benchmark_moe (#23286) From 4b795020eda910ecf16c289a23c4a6c119a4b43b Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Wed, 20 Aug 2025 16:46:06 -0700 Subject: [PATCH 153/361] [EP] Add logging for experts map (#22685) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: Simon Mo --- vllm/model_executor/layers/fused_moe/layer.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index aa8ceda1bb25a..b16c21b7013a0 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -695,6 +695,26 @@ def determine_expert_map( return (local_num_experts, expert_map) +def get_compressed_expert_map(expert_map: torch.Tensor) -> str: + """ + Compresses the expert map by removing any -1 entries. + + Args: + expert_map (torch.Tensor): A tensor of shape (global_num_experts,) + mapping from global to local index. Contains -1 for experts not + assigned to the current rank. + + Returns: + str: A string mapping from local to global index. + Using str to support hashing for logging once only. + """ + global_indices = torch.where(expert_map != -1)[0] + local_indices = expert_map[global_indices] + return ", ".join( + f"{local_index.item()}->{global_index.item()}" + for local_index, global_index in zip(local_indices, global_indices)) + + @CustomOp.register("fused_moe") class FusedMoE(CustomOp): """FusedMoE layer for MoE models. @@ -795,6 +815,12 @@ class FusedMoE(CustomOp): ep_size=self.ep_size, ep_rank=self.ep_rank, global_num_experts=self.global_num_experts) + logger.info_once( + "[EP Rank %s/%s] Expert parallelism is enabled. Local/global" + " number of experts: %s/%s. Experts local to global index map:" + " %s.", self.ep_rank, self.ep_size, self.local_num_experts, + self.global_num_experts, + get_compressed_expert_map(self.expert_map)) else: self.local_num_experts, self.expert_map = (self.global_num_experts, None) From f5aa307d7795b8400d3719087c502c2a227030c7 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 20 Aug 2025 20:14:59 -0400 Subject: [PATCH 154/361] Remove duplicate entry in vllm.attention.__all__ (#23296) Signed-off-by: Russell Bryant --- vllm/attention/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py index 344040586a532..dcb2aa68fbee9 100644 --- a/vllm/attention/__init__.py +++ b/vllm/attention/__init__.py @@ -14,7 +14,6 @@ __all__ = [ "AttentionMetadata", "AttentionType", "AttentionMetadataBuilder", - "Attention", "AttentionState", "get_attn_backend", ] From bbea1cefdd1a29b53355b1655f5d2ae343921f85 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 20 Aug 2025 20:18:12 -0400 Subject: [PATCH 155/361] [CI Bugfix] Fix CI by fully removing --enable-prompt-adapter (#23284) Signed-off-by: mgoin --- vllm/engine/arg_utils.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index dcf78758946f9..f3afc015f669c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -888,12 +888,6 @@ class EngineArgs: parser.add_argument('--disable-log-stats', action='store_true', help='Disable logging statistics.') - parser.add_argument('--enable-prompt-adapter', - action='store_true', - deprecated=True, - help='[DEPRECATED] Prompt adapter has been ' - 'removed. Setting this flag to True or False' - ' has no effect on vLLM behavior.') return parser From b029de9902aa3ac58806c8c17776c7074175b6db Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 20 Aug 2025 18:25:56 -0700 Subject: [PATCH 156/361] [Optimization] Make new_block_ids None if empty (#23262) Signed-off-by: Woosuk Kwon --- vllm/v1/core/kv_cache_manager.py | 30 ++++++++++++++++++++++++++---- vllm/v1/core/sched/output.py | 2 +- vllm/v1/core/sched/scheduler.py | 24 ++++++++++++------------ vllm/v1/worker/gpu_model_runner.py | 14 +++++++++----- vllm/v1/worker/tpu_model_runner.py | 14 +++++++++----- 5 files changed, 57 insertions(+), 27 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index bfaa7ab08f5cf..fd0bdb2c80fc5 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Optional +from typing import Literal, Optional, overload from vllm.distributed.kv_events import KVCacheEvent from vllm.logger import init_logger @@ -37,7 +37,24 @@ class KVCacheBlocks: tuple(blk1 + blk2 for blk1, blk2 in zip(self.blocks, other.blocks))) - def get_block_ids(self) -> tuple[list[int], ...]: + @overload + def get_block_ids( + self, + allow_none: Literal[False] = False, + ) -> tuple[list[int], ...]: + ... + + @overload + def get_block_ids( + self, + allow_none: Literal[True] = True, + ) -> Optional[tuple[list[int], ...]]: + ... + + def get_block_ids( + self, + allow_none: bool = False, + ): """ Converts the KVCacheBlocks instance to block_ids. @@ -46,6 +63,8 @@ class KVCacheBlocks: * the outer tuple corresponds to KV cache groups * each inner list contains the block_ids of the blocks in that group """ + if allow_none and all(len(group) == 0 for group in self.blocks): + return None return tuple([blk.block_id for blk in group] for group in self.blocks) def get_unhashed_block_ids(self) -> list[int]: @@ -348,10 +367,13 @@ class KVCacheManager: """ return self.block_pool.take_events() + def get_blocks(self, request_id: str) -> KVCacheBlocks: + """Get the blocks of a request.""" + return KVCacheBlocks(self.coordinator.get_blocks(request_id)) + def get_block_ids(self, request_id: str) -> tuple[list[int], ...]: """Get the block ids of a request.""" - return KVCacheBlocks( - self.coordinator.get_blocks(request_id)).get_block_ids() + return self.get_blocks(request_id).get_block_ids() def cache_blocks(self, request: Request, num_computed_tokens: int) -> None: """Cache the blocks for the request, if enabled.""" diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index fac07f97195bd..9ba7ec9d96932 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -91,7 +91,7 @@ class CachedRequestData: # NOTE(woosuk): new_token_ids is only used for pipeline parallelism. # When PP is not used, new_token_ids will be empty. new_token_ids: list[list[int]] - new_block_ids: list[tuple[list[int], ...]] + new_block_ids: list[Optional[tuple[list[int], ...]]] num_computed_tokens: list[int] @property diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 4b167da5c8f81..0b528587b9339 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -19,7 +19,7 @@ from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager, compute_encoder_budget) -from vllm.v1.core.kv_cache_manager import KVCacheManager +from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager from vllm.v1.core.sched.interface import SchedulerInterface from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, SchedulerOutput) @@ -185,7 +185,7 @@ class Scheduler(SchedulerInterface): # uses structured decoding. structured_output_request_ids: dict[str, int] = {} - req_to_new_block_ids: dict[str, tuple[list[int], ...]] = {} + req_to_new_blocks: dict[str, KVCacheBlocks] = {} num_scheduled_tokens: dict[str, int] = {} token_budget = self.max_num_scheduled_tokens # Encoder-related. @@ -288,8 +288,7 @@ class Scheduler(SchedulerInterface): # Therefore, we might introduce some additional # cycle to fill in the bitmask, which could be a big no-op. structured_output_request_ids[request.request_id] = req_index - req_to_new_block_ids[request.request_id] = ( - new_blocks.get_block_ids()) + req_to_new_blocks[request.request_id] = new_blocks num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens req_index += 1 @@ -496,8 +495,8 @@ class Scheduler(SchedulerInterface): if self.lora_config and request.lora_request: scheduled_loras.add(request.lora_request.lora_int_id) - req_to_new_block_ids[request.request_id] = ( - self.kv_cache_manager.get_block_ids(request.request_id)) + req_to_new_blocks[request.request_id] = ( + self.kv_cache_manager.get_blocks(request.request_id)) num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens request.status = RequestStatus.RUNNING @@ -546,8 +545,8 @@ class Scheduler(SchedulerInterface): ) # Construct the scheduler output. new_reqs_data = [ - NewRequestData.from_request(req, - req_to_new_block_ids[req.request_id]) + NewRequestData.from_request( + req, req_to_new_blocks[req.request_id].get_block_ids()) for req in scheduled_new_reqs ] cached_reqs_data = self._make_cached_request_data( @@ -555,7 +554,7 @@ class Scheduler(SchedulerInterface): scheduled_resumed_reqs, num_scheduled_tokens, scheduled_spec_decode_tokens, - req_to_new_block_ids, + req_to_new_blocks, ) scheduler_output = SchedulerOutput( scheduled_new_reqs=new_reqs_data, @@ -628,11 +627,11 @@ class Scheduler(SchedulerInterface): resumed_reqs: list[Request], num_scheduled_tokens: dict[str, int], spec_decode_tokens: dict[str, list[int]], - req_to_new_block_ids: dict[str, tuple[list[int], ...]], + req_to_new_blocks: dict[str, KVCacheBlocks], ) -> CachedRequestData: req_ids: list[str] = [] new_token_ids: list[list[int]] = [] - new_block_ids: list[tuple[list[int], ...]] = [] + new_block_ids: list[Optional[tuple[list[int], ...]]] = [] num_computed_tokens: list[int] = [] use_connector = self.connector is not None @@ -655,7 +654,8 @@ class Scheduler(SchedulerInterface): # out of bounds errors. TODO: Remove this once the KVConnector # is updated to handle token IDs properly. new_token_ids.append([]) - new_block_ids.append(req_to_new_block_ids[req_id]) + new_block_ids.append( + req_to_new_blocks[req_id].get_block_ids(allow_none=True)) num_computed_tokens.append(req.num_computed_tokens) # Because resumed_reqs is usually empty, it is more efficient to do # in-place appending so that we don't need to allocate a new list. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 33747d6917a5a..cc86f9826491f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -574,11 +574,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Update the block IDs. if not resumed_from_preemption: - # Append the new blocks to the existing block IDs. - for block_ids, new_ids in zip(req_state.block_ids, - new_block_ids): - block_ids.extend(new_ids) + if new_block_ids is not None: + # Append the new blocks to the existing block IDs. + for block_ids, new_ids in zip(req_state.block_ids, + new_block_ids): + block_ids.extend(new_ids) else: + assert new_block_ids is not None # The request is resumed from preemption. # Replace the existing block IDs with the new ones. req_state.block_ids = new_block_ids @@ -594,7 +596,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Update the persistent batch. self.input_batch.num_computed_tokens_cpu[req_index] = ( num_computed_tokens) - self.input_batch.block_table.append_row(new_block_ids, req_index) + if new_block_ids is not None: + self.input_batch.block_table.append_row( + new_block_ids, req_index) # For the last rank, we don't need to update the token_ids_cpu # because the sampled tokens are already cached. diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 9196c62377b91..0f569500cdf6b 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -418,11 +418,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Update the cached states. req_state.num_computed_tokens = num_computed_tokens if not resumed_from_preemption: - # Append the new blocks to the existing block IDs. - for block_ids, new_ids in zip(req_state.block_ids, - new_block_ids): - block_ids.extend(new_ids) + if new_block_ids is not None: + # Append the new blocks to the existing block IDs. + for block_ids, new_ids in zip(req_state.block_ids, + new_block_ids): + block_ids.extend(new_ids) else: + assert new_block_ids is not None # The request is resumed from preemption. # Replace the existing block IDs with the new ones. req_state.block_ids = new_block_ids @@ -438,7 +440,9 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Update the persistent batch. self.input_batch.num_computed_tokens_cpu[req_index] = ( num_computed_tokens) - self.input_batch.block_table.append_row(new_block_ids, req_index) + if new_block_ids is not None: + self.input_batch.block_table.append_row( + new_block_ids, req_index) # Add the new or resumed requests to the persistent batch. # The smaller empty indices are filled first. From 7be5d113d8784536b79f27f24cfa91958dc291b0 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Thu, 21 Aug 2025 09:34:24 +0800 Subject: [PATCH 157/361] [CPU] Refactor CPU W8A8 scaled_mm (#23071) Signed-off-by: jiang1.li --- .../scripts/hardware_ci/run-cpu-test.sh | 7 +- cmake/cpu_extension.cmake | 59 +- csrc/cpu/cpu_types_x86.hpp | 8 +- csrc/cpu/dnnl_helper.cpp | 346 +++++++ csrc/cpu/dnnl_helper.h | 169 ++++ csrc/cpu/dnnl_helper.hpp | 206 ---- csrc/cpu/dnnl_kernels.cpp | 494 +++++++++ csrc/cpu/quant.cpp | 951 ------------------ csrc/cpu/torch_bindings.cpp | 92 +- tests/kernels/test_onednn.py | 144 +++ vllm/_custom_ops.py | 83 ++ vllm/model_executor/layers/fused_moe/layer.py | 11 +- vllm/model_executor/layers/linear.py | 8 +- .../kernels/scaled_mm/__init__.py | 4 +- .../quantization/kernels/scaled_mm/cpu.py | 206 ++++ .../quantization/kernels/scaled_mm/cutlass.py | 4 +- vllm/model_executor/layers/utils.py | 6 + 17 files changed, 1525 insertions(+), 1273 deletions(-) create mode 100644 csrc/cpu/dnnl_helper.cpp create mode 100644 csrc/cpu/dnnl_helper.h delete mode 100644 csrc/cpu/dnnl_helper.hpp create mode 100644 csrc/cpu/dnnl_kernels.cpp delete mode 100644 csrc/cpu/quant.cpp create mode 100644 tests/kernels/test_onednn.py create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 57a7bc4e5f5df..9dec9f8e9eb32 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -46,6 +46,11 @@ function cpu_tests() { set -e python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" + # Run kernel tests + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pytest -v -s tests/kernels/test_onednn.py" + # Run basic model test docker exec cpu-test-"$NUMA_NODE" bash -c " set -e @@ -99,4 +104,4 @@ function cpu_tests() { # All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" +timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index e0da46e2accaa..cc38cd41a5b24 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -182,17 +182,17 @@ endif() # # Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms) # Flag to enable ACL kernels for AARCH64 platforms -if ( VLLM_BUILD_ACL STREQUAL "ON") +if (VLLM_BUILD_ACL STREQUAL "ON") set(USE_ACL ON) else() set(USE_ACL OFF) endif() -if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND) +if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) FetchContent_Declare( oneDNN GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git - GIT_TAG v3.8.1 + GIT_TAG v3.9 GIT_PROGRESS TRUE GIT_SHALLOW TRUE ) @@ -204,7 +204,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND) endif() set(ONEDNN_AARCH64_USE_ACL "ON") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/") - endif() + endif() set(ONEDNN_LIBRARY_TYPE "STATIC") set(ONEDNN_BUILD_DOC "OFF") @@ -217,38 +217,23 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND) set(ONEDNN_ENABLE_ITT_TASKS "OFF") set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF") set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF") + set(ONEDNN_VERBOSE "OFF") set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) FetchContent_MakeAvailable(oneDNN) - - list(APPEND LIBS dnnl) -elseif(POWER10_FOUND) - FetchContent_Declare( - oneDNN - GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git - GIT_TAG v3.7.2 - GIT_PROGRESS TRUE - GIT_SHALLOW TRUE + add_library(dnnl_ext OBJECT "csrc/cpu/dnnl_helper.cpp") + target_include_directories( + dnnl_ext + PUBLIC ${oneDNN_SOURCE_DIR}/include + PUBLIC ${oneDNN_BINARY_DIR}/include + PRIVATE ${oneDNN_SOURCE_DIR}/src ) - - set(ONEDNN_LIBRARY_TYPE "STATIC") - set(ONEDNN_BUILD_DOC "OFF") - set(ONEDNN_BUILD_EXAMPLES "OFF") - set(ONEDNN_BUILD_TESTS "OFF") - set(ONEDNN_ENABLE_WORKLOAD "INFERENCE") - set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER") - set(ONEDNN_BUILD_GRAPH "OFF") - set(ONEDNN_ENABLE_JIT_PROFILING "OFF") - set(ONEDNN_ENABLE_ITT_TASKS "OFF") - set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF") - set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF") - set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) - - set(DNNL_CPU_RUNTIME "OMP") - - FetchContent_MakeAvailable(oneDNN) - - list(APPEND LIBS dnnl) + target_link_libraries(dnnl_ext dnnl) + target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC) + list(APPEND LIBS dnnl_ext) + set(USE_ONEDNN ON) +else() + set(USE_ONEDNN OFF) endif() message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") @@ -275,7 +260,6 @@ set(VLLM_EXT_SRC if (AVX512_FOUND AND NOT AVX512_DISABLED) set(VLLM_EXT_SRC - "csrc/cpu/quant.cpp" "csrc/cpu/shm.cpp" ${VLLM_EXT_SRC}) if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI) @@ -289,14 +273,11 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED) ${VLLM_EXT_SRC}) add_compile_definitions(-DCPU_CAPABILITY_AVX512) endif() -elseif(POWER10_FOUND) - set(VLLM_EXT_SRC - "csrc/cpu/quant.cpp" - ${VLLM_EXT_SRC}) endif() -if (ASIMD_FOUND) + +if(USE_ONEDNN) set(VLLM_EXT_SRC - "csrc/cpu/quant.cpp" + "csrc/cpu/dnnl_kernels.cpp" ${VLLM_EXT_SRC}) endif() diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index 3952c43cbc727..982f7c07a13bd 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -89,7 +89,7 @@ struct FP16Vec16 : public Vec { explicit FP16Vec16(const FP32Vec16&); - void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } + void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); } void save(void* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -126,7 +126,7 @@ struct BF16Vec16 : public Vec { explicit BF16Vec16(const FP32Vec16&); - void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } + void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); } void save(void* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -180,8 +180,8 @@ struct BF16Vec32 : public Vec { (__m128i)vec8_data.reg, 1)) {} void save(void* ptr) const { - *reinterpret_cast<__m256i*>(ptr) = reg_low; - *reinterpret_cast<__m256i*>((__m256i*)ptr + 1) = reg_high; + _mm256_storeu_si256((__m256i*)ptr, reg_low); + _mm256_storeu_si256((__m256i*)ptr + 1, reg_high); } }; #endif diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp new file mode 100644 index 0000000000000..f3f00edb36068 --- /dev/null +++ b/csrc/cpu/dnnl_helper.cpp @@ -0,0 +1,346 @@ +#include +#include + +#include "common/memory_desc.hpp" +#include "common/memory.hpp" + +#include "dnnl_helper.h" + +static dnnl::engine& default_engine() { + static dnnl::engine engine(dnnl::engine::kind::cpu, 0); + return engine; +} + +static dnnl::stream& default_stream() { + static dnnl::stream stream(default_engine()); + return stream; +} + +void release_dnnl_matmul_handler(int64_t handler) { + DNNLMatMulPrimitiveHandler* ptr = + reinterpret_cast(handler); + delete ptr; +} + +template +class DNNLPrimitiveCache { + public: + using cache_value_t = std::pair; + using result_value_t = VT; + using container_t = std::list; + using value_iterator_t = typename container_t::iterator; + using map_t = std::unordered_map; + using creator_t = VT (*)(); + + public: + DNNLPrimitiveCache(size_t capacity) + : capacity_(capacity), + values_(), + key_to_value_(std::min(256lu, capacity)) { + assert(capacity > 0); + } + + template + result_value_t get_or_create(const KT& key, F&& creator) { + std::optional value = get_value(key); + if (value.has_value()) { + return value.value()->second; + } else { + return add_value({key, creator()})->second; + } + } + + size_t size() const { return values_.size(); } + + private: + void dump_data() { + std::stringstream ss; + ss << "table_id: " << std::hex << reinterpret_cast(this) << std::dec + << "\n"; + ss << "container: ["; + for (auto&& iter : values_) { + ss << "(" << iter.first << ", " << std::hex + << reinterpret_cast(iter.second.get()) << "), " << std::dec; + } + ss << "]\n"; + + ss << "map: ["; + for (auto&& iter : key_to_value_) { + ss << "(" << iter.first << ", " << iter.second->first << ", " << std::hex + << reinterpret_cast(iter.second->second.get()) << std::dec + << "), "; + } + ss << "]\n"; + std::printf("%s\n", ss.str().c_str()); + } + + value_iterator_t add_value(cache_value_t&& new_value) { + if (size() == capacity_) { + cache_value_t& last_item = values_.back(); + key_to_value_.erase(last_item.first); + values_.pop_back(); + } + + auto& added_value_ = values_.emplace_front(std::move(new_value)); + key_to_value_.emplace(added_value_.first, values_.begin()); + return values_.begin(); + } + + std::optional get_value(const KT& key) { + if (key_to_value_.size() > 0 && key == values_.begin()->first) { + return values_.begin(); + } + + auto value_map_iterator = key_to_value_.find(key); + if (value_map_iterator != key_to_value_.end()) { + values_.splice(values_.begin(), values_, value_map_iterator->second); + return value_map_iterator->second; + } else { + return {}; + } + } + + private: + const size_t capacity_; + container_t values_; + map_t key_to_value_; +}; + +DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler( + const Args& args, dnnl::memory::data_type b_type) + : b_n_size_(args.b_n_size), + b_n_stride_(args.b_n_stride), + b_k_size_(args.b_k_size), + b_k_stride_(args.b_k_stride), + b_type_(b_type), + c_type_(args.c_type), + runtime_memory_ptrs_(8), + primitive_cache_size_(args.primitive_cache_size) { + assert(primitive_cache_size_ > 0); +} + +void DNNLMatMulPrimitiveHandler::prepack_weight( + void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) { + dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_, + {b_k_stride_, b_n_stride_}); + dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr); + dnnl::memory packed_weight(b_target_mem_desc, default_engine()); + { + dnnl::reorder(original_weight, packed_weight) + .execute(default_stream(), original_weight, packed_weight); + default_stream().wait(); + } + memory_cache_[DNNL_ARG_WEIGHTS] = packed_weight; + b_target_mem_desc_ = b_target_mem_desc; +} + +void DNNLMatMulPrimitiveHandler::set_runtime_memory_ptr( + size_t index, dnnl_memory* memory_ptr) { + dnnl::impl::memory_storage_t* mem_storage_ptr = memory_ptr->memory_storage(); + dnnl_memory_desc* mem_desc = const_cast(memory_ptr->md()); + runtime_memory_ptrs_[index] = {mem_storage_ptr, mem_desc}; +} + +std::pair +DNNLMatMulPrimitiveHandler::get_runtime_memory_ptr(size_t index) { + return runtime_memory_ptrs_[index]; +} + +namespace std { +template <> +struct hash { + size_t operator()( + const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const { + return hash()(val.b_n_size) ^ hash()(val.b_k_size) ^ + hash()(static_cast(val.a_qs)) ^ + hash()(static_cast(val.b_qs)) ^ hash()(val.use_azp) ^ + hash()(static_cast(val.c_type)); + } +}; + +template <> +struct hash { + size_t operator()( + const W8A8MatMulPrimitiveHandler::MSizeCacheKey& val) const { + return hash()(val.a_m_size) ^ hash()(val.use_bias) ^ + hash()(static_cast(val.bias_type)); + } +}; +} // namespace std + +bool operator==(const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& l, + const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& r) { + return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size && + l.a_qs == r.a_qs && l.b_qs == r.b_qs && l.use_azp == r.use_azp && + l.c_type == r.c_type; +} + +bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l, + const W8A8MatMulPrimitiveHandler::MSizeCacheKey& r) { + return l.use_bias == r.use_bias && l.a_m_size == r.a_m_size && + l.bias_type == r.bias_type; +} + +static std::shared_ptr +get_w8a8_class_primitive_cache( + const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& key, + int64_t cache_size) { + static W8A8MatMulPrimitiveHandler::ClassMatmulCache cache(128); + assert(cache_size > 0); + return cache.get_or_create(key, [&]() { + return std::make_shared(cache_size); + }); +} + +W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args) + : DNNLMatMulPrimitiveHandler( + static_cast(args), + dnnl::memory::data_type::s8), + use_azp_(args.use_a_zero_point), + a_qs_(args.a_quantization_strategy), + b_qs_(args.b_quantization_strategy), + m_size_cache_(nullptr) { + assert(a_qs_ != QuantizationStrategy::PER_OUTPUT_CHANNEL); + assert(b_qs_ != QuantizationStrategy::PER_TOKEN); + if (a_qs_ == QuantizationStrategy::PER_TOKEN) { + assert(!use_azp_); + }; + prepack_weight(args.b_ptr, + create_primitive_desc( + MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL, + .use_bias = false, + .bias_type = dnnl::memory::data_type::undef}, + true) + .weights_desc()); + init_runtime_memory_cache(args); +} + +void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) { + auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0); + auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1); + a_storage->set_data_handle((void*)args.a_ptr); + a_mem_desc->dims[0] = args.a_m_size; + c_storage->set_data_handle((void*)args.c_ptr); + c_mem_desc->dims[0] = args.a_m_size; + + if (a_qs_ == QuantizationStrategy::PER_TENSOR) { + auto&& [a_scale_storage, a_scale_mem_desc] = get_runtime_memory_ptr(2); + a_scale_storage->set_data_handle((void*)args.a_scales_ptr); + } + if (use_azp_) { + auto&& [a_zero_point_storage, a_zero_point_mem_desc] = + get_runtime_memory_ptr(3); + a_zero_point_storage->set_data_handle((void*)args.a_zero_points_ptr); + } + + if (args.use_bias) { + auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(4); + bias_storage->set_data_handle((void*)args.bias_ptr); + } + + dnnl::matmul matmul = get_matmul_cache(args); + matmul.execute(default_stream(), memory_cache_); + default_stream().wait(); +} + +dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache( + const MSizeCacheKey& key) { + if (m_size_cache_.get() == nullptr) { + ClassMatmulCacheKey key = {.b_n_size = b_n_size_, + .b_k_size = b_k_size_, + .a_qs = a_qs_, + .b_qs = b_qs_, + .use_azp = use_azp_, + .c_type = c_type_}; + m_size_cache_ = get_w8a8_class_primitive_cache(key, primitive_cache_size_); + } + + return m_size_cache_->get_or_create(key, [&]() { + dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false); + return dnnl::matmul(desc); + }); +} + +void W8A8MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) { + memory_cache_[DNNL_ARG_SRC] = dnnl::memory({{1, b_k_size_}, + dnnl::memory::data_type::s8, + dnnl::memory::format_tag::ab}, + default_engine(), nullptr); + set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get()); + memory_cache_[DNNL_ARG_DST] = + dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab}, + default_engine(), nullptr); + set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get()); + + // For PER_TOKEN, scales will be applied in outside epilogue + if (a_qs_ == QuantizationStrategy::PER_TENSOR) { + memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC] = dnnl::memory( + {{1}, dnnl::memory::data_type::f32, {1}}, default_engine(), nullptr); + set_runtime_memory_ptr( + 2, memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC].get()); + if (use_azp_) { + memory_cache_[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC] = dnnl::memory( + {{1}, dnnl::memory::data_type::s32, {1}}, default_engine(), nullptr); + set_runtime_memory_ptr( + 3, memory_cache_[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC].get()); + } + } + + if (b_qs_ == QuantizationStrategy::PER_TENSOR) { + memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = + dnnl::memory({{1}, dnnl::memory::data_type::f32, {1}}, default_engine(), + (void*)args.b_scales_ptr); + } else if (b_qs_ == QuantizationStrategy::PER_OUTPUT_CHANNEL) { + memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = + dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, + default_engine(), (void*)args.b_scales_ptr); + } + + memory_cache_[DNNL_ARG_BIAS] = + dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, + default_engine(), nullptr); + set_runtime_memory_ptr(4, memory_cache_[DNNL_ARG_BIAS].get()); +} + +dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc( + const MSizeCacheKey& key, bool first_time) { + dnnl::memory::desc a_md({key.a_m_size, b_k_size_}, + dnnl::memory::data_type::s8, + dnnl::memory::format_tag::ab); + dnnl::memory::desc b_md; + if (first_time) { + b_md = + dnnl::memory::desc({b_k_size_, b_n_size_}, dnnl::memory::data_type::s8, + dnnl::memory::format_tag::any); + } else { + b_md = b_target_mem_desc_; + } + dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_, + dnnl::memory::format_tag::ab); + + dnnl::primitive_attr attr; + // For PER_TOKEN, scales will be applied in outside epilogue + if (a_qs_ == QuantizationStrategy::PER_TENSOR) { + attr.set_scales_mask(DNNL_ARG_SRC, 0); + if (use_azp_) { + attr.set_zero_points_mask(DNNL_ARG_SRC, 0); + } + } + + if (b_qs_ == QuantizationStrategy::PER_TENSOR) { + attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0); + } else if (b_qs_ == QuantizationStrategy::PER_OUTPUT_CHANNEL) { + attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2); + } + + if (key.use_bias) { + // For PER_TOKEN, bias will be applied in epilogue + assert(a_qs_ == QuantizationStrategy::PER_TENSOR); + dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1}); + return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md, + c_md, attr); + } else { + return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md, + attr); + } +} diff --git a/csrc/cpu/dnnl_helper.h b/csrc/cpu/dnnl_helper.h new file mode 100644 index 0000000000000..54ceefced9e98 --- /dev/null +++ b/csrc/cpu/dnnl_helper.h @@ -0,0 +1,169 @@ +#ifndef DNNL_HELPER_H +#define DNNL_HELPER_H + +#include +#include + +#include "oneapi/dnnl/dnnl.hpp" + +namespace c10 { +struct BFloat16; +struct Half; +} // namespace c10 + +namespace dnnl { +namespace impl { +struct memory_storage_t; +struct matmul_pd_t; +struct matmul_desc_t; +} // namespace impl +} // namespace dnnl +struct dnnl_memory_desc; + +template +class DNNLPrimitiveCache; + +template +struct DNNLType { + static constexpr dnnl::memory::data_type type = + dnnl::memory::data_type::undef; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16; +}; + +template +constexpr inline dnnl::memory::data_type get_dnnl_type() { + return DNNLType>::type; +} + +class DNNLMatMulPrimitiveHandler { + public: + virtual ~DNNLMatMulPrimitiveHandler() = default; + + protected: + struct Args { + dnnl_dim_t b_n_size; + dnnl_dim_t b_n_stride; + dnnl_dim_t b_k_size; + dnnl_dim_t b_k_stride; + void* b_ptr; + dnnl::memory::data_type c_type; + size_t primitive_cache_size; + }; + + protected: + DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type); + + void prepack_weight(void* original_b_ptr, + dnnl::memory::desc b_target_mem_desc); + + void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr); + + std::pair + get_runtime_memory_ptr(size_t index); + + protected: + const dnnl_dim_t b_n_size_; + const dnnl_dim_t b_n_stride_; + const dnnl_dim_t b_k_size_; + const dnnl_dim_t b_k_stride_; + dnnl::memory::data_type b_type_; + dnnl::memory::data_type c_type_; + std::unordered_map memory_cache_; + std::vector> + runtime_memory_ptrs_; + dnnl::memory::desc b_target_mem_desc_; + int64_t primitive_cache_size_; +}; + +class W8A8MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler { + public: + enum class QuantizationStrategy { PER_TOKEN, PER_TENSOR, PER_OUTPUT_CHANNEL }; + + struct Args : public DNNLMatMulPrimitiveHandler::Args { + bool use_a_zero_point; + QuantizationStrategy a_quantization_strategy; + QuantizationStrategy b_quantization_strategy; + float* b_scales_ptr; + }; + + struct ClassMatmulCacheKey { + dnnl_dim_t b_n_size; + dnnl_dim_t b_k_size; + QuantizationStrategy a_qs; + QuantizationStrategy b_qs; + bool use_azp; + dnnl::memory::data_type c_type; + + friend bool operator==(const ClassMatmulCacheKey& l, + const ClassMatmulCacheKey& r); + }; + + struct MSizeCacheKey { + dnnl_dim_t a_m_size; + bool use_bias; + dnnl::memory::data_type bias_type; + + friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r); + }; + + using MSizeCache = DNNLPrimitiveCache; + using ClassMatmulCache = + DNNLPrimitiveCache>; + + struct ExecArgs : public MSizeCacheKey { + const int8_t* a_ptr; + const float* a_scales_ptr; + const int32_t* a_zero_points_ptr; + const void* bias_ptr; + void* c_ptr; + }; + + public: + W8A8MatMulPrimitiveHandler(const Args& args); + + QuantizationStrategy get_input_scale_strategy() const { return a_qs_; } + + bool get_input_use_zero_point() const { return use_azp_; } + + void execute(ExecArgs& args); + + private: + dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key, + bool first_time); + + void init_runtime_memory_cache(const Args& args); + + dnnl::matmul get_matmul_cache(const MSizeCacheKey& key); + + private: + const bool use_azp_; + const QuantizationStrategy a_qs_; + const QuantizationStrategy b_qs_; + std::shared_ptr m_size_cache_; +}; + +#endif diff --git a/csrc/cpu/dnnl_helper.hpp b/csrc/cpu/dnnl_helper.hpp deleted file mode 100644 index 1cb8dc5b25a66..0000000000000 --- a/csrc/cpu/dnnl_helper.hpp +++ /dev/null @@ -1,206 +0,0 @@ -#ifndef DNNL_HELPER_HPP -#define DNNL_HELPER_HPP - -#include -#include - -#include "oneapi/dnnl/dnnl.hpp" - -namespace { -template -struct DNNLType { - static constexpr dnnl::memory::data_type type = - dnnl::memory::data_type::undef; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16; -}; - -template -constexpr inline dnnl::memory::data_type get_dnnl_type() { - return DNNLType>::type; -} -}; // namespace - -template -class DNNLPrimitiveHelper { - public: - // I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias) - // A: [M, K], row-major - // B: [K, N], column-major - // C: [M, N], row-major - // bias: [N], row-major, optional - // a_scales: [MS] - // b_scales: [NS] - // Note: Due to the limitation of oneDNN - // (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is - // not supported. - - template - static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c, - const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N, - dnnl_dim_t K, const float* a_scales, - const float* b_scales, dnnl_dim_t MS, - dnnl_dim_t NS) { - auto&& OutputType = get_dnnl_type(); - auto&& BiasType = get_dnnl_type(); - - dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1}); - dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K}); - dnnl::memory::desc c_md({M, N}, OutputType, {N, 1}); - - dnnl::primitive_attr attr; - if constexpr (!InputNoScale) { - if (MS == 1) { - // per-tensor - attr.set_scales_mask(DNNL_ARG_SRC, 0); - } else { - // per-token - TORCH_CHECK(false, "per-token quantization is unsupported."); - } - } - - if (NS == 1) { - // per-tensor - attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0); - } else { - // per-channel - attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2); - } - - dnnl::matmul::primitive_desc matmul_pd; -// Create memory descriptors with format_tag::any for the primitive. This -// enables the matmul primitive to choose memory layouts for an -// optimized primitive implementation, and these layouts may differ from the -// ones provided by the user. -#ifdef __aarch64__ - auto mat_src_md = dnnl::memory::desc({M, K}, dnnl::memory::data_type::s8, - dnnl::memory::format_tag::any); - auto mat_weights_md = dnnl::memory::desc( - {K, N}, dnnl::memory::data_type::s8, dnnl::memory::format_tag::any); - auto mat_dst_md = - dnnl::memory::desc({M, N}, OutputType, dnnl::memory::format_tag::any); - if (bias) { - dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1}); - matmul_pd = dnnl::matmul::primitive_desc(default_engine(), mat_src_md, - mat_weights_md, bias_md, - mat_dst_md, attr); - } else { - matmul_pd = dnnl::matmul::primitive_desc( - default_engine(), mat_src_md, mat_weights_md, mat_dst_md, attr); - } -#else - if (bias) { - dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1}); - matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, - bias_md, c_md, attr); - } else { - matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, - c_md, attr); - } -#endif - dnnl::matmul matmul(matmul_pd); - - auto& engine = default_engine(); - - dnnl::memory a_m(a_md, engine, (void*)a); - dnnl::memory b_m(b_md, engine, (void*)b); - dnnl::memory c_m(c_md, engine, (void*)c); - dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine, - (void*)a_scales); - dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine, - (void*)b_scales); - - auto& stream = default_stream(); - - auto mat_src_mem = a_m; - auto mat_weights_mem = b_m; - auto mat_dst_mem = c_m; -#ifdef __aarch64__ - if (matmul_pd.weights_desc() != b_m.get_desc()) { - mat_weights_mem = dnnl::memory(matmul_pd.weights_desc(), engine); - dnnl::reorder(b_m, mat_weights_mem).execute(stream, b_m, mat_weights_mem); - } -#endif - if constexpr (InputNoScale) { - if (bias) { - dnnl::memory::desc bias_md({N}, BiasType, {1}); - dnnl::memory bias_m(bias_md, engine, (void*)bias); - matmul.execute( - stream, { - {DNNL_ARG_SRC, mat_src_mem}, - {DNNL_ARG_WEIGHTS, mat_weights_mem}, - {DNNL_ARG_BIAS, bias_m}, - {DNNL_ARG_DST, mat_dst_mem}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m}, - }); - } else { - matmul.execute( - stream, { - {DNNL_ARG_SRC, mat_src_mem}, - {DNNL_ARG_WEIGHTS, mat_weights_mem}, - {DNNL_ARG_DST, mat_dst_mem}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m}, - }); - } - } else { - if (bias) { - dnnl::memory::desc bias_md({N}, BiasType, {1}); - dnnl::memory bias_m(bias_md, engine, (void*)bias); - matmul.execute( - stream, { - {DNNL_ARG_SRC, mat_src_mem}, - {DNNL_ARG_WEIGHTS, mat_weights_mem}, - {DNNL_ARG_BIAS, bias_m}, - {DNNL_ARG_DST, mat_dst_mem}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m}, - }); - } else { - matmul.execute( - stream, { - {DNNL_ARG_SRC, mat_src_mem}, - {DNNL_ARG_WEIGHTS, mat_weights_mem}, - {DNNL_ARG_DST, mat_dst_mem}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m}, - }); - } - } - stream.wait(); - } - - private: - static dnnl::engine& default_engine() { - static dnnl::engine engine(dnnl::engine::kind::cpu, 0); - return engine; - } - - static dnnl::stream& default_stream() { - static dnnl::stream stream(default_engine()); - return stream; - } -}; -#endif diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp new file mode 100644 index 0000000000000..acc3b9ecde143 --- /dev/null +++ b/csrc/cpu/dnnl_kernels.cpp @@ -0,0 +1,494 @@ +#include "cpu_types.hpp" +#include "dnnl_helper.h" + +namespace { +template +struct KernelVecType { + using load_vec_type = void; + using cvt_vec_type = void; +}; + +template <> +struct KernelVecType { + using load_vec_type = vec_op::FP32Vec16; + using cvt_vec_type = vec_op::FP32Vec16; +}; + +#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT) +template <> +struct KernelVecType { + using load_vec_type = vec_op::BF16Vec16; + using cvt_vec_type = vec_op::FP32Vec16; +}; +#endif + +template <> +struct KernelVecType { +#if defined(__powerpc64__) || defined(__s390x__) + // Power architecture-specific vector type + using load_vec_type = vec_op::FP32Vec16; +#else + // Fallback for other architectures + using load_vec_type = vec_op::FP16Vec16; +#endif + using cvt_vec_type = vec_op::FP32Vec16; +}; + +template +void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, + const float* scale, const int32_t* azp, + const int64_t num_tokens, + const int64_t input_stride, + const int64_t hidden_size) { + using load_vec_t = typename KernelVecType::load_vec_type; + using cvt_vec_t = typename KernelVecType::cvt_vec_type; + constexpr int64_t vec_elem_num = load_vec_t::VEC_ELEM_NUM; + + constexpr float i8_min = + static_cast(std::numeric_limits::min()); + constexpr float i8_max = + static_cast(std::numeric_limits::max()); + const cvt_vec_t inv_scale(1.0 / *scale); + const cvt_vec_t i8_min_vec(i8_min); + const cvt_vec_t i8_max_vec(i8_max); + + cvt_vec_t zp_vec; + if constexpr (AZP) { + zp_vec = cvt_vec_t(static_cast(*azp)); + } + +#pragma omp parallel for + for (int64_t i = 0; i < num_tokens; ++i) { + int64_t j = 0; + const scalar_t* input_ptr = input + i * input_stride; + int8_t* output_ptr = output + i * hidden_size; + for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + elems_fp32 = elems_fp32 * inv_scale; + + if constexpr (AZP) { + elems_fp32 = elems_fp32 + zp_vec; + } + + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); + vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_int8.save(output_ptr + j); + } + + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + elems_fp32 = elems_fp32 * inv_scale; + + if constexpr (AZP) { + elems_fp32 = elems_fp32 + zp_vec; + } + + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); + vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_int8.save(output_ptr + j, hidden_size - j); + } +} + +template +void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, + float* scale, int32_t* azp, + const int64_t num_tokens, + const int64_t input_stride, + const int64_t hidden_size) { + using load_vec_t = typename KernelVecType::load_vec_type; + using cvt_vec_t = typename KernelVecType::cvt_vec_type; + constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; + + constexpr float i8_min = + static_cast(std::numeric_limits::min()); + constexpr float i8_max = + static_cast(std::numeric_limits::max()); + const cvt_vec_t i8_min_vec(i8_min); + const cvt_vec_t i8_max_vec(i8_max); + +#pragma omp parallel for + for (int64_t i = 0; i < num_tokens; ++i) { + cvt_vec_t max_value(std::numeric_limits::lowest()); + cvt_vec_t min_value(std::numeric_limits::max()); + { + int64_t j = 0; + const scalar_t* input_ptr = input + i * input_stride; + for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + if constexpr (AZP) { + max_value = max_value.max(elems_fp32); + min_value = min_value.min(elems_fp32); + } else { + max_value = max_value.max(elems_fp32.abs()); + } + } + + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + + if (j + vec_elem_num == hidden_size) { + if constexpr (AZP) { + max_value = max_value.max(elems_fp32); + min_value = min_value.min(elems_fp32); + } else { + max_value = max_value.max(elems_fp32.abs()); + } + } else { + if constexpr (AZP) { + max_value = max_value.max(elems_fp32, hidden_size - j); + min_value = min_value.min(elems_fp32, hidden_size - j); + } else { + max_value = max_value.max(elems_fp32.abs(), hidden_size - j); + } + } + } + + float scale_val, azp_val; + if constexpr (AZP) { + float max_scalar = max_value.reduce_max(); + float min_scalar = min_value.reduce_min(); + scale_val = (max_scalar - min_scalar) / 255.0f; + azp_val = std::nearbyint(-128.0f - min_scalar / scale_val); + azp[i] = azp_val; + scale[i] = scale_val; + } else { + scale_val = max_value.reduce_max() / 127.0f; + scale[i] = scale_val; + } + + const cvt_vec_t inv_scale(1.0 / scale_val); + const cvt_vec_t azp_vec(azp_val); + + { + int64_t j = 0; + const scalar_t* input_ptr = input + i * input_stride; + int8_t* output_ptr = output + i * hidden_size; + for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + elems_fp32 = (elems_fp32 * inv_scale); + + if constexpr (AZP) { + elems_fp32 = elems_fp32 + azp_vec; + } + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); + vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_int8.save(output_ptr + j); + } + + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + elems_fp32 = (elems_fp32 * inv_scale); + + if constexpr (AZP) { + elems_fp32 = elems_fp32 + azp_vec; + } + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); + vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_int8.save(output_ptr + j, hidden_size - j); + } + } +} + +template +void dynamic_quant_epilogue(const float* input, scalar_t* output, + const float* a_scale, const int32_t* azp, + const float* azp_adj, const scalar_t* bias, + const int64_t num_tokens, + const int64_t hidden_size) { + CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue) + using load_vec_t = typename KernelVecType::load_vec_type; + using cvt_vec_t = typename KernelVecType::cvt_vec_type; + constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; + + const int64_t thread_num = omp_get_max_threads(); + if (num_tokens > thread_num) { +#pragma omp parallel for + for (int64_t i = 0; i < num_tokens; ++i) { + const float* input_ptr = input + i * hidden_size; + scalar_t* output_ptr = output + i * hidden_size; + int64_t j = 0; + cvt_vec_t token_scale_vec(a_scale[i]); + cvt_vec_t token_zp_scale_vec; + if constexpr (AZP) { + float zp_scale_val = a_scale[i] * static_cast(azp[i]); + token_zp_scale_vec = cvt_vec_t(zp_scale_val); + } + for (; j < hidden_size - vec_elem_num; ++j) { + cvt_vec_t elems_fp32(input_ptr + j); + elems_fp32 = elems_fp32 * token_scale_vec; + if constexpr (AZP) { + cvt_vec_t azp_adj_fp32(azp_adj + j); + elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec; + } + if constexpr (Bias) { + load_vec_t bias_vec(bias + j); + cvt_vec_t bias_vec_fp32(bias_vec); + elems_fp32 = elems_fp32 + bias_vec_fp32; + } + load_vec_t elems_out(elems_fp32); + elems_out.save(output_ptr + j); + } + cvt_vec_t elems_fp32(input_ptr + j); + elems_fp32 = elems_fp32 * token_scale_vec; + if constexpr (AZP) { + cvt_vec_t azp_adj_fp32(azp_adj + j); + elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec; + } + if constexpr (Bias) { + load_vec_t bias_vec(bias + j); + cvt_vec_t bias_vec_fp32(bias_vec); + elems_fp32 = elems_fp32 + bias_vec_fp32; + } + load_vec_t elems_out(elems_fp32); + elems_out.save(output_ptr + j, hidden_size - j); + } + } else { + const int64_t vec_iteration = + (hidden_size + vec_elem_num - 1) / vec_elem_num; + const int64_t vec_iteration_per_thread = + (vec_iteration + thread_num - 1) / thread_num; + const int64_t elem_num_per_thread = vec_iteration_per_thread * vec_elem_num; +#pragma omp parallel for schedule(static, 1) + for (int64_t i = 0; i < thread_num; ++i) { + const int64_t start = elem_num_per_thread * i; + const int64_t end = std::min(hidden_size, elem_num_per_thread + start); + for (int64_t j = 0; j < num_tokens; ++j) { + cvt_vec_t token_scale_vec(a_scale[j]); + cvt_vec_t token_zp_scale_vec; + if constexpr (AZP) { + float zp_scale_val = a_scale[j] * static_cast(azp[j]); + token_zp_scale_vec = cvt_vec_t(zp_scale_val); + } + int64_t k = start; + const float* input_ptr = input + j * hidden_size; + scalar_t* output_ptr = output + j * hidden_size; + for (; k < end - vec_elem_num; k += vec_elem_num) { + cvt_vec_t elems_fp32(input_ptr + k); + elems_fp32 = elems_fp32 * token_scale_vec; + if constexpr (AZP) { + cvt_vec_t azp_adj_fp32(azp_adj + k); + elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec; + } + if constexpr (Bias) { + load_vec_t bias_vec(bias + k); + cvt_vec_t bias_vec_fp32(bias_vec); + elems_fp32 = elems_fp32 + bias_vec_fp32; + } + load_vec_t elems_out(elems_fp32); + elems_out.save(output_ptr + k); + } + if (k < end) { + cvt_vec_t elems_fp32(input_ptr + k); + elems_fp32 = elems_fp32 * token_scale_vec; + if constexpr (AZP) { + cvt_vec_t azp_adj_fp32(azp_adj + k); + elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec; + } + if constexpr (Bias) { + load_vec_t bias_vec(bias + k); + cvt_vec_t bias_vec_fp32(bias_vec); + elems_fp32 = elems_fp32 + bias_vec_fp32; + } + load_vec_t elems_out(elems_fp32); + elems_out.save(output_ptr + k, end - k); + } + } + } + } +} +} // namespace + +int64_t create_onednn_scaled_mm_handler( + const torch::Tensor& b, // [IC, OC], column-major + const torch::Tensor& b_scales, // [1] or [OC] + at::ScalarType output_type, bool dynamic_act_quant, bool use_azp, + int64_t primitive_cache_size) { + TORCH_CHECK(b.dim() == 2); + TORCH_CHECK(b.stride(0) == 1); // Column-major + TORCH_CHECK(b_scales.is_contiguous()); + + W8A8MatMulPrimitiveHandler::Args args; + args.primitive_cache_size = primitive_cache_size; + + if (b_scales.numel() == 1) { + args.b_quantization_strategy = + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR; + } else { + TORCH_CHECK_EQ(b_scales.numel(), b.size(1)); + args.b_quantization_strategy = + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_OUTPUT_CHANNEL; + } + args.b_scales_ptr = b_scales.data_ptr(); + args.b_k_size = b.size(0); + args.b_k_stride = b.stride(0); + args.b_n_size = b.size(1); + args.b_n_stride = b.stride(1); + args.b_ptr = b.data_ptr(); + + if (dynamic_act_quant) { + // dynamic per-token, bias, A scales and A zps will be applied in outside. + args.a_quantization_strategy = + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TOKEN; + args.use_a_zero_point = false; + } else { + // static per-tensor + args.a_quantization_strategy = + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR; + args.use_a_zero_point = use_azp; + } + + VLLM_DISPATCH_FLOATING_TYPES(output_type, "create_onednn_scaled_mm_handler", + [&] { + if (dynamic_act_quant) { + args.c_type = get_dnnl_type(); + } else { + args.c_type = get_dnnl_type(); + } + }); + + return reinterpret_cast(new W8A8MatMulPrimitiveHandler(args)); +} + +void onednn_scaled_mm( + torch::Tensor& c, // [M, OC], row-major + const torch::Tensor& a, // [M, IC], row-major + const torch::Tensor& a_scales, // [M] or [1] + const std::optional& azp, // [M] or [1] + const std::optional& azp_adj, // [M] or [1] + const std::optional& bias, // [N] + int64_t handler) { + CPU_KERNEL_GUARD_IN(onednn_scaled_mm) + TORCH_CHECK(a.dim() == 2); + TORCH_CHECK(a.is_contiguous()); + TORCH_CHECK(c.is_contiguous()); + W8A8MatMulPrimitiveHandler* ptr = + reinterpret_cast(handler); + const int32_t* azp_ptr = nullptr; + if (azp.has_value()) { + azp_ptr = azp->data_ptr(); + } + if (ptr->get_input_scale_strategy() == + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR) { + TORCH_CHECK_EQ(a_scales.numel(), 1); + } + + W8A8MatMulPrimitiveHandler::ExecArgs exec_args; + exec_args.a_ptr = a.data_ptr(); + exec_args.a_m_size = a.size(0); + exec_args.bias_ptr = nullptr; + exec_args.use_bias = false; + exec_args.a_scales_ptr = nullptr; + exec_args.a_zero_points_ptr = nullptr; + + VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "onednn_scaled_mm", [&] { + if (ptr->get_input_scale_strategy() == + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR) { + if (bias.has_value()) { + exec_args.bias_ptr = bias->data_ptr(); + exec_args.bias_type = get_dnnl_type(); + exec_args.use_bias = true; + } + exec_args.a_scales_ptr = a_scales.data_ptr(); + exec_args.a_zero_points_ptr = azp_ptr; + exec_args.c_ptr = c.data_ptr(); + ptr->execute(exec_args); + } else if (ptr->get_input_scale_strategy() == + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TOKEN) { + torch::Tensor tmp_fp32_out = + torch::empty_like(c, ::at::ScalarType::Float); + exec_args.c_ptr = tmp_fp32_out.data_ptr(); + ptr->execute(exec_args); + if (bias.has_value()) { + if (azp.has_value()) { + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), azp_ptr, azp_adj->data_ptr(), + bias->data_ptr(), c.size(0), c.size(1)); + } else { + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), azp_ptr, nullptr, + bias->data_ptr(), c.size(0), c.size(1)); + } + } else { + if (azp.has_value()) { + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), azp_ptr, azp_adj->data_ptr(), + (scalar_t*)nullptr, c.size(0), c.size(1)); + } else { + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), azp_ptr, nullptr, (scalar_t*)nullptr, + c.size(0), c.size(1)); + } + } + } else { + TORCH_CHECK(false, "invalid act quant type."); + } + }); +} + +// static-per-tensor quantization. +void static_scaled_int8_quant( + torch::Tensor& out, // [batch, hidden_size] + const torch::Tensor& input, // [batch, hidden_size] + const torch::Tensor& scale, std::optional const& azp) { + CPU_KERNEL_GUARD_IN(static_scaled_int8_quant) + TORCH_CHECK(out.is_contiguous()); + TORCH_CHECK_EQ(input.dim(), 2); + TORCH_CHECK_EQ(input.stride(1), 1); + TORCH_CHECK(scale.numel() == 1); + TORCH_CHECK(!azp.has_value() || azp->numel() == 1); + + const int64_t stride = input.stride(0); + const int64_t hidden_size = input.size(1); + const int64_t num_tokens = input.size(0); + VLLM_DISPATCH_FLOATING_TYPES( + input.scalar_type(), "static_scaled_int8_quant_impl", [&] { + if (azp.has_value()) { + static_scaled_int8_quant_impl( + input.data_ptr(), out.data_ptr(), + scale.data_ptr(), azp->data_ptr(), num_tokens, + stride, hidden_size); + } else { + static_scaled_int8_quant_impl(input.data_ptr(), + out.data_ptr(), + scale.data_ptr(), nullptr, + num_tokens, stride, hidden_size); + } + }); +} + +// dynamic-per-token quantization. +void dynamic_scaled_int8_quant( + torch::Tensor& out, // [batch, hidden_size] + const torch::Tensor& input, // [batch, hidden_size] + torch::Tensor& scale, // [batch, 1] + std::optional const& azp) { + CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant) + TORCH_CHECK(out.is_contiguous()); + TORCH_CHECK_EQ(input.dim(), 2); + TORCH_CHECK_EQ(input.stride(1), 1); + + const int64_t hidden_size = input.size(1); + const int64_t num_tokens = input.size(0); + const int64_t stride = input.stride(0); + VLLM_DISPATCH_FLOATING_TYPES( + input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] { + if (azp.has_value()) { + dynamic_scaled_int8_quant_impl( + input.data_ptr(), out.data_ptr(), + scale.data_ptr(), azp->data_ptr(), num_tokens, + stride, hidden_size); + } else { + dynamic_scaled_int8_quant_impl( + input.data_ptr(), out.data_ptr(), + scale.data_ptr(), nullptr, num_tokens, stride, + hidden_size); + } + }); +} diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp deleted file mode 100644 index 6e120b8d20a7e..0000000000000 --- a/csrc/cpu/quant.cpp +++ /dev/null @@ -1,951 +0,0 @@ -#include "cpu_types.hpp" -#include "dnnl_helper.hpp" - -namespace { -template -struct KernelVecType { - using load_vec_type = void; - using azp_adj_load_vec_type = void; - using cvt_vec_type = void; -}; - -template <> -struct KernelVecType { - using load_vec_type = vec_op::FP32Vec16; - using azp_adj_load_vec_type = vec_op::INT32Vec16; - using cvt_vec_type = vec_op::FP32Vec16; -}; - -#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT) -template <> -struct KernelVecType { - using load_vec_type = vec_op::BF16Vec16; - using azp_adj_load_vec_type = vec_op::INT32Vec16; - using cvt_vec_type = vec_op::FP32Vec16; -}; -#endif - -template <> -struct KernelVecType { -#if defined(__powerpc64__) || defined(__s390x__) - // Power architecture-specific vector type - using load_vec_type = vec_op::FP32Vec16; -#else - // Fallback for other architectures - using load_vec_type = vec_op::FP16Vec16; -#endif - using azp_adj_load_vec_type = vec_op::INT32Vec16; - using cvt_vec_type = vec_op::FP32Vec16; -}; - -#if defined(__AVX512F__) || defined(__aarch64__) -template -void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - const float* scale, const int32_t* azp, - const int num_tokens, - const int hidden_size) { - using load_vec_t = typename KernelVecType::load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - constexpr float i8_min = - static_cast(std::numeric_limits::min()); - constexpr float i8_max = - static_cast(std::numeric_limits::max()); - const cvt_vec_t inv_scale(1.0 / *scale); - const cvt_vec_t i8_min_vec(i8_min); - const cvt_vec_t i8_max_vec(i8_max); - - cvt_vec_t zp_vec; - if constexpr (AZP) { - zp_vec = cvt_vec_t(static_cast(*azp)); - } - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = elems_fp32 * inv_scale; - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + zp_vec; - } - - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j); - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = elems_fp32 * inv_scale; - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + zp_vec; - } - - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j, hidden_size - j); - } -} - -template -void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - float* scale, int32_t* azp, - const int num_tokens, - const int hidden_size) { - using load_vec_t = typename KernelVecType::load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - constexpr float i8_min = - static_cast(std::numeric_limits::min()); - constexpr float i8_max = - static_cast(std::numeric_limits::max()); - const cvt_vec_t i8_min_vec(i8_min); - const cvt_vec_t i8_max_vec(i8_max); - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - cvt_vec_t max_value(std::numeric_limits::lowest()); - cvt_vec_t min_value(std::numeric_limits::max()); - { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - if constexpr (AZP) { - max_value = max_value.max(elems_fp32); - min_value = min_value.min(elems_fp32); - } else { - max_value = max_value.max(elems_fp32.abs()); - } - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - - if (j + vec_elem_num == hidden_size) { - if constexpr (AZP) { - max_value = max_value.max(elems_fp32); - min_value = min_value.min(elems_fp32); - } else { - max_value = max_value.max(elems_fp32.abs()); - } - } else { - if constexpr (AZP) { - max_value = max_value.max(elems_fp32, hidden_size - j); - min_value = min_value.min(elems_fp32, hidden_size - j); - } else { - max_value = max_value.max(elems_fp32.abs(), hidden_size - j); - } - } - } - - float scale_val, azp_val; - if constexpr (AZP) { - float max_scalar = max_value.reduce_max(); - float min_scalar = min_value.reduce_min(); - scale_val = (max_scalar - min_scalar) / 255.0f; - azp_val = std::nearbyint(-128.0f - min_scalar / scale_val); - azp[i] = static_cast(azp_val); - scale[i] = scale_val; - } else { - scale_val = max_value.reduce_max() / 127.0f; - scale[i] = scale_val; - } - - const cvt_vec_t inv_scale(1.0 / scale_val); - const cvt_vec_t azp_vec(azp_val); - - { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = (elems_fp32 * inv_scale); - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + azp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j); - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = (elems_fp32 * inv_scale); - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + azp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j, hidden_size - j); - } - } -} - -template -void static_quant_epilogue(const float* input, scalar_t* output, - const float a_scale, const float* b_scale, - const int32_t* azp_with_adj, const int num_tokens, - const int hidden_size) { - CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl) - using load_vec_t = typename KernelVecType::load_vec_type; - using azp_adj_load_vec_t = - typename KernelVecType::azp_adj_load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - cvt_vec_t a_scale_vec(a_scale); - cvt_vec_t b_scale_vec(*b_scale); - cvt_vec_t scale_vec = a_scale_vec * b_scale_vec; - - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - cvt_vec_t elems_fp32(input + i * hidden_size + j); - azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - - if constexpr (PerChannel) { - b_scale_vec = cvt_vec_t(b_scale + j); - scale_vec = b_scale_vec * a_scale_vec; - } - - elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32; - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j); - } - - cvt_vec_t elems_fp32(input + i * hidden_size + j); - azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - - if constexpr (PerChannel) { - b_scale_vec = cvt_vec_t(b_scale + j); - scale_vec = b_scale_vec * a_scale_vec; - } - - elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32; - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j, hidden_size - j); - } -} - -template -void dynamic_quant_epilogue(const float* input, scalar_t* output, - const float* a_scale, const float* b_scale, - const int32_t* azp, const int32_t* azp_adj, - const scalar_t* bias, const int num_tokens, - const int hidden_size) { - CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue) - using load_vec_t = typename KernelVecType::load_vec_type; - using azp_adj_load_vec_t = - typename KernelVecType::azp_adj_load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - int j = 0; - cvt_vec_t token_scale_vec(a_scale[i]); - cvt_vec_t token_zp_scale_vec; - if constexpr (AZP) { - float zp_scale_val = a_scale[i] * static_cast(azp[i]); - if constexpr (!PerChannel) { - zp_scale_val *= *b_scale; - } - token_zp_scale_vec = cvt_vec_t(zp_scale_val); - } - - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - cvt_vec_t elems_fp32(input + i * hidden_size + j); - elems_fp32 = elems_fp32 * token_scale_vec; - - if constexpr (AZP) { - azp_adj_load_vec_t azp_adj_vec(azp_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec; - - if constexpr (PerChannel) { - cvt_vec_t b_scale_vec(b_scale + j); - azp_adj_fp32 = azp_adj_fp32 * b_scale_vec; - } - - elems_fp32 = elems_fp32 - azp_adj_fp32; - } - - if constexpr (Bias) { - load_vec_t bias_vec(bias + j); - cvt_vec_t bias_vec_fp32(bias_vec); - elems_fp32 = elems_fp32 + bias_vec_fp32; - } - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j); - } - - cvt_vec_t elems_fp32(input + i * hidden_size + j); - elems_fp32 = elems_fp32 * token_scale_vec; - - if constexpr (AZP) { - azp_adj_load_vec_t azp_adj_vec(azp_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec; - - if constexpr (PerChannel) { - cvt_vec_t b_scale_vec(b_scale + j); - azp_adj_fp32 = azp_adj_fp32 * b_scale_vec; - } - - elems_fp32 = elems_fp32 - azp_adj_fp32; - } - - if constexpr (Bias) { - load_vec_t bias_vec(bias + j); - cvt_vec_t bias_vec_fp32(bias_vec); - elems_fp32 = elems_fp32 + bias_vec_fp32; - } - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j, hidden_size - j); - } -} -#elif defined(__powerpc64__) -template -void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - const float* scale, const int32_t* azp, - const int num_tokens, - const int hidden_size) { - using load_vec_t = typename KernelVecType::load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - constexpr float i8_min = - static_cast(std::numeric_limits::min()); - constexpr float i8_max = - static_cast(std::numeric_limits::max()); - - const cvt_vec_t inv_scale(1.0 / *scale); - const cvt_vec_t i8_min_vec(i8_min); - const cvt_vec_t i8_max_vec(i8_max); - - cvt_vec_t zp_vec; - if constexpr (AZP) { - zp_vec = cvt_vec_t(static_cast(*azp)); - } - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = elems_fp32 * inv_scale; - if constexpr (AZP) { - elems_fp32 = elems_fp32 + zp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j); - } - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = elems_fp32 * inv_scale; - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + zp_vec; - } - - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j, hidden_size - j); - } -} -template -void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - float* scale, int32_t* azp, - const int num_tokens, - const int hidden_size) { - using load_vec_t = typename KernelVecType::load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - constexpr float i8_min = - static_cast(std::numeric_limits::min()); - constexpr float i8_max = - static_cast(std::numeric_limits::max()); - const cvt_vec_t i8_min_vec(i8_min); - const cvt_vec_t i8_max_vec(i8_max); - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - cvt_vec_t max_value(std::numeric_limits::lowest()); - cvt_vec_t min_value(std::numeric_limits::max()); - { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - if constexpr (AZP) { - max_value = max_value.max(elems_fp32); - min_value = min_value.min(elems_fp32); - } else { - max_value = max_value.max(elems_fp32.abs()); - } - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - - if (j + vec_elem_num == hidden_size) { - if constexpr (AZP) { - max_value = max_value.max(elems_fp32); - min_value = min_value.min(elems_fp32); - } else { - max_value = max_value.max(elems_fp32.abs()); - } - } else { - if constexpr (AZP) { - max_value = max_value.max(elems_fp32, hidden_size - j); - min_value = min_value.min(elems_fp32, hidden_size - j); - } else { - max_value = max_value.max(elems_fp32.abs(), hidden_size - j); - } - } - } - - float scale_val, azp_val; - if constexpr (AZP) { - float max_scalar = max_value.reduce_max(); - float min_scalar = min_value.reduce_min(); - scale_val = (max_scalar - min_scalar) / 255.0f; - azp_val = std::nearbyint(-128.0f - min_scalar / scale_val); - azp[i] = static_cast(azp_val); - scale[i] = scale_val; - } else { - scale_val = max_value.reduce_max() / 127.0f; - scale[i] = scale_val; - } - - const cvt_vec_t inv_scale(1.0 / scale_val); - const cvt_vec_t azp_vec(azp_val); - - { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = (elems_fp32 * inv_scale); - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + azp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j); - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = (elems_fp32 * inv_scale); - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + azp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j, hidden_size - j); - } - } -} -template -void static_quant_epilogue(const float* input, scalar_t* output, - const float a_scale, const float* b_scale, - const int32_t* azp_with_adj, const int num_tokens, - const int hidden_size) { - CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl) - using load_vec_t = typename KernelVecType::load_vec_type; - using azp_adj_load_vec_t = - typename KernelVecType::azp_adj_load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - cvt_vec_t a_scale_vec(a_scale); - cvt_vec_t b_scale_vec(*b_scale); - cvt_vec_t scale_vec = a_scale_vec * b_scale_vec; - - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - cvt_vec_t elems_fp32(input + i * hidden_size + j); - azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - - if constexpr (PerChannel) { - b_scale_vec = cvt_vec_t(b_scale + j); - scale_vec = b_scale_vec * a_scale_vec; - } - elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32; - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j); - } - - cvt_vec_t elems_fp32(input + i * hidden_size + j); - azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - - if constexpr (PerChannel) { - b_scale_vec = cvt_vec_t(b_scale + j); - scale_vec = b_scale_vec * a_scale_vec; - } - - elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32; - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j, hidden_size - j); - } -} -template -void dynamic_quant_epilogue(const float* input, scalar_t* output, - const float* a_scale, const float* b_scale, - const int32_t* azp, const int32_t* azp_adj, - const scalar_t* bias, const int num_tokens, - const int hidden_size) { - CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue) - using load_vec_t = typename KernelVecType::load_vec_type; - using azp_adj_load_vec_t = - typename KernelVecType::azp_adj_load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - int j = 0; - cvt_vec_t token_scale_vec(a_scale[i]); - cvt_vec_t token_zp_scale_vec; - if constexpr (AZP) { - float zp_scale_val = a_scale[i] * static_cast(azp[i]); - if constexpr (!PerChannel) { - zp_scale_val *= *b_scale; - } - token_zp_scale_vec = cvt_vec_t(zp_scale_val); - } - - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - cvt_vec_t elems_fp32(input + i * hidden_size + j); - elems_fp32 = elems_fp32 * token_scale_vec; - - if constexpr (AZP) { - azp_adj_load_vec_t azp_adj_vec(azp_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec; - - if constexpr (PerChannel) { - cvt_vec_t b_scale_vec(b_scale + j); - azp_adj_fp32 = azp_adj_fp32 * b_scale_vec; - } - - elems_fp32 = elems_fp32 - azp_adj_fp32; - } - - if constexpr (Bias) { - load_vec_t bias_vec(bias + j); - cvt_vec_t bias_vec_fp32(bias_vec); - elems_fp32 = elems_fp32 + bias_vec_fp32; - } - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j); - } - - cvt_vec_t elems_fp32(input + i * hidden_size + j); - elems_fp32 = elems_fp32 * token_scale_vec; - - if constexpr (AZP) { - azp_adj_load_vec_t azp_adj_vec(azp_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec; - - if constexpr (PerChannel) { - cvt_vec_t b_scale_vec(b_scale + j); - azp_adj_fp32 = azp_adj_fp32 * b_scale_vec; - } - - elems_fp32 = elems_fp32 - azp_adj_fp32; - } - - if constexpr (Bias) { - load_vec_t bias_vec(bias + j); - cvt_vec_t bias_vec_fp32(bias_vec); - elems_fp32 = elems_fp32 + bias_vec_fp32; - } - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j, hidden_size - j); - } -} -#else -template -void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - const float* scale, const int32_t* azp, - const int num_tokens, - const int hidden_size) { - TORCH_CHECK(false, - "static_scaled_int8_quant_impl requires AVX512/powerpc64/AArch64 " - "support.") -} - -template -void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - float* scale, int32_t* azp, - const int num_tokens, - const int hidden_size) { - TORCH_CHECK(false, - "dynamic_scaled_int8_quant_impl requires " - "AVX512/powerpc64/AArch64 support.") -} - -template -void static_quant_epilogue(const float* input, scalar_t* output, - const float a_scale, const float* b_scale, - const int32_t* azp_with_adj, const int num_tokens, - const int hidden_size) { - TORCH_CHECK( - false, "static_quant_epilogue requires AVX512/powerpc64/AArch64 support.") -} - -template -void dynamic_quant_epilogue(const float* input, scalar_t* output, - const float* a_scale, const float* b_scale, - const int32_t* azp, const int32_t* azp_with_adj, - const scalar_t* bias, const int num_tokens, - const int hidden_size) { - TORCH_CHECK( - false, - "dynamic_quant_epilogue requires AVX512/powerpc64/AArch64 support.") -} -#endif -} // namespace - -void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major - const torch::Tensor& a, // [M, IC], row-major - const torch::Tensor& b, // [IC, OC], column-major - const torch::Tensor& a_scales, // [1] or [M] - const torch::Tensor& b_scales, // [1] or [OC] - const std::optional& bias // [OC] -) { - CPU_KERNEL_GUARD_IN(cutlass_scaled_mm) - // Checks for conformality - TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8, - "int8_scaled_mm only supports INT8 inputs.") - TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); - TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && - b.size(1) == c.size(1)); - TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0)); - TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1)); - - // Check for strides and alignment - TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major - TORCH_CHECK(b.stride(0) == 1); // Column-major - TORCH_CHECK(c.stride(0) % 16 == 0 && - b.stride(1) % 16 == 0); // 16 Byte Alignment - TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); - - if (bias) { - TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() && - bias->dim() == 1); - } - - VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm", [&] { - if (a_scales.numel() != 1) { - // per-token - // Note: oneDNN doesn't support per-token activation quantization - // Ideally we want to fuse the GEMM and the scale procedure with oneDNN - // JIT, the intermediate data is cached in registers or L1. But for now - // the oneDNN GEMM code generation only supports two quantization - // patterns: per-tensor or per-output-channel of weight. - // So we have to apply the per-token scale with a 'epilogue'. In C=s_a * - // s_b * (A@B) + bias, the C_inter = s_b * (A@B) is computed by oneDNN - // GEMM, then the per-token scale (and bias) is applied with the epilogue - // C=s_a * C_inter + bias. - torch::Tensor tmp_fp32_out = - torch::empty_like(c, ::at::ScalarType::Float); - // Compute C_inter=s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), - a.size(1), nullptr, b_scales.data_ptr(), 0, b_scales.numel()); - if (bias.has_value()) { - // Compute C=s_a * C_inter + bias - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), nullptr, nullptr, nullptr, - bias->data_ptr(), c.size(0), c.size(1)); - } else { - // Compute C=s_a * C_inter - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), nullptr, nullptr, nullptr, nullptr, - c.size(0), c.size(1)); - } - } else { - // per-tensor - if (bias.has_value()) { - // Compute C=s_a * s_b * (A@B) + bias - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), c.data_ptr(), - bias->data_ptr(), a.size(0), b.size(1), a.size(1), - a_scales.data_ptr(), b_scales.data_ptr(), - a_scales.numel(), b_scales.numel()); - } else { - // Compute C=s_a * s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), c.data_ptr(), - nullptr, a.size(0), b.size(1), a.size(1), - a_scales.data_ptr(), b_scales.data_ptr(), - a_scales.numel(), b_scales.numel()); - } - } - }); -} - -void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major - const torch::Tensor& a, // [M, IC], row-major - const torch::Tensor& b, // [IC, OC], column-major - const torch::Tensor& a_scales, // [1] or [M] - const torch::Tensor& b_scales, // [1] or [OC] - const torch::Tensor& azp_adj, // [OC] - const std::optional& azp, // [1] or [M] - const std::optional& bias // [OC] -) { - CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp) - // Checks for conformality - TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8, - "int8_scaled_mm_azp only supports INT8 inputs.") - TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); - TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && - b.size(1) == c.size(1)); - TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0)); - TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1)); - - // Check for strides and alignment - TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major - TORCH_CHECK(b.stride(0) == 1); // Column-major - TORCH_CHECK(c.stride(0) % 16 == 0 && - b.stride(1) % 16 == 0); // 16 Byte Alignment - TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); - - if (bias) { - TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous()); - } - if (azp) { - TORCH_CHECK(azp->numel() == a.size(0) && azp->is_contiguous()); - } - TORCH_CHECK(azp_adj.numel() == b.size(1) && azp_adj.is_contiguous()); - - // azp & bias types - TORCH_CHECK(azp_adj.dtype() == torch::kInt32); - TORCH_CHECK(!azp || azp->dtype() == torch::kInt32); - TORCH_CHECK(!bias || bias->dtype() == c.dtype(), - "currently bias dtype must match output dtype ", c.dtype()); - - VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_azp", [&] { - torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float); - if (a_scales.numel() != 1) { - // per-token - // Note: oneDNN doesn't support per-token activation quantization - // Compute C_inter=s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), - a.size(1), nullptr, b_scales.data_ptr(), 0, b_scales.numel()); - if (bias.has_value()) { - // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj + bias - if (b_scales.numel() != 1) { - // Per-Channel - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), b_scales.data_ptr(), - azp->data_ptr(), azp_adj.data_ptr(), - bias->data_ptr(), c.size(0), c.size(1)); - } else { - // Per-Tensor - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), b_scales.data_ptr(), - azp->data_ptr(), azp_adj.data_ptr(), - bias->data_ptr(), c.size(0), c.size(1)); - } - } else { - // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj - if (b_scales.numel() != 1) { - // Per-Channel - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), b_scales.data_ptr(), - azp->data_ptr(), azp_adj.data_ptr(), nullptr, - c.size(0), c.size(1)); - } else { - // Per-Tensor - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), b_scales.data_ptr(), - azp->data_ptr(), azp_adj.data_ptr(), nullptr, - c.size(0), c.size(1)); - } - } - } else { - // per-tensor - if (bias.has_value()) { - // Compute C_inter=s_a * s_b * (A@B) + bias - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), bias->data_ptr(), - a.size(0), b.size(1), a.size(1), a_scales.data_ptr(), - b_scales.data_ptr(), a_scales.numel(), b_scales.numel()); - } else { - // Compute C_inter=s_a * s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), - a.size(1), a_scales.data_ptr(), b_scales.data_ptr(), - a_scales.numel(), b_scales.numel()); - } - - // Compute C=C_inter - s_a * s_b * azp_adj - if (b_scales.numel() != 1) { - // Per-Channel - static_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - *a_scales.data_ptr(), b_scales.data_ptr(), - azp_adj.data_ptr(), a.size(0), b.size(1)); - } else { - // Per-Tensor - static_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - *a_scales.data_ptr(), b_scales.data_ptr(), - azp_adj.data_ptr(), a.size(0), b.size(1)); - } - } - }); -} - -// static-per-tensor quantization. -void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] - const torch::Tensor& input, // [..., hidden_size] - const torch::Tensor& scale, - std::optional const& azp) { - CPU_KERNEL_GUARD_IN(static_scaled_int8_quant) - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(out.is_contiguous()); - TORCH_CHECK(scale.numel() == 1); - TORCH_CHECK(!azp.has_value() || azp->numel() == 1); - - const int hidden_size = input.size(-1); - const int num_tokens = input.numel() / hidden_size; - VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), "static_scaled_int8_quant_impl", [&] { - if (azp.has_value()) { - static_scaled_int8_quant_impl( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), azp->data_ptr(), num_tokens, - hidden_size); - } else { - static_scaled_int8_quant_impl( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), nullptr, num_tokens, hidden_size); - } - }); -} - -// dynamic-per-token quantization. -void dynamic_scaled_int8_quant( - torch::Tensor& out, // [..., hidden_size] - const torch::Tensor& input, // [..., hidden_size] - torch::Tensor& scale, // [..., 1] - std::optional const& azp) { - CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant) - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(out.is_contiguous()); - - int const hidden_size = input.size(-1); - int const num_tokens = input.numel() / hidden_size; - VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] { - if (azp.has_value()) { - dynamic_scaled_int8_quant_impl( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), azp->data_ptr(), num_tokens, - hidden_size); - } else { - dynamic_scaled_int8_quant_impl( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), nullptr, num_tokens, hidden_size); - } - }); -} - -#if defined(__powerpc64__) -void int8_scaled_mm_ppc64le(torch::Tensor& c, // [M, OC], row-major - const torch::Tensor& a, // [M, IC], row-major - const torch::Tensor& b, // [IC, OC], column-major - const torch::Tensor& a_scales, - const torch::Tensor& b_scales, - const std::optional& bias // [OC] -) { - CPU_KERNEL_GUARD_IN(cutlass_scaled_mm) - // Checks for conformality - TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8, - "int8_scaled_mm_ppc64le only supports INT8 inputs."); - TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); - TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && - b.size(1) == c.size(1)); - // We dont need this - TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0)); - TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1)); - - // Check for strides and alignment - TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major - TORCH_CHECK(b.stride(0) == 1); // Column-major - TORCH_CHECK(c.stride(0) % 16 == 0 && - b.stride(1) % 16 == 0); // 16 Byte Alignment - TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); - - if (bias) { - TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() && - bias->dim() == 1); - } - VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_ppc64le", [&] { - torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float); - // Compute C_inter=s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), - a.size(1), nullptr, b_scales.data_ptr(), 0, b_scales.numel()); - if (bias.has_value()) { - // Compute C=s_a * C_inter + bias - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), nullptr, nullptr, nullptr, - bias->data_ptr(), c.size(0), c.size(1)); - } else { - // Compute C=s_a * C_inter - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), nullptr, nullptr, nullptr, nullptr, - c.size(0), c.size(1)); - } - }); -} - -#endif diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index b20a054648428..c9f426bdf618a 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -6,25 +6,20 @@ std::string init_cpu_threads_env(const std::string& cpu_ids); -void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a, - const torch::Tensor& b, const torch::Tensor& a_scales, - const torch::Tensor& b_scales, - const std::optional& bias); +void release_dnnl_matmul_handler(int64_t handler); -void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a, - const torch::Tensor& b, const torch::Tensor& a_scales, - const torch::Tensor& b_scales, - const torch::Tensor& azp_adj, - const std::optional& azp, - const std::optional& bias); +int64_t create_onednn_scaled_mm_handler(const torch::Tensor& b, + const torch::Tensor& b_scales, + at::ScalarType output_type, + bool dynamic_act_quant, bool use_azp, + int64_t primitive_cache_size); -#if defined(__powerpc64__) -void int8_scaled_mm_ppc64le(torch::Tensor& c, const torch::Tensor& a, - const torch::Tensor& b, - const torch::Tensor& a_scales, - const torch::Tensor& b_scales, - const std::optional& bias); -#endif +void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a, + const torch::Tensor& a_scales, + const std::optional& azp, + const std::optional& azp_adj, + const std::optional& bias, + int64_t handler); void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query, torch::Tensor& kv_cache, double scale, @@ -151,8 +146,25 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding); // Quantization -#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) +#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \ + defined(__powerpc64__) at::Tag stride_tag = at::Tag::needs_fixed_stride_order; + // Helper function to release oneDNN handlers + ops.def("release_dnnl_matmul_handler(int handler) -> ()", + &release_dnnl_matmul_handler); + + // Create oneDNN W8A8 handler + ops.def( + "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType " + "output_type, bool dynamic_act_quant, bool use_azp, int " + "primitive_cache_size) -> int", + &create_onednn_scaled_mm_handler); + + // oneDNN scaled_mm for W8A8 with static per-tensor activation quantization + ops.def( + "onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, " + "Tensor? azp_adj, Tensor? bias, int handler) -> ()"); + ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm); // Compute int8 quantized tensor for given scaling factor. ops.def( @@ -168,50 +180,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { {stride_tag}); ops.impl("dynamic_scaled_int8_quant", torch::kCPU, &dynamic_scaled_int8_quant); - // W8A8 GEMM, supporting symmetric per-tensor or per-row/column - // quantization. - ops.def( - "cutlass_scaled_mm(Tensor! out, Tensor a," - " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor? bias) -> ()", - {stride_tag}); - ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm); - // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column - // quantization. - ops.def( - "cutlass_scaled_mm_azp(Tensor! out, Tensor a," - " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor azp_adj," - " Tensor? azp, Tensor? bias) -> ()", - {stride_tag}); - ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp); -#elif defined(__powerpc64__) - // Compute int8 quantized tensor for given scaling factor. - ops.def( - "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale," - "Tensor? azp) -> ()"); - ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant); - - // Compute int8 quantized tensor and scaling factor - ops.def( - "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, " - "Tensor!? azp) -> ()"); - ops.impl("dynamic_scaled_int8_quant", torch::kCPU, - &dynamic_scaled_int8_quant); - // W8A8 GEMM, supporting symmetric quantization. - ops.def( - "cutlass_scaled_mm(Tensor! out, Tensor a," - " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor? bias) -> ()"); - ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm_ppc64le); - // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column - // quantization. - ops.def( - "cutlass_scaled_mm_azp(Tensor! out, Tensor a," - " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor azp_adj," - " Tensor? azp, Tensor? bias) -> ()"); - ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp); #endif // SHM CCL diff --git a/tests/kernels/test_onednn.py b/tests/kernels/test_onednn.py new file mode 100644 index 0000000000000..17692384ac9a9 --- /dev/null +++ b/tests/kernels/test_onednn.py @@ -0,0 +1,144 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Integration tests for FlexAttention backend vs default backend""" + +from typing import Optional + +import pytest +import torch + +from tests.kernels.utils import to_int8 +from vllm import _custom_ops as ops +from vllm.platforms import current_platform + +if not current_platform.is_cpu(): + pytest.skip("skipping CPU-only tests", allow_module_level=True) + +NK_FACTORS = [ + (256, 128), + (4096, 4096), + (16384, 4096), + (1023, 491), + (1001, 15), +] +M_FACTORS = [ + (16, 1, 32, 128, 64), + (1, 17, 1, 31, 17), +] +CACHE_SIZES = [2] +DTYPE = [torch.bfloat16] + + +def rand_int8(shape: tuple, device: str = "cpu"): + return to_int8(torch.rand(shape, device=device) * 255 - 128) + + +def ref_int8_scaled_mm( + a: torch.Tensor, + b: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + azp: Optional[torch.Tensor], + bias: Optional[torch.Tensor], + output_type: torch.dtype, +): + if azp is not None: + a = a.to(dtype=torch.float32) - azp.to(dtype=torch.float32) + output = torch.mm((scale_a * a.to(dtype=torch.float32)), + (scale_b * b.to(dtype=torch.float32))) + if bias is not None: + output += bias.float() + + return output.to(dtype=output_type) + + +def onednn_int8_gemm_test_helper(primitive_cache_size: int, + m: int, + n: int, + k: int, + per_tensor_a_quant: bool, + per_tensor_b_quant: bool, + use_azp: bool, + use_bias: bool, + out_dtype: torch.dtype = torch.bfloat16, + device: str = "cpu"): + # Test for a oneDNN kernel with per-tensor / per-token activation + # quantization and per-tensor / per-output channel weight quantization. + a = to_int8(torch.randn((m, k), device=device) * 5) + b = to_int8(torch.randn((n, k), device=device).t() * 5) + + a_scales_shape = (1, 1) if per_tensor_a_quant else (m, 1) + b_scales_shape = (1, 1) if per_tensor_b_quant else (1, n) + + scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32)) + scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32)) + + if use_azp: + azp = torch.rand(a_scales_shape, dtype=torch.float32) * 10 + 1.5 + azp = (azp / scale_a).round().to(dtype=torch.int32) + azp_adj = scale_b * b.sum(dim=0, keepdim=True, dtype=torch.float32) + else: + azp = None + azp_adj = None + + if use_bias: + bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10 + else: + bias = None + + handler = ops.create_onednn_scaled_mm( + b, + scale_b, + out_dtype, + not per_tensor_a_quant, + use_azp, + primitive_cache_size, + ) + + out = torch.zeros((m, n), dtype=out_dtype) + ops.onednn_scaled_mm(handler, a, out, scale_a, azp, azp_adj, bias) + baseline = ref_int8_scaled_mm(a, b, scale_a, scale_b, azp, bias, out_dtype) + + torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0) + + if use_bias: + # To test runtime bias setting + out = torch.zeros((m, n), dtype=out_dtype) + ops.onednn_scaled_mm(handler, a, out, scale_a, azp, azp_adj, None) + baseline = ref_int8_scaled_mm(a, b, scale_a, scale_b, azp, None, + out_dtype) + + torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0) + + +@pytest.mark.parametrize("n,k", NK_FACTORS) +@pytest.mark.parametrize("m_list", M_FACTORS) +@pytest.mark.parametrize("per_tensor_a_scale", [True, False]) +@pytest.mark.parametrize("per_tensor_b_scale", [True, False]) +@pytest.mark.parametrize("use_bias", [True, False]) +@pytest.mark.parametrize("use_azp", [True, False]) +@pytest.mark.parametrize("output_type", DTYPE) +@pytest.mark.parametrize("primitive_cache_size", CACHE_SIZES) +def test_onednn_int8_scaled_gemm( + n: int, + k: int, + m_list: tuple[int], + per_tensor_a_scale: bool, + per_tensor_b_scale: bool, + use_bias: bool, + use_azp: bool, + output_type: torch.dtype, + primitive_cache_size: int, +): + for m in m_list: + onednn_int8_gemm_test_helper( + primitive_cache_size=primitive_cache_size, + m=m, + n=n, + k=k, + per_tensor_a_quant=per_tensor_a_scale, + per_tensor_b_quant=per_tensor_b_scale, + use_bias=use_bias, + use_azp=use_azp, + out_dtype=output_type, + ) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 59f2d7737f19d..3081aff114fc1 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1827,3 +1827,86 @@ if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"): M = mat1.size(0) N = mat2.size(0) return torch.empty((M, N), dtype=out_dtype) + + +class CPUDNNLGEMMHandler: + + def __init__(self) -> None: + self.handler: Optional[int] = None + self.n = -1 + self.k = -1 + + def __del__(self): + if self.handler is not None: + torch.ops._C.release_dnnl_matmul_handler(self.handler) + + +def create_onednn_scaled_mm( + weight: torch.Tensor, # [K, N] + weight_scales: torch.Tensor, + output_type: torch.dtype, + dynamic_quant: bool, + use_azp: bool, + primitive_cache_size: int = 128, +) -> CPUDNNLGEMMHandler: + handler = CPUDNNLGEMMHandler() + handler.k, handler.n = weight.size() + handler.handler = torch.ops._C.create_onednn_scaled_mm_handler( + weight, weight_scales, output_type, dynamic_quant, use_azp, + primitive_cache_size) + return handler + + +def onednn_scaled_int8_quant(input: torch.Tensor, + scale: Optional[torch.Tensor] = None, + azp: Optional[torch.Tensor] = None, + symmetric: bool = True): + """ + Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp. + + Args: + input: The input tensor to be quantized to int8. + scale: Optional scaling factor for the int8 quantization. + When not provided, we invoke dynamic-per-token quantization. + azp: Optional zero-point for the int8 quantization. + Must be provided for asymmetric quantization if `scale` is provided. + symmetric: Whether to use symmetric quantization (scale only, azp ignored). + + Returns: + tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp. + """ + output = torch.empty_like(input, dtype=torch.int8) + token_num = input.numel() // input.shape[-1] + input = input.view((token_num, input.shape[-1])) + if scale is not None: + # static-per-tensor quantization. + assert symmetric == ( + azp + is None), "azp must only be provided for asymmetric quantization." + torch.ops._C.static_scaled_int8_quant(output, input, scale, azp) + return output, scale, azp + + # dynamic-per-token quantization. + input_scales = torch.empty((token_num, 1), + device=input.device, + dtype=torch.float32) + input_azp = None if symmetric else torch.empty_like(input_scales, + dtype=torch.int32) + torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales, + input_azp) + return output, input_scales, input_azp + + +def onednn_scaled_mm( + dnnl_handler: CPUDNNLGEMMHandler, + x: torch.Tensor, + output: torch.Tensor, + input_scale: Optional[torch.Tensor], + input_zp: Optional[torch.Tensor], + input_zp_adj: Optional[torch.Tensor], + bias: Optional[torch.Tensor], +) -> torch.Tensor: + torch.ops._C.onednn_scaled_mm(output, x, input_scale, input_zp, + input_zp_adj, bias, dnnl_handler.handler) + + return output diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index b16c21b7013a0..fcc6987d26bb2 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -360,10 +360,15 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): elif current_platform.is_cpu(): if current_platform.get_cpu_architecture() == CpuArchEnum.X86: from vllm.model_executor.layers.fused_moe import cpu_fused_moe - dtype = layer.w13_weight.dtype + from vllm.model_executor.layers.utils import ( + check_cpu_sgl_kernel) + dtype_w13 = layer.w13_weight.dtype + _, n_w13, k_w13 = layer.w13_weight.size() + dtype_w2 = layer.w2_weight.dtype + _, n_w2, k_w2 = layer.w2_weight.size() if (envs.VLLM_CPU_SGL_KERNEL - and torch._C._cpu._is_amx_tile_supported() - and dtype == torch.bfloat16): + and check_cpu_sgl_kernel(n_w13, k_w13, dtype_w13) + and check_cpu_sgl_kernel(n_w2, k_w2, dtype_w2)): packed_w13_weight = torch.ops._C.convert_weight_packed( layer.w13_weight) assert packed_w13_weight.size() == layer.w13_weight.size() diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 654e2ec7b2fa0..9b1ab7af0ac84 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -199,11 +199,10 @@ class UnquantizedLinearMethod(LinearMethodBase): def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if current_platform.is_cpu() and envs.VLLM_CPU_SGL_KERNEL: + from vllm.model_executor.layers.utils import check_cpu_sgl_kernel N, K = layer.weight.size() dtype = layer.weight.dtype - if (torch._C._cpu._is_amx_tile_supported() - and dtype == torch.bfloat16 and N % 32 == 0 - and K % 32 == 0): + if check_cpu_sgl_kernel(N, K, dtype): packed_weight = torch.ops._C.convert_weight_packed( layer.weight) assert packed_weight.size() == layer.weight.size() @@ -215,7 +214,8 @@ class UnquantizedLinearMethod(LinearMethodBase): else: logger.warning( "CPU SGL kernels require Intel AMX support," - " bfloat16 weight, IC and OC are divisible by 32.") + " bf16/fp16/int8 weight, IC and OC are divisible by " + "32 and 16.") layer.use_cpu_sgl = False def apply(self, diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py index 18f5ce04fd355..2bc68ab3ebd18 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -6,6 +6,8 @@ from typing import Optional from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import ( AiterScaledMMLinearKernel) +from vllm.model_executor.layers.quantization.kernels.scaled_mm.cpu import ( + CPUScaledMMLinearKernel) from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import ( CutlassScaledMMLinearKernel) from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import ( # noqa: E501 @@ -18,7 +20,7 @@ from vllm.platforms import PlatformEnum, current_platform # in priority/performance order (when available) _POSSIBLE_KERNELS: dict[PlatformEnum, list[type[ScaledMMLinearKernel]]] = { - PlatformEnum.CPU: [CutlassScaledMMLinearKernel], + PlatformEnum.CPU: [CPUScaledMMLinearKernel], PlatformEnum.CUDA: [CutlassScaledMMLinearKernel], PlatformEnum.ROCM: [AiterScaledMMLinearKernel, TritonScaledMMLinearKernel], PlatformEnum.TPU: [XLAScaledMMLinearKernel], diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py new file mode 100644 index 0000000000000..59d2b5bce962e --- /dev/null +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py @@ -0,0 +1,206 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Optional + +import torch + +from vllm import _custom_ops as ops +from vllm import envs +from vllm.model_executor.layers.quantization.utils import replace_parameter +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + convert_to_channelwise) +from vllm.model_executor.layers.utils import check_cpu_sgl_kernel +from vllm.platforms import current_platform +from vllm.platforms.interface import CpuArchEnum + +from .ScaledMMLinearKernel import (ScaledMMLinearKernel, + ScaledMMLinearLayerConfig) + + +class CPUScaledMMLinearKernel(ScaledMMLinearKernel): + + @classmethod + def get_min_capability(cls) -> int: + return 75 + + @classmethod + def can_implement( + cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]: + if not current_platform.is_cpu(): + return False, "CPUScaledMM requires running on CPU." + + return True, None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + weight = getattr(layer, self.w_q_name) + dtype = weight.dtype + N, K = weight.size() + if (current_platform.get_cpu_architecture() == CpuArchEnum.X86 + and envs.VLLM_CPU_SGL_KERNEL and self.config.input_symmetric + and check_cpu_sgl_kernel(N, K, dtype)): + self.linear_method = self._apply_weights_sgl + self.process_weights_for_sgl(layer) + else: + self.linear_method = self._apply_weights_onednn + self.process_weights_for_onednn(layer) + + def process_weights_for_onednn(self, layer: torch.nn.Module) -> None: + # WEIGHT + # Transpose to [K, N] for convenience + weight = getattr(layer, self.w_q_name) + replace_parameter( + layer, self.w_q_name, + torch.nn.Parameter(weight.t().data, requires_grad=False)) + + # WEIGHT SCALE + # oneDNN kernels support only per-tensor and per-channel. + # If we have a fused module (QKV, MLP) with per tensor scales (thus N + # scales being passed to the kernel), convert to the per-channel case. + is_fused_module = len(layer.logical_widths) > 1 + weight_scale = getattr(layer, self.w_s_name) + if is_fused_module and not self.config.is_channelwise: + weight_scale = convert_to_channelwise(weight_scale, + layer.logical_widths) + replace_parameter( + layer, self.w_s_name, + torch.nn.Parameter(weight_scale.data, requires_grad=False)) + + # INPUT SCALE + if self.config.is_static_input_scheme: + input_scale = getattr(layer, self.i_s_name) + + if self.config.input_symmetric: + replace_parameter( + layer, self.i_s_name, + torch.nn.Parameter(input_scale.max(), requires_grad=False)) + setattr(layer, self.i_zp_name, None) + else: + input_zero_point = getattr(layer, self.i_zp_name) + + # reconstruct the ranges + int8_traits = torch.iinfo(torch.int8) + azps = input_zero_point.to(dtype=torch.int32) + range_max = (input_scale * (int8_traits.max - azps)).max() + range_min = (input_scale * (int8_traits.min - azps)).min() + + scale = (range_max - range_min) / (int8_traits.max - + int8_traits.min) + replace_parameter( + layer, self.i_s_name, + torch.nn.Parameter(scale, requires_grad=False)) + + azp = (int8_traits.min - + range_min / scale).round().to(dtype=torch.int32) + replace_parameter(layer, self.i_zp_name, + torch.nn.Parameter(azp, requires_grad=False)) + + else: + setattr(layer, self.i_s_name, None) + setattr(layer, self.i_zp_name, None) + + # Different from cutlass, oneDNN kernels only need the AZP adjustment + # term for dynamic quantization. And s_b should be folded into the + # term. Such as: + # s_a * s_b * [(A - zp_a)B] + bias = + # s_a * (s_b * AB) - s_a * s_b * zp_a * B + bias = + # s_a * GEMM_output - s_a * zp_a * adj + bias + if not (self.config.input_symmetric + and self.config.is_static_input_scheme): + weight = getattr(layer, self.w_q_name) + weight_scale = getattr(layer, self.w_s_name) + azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.float32) + azp_adj = azp_adj * weight_scale.squeeze() + setattr(layer, self.azp_adj_name, + torch.nn.Parameter(azp_adj, requires_grad=False)) + else: + setattr(layer, self.azp_adj_name, None) + + weight = getattr(layer, self.w_q_name) + self.dnnl_handler = ops.create_onednn_scaled_mm( + weight, + getattr(layer, self.w_s_name), + torch.get_default_dtype(), + getattr(layer, self.i_s_name) is None, + not self.config.input_symmetric, + 32, + ) + # weight is prepacked and maintained by the dnnl_handler, + # release the original weight + setattr(layer, self.w_q_name, None) + del weight + + def process_weights_for_sgl(self, layer: torch.nn.Module) -> None: + # WEIGHT + weight = getattr(layer, self.w_q_name) + packed_weight = torch.ops._C.convert_weight_packed(weight) + replace_parameter( + layer, self.w_q_name, + torch.nn.Parameter(packed_weight, requires_grad=False)) + + if layer.bias is not None: + bias = layer.bias + layer.register_parameter( + "bias_fp32", + torch.nn.Parameter(bias.float().data, requires_grad=False)) + + # WEIGHT SCALE + # CPU SGL kernels only support per-channel. + # For per-tensor quant, convert to the per-channel case. + weight_scale = getattr(layer, self.w_s_name) + if not self.config.is_channelwise: + weight_scale = convert_to_channelwise(weight_scale, + layer.logical_widths) + replace_parameter( + layer, self.w_s_name, + torch.nn.Parameter(weight_scale.data, requires_grad=False)) + + setattr(layer, self.i_s_name, None) + setattr(layer, self.i_zp_name, None) + setattr(layer, self.azp_adj_name, None) + + def apply_weights(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + return self.linear_method( + layer, + x, + bias, + ) + + def _apply_weights_onednn( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer) + + # ops.scaled_int8_quant supports both dynamic and static quant: + # * dynamic, i_s is None and x_s computed from x. + # * static, i_s is scalar and x_s is i_s. + x_q, x_s, x_zp = ops.onednn_scaled_int8_quant( + x, i_s, i_zp, self.config.input_symmetric) + + m = x.size(0) + n = self.dnnl_handler.n + out = torch.empty((m, n), dtype=x.dtype) + ops.onednn_scaled_mm(self.dnnl_handler, x_q, out, x_s, x_zp, azp_adj, + bias) + + return out + + def _apply_weights_sgl( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + w_q, w_s, _, _, _ = self._get_weight_params(layer) + return torch.ops._C.int8_scaled_mm_with_quant( + x, + w_q, + w_s, + layer.bias_fp32 if bias is not None else None, + x.dtype, + True, + ) diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py index 6ddd4a9ec4233..2f982f96b0d04 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py @@ -25,8 +25,8 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel): def can_implement( cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]: - if (not current_platform.is_cuda() and not current_platform.is_cpu()): - return False, "CutlassScaledMM requires running on CUDA or CPU." + if not current_platform.is_cuda(): + return False, "CutlassScaledMM requires running on CUDA." return True, None diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index 48a347a8f5611..2897f75b3129e 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -142,6 +142,12 @@ direct_register_custom_op( ) +def check_cpu_sgl_kernel(n: int, k: int, dtype: torch.dtype): + return (torch._C._cpu._is_amx_tile_supported() + and (dtype in (torch.bfloat16, torch.int8)) and k % 32 == 0 + and n % 16 == 0) + + def cpu_unquantized_gemm(layer: torch.nn.Module, x: torch.Tensor, weight: torch.Tensor, From 2461d9e562e5852555c76e0dbed06979f9c6c688 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 21 Aug 2025 11:05:20 +0800 Subject: [PATCH 158/361] [CI/Build] Split out mm processor tests (#23260) Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 15 +++++++++++---- .../{ => processing}/test_tensor_schema.py | 7 +++---- vllm/model_executor/models/cohere2_vision.py | 2 ++ 3 files changed, 16 insertions(+), 8 deletions(-) rename tests/models/multimodal/{ => processing}/test_tensor_schema.py (98%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 745420664010a..5869ae21d5c7e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -545,6 +545,15 @@ steps: commands: - pytest -v -s models/language/pooling -m 'not core_model' +- label: Multi-Modal Processor Test + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py + - pytest -v -s models/multimodal/processing/test_tensor_schema.py + - label: Multi-Modal Models Test (Standard) mirror_hardwares: [amdexperimental] torch_nightly: true @@ -554,9 +563,7 @@ steps: commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip freeze | grep -E 'torch' - - pytest -v -s models/multimodal/processing - - pytest -v -s --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/test_tensor_schema.py models/multimodal -m core_model - - pytest -v -s models/multimodal/test_tensor_schema.py -m core_model # Needs mp_method="spawn" + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - label: Multi-Modal Models Test (Extended) 1 @@ -567,7 +574,7 @@ steps: - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model' + - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing - label: Multi-Modal Models Test (Extended) 2 mirror_hardwares: [amdexperimental] diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py similarity index 98% rename from tests/models/multimodal/test_tensor_schema.py rename to tests/models/multimodal/processing/test_tensor_schema.py index 143b4c8fc8c49..79164f02c3398 100644 --- a/tests/models/multimodal/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -24,9 +24,9 @@ from vllm.utils import GiB_bytes, is_list_of, set_default_torch_num_threads from vllm.v1.core.kv_cache_utils import get_kv_cache_config from vllm.v1.engine.core import EngineCore as V1EngineCore -from ...conftest import VllmRunner -from ..registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS -from ..utils import dummy_hf_overrides +from ....conftest import VllmRunner +from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS +from ...utils import dummy_hf_overrides ARCH_TO_SKIP = { "MolmoForCausalLM": "incompatible requirements", @@ -147,7 +147,6 @@ def get_model_id_to_test( return filtered_results -@pytest.mark.core_model @pytest.mark.parametrize( "model_arch, model_id", get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys())) diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index fca1aee835b89..179cc2af8eb3f 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -170,6 +170,8 @@ class Cohere2VisionProcessingInfo(BaseProcessingInfo): # The current implementation of get_number_of_image_patches # is incorrect, so we patch it here. + # TODO: Revert once + # https://github.com/huggingface/transformers/pull/40312 is released. # return image_processor.get_number_of_image_patches(image_height, # image_width, {}) From 3663870c72da246d81d8bd8f5c059890fb3f3f5d Mon Sep 17 00:00:00 2001 From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com> Date: Thu, 21 Aug 2025 06:08:51 +0300 Subject: [PATCH 159/361] [V1][Mamba1] - Full CUDA and Piecewise CUDA Graphs Support (#23035) Signed-off-by: asafg Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com> Co-authored-by: asafg --- docs/usage/v1_guide.md | 2 +- .../models/language/generation/test_hybrid.py | 20 ++---- vllm/config/compilation.py | 1 + .../layers/mamba/mamba_mixer.py | 66 ++++++++++++++++--- vllm/model_executor/models/jamba.py | 8 ++- vllm/model_executor/models/mamba.py | 7 +- vllm/v1/attention/backends/mamba1_attn.py | 37 +++++------ vllm/v1/attention/backends/mamba2_attn.py | 45 ++----------- vllm/v1/attention/backends/mamba_attn.py | 55 ++++++++++++++++ 9 files changed, 154 insertions(+), 87 deletions(-) create mode 100644 vllm/v1/attention/backends/mamba_attn.py diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 54af970ea842d..9bf0c5842c6be 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -107,7 +107,7 @@ to enable simultaneous generation and embedding using the same engine instance i #### Mamba Models Models using selective state-space mechanisms instead of standard transformer attention are supported. -Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1. Additionally, Mamba-1 models require `enforce_eager=True`. +Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1. Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`, `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index aee0a50336c09..f8c0eaa8cf3a2 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -54,16 +54,14 @@ V1_SUPPORTED_MODELS = [ "tiiuae/Falcon-H1-0.5B-Base", ] +FULL_CUDA_GRAPH_MODELS = [ + "ai21labs/Jamba-tiny-dev", + "Zyphra/Zamba2-1.2B-instruct", +] + # Avoid OOM MAX_NUM_SEQS = 4 -# Once we add support for FCG in Mamba1, this list will be removed and tests -# all test cases will use enforce_eager=False -ENFORCE_EAGER_MODELS_V1 = [ - "state-spaces/mamba-130m-hf", - "ai21labs/Jamba-tiny-dev", -] - @pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS) @pytest.mark.parametrize("max_tokens", [64]) @@ -101,19 +99,13 @@ def test_models( example_prompts, max_tokens, num_logprobs) if model in V1_SUPPORTED_MODELS: - enforce_eager = False with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") if model in HYBRID_MODELS: # required due to reorder_batch behaviour m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER") - - if model in ENFORCE_EAGER_MODELS_V1: - enforce_eager = True - with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS, - enforce_eager=enforce_eager, enable_prefix_caching=False) as vllm_model: vllm_v1_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) @@ -373,7 +365,7 @@ def test_distributed_correctness( ) -@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"]) +@pytest.mark.parametrize("model", FULL_CUDA_GRAPH_MODELS) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) def test_full_cuda_graph( diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 56a2183f8e2c1..c654485f4fe9c 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -336,6 +336,7 @@ class CompilationConfig: "vllm.unified_attention", "vllm.unified_attention_with_output", "vllm.mamba_mixer2", + "vllm.mamba_mixer", ] def compute_hash(self) -> str: diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index 3c7322260df43..a24e72778b34b 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -27,6 +27,8 @@ from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( selective_scan_fn, selective_state_update) from vllm.model_executor.models.mamba_cache import MambaCacheParams from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform +from vllm.utils import direct_register_custom_op from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionMetadata @@ -183,22 +185,26 @@ class MambaMixer(MambaBase, CustomOp): def forward(self, hidden_states: torch.Tensor, + output: torch.Tensor, mamba_cache_params: Optional[MambaCacheParams] = None): if not envs.VLLM_USE_V1: - return CustomOp.forward(self, hidden_states, mamba_cache_params) + CustomOp.forward(self, hidden_states, output, mamba_cache_params) else: - return self.forward_cuda( + torch.ops.vllm.mamba_mixer( hidden_states, - mamba_cache_params, + output, + self.prefix, ) def forward_native(self, hidden_states: torch.Tensor, + output: torch.Tensor, mamba_cache_params: Optional[MambaCacheParams] = None): pass def forward_cuda(self, hidden_states: torch.Tensor, + output: torch.Tensor, mamba_cache_params: Optional[MambaCacheParams] = None): """ Run the Mamba-1 SSM pipeline. @@ -237,6 +243,7 @@ class MambaMixer(MambaBase, CustomOp): conv_state = self_kv_cache[0].transpose(-1, -2) ssm_state = self_kv_cache[1] has_initial_states = mamba1_metadata.has_initial_states + num_padded_decodes = mamba1_metadata.num_padded_decodes else: assert isinstance(attn_metadata, AttentionMetadata) assert mamba_cache_params is not None @@ -248,6 +255,7 @@ class MambaMixer(MambaBase, CustomOp): has_initial_states = None if context_lens_tensor is not None: has_initial_states = context_lens_tensor > 0 + num_padded_decodes = attn_metadata.num_decode_tokens # 1. Gated MLP's linear projection projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1) @@ -267,6 +275,7 @@ class MambaMixer(MambaBase, CustomOp): num_decodes = attn_metadata.num_decode_tokens # token count (=request) has_prefill = num_prefill_tokens > 0 has_decode = num_decode_tokens > 0 + num_actual_tokens = num_prefill_tokens + num_decode_tokens prefill_decode_split = split_batch_to_prefill_and_decode( hidden_states_BC, @@ -278,6 +287,7 @@ class MambaMixer(MambaBase, CustomOp): num_decode_tokens, num_prefills, num_decodes, + num_padded_decodes, ) hidden_states_BC_p = prefill_decode_split.hidden_states_BC_p hidden_states_BC_d = prefill_decode_split.hidden_states_BC_d @@ -371,7 +381,7 @@ class MambaMixer(MambaBase, CustomOp): else: out = self.out_proj(scan_outputs_combined.transpose(-2, -1))[0] - return out + output[:num_actual_tokens] = out def get_state_dtype(self) -> tuple[torch.dtype]: assert self.model_config is not None @@ -421,18 +431,27 @@ def split_batch_to_prefill_and_decode( num_decode_tokens: int, num_prefills: int, num_decodes: int, + num_padded_decodes: int, ) -> PrefillDecodeSplit: + num_actual_tokens = num_prefill_tokens + num_padded_decodes + if envs.VLLM_USE_V1: # In v1, decode tokens come first, then prefill tokens. hidden_states_BC_d, hidden_states_BC_p = torch.split( - hidden_states_BC, [num_decode_tokens, num_prefill_tokens], dim=-1) - gate_d, gate_p = torch.split(gate, - [num_decode_tokens, num_prefill_tokens], + hidden_states_BC[..., :num_actual_tokens], + [num_padded_decodes, num_prefill_tokens], + dim=-1) + gate_d, gate_p = torch.split(gate[..., :num_actual_tokens], + [num_padded_decodes, num_prefill_tokens], dim=-1) + + # num_padded_decodes accounts for CUDA graph padding when applicable state_indices_tensor_d, state_indices_tensor_p = torch.split( - state_indices_tensor, [num_decodes, num_prefills], dim=0) + state_indices_tensor[:num_padded_decodes + num_prefills], + [num_padded_decodes, num_prefills], + dim=0) query_start_loc_p = (query_start_loc[-num_prefills - 1:] - - num_decodes if num_prefills > 0 else None) + num_padded_decodes if num_prefills > 0 else None) has_initial_states_p = has_initial_states[-num_prefills:] if ( has_initial_states is not None and num_prefills > 0) else None else: @@ -459,3 +478,32 @@ def split_batch_to_prefill_and_decode( query_start_loc_p=query_start_loc_p, has_initial_states_p=has_initial_states_p, ) + + +def mamba_mixer( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + self.forward_cuda(hidden_states=hidden_states, + output=output, + mamba_cache_params=None) + + +def mamba_mixer_fake( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="mamba_mixer", + op_func=mamba_mixer, + mutates_args=["output"], + fake_impl=mamba_mixer_fake, + dispatch_key=current_platform.dispatch_key, +) diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 0b32d6f256590..3c1a0b68df56e 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -10,6 +10,7 @@ from transformers import JambaConfig from vllm import envs from vllm.attention.layer import Attention +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group @@ -154,10 +155,10 @@ class JambaMambaDecoderLayer(nn.Module): hidden_states, residual = self.input_layernorm( hidden_states, residual) - hidden_states = self.mamba(hidden_states, mamba_cache_params) + output = torch.empty_like(hidden_states) + self.mamba(hidden_states, output, mamba_cache_params) # Fully Connected - hidden_states, residual = self.pre_ff_layernorm( - hidden_states, residual) + hidden_states, residual = self.pre_ff_layernorm(output, residual) hidden_states = self.feed_forward(hidden_states) return hidden_states, residual @@ -278,6 +279,7 @@ ALL_DECODER_LAYER_TYPES = { } +@support_torch_compile class JambaModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index f4aaf0c6f467c..f02499a4f96b5 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -9,6 +9,7 @@ from torch import nn from transformers import MambaConfig from vllm import envs +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.layernorm import RMSNorm @@ -81,10 +82,12 @@ class MambaDecoderLayer(nn.Module): else: hidden_states, residual = self.norm(hidden_states, residual) - hidden_states = self.mixer(hidden_states, mamba_cache_params) - return hidden_states, residual + output = torch.empty_like(hidden_states) + self.mixer(hidden_states, output, mamba_cache_params) + return output, residual +@support_torch_compile class MambaModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py index 6cdc509083ae9..97a1aa86dda0d 100644 --- a/vllm/v1/attention/backends/mamba1_attn.py +++ b/vllm/v1/attention/backends/mamba1_attn.py @@ -2,16 +2,16 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import ClassVar, Optional +from typing import Optional import torch from vllm.attention.backends.abstract import AttentionBackend -from vllm.config import VllmConfig -from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, - CommonAttentionMetadata, +from vllm.attention.backends.utils import PAD_SLOT_ID +from vllm.v1.attention.backends.mamba_attn import ( + BaseMambaAttentionMetadataBuilder) +from vllm.v1.attention.backends.utils import (CommonAttentionMetadata, split_decodes_and_prefills) -from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec class Mamba1AttentionBackend(AttentionBackend): @@ -31,24 +31,11 @@ class Mamba1AttentionMetadata: num_prefill_tokens: int num_decodes: int num_decode_tokens: int + num_padded_decodes: int class Mamba1AttentionMetadataBuilder( - AttentionMetadataBuilder[Mamba1AttentionMetadata]): - reorder_batch_threshold: ClassVar[int] = 1 - - def __init__( - self, - kv_cache_spec: AttentionSpec, - vllm_config: VllmConfig, - device: torch.device, - layer_names: list[str], - ): - assert isinstance(kv_cache_spec, MambaSpec) - self.kv_cache_spec = kv_cache_spec - self.device = device - self.vllm_config = vllm_config - self.layer_names = layer_names + BaseMambaAttentionMetadataBuilder[Mamba1AttentionMetadata]): def build( self, @@ -67,9 +54,18 @@ class Mamba1AttentionMetadataBuilder( decode_threshold=1)) has_initial_states = None + padded_decodes = num_decodes if num_prefills > 0: has_initial_states = context_lens_tensor > 0 + elif (num_decodes > 0 and num_decodes <= self.decode_cudagraph_max_bs + and self.compilation_config.full_cuda_graph): + state_indices_for_decode = state_indices_tensor[:num_decodes] + padded_decodes = self.vllm_config.pad_for_cudagraph(num_decodes) + self.state_indices_tensor[:num_decodes].copy_( + state_indices_for_decode, non_blocking=True) + state_indices_tensor = self.state_indices_tensor[:padded_decodes] + state_indices_tensor[num_decodes:] = PAD_SLOT_ID return Mamba1AttentionMetadata( query_start_loc=query_start_loc, @@ -80,4 +76,5 @@ class Mamba1AttentionMetadataBuilder( num_prefill_tokens=num_prefill_tokens, num_decodes=num_decodes, num_decode_tokens=num_decode_tokens, + num_padded_decodes=padded_decodes, ) diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py index ace078e2b27c6..ed30884fdbc94 100644 --- a/vllm/v1/attention/backends/mamba2_attn.py +++ b/vllm/v1/attention/backends/mamba2_attn.py @@ -2,18 +2,18 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from dataclasses import dataclass -from typing import ClassVar, Optional +from typing import Optional import torch from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.config import VllmConfig -from vllm.v1.attention.backends.utils import (AttentionCGSupport, - AttentionMetadataBuilder, - CommonAttentionMetadata, +from vllm.v1.attention.backends.mamba_attn import ( + BaseMambaAttentionMetadataBuilder) +from vllm.v1.attention.backends.utils import (CommonAttentionMetadata, split_decodes_and_prefills) -from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec +from vllm.v1.kv_cache_interface import AttentionSpec def _query_start_loc_to_chunk_indices_offsets(query_start_loc: torch.Tensor, @@ -88,29 +88,14 @@ class Mamba2AttentionMetadata: class Mamba2AttentionMetadataBuilder( - AttentionMetadataBuilder[Mamba2AttentionMetadata]): - cudagraph_support: ClassVar[AttentionCGSupport] = \ - AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE - - reorder_batch_threshold: ClassVar[int] = 1 + BaseMambaAttentionMetadataBuilder[Mamba2AttentionMetadata]): def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): - assert isinstance(kv_cache_spec, MambaSpec) - self.kv_cache_spec = kv_cache_spec + super().__init__(kv_cache_spec, layer_names, vllm_config, device) self.chunk_size = vllm_config.model_config.get_mamba_chunk_size() - self.vllm_config = vllm_config - self.compilation_config = vllm_config.compilation_config assert self.chunk_size is not None, ( "chunk_size needs to be set in the model config for Mamba2 models") - self.decode_cudagraph_max_bs = min( - self.vllm_config.scheduler_config.max_num_seqs, - self.compilation_config.max_capture_size) - self.state_indices_tensor = torch.empty( - (self.decode_cudagraph_max_bs, ), - dtype=torch.int32, - device=device, - ) def build(self, common_prefix_len: int, @@ -187,19 +172,3 @@ class Mamba2AttentionMetadataBuilder( state_indices_tensor=state_indices_tensor, ) return attn_metadata - - def build_for_cudagraph_capture( - self, common_attn_metadata: CommonAttentionMetadata): - """ - This method builds the metadata for full cudagraph capture. - Currently, only decode is supported for full cudagraphs with Mamba. - """ - m = common_attn_metadata - - assert m.num_reqs == m.num_actual_tokens, \ - "Mamba only supports decode-only full CUDAGraph capture. " \ - "Make sure all cudagraph capture sizes <= max_num_seq." - - m.max_query_len = 1 # decode-only - - return self.build(0, m) diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py new file mode 100644 index 0000000000000..07ef7cb69a160 --- /dev/null +++ b/vllm/v1/attention/backends/mamba_attn.py @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import abc +from typing import ClassVar, TypeVar + +import torch + +from vllm.config import VllmConfig +from vllm.v1.attention.backends.utils import (AttentionCGSupport, + AttentionMetadataBuilder, + CommonAttentionMetadata) +from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec + +M = TypeVar("M") + + +class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC): + reorder_batch_threshold: ClassVar[int] = 1 + cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE + + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], + vllm_config: VllmConfig, device: torch.device): + assert isinstance(kv_cache_spec, MambaSpec) + self.kv_cache_spec = kv_cache_spec + self.device = device + self.vllm_config = vllm_config + self.layer_names = layer_names + + self.compilation_config = vllm_config.compilation_config + self.decode_cudagraph_max_bs = min( + self.vllm_config.scheduler_config.max_num_seqs, + self.compilation_config.max_capture_size) + self.state_indices_tensor = torch.empty( + (self.decode_cudagraph_max_bs, ), + dtype=torch.int32, + device=device, + ) + + def build_for_cudagraph_capture( + self, common_attn_metadata: CommonAttentionMetadata) -> M: + """ + This method builds the metadata for full cudagraph capture. + Currently, only decode is supported for full cudagraphs with Mamba. + """ + m = common_attn_metadata + + assert m.num_reqs == m.num_actual_tokens, \ + "Mamba only supports decode-only full CUDAGraph capture. " \ + "Make sure all cudagraph capture sizes <= max_num_seq." + + m.max_query_len = 1 # decode-only + + return self.build(0, m) \ No newline at end of file From f94bf9b924afe2e720b864590c9798b911e77e66 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 20 Aug 2025 23:09:39 -0400 Subject: [PATCH 160/361] [Compile] Fix Compile Warning SM100 Cutlass MLA (#23287) Signed-off-by: yewentao256 --- csrc/attention/mla/sm100_cutlass_mla_kernel.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu index e0e95d06290df..6dd6f269f3dc9 100644 --- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu +++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu @@ -167,7 +167,7 @@ typename T::Fmha::Arguments args_from_options( // TODO(trevor-m): Change split_kv back to -1 when // https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will // perform worse with larger context length and smaller batch sizes. - num_kv_splits, // split_kv + static_cast(num_kv_splits), // split_kv nullptr, // is_var_split_kv }; // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute @@ -264,7 +264,7 @@ int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_ba // Assumes device 0 when getting sm_count. arguments.hw_info.sm_count = sm_count <= 0 ? cutlass::KernelHardwareInfo::query_device_multiprocessor_count(/*device_id=*/0) : sm_count; - arguments.split_kv = num_kv_splits; + arguments.split_kv = static_cast(num_kv_splits); MlaSm100Type::Fmha::set_split_kv(arguments); return MlaSm100Type::Fmha::get_workspace_size(arguments); From 655a09f6538e6b09af23771dcc4fcebd72a15b23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=A8=E5=A5=87=28yann=20qi=29?= <51905299+yannqi@users.noreply.github.com> Date: Thu, 21 Aug 2025 12:08:52 +0800 Subject: [PATCH 161/361] [Model][VLM] Support R-4B Model (#23246) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: yannqi Signed-off-by: 杨奇(yann qi) <51905299+yannqi@users.noreply.github.com> Signed-off-by: Cyrus Leung Co-authored-by: yannqiyang Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cyrus Leung --- docs/models/supported_models.md | 1 + examples/offline_inference/vision_language.py | 23 ++++ .../vision_language_multi_image.py | 34 ++++++ .../multimodal/processing/test_common.py | 1 + tests/models/registry.py | 2 + vllm/model_executor/models/registry.py | 1 + vllm/model_executor/models/rvl.py | 103 ++++++++++++++++++ 7 files changed, 165 insertions(+) create mode 100644 vllm/model_executor/models/rvl.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 7308d0010690a..831bfb1e939e6 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -652,6 +652,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + IE+ + VE+ | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + IE+ + VE+ | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎ | +| `RForConditionalGeneration` | R-VL-4B | T + IE+ | `YannQi/R-4B` | | ✅︎ | ✅︎ | | `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ | | `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ | | `Step3VLForConditionalGeneration` | Step3-VL | T + I+ | `stepfun-ai/step3` | | ✅︎ | ✅︎ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 88bbbfdfbd188..e7a7a30dd31a6 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1436,6 +1436,28 @@ def run_qwen2_5_omni(questions: list[str], modality: str): ) +# R-4B +def run_r_vl(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + model_name = "YannQi/R-4B" + + prompts = [ + f"<|im_start|>user \n{question}<|im_end|><|im_start|>assistant\n" + for question in questions + ] + + engine_args = EngineArgs( + model=model_name, + max_model_len=16384, + limit_mm_per_prompt={modality: 1}, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # SkyworkR1V def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1622,6 +1644,7 @@ model_example_map = { "qwen2_vl": run_qwen2_vl, "qwen2_5_vl": run_qwen2_5_vl, "qwen2_5_omni": run_qwen2_5_omni, + "rvl": run_r_vl, "skywork_chat": run_skyworkr1v, "smolvlm": run_smolvlm, "step3": run_step3, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index eabd9453f3c51..d9242efa85470 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -992,6 +992,39 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_r_vl(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "YannQi/R-4B" + engine_args = EngineArgs( + model=model_name, + max_model_len=16384, + max_num_seqs=16, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] + + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" @@ -1193,6 +1226,7 @@ model_example_map = { "qwen_vl_chat": load_qwen_vl_chat, "qwen2_vl": load_qwen2_vl, "qwen2_5_vl": load_qwen2_5_vl, + "rvl": load_r_vl, "smolvlm": load_smolvlm, "step3": load_step3, "tarsier": load_tarsier, diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 02aecfad8281d..adc8b2510d677 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -316,6 +316,7 @@ def _test_processing_correctness_one( "Qwen/Qwen2.5-VL-3B-Instruct", "Qwen/Qwen2-Audio-7B-Instruct", "Qwen/Qwen2.5-Omni-3B", + "YannQi/R-4B", "Skywork/Skywork-R1V-38B", "HuggingFaceTB/SmolVLM2-2.2B-Instruct", "stepfun-ai/step3", diff --git a/tests/models/registry.py b/tests/models/registry.py index 6e6acfb8cd228..4f69f90b6aae1 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -489,6 +489,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { max_model_len=4096), "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"), "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"), # noqa: E501 + "RForConditionalGeneration": _HfExamplesInfo("YannQi/R-4B", + trust_remote_code=True), "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B", trust_remote_code=True), "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct", # noqa: E501 diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 78ef270598b8e..39a3e425a46df 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -217,6 +217,7 @@ _MULTIMODAL_MODELS = { "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"), "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"), # noqa: E501 "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"), + "RForConditionalGeneration": ("rvl", "RForConditionalGeneration"), "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501 "Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"), "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), diff --git a/vllm/model_executor/models/rvl.py b/vllm/model_executor/models/rvl.py new file mode 100644 index 0000000000000..efdb010046634 --- /dev/null +++ b/vllm/model_executor/models/rvl.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Mapping + +import torch +import torch.nn as nn +from transformers.activations import GELUActivation + +from vllm.config import VllmConfig +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalDataDict + +from .llava_next import (LlavaDummyInputsBuilder, LlavaNextMultiModalProcessor, + LlavaNextProcessingInfo) +from .llava_onevision import LlavaOnevisionForConditionalGeneration +from .utils import WeightsMapper + + +class RVLProcessingInfo(LlavaNextProcessingInfo): + + def get_hf_config(self): + return self.ctx.get_hf_config() + + def get_hf_processor(self, **kwargs: object): + return self.ctx.get_hf_processor(**kwargs) + + +class RVLDummyInputsBuilder(LlavaDummyInputsBuilder[RVLProcessingInfo]): + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + image_token = "" + + return image_token * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + + target_width, target_height = ( + self.info.get_image_size_with_most_features()) + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + } + + +class RVLMultiModalProjector(nn.Module): + + def __init__(self, config): + super().__init__() + self.pre_norm = nn.LayerNorm(config.vision_config.hidden_size, + eps=1e-06) + self.linear_1 = nn.Linear( + config.vision_config.hidden_size, + config.text_config.hidden_size, + bias=True, + ) + self.act = GELUActivation() + self.linear_2 = nn.Linear( + config.text_config.hidden_size, + config.text_config.hidden_size, + bias=True, + ) + + def forward(self, image_feature: torch.Tensor) -> torch.Tensor: + image_feature = self.pre_norm(image_feature) + hidden_states = self.linear_1(image_feature) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + + return hidden_states + + +@MULTIMODAL_REGISTRY.register_processor( + LlavaNextMultiModalProcessor, + info=RVLProcessingInfo, + dummy_inputs=RVLDummyInputsBuilder, +) +class RForConditionalGeneration(LlavaOnevisionForConditionalGeneration): + + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + # mapping for new names in checkpoint saved after transformers + # v4.52 + "model.language_model.": "language_model.model.", + "model.vision_tower.": "vision_tower.", + "model.multi_modal_projector.": "multi_modal_projector.", + "model.image_newline": "image_newline", + "lm_head.": "language_model.lm_head.", + }) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__(vllm_config=vllm_config, prefix=prefix) + config = vllm_config.model_config.hf_config + self.multi_modal_projector = RVLMultiModalProjector(config) From 8993073dc1a7e2d31eda85812b76789046ae7c28 Mon Sep 17 00:00:00 2001 From: QiliangCui Date: Thu, 21 Aug 2025 04:15:20 +0000 Subject: [PATCH 162/361] [CI] Delete images older than 24h. (#23291) Signed-off-by: Qiliang Cui --- .buildkite/scripts/tpu/cleanup_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/scripts/tpu/cleanup_docker.sh b/.buildkite/scripts/tpu/cleanup_docker.sh index 209d9c4341cdd..740d81fb39bb0 100755 --- a/.buildkite/scripts/tpu/cleanup_docker.sh +++ b/.buildkite/scripts/tpu/cleanup_docker.sh @@ -17,7 +17,7 @@ if [ "$disk_usage" -gt "$threshold" ]; then # Remove dangling images (those that are not tagged and not used by any container) docker image prune -f # Remove unused volumes / force the system prune for old images as well. - docker volume prune -f && docker system prune --force --filter "until=72h" --all + docker volume prune -f && docker system prune --force --filter "until=24h" --all echo "Docker images and volumes cleanup completed." else echo "Disk usage is below $threshold%. No cleanup needed." From f64ee61d9e7014a5f230a8347186b952dbe483de Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 21 Aug 2025 00:21:05 -0400 Subject: [PATCH 163/361] [CI] Block the cu126 wheel build while broken (#23285) Signed-off-by: mgoin --- .buildkite/release-pipeline.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index e20ce54ca795a..f96c38bf57db7 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -27,7 +27,12 @@ steps: env: DOCKER_BUILDKIT: "1" + - block: "Build CUDA 12.6 wheel" + key: block-build-cu126-wheel + depends_on: ~ + - label: "Build wheel - CUDA 12.6" + depends_on: block-build-cu126-wheel id: build-wheel-cuda-12-6 agents: queue: cpu_queue_postmerge From f571ff8eb6d9117c6a418f7f925921968dff8ac8 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Wed, 20 Aug 2025 21:28:32 -0700 Subject: [PATCH 164/361] [Sampler] Support returning final logprobs (#22387) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: Nick Hill Co-authored-by: Woosuk Kwon --- docs/usage/v1_guide.md | 7 ++- tests/v1/sample/test_logprobs.py | 10 ++-- vllm/config/__init__.py | 30 ++++++---- vllm/engine/arg_utils.py | 1 + vllm/v1/sample/ops/topk_topp_sampler.py | 65 ++++++++++---------- vllm/v1/sample/sampler.py | 79 +++++++++++++++++++------ vllm/v1/sample/tpu/sampler.py | 2 +- 7 files changed, 125 insertions(+), 69 deletions(-) diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 9bf0c5842c6be..b89768913681e 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -154,12 +154,15 @@ differences compared to V0: ##### Logprobs Calculation -Logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e. +By default, logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e. before applying any logits post-processing such as temperature scaling or penalty adjustments). As a result, the returned logprobs do not reflect the final adjusted probabilities used during sampling. -Support for logprobs with post-sampling adjustments is in progress and will be added in future updates. +You can adjust this behavior by setting the `--logprobs-mode` flag. +Four modes are supported: `raw_logprobs` (default), `processed_logprobs`, `raw_logits`, `processed_logits`. +Raw means the values before applying any logit processors, like bad words. +Processed means the values after applying all processors, including temperature and top_k/top_p. ##### Prompt Logprobs with Prefix Caching diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 8bd142e87b06e..e835c029634ce 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -456,9 +456,7 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch): assert len(logprob) == vocab_size -@pytest.mark.parametrize( - "logprobs_mode", - ["raw_logprobs", "raw_logits", "processed_logprobs", "processed_logits"]) +@pytest.mark.parametrize("logprobs_mode", list(LogprobsMode)) def test_logprobs_mode(logprobs_mode: LogprobsMode, monkeypatch: pytest.MonkeyPatch): """Test with LLM engine with different logprobs_mode. @@ -487,12 +485,14 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode, for logprobs in output.logprobs: for token_id in logprobs: logprob = logprobs[token_id] - if "logprobs" in logprobs_mode: + if logprobs_mode in (LogprobsMode.RAW_LOGPROBS, + LogprobsMode.PROCESSED_LOGPROBS): assert logprob.logprob <= 0 if logprob.logprob > 0: positive_values = positive_values + 1 total_token_with_logprobs = total_token_with_logprobs + 1 assert total_token_with_logprobs >= len(results[0].outputs) - if "logits" in logprobs_mode: + if logprobs_mode in (LogprobsMode.RAW_LOGITS, + LogprobsMode.PROCESSED_LOGITS): assert positive_values > 0 del llm diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 959f111ced22e..2973cb92d195b 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -257,11 +257,16 @@ def is_init_field(cls: ConfigType, name: str) -> bool: TokenizerMode = Literal["auto", "slow", "mistral", "custom"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] -LogprobsMode = Literal["raw_logprobs", "raw_logits", "processed_logprobs", - "processed_logits"] MMEncoderTPMode = Literal["weights", "data"] +class LogprobsMode(enum.Enum): + RAW_LOGITS = "raw_logits" + RAW_LOGPROBS = "raw_logprobs" + PROCESSED_LOGITS = "processed_logits" + PROCESSED_LOGPROBS = "processed_logprobs" + + @config @dataclass(config=ConfigDict(arbitrary_types_allowed=True)) class ModelConfig: @@ -363,12 +368,13 @@ class ModelConfig: specified in `SamplingParams`. The default value comes the default for the OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length * vocab_size) logprobs are allowed to be returned and it may cause OOM.""" - logprobs_mode: LogprobsMode = "raw_logprobs" + logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS """Indicates the content returned in the logprobs and prompt_logprobs. Supported mode: 1) raw_logprobs, 2) processed_logprobs, 3) raw_logits, 4) processed_logits. - Raw means the values before applying logit processors, like bad words. - Processed means the values after applying such processors. + Raw means the values before applying any logit processors, like bad words. + Processed means the values after applying all processors, including + temperature and top_k/top_p. """ disable_sliding_window: bool = False """Whether to disable sliding window. If True, we will disable the sliding @@ -2586,7 +2592,7 @@ class MultiModalConfig: skip_mm_profiling: bool = False """ - When enabled, skips multimodal memory profiling and only profiles with + When enabled, skips multimodal memory profiling and only profiles with language backbone model during engine initialization. This reduces engine startup time but shifts the responsibility to users for @@ -2649,24 +2655,24 @@ class PoolerConfig: ## for embeddings models normalize: Optional[bool] = None """ - Whether to normalize the embeddings outputs. + Whether to normalize the embeddings outputs. """ dimensions: Optional[int] = None """ - Reduce the dimensions of embeddings if model + Reduce the dimensions of embeddings if model support matryoshka representation. """ ## for classification models activation: Optional[bool] = None """ - Whether to apply activation function to the classification outputs. + Whether to apply activation function to the classification outputs. """ ## for reward models softmax: Optional[bool] = None """ - Whether to apply softmax to the reward outputs. + Whether to apply softmax to the reward outputs. """ step_tag_id: Optional[int] = None """ @@ -2692,9 +2698,9 @@ class PoolerConfig: max_embed_len: Optional[int] = None """ - Maximum input length allowed for embedding generation. When set, allows + Maximum input length allowed for embedding generation. When set, allows inputs longer than max_embed_len to be accepted for embedding models. - This parameter enables accepting long inputs without requiring + This parameter enables accepting long inputs without requiring VLLM_ALLOW_LONG_MAX_MODEL_LEN environment variable. When an input exceeds max_embed_len, it will be handled according to the original max_model_len validation logic. Defaults to None (i.e. set to max_model_len). diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f3afc015f669c..b0f50b4429a82 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -516,6 +516,7 @@ class EngineArgs: model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"]) model_group.add_argument("--logprobs-mode", + choices=[f.value for f in LogprobsMode], **model_kwargs["logprobs_mode"]) model_group.add_argument("--disable-sliding-window", **model_kwargs["disable_sliding_window"]) diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index e0434c8f3d713..7bd4a5a380ac0 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -8,6 +8,7 @@ import torch.nn as nn from packaging import version from vllm import envs +from vllm.config import LogprobsMode from vllm.logger import init_logger from vllm.platforms import current_platform @@ -28,9 +29,16 @@ class TopKTopPSampler(nn.Module): Implementations may update the logits tensor in-place. """ - def __init__(self): + def __init__( + self, + logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS) -> None: super().__init__() - if current_platform.is_cuda(): + self.logprobs_mode = logprobs_mode + # flashinfer optimization does not apply if intermediate + # logprobs/logits after top_k/top_p need to be returned + if logprobs_mode not in (LogprobsMode.PROCESSED_LOGITS, + LogprobsMode.PROCESSED_LOGPROBS + ) and current_platform.is_cuda(): if is_flashinfer_available: flashinfer_version = flashinfer.__version__ if version.parse(flashinfer_version) < version.parse("0.2.3"): @@ -63,10 +71,12 @@ class TopKTopPSampler(nn.Module): "native implementation of top-p & top-k sampling. For the " "best performance, please install FlashInfer.") self.forward = self.forward_native - elif current_platform.is_tpu(): - self.forward = self.forward_tpu else: self.forward = self.forward_native + if current_platform.is_tpu(): + self.apply_top_k_top_p = apply_top_k_top_p_tpu + else: + self.apply_top_k_top_p = apply_top_k_top_p def forward_native( self, @@ -74,15 +84,20 @@ class TopKTopPSampler(nn.Module): generators: dict[int, torch.Generator], k: Optional[torch.Tensor], p: Optional[torch.Tensor], - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """ PyTorch-native implementation of top-k and top-p sampling. The logits tensor may be updated in-place. """ - logits = apply_top_k_top_p(logits, k, p) + logits = self.apply_top_k_top_p(logits, k, p) + logits_to_return = None + if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: + logits_to_return = logits + elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: + logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32) probs = logits.softmax(dim=-1, dtype=torch.float32) - return random_sample(probs, generators) + return random_sample(probs, generators), logits_to_return def forward_cuda( self, @@ -90,34 +105,24 @@ class TopKTopPSampler(nn.Module): generators: dict[int, torch.Generator], k: Optional[torch.Tensor], p: Optional[torch.Tensor], - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """More optimized implementation for top-k and top-p sampling.""" - if k is None and p is None: - # We prefer `random_sample` over `flashinfer_sample` when sorting is - # not needed. This is because `random_sample` does not require - # CPU-GPU synchronization while `flashinfer_sample` does. - probs = logits.softmax(dim=-1, dtype=torch.float32) - return random_sample(probs, generators) - if generators: - logger.warning_once("FlashInfer 0.2.3+ does not support " - "per-request generators. Falling back to " - "PyTorch-native implementation.") + # We prefer `random_sample` over `flashinfer_sample` when sorting is + # not needed. This is because `random_sample` does not require + # CPU-GPU synchronization while `flashinfer_sample` does. + if (k is None and p is None) or generators: + if generators: + logger.warning_once("FlashInfer 0.2.3+ does not support " + "per-request generators. Falling back to " + "PyTorch-native implementation.") return self.forward_native(logits, generators, k, p) + assert self.logprobs_mode not in ( + LogprobsMode.PROCESSED_LOGITS, LogprobsMode.PROCESSED_LOGPROBS + ), "FlashInfer does not support returning logits/logprobs" # flashinfer sampling functions expect contiguous logits. # In flex_attn/triton_attn fp32 inference, logits can be non-contiguous # because of slicing operation in logits_processor. - return flashinfer_sample(logits.contiguous(), k, p, generators) - - def forward_tpu( - self, - logits: torch.Tensor, - generators: dict[int, torch.Generator], - k: Optional[torch.Tensor], - p: Optional[torch.Tensor], - ) -> torch.Tensor: - logits = apply_top_k_top_p_tpu(logits, k, p) - probs = logits.softmax(dim=-1, dtype=torch.float32) - return random_sample(probs, generators) + return flashinfer_sample(logits.contiguous(), k, p, generators), None def apply_top_k_top_p_tpu( diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 82f51298f1b59..70ec8a0c26ddf 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A layer that samples the next tokens from the model's outputs.""" +from typing import Optional + import torch import torch.nn as nn @@ -18,10 +20,50 @@ _SAMPLING_EPS = 1e-5 class Sampler(nn.Module): + """ + A layer that samples the next tokens from the model's outputs + with the following steps in order: - def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs"): + 1. If logprobs are requested: + a) If `logprobs_mode` is `raw_logprobs`, compute logprobs + as the final logprobs to return. + b) If `logprobs_mode` is `raw_logits`, clone the logits + as the final logprobs to return. + 2. Convert logits to float32. + 3. Apply allowed token ids whitelist. + 4. Apply bad words exclusion. + 5. Apply logit processors which are not argmax-invariant, + i.e. that can impact greedy sampling. + a) Min tokens processor + b) Logit bias processor + 6. Apply penalties + a) Repetition penalty + b) Frequency penalty + c) Presence penalty + 7. Sample the next tokens. `sample` method performs the following steps: + a) If not `all_random`, perform greedy sampling. If `all_greedy`, + return the greedily sampled tokens and final logprobs if requested. + b) Apply temperature. + c) Apply logit processors which are argmax-invariant, by default + the min_p processor. + d) Apply top_k and/or top_p. + e) Sample the next tokens with the probability distribution. + f) If `all_random` or temperature >= epsilon (1e-5), return the + randomly sampled tokens and final logprobs if requested. Else, + return the greedily sampled tokens and logprobs if requested. + 8. Gather the logprobs of the top `max_num_logprobs` and sampled token + (if requested). Note that if the sampled token is within the top + `max_num_logprobs`, the logprob will be eventually merged in + `LogprobsProcessor` during output processing. Therefore, the + final output may contain either `max_num_logprobs + 1` or + `max_num_logprobs` logprobs. + 9. Return the final `SamplerOutput`. + """ + + def __init__(self, + logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS): super().__init__() - self.topk_topp_sampler = TopKTopPSampler() + self.topk_topp_sampler = TopKTopPSampler(logprobs_mode) self.pin_memory = is_pin_memory_available() self.logprobs_mode = logprobs_mode @@ -34,13 +76,11 @@ class Sampler(nn.Module): # temperature scaling) for the top-k logprobs. # This is different from the V0 sampler, which uses the logits that # is used for sampling (after penalties and temperature scaling). - # TODO(rob): provide option for logprobs post sampling. - # See https://vllm-dev.slack.com/archives/C07UUL8E61Z/p1735907856007919 # noqa: E501 num_logprobs = sampling_metadata.max_num_logprobs if num_logprobs is not None: - if self.logprobs_mode == "raw_logprobs": + if self.logprobs_mode == LogprobsMode.RAW_LOGPROBS: raw_logprobs = self.compute_logprobs(logits) - elif self.logprobs_mode == "raw_logits": + elif self.logprobs_mode == LogprobsMode.RAW_LOGITS: raw_logprobs = logits.clone() # Use float32 for the logits. @@ -57,15 +97,10 @@ class Sampler(nn.Module): # Apply penalties (e.g., min_tokens, freq_penalties). logits = self.apply_penalties(logits, sampling_metadata) - # Get the process logprobs or logits. - if num_logprobs is not None: - if self.logprobs_mode == "processed_logprobs": - raw_logprobs = self.compute_logprobs(logits) - elif self.logprobs_mode == "processed_logits": - raw_logprobs = logits.clone() - # Sample the next token. - sampled = self.sample(logits, sampling_metadata) + sampled, processed_logprobs = self.sample(logits, sampling_metadata) + if processed_logprobs is not None: + raw_logprobs = processed_logprobs # Convert sampled token ids to int64 (long) type to ensure compatibility # with subsequent operations that may use these values as indices. # This conversion is necessary because FlashInfer sampling operations @@ -105,7 +140,7 @@ class Sampler(nn.Module): self, logits: torch.Tensor, sampling_metadata: SamplingMetadata, - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: """Sample logits based on sampling metadata. The various logits processing functions called in this method @@ -119,7 +154,13 @@ class Sampler(nn.Module): else: greedy_sampled = self.greedy_sample(logits) if sampling_metadata.all_greedy: - return greedy_sampled + processed_logprobs = None + if sampling_metadata.max_num_logprobs is not None: + if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: + processed_logprobs = logits + elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: + processed_logprobs = self.compute_logprobs(logits) + return greedy_sampled, processed_logprobs assert sampling_metadata.temperature is not None @@ -132,7 +173,7 @@ class Sampler(nn.Module): logits = processor.apply(logits) # Apply top_k and/or top_p. - random_sampled = self.topk_topp_sampler( + random_sampled, processed_logprobs = self.topk_topp_sampler( logits, sampling_metadata.generators, sampling_metadata.top_k, @@ -140,7 +181,7 @@ class Sampler(nn.Module): ) if greedy_sampled is None: - return random_sampled + return random_sampled, processed_logprobs sampled = torch.where( sampling_metadata.temperature < _SAMPLING_EPS, @@ -148,7 +189,7 @@ class Sampler(nn.Module): random_sampled, out=greedy_sampled, # Reuse tensor ) - return sampled + return sampled, processed_logprobs def compute_logprobs(self, logits: torch.Tensor) -> torch.Tensor: return logits.log_softmax(dim=-1, dtype=torch.float32) diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py index 2c9f4892bc247..04545d587e4a9 100644 --- a/vllm/v1/sample/tpu/sampler.py +++ b/vllm/v1/sample/tpu/sampler.py @@ -65,7 +65,7 @@ class Sampler(nn.Module): logits = self.apply_min_p(logits, sampling_metadata.min_p) # Apply top_k and/or top_p. - random_sampled = self.topk_topp_sampler( + random_sampled, _ = self.topk_topp_sampler( logits, sampling_metadata.generators, sampling_metadata.top_k, From 0c31e28e9520d96c451cc7f023fd0f0af549766a Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 21 Aug 2025 13:03:00 +0800 Subject: [PATCH 165/361] [Bugfix] Fix extra whitespace in strings caused by newline (#23272) Signed-off-by: DarkLight1337 --- benchmarks/benchmark_dataset.py | 6 ++++-- examples/offline_inference/vision_language.py | 15 +++++++-------- vllm/benchmarks/datasets.py | 6 ++++-- vllm/model_executor/model_loader/tpu.py | 11 ++++++----- vllm/model_executor/models/hyperclovax_vision.py | 9 ++++----- vllm/model_executor/models/phi4mm.py | 6 +++--- vllm/transformers_utils/configs/eagle.py | 4 ++-- 7 files changed, 30 insertions(+), 27 deletions(-) diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index e1a856026c4ae..2ea4f9ccaff2b 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -958,8 +958,10 @@ class InstructCoderDataset(HuggingFaceDataset): for i, item in enumerate(self.data): if len(sampled_requests) >= num_requests: break - prompt = f"{item['input']}\n\n{item['instruction']} Just output \ - the code, do not include any explanation." + prompt = ( + f"{item['input']}\n\n{item['instruction']} Just output " + "the code, do not include any explanation." + ) # apply template prompt = tokenizer.apply_chat_template( diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index e7a7a30dd31a6..8d97ba2668263 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -283,8 +283,10 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: ) prompts = [ - f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\ - {question}<|assistant|>" + ( + "<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>" + f"{question}<|assistant|>" + ) for question in questions ] @@ -767,15 +769,13 @@ def run_llava_next_video(questions: list[str], modality: str) -> ModelRequestDat def run_llava_onevision(questions: list[str], modality: str) -> ModelRequestData: if modality == "video": prompts = [ - f"<|im_start|>user