From 947e982ede0e65a5b319e1d2c007b6ae0106398d Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 28 Jul 2025 21:46:39 -0400 Subject: [PATCH 001/224] [Docs] Minimize spacing for supported_hardware.md table (#21779) --- .../quantization/supported_hardware.md | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md index 70a6a499562a3..f53e69ecc6115 100644 --- a/docs/features/quantization/supported_hardware.md +++ b/docs/features/quantization/supported_hardware.md @@ -2,19 +2,26 @@ The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: + + | Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | Intel Gaudi | x86 CPU | AWS Neuron | Google TPU | |-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------| -| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ | -| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ | -| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | -| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ❌ | +| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ | +| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | ❌ | +| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | +| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ❌ | | BitBLAS (GPTQ) | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | AQLM | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | -| INC (W8A8) | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅︎ | ❌ | ❌ | ❌ | +| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | +| INC (W8A8) | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅︎ | ❌ | ❌ | ❌ | - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. - ✅︎ indicates that the quantization method is supported on the specified hardware. From 48b763d6b5c969024a8a5ae30c2bf9a91e8ac032 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 28 Jul 2025 21:47:21 -0400 Subject: [PATCH 002/224] [Refactor] Merge Compressed Tensor FP8 `CompressedTensorsW8A8Fp8MoEMethod` and `CompressedTensorsW8A8Fp8MoECutlassMethod` (#21775) Signed-off-by: yewentao256 --- .../compressed_tensors_moe.py | 389 +++++------------- 1 file changed, 100 insertions(+), 289 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 8f69636dda7bf..17b41e8a1c23c 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -45,7 +45,6 @@ class GPTQMarlinState(Enum): __all__ = [ "CompressedTensorsMoEMethod", "CompressedTensorsW8A8Fp8MoEMethod", - "CompressedTensorsW8A8Fp8MoECutlassMethod", "CompressedTensorsW8A8Int8MoEMethod", "CompressedTensorsWNA16MarlinMoEMethod", "CompressedTensorsWNA16MoEMethod", "CompressedTensorsW4A4MoeMethod" @@ -84,9 +83,8 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant): return CompressedTensorsW4A4MoeMethod() elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant) - or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)): - return CompressedTensorsW8A8Fp8MoECutlassMethod(quant_config) - elif quant_config._is_fp8_w8a8(weight_quant, input_quant): + or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant) + or quant_config._is_fp8_w8a8(weight_quant, input_quant)): return CompressedTensorsW8A8Fp8MoEMethod(quant_config) elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant): return CompressedTensorsW8A8Int8MoEMethod(quant_config) @@ -378,6 +376,14 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled() + # cutlass path + self.is_fp8_w8a8_sm100 = quant_config._is_fp8_w8a8_sm100( + self.weight_quant, self.input_quant) + self.use_cutlass = (quant_config._is_fp8_w8a8_sm90( + self.weight_quant, self.input_quant) or self.is_fp8_w8a8_sm100) + self.fused_experts = None # type: ignore[assignment] + self.disable_expert_map = False + def create_weights(self, layer: torch.nn.Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): @@ -558,6 +564,34 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): prepare_finalize: FusedMoEPrepareAndFinalize, moe: FusedMoEConfig, ) -> FusedMoEPermuteExpertsUnpermute: + # cutlass path + if self.use_cutlass: + from vllm.model_executor.layers.fused_moe import CutlassExpertsFp8 + + use_batched_format = (prepare_finalize.activation_format == + FusedMoEActivationFormat.BatchedExperts) + + num_dispatchers = prepare_finalize.num_dispatchers() + num_experts = (moe.num_local_experts + if use_batched_format else moe.num_experts) + + logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__) + + experts = CutlassExpertsFp8( + num_experts, + moe.in_dtype, + self.input_quant.strategy == QuantizationStrategy.TOKEN, + self.weight_quant.strategy == QuantizationStrategy.CHANNEL, + num_dispatchers=num_dispatchers, + use_batched_format=use_batched_format, + ) + + self.disable_expert_map = (num_dispatchers > 1 + or not experts.supports_expert_map()) + + return experts + + # triton path from vllm.model_executor.layers.fused_moe import TritonExperts from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedTritonExperts) @@ -629,6 +663,68 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): indices_type=self.topk_indices_dtype, ) + # cutlass path + if self.use_cutlass: + per_act_token = ( + self.input_quant.strategy == QuantizationStrategy.TOKEN) + per_channel_quant = ( + self.weight_quant.strategy == QuantizationStrategy.CHANNEL) + + # small-batch fallback on SM100 + if self.is_fp8_w8a8_sm100 and topk_ids.shape[0] <= 8: + from vllm.model_executor.layers.fused_moe import fused_experts + return fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + use_fp8_w8a8=True, + per_channel_quant=per_channel_quant, + global_num_experts=global_num_experts, + expert_map=None if self.disable_expert_map else expert_map, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale) + + if self.fused_experts is None: + from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + cutlass_moe_fp8) + return cutlass_moe_fp8( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights, + topk_ids, + per_act_token=per_act_token, + activation=activation, + global_num_experts=global_num_experts, + expert_map=None if self.disable_expert_map else expert_map, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + ) + else: + return self.fused_experts( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights, + topk_ids, + activation=activation, + global_num_experts=global_num_experts, + expert_map=None if self.disable_expert_map else expert_map, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + ) + if self.rocm_aiter_moe_enabled: return self.rocm_aiter_fused_experts_func( hidden_states=x, @@ -685,291 +781,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): a2_scale=layer.w2_input_scale) -class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod): - - def __init__( - self, - quant_config: "CompressedTensorsConfig" # type: ignore # noqa E501 - ): - self.quant_config = quant_config - self.weight_quant = self.quant_config.target_scheme_map["Linear"].get( - "weights") - self.input_quant = self.quant_config.target_scheme_map["Linear"].get( - "input_activations") - - per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR - and self.input_quant.strategy - == QuantizationStrategy.TENSOR) - per_channel = ( - self.weight_quant.strategy == QuantizationStrategy.CHANNEL - and self.input_quant.strategy == QuantizationStrategy.TOKEN) - if not (per_tensor or per_channel): - raise ValueError( - "For FP8 Fused MoE layers, we require per tensor " - "or channelwise, dynamic per token quantization. Found " - f"{self.weight_quant}, {self.input_quant}") - - self.static_input_scales = not self.input_quant.dynamic - if self.static_input_scales and per_channel: - raise ValueError( - "For FP8 Fused MoE layer, we require either per tensor or " - "channelwise, dynamic per token quantization.") - - self.topk_indices_dtype = None - self.fused_experts = None # type: ignore - self.disable_expert_map = False - self.is_fp8_w8a8_sm100 = self.quant_config._is_fp8_w8a8_sm100( - self.weight_quant, self.input_quant) - - def create_weights(self, layer: torch.nn.Module, num_experts: int, - hidden_size: int, intermediate_size_per_partition: int, - params_dtype: torch.dtype, **extra_weight_attrs): - - params_dtype = torch.float8_e4m3fn - - # WEIGHTS - w13_weight = torch.nn.Parameter(torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - hidden_size, - dtype=params_dtype), - requires_grad=False) - layer.register_parameter("w13_weight", w13_weight) - set_weight_attrs(w13_weight, extra_weight_attrs) - - w2_weight = torch.nn.Parameter(torch.empty( - num_experts, - hidden_size, - intermediate_size_per_partition, - dtype=params_dtype), - requires_grad=False) - layer.register_parameter("w2_weight", w2_weight) - set_weight_attrs(w2_weight, extra_weight_attrs) - - # WEIGHT_SCALES - if self.weight_quant.strategy == QuantizationStrategy.TENSOR: - # Allocate 2 scales for w1 and w3 respectively. - # They are combined to a single scale after weight loading. - w13_weight_scale = torch.nn.Parameter(torch.ones( - num_experts, 2, dtype=torch.float32), - requires_grad=False) - layer.register_parameter("w13_weight_scale", w13_weight_scale) - w2_weight_scale = torch.nn.Parameter(torch.ones( - num_experts, dtype=torch.float32), - requires_grad=False) - layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Add PER-TENSOR quantization for FusedMoE.weight_loader. - extra_weight_attrs.update( - {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}) - set_weight_attrs(w13_weight_scale, extra_weight_attrs) - set_weight_attrs(w2_weight_scale, extra_weight_attrs) - - elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL: - w13_weight_scale = torch.nn.Parameter(torch.ones( - num_experts, - 2 * intermediate_size_per_partition, - 1, - dtype=torch.float32), - requires_grad=False) - layer.register_parameter("w13_weight_scale", w13_weight_scale) - w2_weight_scale = torch.nn.Parameter(torch.ones( - num_experts, hidden_size, 1, dtype=torch.float32), - requires_grad=False) - layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Add PER-CHANNEL quantization for FusedMoE.weight_loader. - extra_weight_attrs.update( - {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}) - set_weight_attrs(w13_weight_scale, extra_weight_attrs) - set_weight_attrs(w2_weight_scale, extra_weight_attrs) - - # INPUT_SCALES - if self.static_input_scales: - w13_input_scale = torch.nn.Parameter(torch.ones( - num_experts, dtype=torch.float32), - requires_grad=False) - layer.register_parameter("w13_input_scale", w13_input_scale) - set_weight_attrs(w13_input_scale, extra_weight_attrs) - - w2_input_scale = torch.nn.Parameter(torch.ones( - num_experts, dtype=torch.float32), - requires_grad=False) - layer.register_parameter("w2_input_scale", w2_input_scale) - set_weight_attrs(w2_input_scale, extra_weight_attrs) - else: - layer.w13_input_scale = None - layer.w2_input_scale = None - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - # Fp8 moe kernels require a single activation scale. - # We take the max of all the scales in case they differ. - if self.static_input_scales: - assert self.input_quant.strategy == QuantizationStrategy.TENSOR - if (layer.w13_input_scale is None or layer.w2_input_scale is None): - raise ValueError( - "QuantConfig has static quantization, but found " - "activation scales are None.") - if (not all_close_1d(layer.w13_input_scale) - or not all_close_1d(layer.w2_input_scale)): - logger.warning_once( - "Found input_scales that are not equal for " - "fp8 MoE layer. Using the maximum across experts " - "for each layer.") - layer.w13_input_scale = torch.nn.Parameter( - layer.w13_input_scale.max(), requires_grad=False) - layer.w2_input_scale = torch.nn.Parameter( - layer.w2_input_scale.max(), requires_grad=False) - - # For Per-TENSOR case, Fp8 moe kernel needs single weight scale - # for w13 per expert. Use max then dequant and requant each expert. - if self.weight_quant.strategy == QuantizationStrategy.TENSOR: - assert layer.w13_weight_scale is not None - shard_size = layer.intermediate_size_per_partition - max_w13_scales = layer.w13_weight_scale.max(dim=1).values - for expert_id in range(layer.local_num_experts): - start = 0 - for shard_id in range(2): - dq_weight = per_tensor_dequantize( - layer.w13_weight[expert_id][start:start + - shard_size, :], - layer.w13_weight_scale[expert_id][shard_id]) - layer.w13_weight[expert_id][ - start:start + shard_size, :], _ = ops.scaled_fp8_quant( - dq_weight, max_w13_scales[expert_id]) - start += shard_size - layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales, - requires_grad=False) - - def select_gemm_impl( - self, - prepare_finalize: FusedMoEPrepareAndFinalize, - moe: FusedMoEConfig, - ) -> FusedMoEPermuteExpertsUnpermute: - from vllm.model_executor.layers.fused_moe import CutlassExpertsFp8 - - use_batched_format = (prepare_finalize.activation_format == - FusedMoEActivationFormat.BatchedExperts) - - num_dispatchers = prepare_finalize.num_dispatchers() - - num_experts = (moe.num_local_experts - if use_batched_format else moe.num_experts) - - logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__) - - experts = CutlassExpertsFp8( - num_experts, - moe.in_dtype, - self.input_quant.strategy == QuantizationStrategy.TOKEN, - self.weight_quant.strategy == QuantizationStrategy.CHANNEL, - num_dispatchers=num_dispatchers, - use_batched_format=use_batched_format, - ) - - self.disable_expert_map = (num_dispatchers > 1 - or not experts.supports_expert_map()) - - return experts - - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: Optional[int] = None, - num_expert_group: Optional[int] = None, - global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, - custom_routing_function: Optional[Callable] = None, - scoring_func: str = "softmax", - e_score_correction_bias: Optional[torch.Tensor] = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: Optional[torch.Tensor] = None, - logical_to_physical_map: Optional[torch.Tensor] = None, - logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - if enable_eplb: - raise NotImplementedError( - "EPLB not supported for " - "`CompressedTensorsW8A8Fp8MoECutlassMethod` yet.") - - topk_weights, topk_ids = FusedMoE.select_experts( - hidden_states=x, - router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) - - per_act_token = ( - self.input_quant.strategy == QuantizationStrategy.TOKEN) - per_channel_quant = ( - self.weight_quant.strategy == QuantizationStrategy.CHANNEL) - # Triton fused_experts is faster in small batch sizes on SM100. - # Fall back to fused_experts in small batch sizes. - if self.is_fp8_w8a8_sm100 and topk_ids.shape[0] <= 8: - from vllm.model_executor.layers.fused_moe import fused_experts - return fused_experts( - x, - layer.w13_weight, - layer.w2_weight, - topk_weights, - topk_ids, - inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - use_fp8_w8a8=True, - per_channel_quant=per_channel_quant, - global_num_experts=global_num_experts, - expert_map=None if self.disable_expert_map else expert_map, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale) - if self.fused_experts is None: - # If no modular kernel is provided, use cutlass_moe_fp8 - from vllm.model_executor.layers.fused_moe.cutlass_moe import ( - cutlass_moe_fp8) - return cutlass_moe_fp8( - x, - layer.w13_weight, - layer.w2_weight, - topk_weights, - topk_ids, - per_act_token=per_act_token, - activation=activation, - global_num_experts=global_num_experts, - expert_map=None if self.disable_expert_map else expert_map, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale, - ) - else: - return self.fused_experts( - x, - layer.w13_weight, - layer.w2_weight, - topk_weights, - topk_ids, - activation=activation, - global_num_experts=global_num_experts, - expert_map=None if self.disable_expert_map else expert_map, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale, - ) - - class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): def __init__( From afa26075966301887a15f958a6aec0a89a3faacd Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 28 Jul 2025 21:56:24 -0400 Subject: [PATCH 003/224] [CI] Parallelize Kernels MoE Test (#21764) Signed-off-by: mgoin --- .buildkite/test-pipeline.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 948ce9e8667f5..ac145453dabde 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -403,17 +403,18 @@ steps: - vllm/model_executor/layers/quantization - tests/kernels/quantization commands: - - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 -- label: Kernels MoE Test +- label: Kernels MoE Test %N mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/moe/ - tests/kernels/moe - vllm/model_executor/layers/fused_moe/ commands: - - pytest -v -s kernels/moe + - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 - label: Kernels Mamba Test mirror_hardwares: [amdexperimental, amdproduction] From e18f0851033fbc4ef55c1989411f2a5666b518c6 Mon Sep 17 00:00:00 2001 From: Calvin Chen Date: Tue, 29 Jul 2025 09:59:44 +0800 Subject: [PATCH 004/224] skip fusedmoe layer for start_load_kv (#21378) Signed-off-by: calvin chen --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index d47a75461d72e..32d0e43d71afe 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -192,8 +192,16 @@ class P2pNcclConnector(KVConnectorBase_V1): # Load the KV for each request each layer for request in metadata.requests: for layer_name in forward_context.no_compile_layers: - attn_layer = forward_context.no_compile_layers[layer_name] - kv_cache_layer = attn_layer.kv_cache[ \ + layer = forward_context.no_compile_layers[layer_name] + + # Only process layers that have kv_cache + # attribute (attention layers) Skip non-attention + # layers like FusedMoE + kv_cache = getattr(layer, 'kv_cache', None) + if kv_cache is None: + continue + + kv_cache_layer = kv_cache[ \ forward_context.virtual_engine] kv_cache = self.p2p_nccl_engine.recv_tensor( From 12a223ef9bfebcc61e477047dce049495fe8c8a8 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Mon, 28 Jul 2025 23:35:37 -0400 Subject: [PATCH 005/224] [AMD][CI/Build][Bugfix] Guarding CUDA specific functions by ifndef ROCM (#21766) Signed-off-by: Gregory Shtrasberg --- .../quantization/compressed_tensors/int8_quant_kernels.cu | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu index 6a81f159f46ae..d8369108d0bd3 100644 --- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu +++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu @@ -1,7 +1,9 @@ #include #include -#include "../per_token_group_quant_8bit.h" +#ifndef USE_ROCM + #include "../per_token_group_quant_8bit.h" +#endif #include @@ -339,10 +341,12 @@ void dynamic_scaled_int8_quant( }); } +#ifndef USE_ROCM void per_token_group_quant_int8(const torch::Tensor& input, torch::Tensor& output_q, torch::Tensor& output_s, int64_t group_size, double eps, double int8_min, double int8_max) { per_token_group_quant_8bit(input, output_q, output_s, group_size, eps, int8_min, int8_max); -} \ No newline at end of file +} +#endif From f1e2c095ecee01db02d0b63aae26d039b940d894 Mon Sep 17 00:00:00 2001 From: Benji Beck Date: Mon, 28 Jul 2025 22:09:45 -0700 Subject: [PATCH 006/224] Migrate InternVLImageInputs and InternVLVideoInputs to TensorSchema (#21684) Signed-off-by: Benji Beck --- vllm/model_executor/models/internvl.py | 115 +++++++++++-------------- 1 file changed, 51 insertions(+), 64 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 3637f037751c0..a0e98ca3f8155 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -9,7 +9,7 @@ # -------------------------------------------------------- from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Literal, Optional, TypedDict, TypeVar, Union +from typing import Annotated, Any, Literal, Optional, TypeVar, Union import numpy.typing as npt import torch @@ -37,6 +37,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) @@ -51,54 +52,60 @@ IMAGENET_MEAN = (0.485, 0.456, 0.406) IMAGENET_STD = (0.229, 0.224, 0.225) -class InternVLImagePixelInputs(TypedDict): +class InternVLImagePixelInputs(TensorSchema): + """ + Dimensions: + - bn: Batch size * number of images + - bnp: Batch size * number of images * (1 + num_patches) + - c: Number of channels (3) + - h: Height of each image patch + - w: Width of each image patch + """ type: Literal["pixel_values"] - pixel_values_flat: torch.Tensor + pixel_values_flat: Annotated[torch.Tensor, TensorShape("bnp", 3, "h", "w")] + num_patches: Annotated[torch.Tensor, TensorShape("bn")] + + +class InternVLImageEmbeddingInputs(TensorSchema): """ - Shape: - `(batch_size * num_images * (1 + num_patches), num_channels, height, width)` + Dimensions: + - n: Number of images + - f: Total image feature size + - h: Hidden size (must match the hidden size of language model backbone) """ - - num_patches: torch.Tensor - """Shape: `(batch_size * num_images)`""" - - -class InternVLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: Union[torch.Tensor, list[torch.Tensor]] - """ - A tensor of shape `(num_images, total_image_feature_size, hidden_size)` - or a list of tensors of shape `(total_image_feature_size, hidden_size)` - - `hidden_size` must match the hidden size of language model backbone. - """ + data: Annotated[Union[torch.Tensor, list[torch.Tensor]], + TensorShape("n", "f", "h")] InternVLImageInputs = Union[InternVLImagePixelInputs, InternVLImageEmbeddingInputs] -class InternVLVideoPixelInputs(TypedDict): +class InternVLVideoPixelInputs(TensorSchema): + """ + Dimensions: + - bvf: Batch size * number of videos * num_frames + - bn: Batch size * number of images + - c: Number of channels (3) + - h: Height of each video frame + - w: Width of each video frame + """ type: Literal["pixel_values_videos"] - pixel_values_flat: torch.Tensor + pixel_values_flat: Annotated[torch.Tensor, TensorShape("bvf", 3, "h", "w")] + num_patches: Annotated[torch.Tensor, TensorShape("bn")] + + +class InternVLVideoEmbeddingInputs(TensorSchema): """ - Shape: - `(batch_size * num_video * num_frames, num_channels, height, width)` + Dimensions: + - n: Number of videos + - f: Total video feature size + - h: Hidden size (must match the hidden size of language model backbone) """ - - num_patches: torch.Tensor - """Shape: `(batch_size * num_images)`""" - - -class InternVLVideoEmbeddingInputs(TypedDict): type: Literal["video_embeds"] - data: Union[torch.Tensor, list[torch.Tensor]] - """ - A tensor of shape `(num_videos, total_video_feature_size, hidden_size)` - or a list of tensors of shape `(total_video_feature_size, hidden_size)` - - `hidden_size` must match the hidden size of language model backbone. - """ + data: Annotated[Union[torch.Tensor, list[torch.Tensor]], + TensorShape("n", "f", "h")] InternVLVideoInputs = Union[InternVLVideoPixelInputs, @@ -1151,26 +1158,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, vit_embeds = self.mlp1(vit_embeds) return vit_embeds - def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: - - h = w = self.config.vision_config.image_size - expected_dims = (3, h, w) - - def _validate_shape(d: torch.Tensor): - actual_dims = tuple(d.shape) - - if actual_dims != expected_dims: - expected_expr = str(expected_dims) - raise ValueError( - "The expected shape of pixel values per image per batch " - f" per patch is {expected_expr}. " - f"You supplied {tuple(d.shape)}.") - - for d in data: - _validate_shape(d) - - return data - def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[InternVLImageInputs]: pixel_values_flat = kwargs.pop("pixel_values_flat", None) @@ -1205,12 +1192,14 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, pixel_values_flat = flatten_bn(pixel_values_flat, concat=True) image_num_patches = flatten_bn(image_num_patches, concat=True) + expected_h = expected_w = self.config.vision_config.image_size + resolve_bindings = {"h": expected_h, "w": expected_w} return InternVLImagePixelInputs( type="pixel_values", - pixel_values_flat=self._validate_pixel_values( - pixel_values_flat), + pixel_values_flat=pixel_values_flat, num_patches=image_num_patches, + resolve_bindings=resolve_bindings, ) raise AssertionError("This line should be unreachable.") @@ -1225,11 +1214,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, return None if video_embeds is not None: - if not isinstance(video_embeds, (torch.Tensor, list)): - raise ValueError("Incorrect type of video embeddings. " - f"Got type: {type(video_embeds)}") - - return InternVLImageEmbeddingInputs( + return InternVLVideoEmbeddingInputs( type="video_embeds", data=flatten_bn(video_embeds), ) @@ -1250,12 +1235,14 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, pixel_values_flat_video = flatten_bn(pixel_values_flat_video, concat=True) video_num_patches = flatten_bn(video_num_patches, concat=True) + expected_h = expected_w = self.config.vision_config.image_size + resolve_bindings = {"h": expected_h, "w": expected_w} return InternVLVideoPixelInputs( type="pixel_values_videos", - pixel_values_flat=self._validate_pixel_values( - pixel_values_flat_video), + pixel_values_flat=pixel_values_flat_video, num_patches=video_num_patches, + resolve_bindings=resolve_bindings, ) raise AssertionError("This line should be unreachable.") From 7234fe26858f2c621901494c307c90e65fe35340 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 29 Jul 2025 06:14:47 +0100 Subject: [PATCH 007/224] [Misc] Rework process titles (#21780) Signed-off-by: Nick Hill --- vllm/entrypoints/cli/serve.py | 6 ++++-- vllm/entrypoints/openai/api_server.py | 16 ++++++++++++---- vllm/utils/__init__.py | 16 ++++++++++++---- vllm/v1/engine/coordinator.py | 7 +++---- vllm/v1/engine/core.py | 7 ++++--- vllm/v1/executor/multiproc_executor.py | 16 ++++++++++------ vllm/v1/utils.py | 6 +++--- 7 files changed, 48 insertions(+), 26 deletions(-) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 68eb2580991c8..a69363e3d98fe 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -21,7 +21,7 @@ from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG, from vllm.executor.multiproc_worker_utils import _add_prefix from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser, bind_process_name, get_tcp_uri +from vllm.utils import FlexibleArgumentParser, get_tcp_uri from vllm.v1.engine.core import EngineCoreProc from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines from vllm.v1.executor.abstract import Executor @@ -77,7 +77,7 @@ def run_headless(args: argparse.Namespace): if args.api_server_count > 1: raise ValueError("api_server_count can't be set in headless mode") - bind_process_name("APIServer_Headless") + # set_process_title("Headless_ProcManager") # Create the EngineConfig. engine_args = vllm.AsyncEngineArgs.from_cli_args(args) usage_context = UsageContext.OPENAI_API_SERVER @@ -140,6 +140,8 @@ def run_multi_api_server(args: argparse.Namespace): num_api_servers = args.api_server_count assert num_api_servers > 0 + # set_process_title("ProcManager") + if num_api_servers > 1: setup_multiprocess_prometheus() diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 3d4c4a6b752a7..c375c8755108c 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -11,6 +11,7 @@ import multiprocessing import os import signal import socket +import sys import tempfile import uuid from argparse import Namespace @@ -94,15 +95,15 @@ from vllm.entrypoints.openai.serving_transcription import ( from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.entrypoints.utils import (cli_env_setup, load_aware_call, log_non_default_args, with_cancellation) +from vllm.executor.multiproc_worker_utils import _add_prefix from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.usage.usage_lib import UsageContext -from vllm.utils import (Device, FlexibleArgumentParser, bind_process_name, - get_open_zmq_ipc_path, is_valid_ipv6_address, - set_ulimit) +from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path, + is_valid_ipv6_address, set_process_title, set_ulimit) from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION @@ -1805,6 +1806,13 @@ def setup_server(args): async def run_server(args, **uvicorn_kwargs) -> None: """Run a single-worker API server.""" + + # Add process-specific prefix to stdout and stderr. + process_name = "APIServer" + pid = os.getpid() + _add_prefix(sys.stdout, process_name, pid) + _add_prefix(sys.stderr, process_name, pid) + listen_address, sock = setup_server(args) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) @@ -1820,7 +1828,7 @@ async def run_server_worker(listen_address, ToolParserManager.import_tool_parser(args.tool_parser_plugin) server_index = client_config.get("client_index", 0) if client_config else 0 - bind_process_name("APIServer", str(server_index)) + set_process_title("APIServer", str(server_index)) # Load logging config for uvicorn if specified log_config = load_log_config(args.log_config_file) if log_config is not None: diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 054037b8932b7..ae978c855a8e5 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -3282,14 +3282,22 @@ def has_deep_gemm() -> bool: return _has_module("deep_gemm") -def bind_process_name(name: str, suffix: str = "") -> None: - """Bind the process name to a specific name with an optional suffix. +def set_process_title(name: str, + suffix: str = "", + append: bool = False) -> None: + """ + Set the current process title to a specific name with an + optional suffix. Args: - name: The base name to bind the process to. + name: The title to assign to the current process. suffix: An optional suffix to append to the base name. + append: Whether to append to the existing process title. """ - name = f"{envs.VLLM_PROCESS_NAME_PREFIX}::{name}" if suffix: name = f"{name}_{suffix}" + if append: + name = f"{setproctitle.getproctitle()}_{name}" + else: + name = f"{envs.VLLM_PROCESS_NAME_PREFIX}::{name}" setproctitle.setproctitle(name) diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py index fc45eea3a73cf..440628576bcb7 100644 --- a/vllm/v1/engine/coordinator.py +++ b/vllm/v1/engine/coordinator.py @@ -10,11 +10,10 @@ import zmq from vllm.config import ParallelConfig from vllm.logger import init_logger -from vllm.utils import get_mp_context, make_zmq_socket +from vllm.utils import get_mp_context, make_zmq_socket, set_process_title from vllm.v1.engine import EngineCoreOutputs, EngineCoreRequestType from vllm.v1.serial_utils import MsgpackDecoder -from vllm.v1.utils import (bind_process_name, get_engine_client_zmq_addr, - shutdown) +from vllm.v1.utils import get_engine_client_zmq_addr, shutdown logger = init_logger(__name__) @@ -119,7 +118,7 @@ class DPCoordinatorProc: def __init__(self, engine_count: int, min_stats_update_interval_ms: int = 100): - bind_process_name(self.__class__.__name__) + set_process_title("DPCoordinator") self.ctx = zmq.Context() self.engines = [EngineState() for _ in range(engine_count)] diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 57f60c4b289bb..cad93061e65b0 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -26,8 +26,8 @@ from vllm.lora.request import LoRARequest from vllm.tasks import POOLING_TASKS, SupportedTask from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) -from vllm.utils import (bind_process_name, make_zmq_socket, - resolve_obj_by_qualname) +from vllm.utils import (make_zmq_socket, resolve_obj_by_qualname, + set_process_title) from vllm.v1.core.kv_cache_utils import (get_kv_cache_config, unify_kv_cache_configs) from vllm.v1.core.sched.interface import SchedulerInterface @@ -425,7 +425,6 @@ class EngineCoreProc(EngineCore): client_handshake_address: Optional[str] = None, engine_index: int = 0, ): - bind_process_name(self.__class__.__name__, f"{engine_index}") self.input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]() self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs], bytes]]() @@ -630,11 +629,13 @@ class EngineCoreProc(EngineCore): parallel_config: ParallelConfig = kwargs[ "vllm_config"].parallel_config if parallel_config.data_parallel_size > 1 or dp_rank > 0: + set_process_title("DPEngineCore", str(dp_rank)) # Set data parallel rank for this engine process. parallel_config.data_parallel_rank = dp_rank parallel_config.data_parallel_rank_local = local_dp_rank engine_core = DPEngineCoreProc(*args, **kwargs) else: + set_process_title("EngineCore") engine_core = EngineCoreProc(*args, **kwargs) engine_core.run_busy_loop() diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 897174c1599df..8270385053852 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -30,8 +30,8 @@ from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator from vllm.executor.multiproc_worker_utils import ( _add_prefix, set_multiprocessing_worker_envs) from vllm.logger import init_logger -from vllm.utils import (bind_process_name, get_distributed_init_method, - get_loopback_ip, get_mp_context, get_open_port) +from vllm.utils import (get_distributed_init_method, get_loopback_ip, + get_mp_context, get_open_port, set_process_title) from vllm.v1.executor.abstract import Executor, FailureCallback from vllm.v1.outputs import ModelRunnerOutput from vllm.worker.worker_base import WorkerWrapperBase @@ -376,10 +376,14 @@ class WorkerProc: } wrapper.init_worker(all_kwargs) self.worker = wrapper - bind_process_name( - self.worker.worker.__class__.__name__, - f"TP{self.rank}_DP{vllm_config.parallel_config.data_parallel_rank}" - ) + + pp_size = vllm_config.parallel_config.pipeline_parallel_size + tp_size = vllm_config.parallel_config.tensor_parallel_size + pp_str = f"PP{rank // tp_size}" if pp_size > 1 else "" + tp_str = f"TP{rank % tp_size}" if tp_size > 1 else "" + suffix = f"{pp_str}{'_' if pp_str and tp_str else ''}{tp_str}" + if suffix: + set_process_title(suffix, append=True) pid = os.getpid() _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid) _add_prefix(sys.stderr, f"VllmWorker rank={rank}", pid) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index bb5a36f38386b..c74d8c543f76c 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -15,8 +15,8 @@ import torch from vllm.logger import init_logger from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message) -from vllm.utils import (bind_process_name, get_open_port, - get_open_zmq_ipc_path, get_tcp_uri, kill_process_tree) +from vllm.utils import (get_open_port, get_open_zmq_ipc_path, get_tcp_uri, + kill_process_tree) if TYPE_CHECKING: from vllm.v1.engine.coordinator import DPCoordinator @@ -144,7 +144,7 @@ class APIServerProcessManager: self.listen_address = listen_address self.sock = sock self.args = args - bind_process_name(self.__class__.__name__) + # Start API servers spawn_context = multiprocessing.get_context("spawn") self.processes: list[BaseProcess] = [] From a2480251ec92ba2a849464dde48db8a2b7f6ef81 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 29 Jul 2025 14:53:18 +0800 Subject: [PATCH 008/224] [Doc] Link to RFC for pooling optimizations (#21806) Signed-off-by: DarkLight1337 --- docs/models/pooling_models.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index a06d86523af1a..f1200103171e9 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -7,9 +7,9 @@ These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract before returning them. !!! note - We currently support pooling models primarily as a matter of convenience. - As shown in the [Compatibility Matrix](../features/compatibility_matrix.md), most vLLM features are not applicable to - pooling models as they only work on the generation or decode stage, so performance may not improve as much. + We currently support pooling models primarily as a matter of convenience. This is not guaranteed to have any performance improvement over using HF Transformers / Sentence Transformers directly. + + We are now planning to optimize pooling models in vLLM. Please comment on if you have any suggestions! ## Configuration From a4528f0cac5d2857ccc56d2a2e1a1c43142643ce Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Tue, 29 Jul 2025 18:13:27 +0800 Subject: [PATCH 009/224] [Model]: Fused MoE for nomic-embed-text-v2-moe (#18321) Signed-off-by: isotr0py <2037008807@qq.com> Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: Isotr0py --- .../layers/fused_moe/fused_moe.py | 47 +++- vllm/model_executor/models/bert_with_rope.py | 204 +++++++++--------- 2 files changed, 140 insertions(+), 111 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 1985e8612da35..227aacf25c0b0 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -7,6 +7,7 @@ import os from typing import Any, Callable, Optional import torch +import torch.nn.functional as F import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk @@ -1001,6 +1002,7 @@ def inplace_fused_experts(hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, activation: str = "silu", + is_act_and_mul: bool = True, apply_router_weight_on_input: bool = False, use_fp8_w8a8: bool = False, use_int8_w8a8: bool = False, @@ -1018,7 +1020,8 @@ def inplace_fused_experts(hidden_states: torch.Tensor, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[list[int]] = None) -> None: fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True, - activation, apply_router_weight_on_input, use_fp8_w8a8, + activation, is_act_and_mul, + apply_router_weight_on_input, use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, use_mxfp4_w4a4, per_channel_quant, global_num_experts, expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, @@ -1032,6 +1035,7 @@ def inplace_fused_experts_fake( topk_weights: torch.Tensor, topk_ids: torch.Tensor, activation: str = "silu", + is_act_and_mul: bool = True, apply_router_weight_on_input: bool = False, use_fp8_w8a8: bool = False, use_int8_w8a8: bool = False, @@ -1167,6 +1171,7 @@ def outplace_fused_experts( topk_weights: torch.Tensor, topk_ids: torch.Tensor, activation: str = "silu", + is_act_and_mul: bool = True, apply_router_weight_on_input: bool = False, use_fp8_w8a8: bool = False, use_int8_w8a8: bool = False, @@ -1183,13 +1188,12 @@ def outplace_fused_experts( a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[list[int]] = None) -> torch.Tensor: - return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, - False, activation, apply_router_weight_on_input, - use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16, - use_int4_w4a16, use_mxfp4_w4a4, - per_channel_quant, global_num_experts, - expert_map, w1_scale, w2_scale, w1_zp, w2_zp, - a1_scale, a2_scale, block_shape) + return fused_experts_impl( + hidden_states, w1, w2, topk_weights, topk_ids, False, activation, + is_act_and_mul, apply_router_weight_on_input, use_fp8_w8a8, + use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, use_mxfp4_w4a4, + per_channel_quant, global_num_experts, expert_map, w1_scale, w2_scale, + w1_zp, w2_zp, a1_scale, a2_scale, block_shape) def outplace_fused_experts_fake( @@ -1199,6 +1203,7 @@ def outplace_fused_experts_fake( topk_weights: torch.Tensor, topk_ids: torch.Tensor, activation: str = "silu", + is_act_and_mul: bool = True, use_fp8_w8a8: bool = False, use_int8_w8a8: bool = False, use_int8_w8a16: bool = False, @@ -1253,6 +1258,7 @@ def fused_experts( topk_ids: torch.Tensor, inplace: bool = False, activation: str = "silu", + is_act_and_mul: bool = True, apply_router_weight_on_input: bool = False, use_fp8_w8a8: bool = False, use_int8_w8a8: bool = False, @@ -1283,6 +1289,8 @@ def fused_experts( or is_blackwell_deep_gemm_used()) if (allow_deep_gemm and use_fp8_w8a8 and should_use_deep_gemm): assert apply_router_weight_on_input is False + assert is_act_and_mul, ( + "DeepGemm only supports is_act_and_mul=True for now.") return deep_gemm_moe_fp8( hidden_states=hidden_states, w1=w1, @@ -1319,6 +1327,7 @@ def fused_experts( topk_weights=topk_weights, topk_ids=topk_ids, activation=activation, + is_act_and_mul=is_act_and_mul, apply_router_weight_on_input=apply_router_weight_on_input, use_fp8_w8a8=use_fp8_w8a8, use_int8_w8a8=use_int8_w8a8, @@ -1345,6 +1354,7 @@ def fused_experts_impl( topk_ids: torch.Tensor, inplace: bool = False, activation: str = "silu", + is_act_and_mul: bool = True, apply_router_weight_on_input: bool = False, use_fp8_w8a8: bool = False, use_int8_w8a8: bool = False, @@ -1503,14 +1513,21 @@ def fused_experts_impl( per_channel_quant=per_channel_quant, block_shape=block_shape) - if activation == "silu": + # Activation function with multiplication + if activation == "silu" and is_act_and_mul: torch.ops._C.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) - elif activation == "gelu": + elif activation == "gelu" and is_act_and_mul: torch.ops._C.gelu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) + # Activation function without multiplication + elif activation == "silu": + intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N)) + elif activation == "gelu": + intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N)) else: - raise ValueError(f"Unsupported FusedMoe activation: {activation}") + raise ValueError(f"Unsupported FusedMoe activation: {activation}, " + f"with is_act_and_mul={is_act_and_mul}.") qintermediate_cache2, a2q_scale = moe_kernel_quantize_input( A=intermediate_cache2, @@ -1555,6 +1572,7 @@ def fused_moe( renormalize: bool, inplace: bool = False, activation: str = "silu", + is_act_and_mul: bool = True, use_grouped_topk: bool = False, num_expert_group: Optional[int] = None, topk_group: Optional[int] = None, @@ -1591,6 +1609,9 @@ def fused_moe( Defaults to False. - activation (str): The activation function to apply after the first MoE layer. + - is_act_and_mul (bool): If True, use activation-and-mul function for + activation (self-gated activation), otherwise use activation function + for activation (ungated activation). - num_expert_group: Optional[int]: additional parameter for grouped_topk - topk_group: Optional[int]: additional parameter for grouped_topk - use_grouped_topk: If True, use grouped_topk instead of fused_topk @@ -1627,6 +1648,9 @@ def fused_moe( Returns: - torch.Tensor: The output tensor after applying the MoE layer. """ + if not is_act_and_mul: + assert inplace is False, ( + "is_act_and_mul=False is not supported with inplace=True") if use_grouped_topk: assert num_expert_group is not None and topk_group is not None @@ -1647,6 +1671,7 @@ def fused_moe( topk_ids, inplace=inplace, activation=activation, + is_act_and_mul=is_act_and_mul, use_fp8_w8a8=use_fp8_w8a8, use_int8_w8a8=use_int8_w8a8, use_int8_w8a16=use_int8_w8a16, diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 0b7350f07d3f6..5249acbd84a56 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -10,9 +10,12 @@ from transformers import PretrainedConfig from vllm.attention import Attention, AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.distributed import (divide, get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce) from vllm.model_executor.layers.activation import (get_act_and_mul_fn, get_act_fn) +from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear, @@ -26,6 +29,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models import SupportsV0Only from vllm.model_executor.models.interfaces import SupportsQuant from vllm.model_executor.models.utils import WeightsMapper +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors @@ -201,114 +206,101 @@ class BertWithRopeMLP(nn.Module): return hidden_states -class NomicRouter(nn.Module): +class NomicMoE(nn.Module): - def __init__(self, hidden_size: int, moe_num_experts: int, moe_top_k: int): + def __init__( + self, + num_experts: int, + top_k: int, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + params_dtype: Optional[torch.dtype] = None, + tp_size: Optional[int] = None, + ): super().__init__() - self.moe_top_k = moe_top_k - self.layer = ReplicatedLinear(hidden_size, moe_num_experts, bias=False) - def forward( - self, x: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor, torch.LongTensor]: - weights = self.layer(x.view(-1, x.shape[-1]))[0].softmax( - dim=-1, dtype=torch.float32) - top_weights, top_experts = torch.topk(weights, self.moe_top_k, dim=-1) - weights = weights.to(x.dtype) - top_weights = top_weights.to(x.dtype) - return weights, top_weights, top_experts # type: ignore - - -class NomicExpertMLP(nn.Module): - - def __init__(self, hidden_size: int, ffn_hidden_size: int, - moe_num_experts: int, ffn_act_fn: str): - super().__init__() + self.tp_size = tp_size or get_tensor_model_parallel_world_size() + self.num_total_experts = num_experts + self.top_k = top_k self.hidden_size = hidden_size - self.ffn_hidden_size = ffn_hidden_size - self.moe_num_experts = moe_num_experts + self.total_intermediate_size = intermediate_size + self.intermediate_size = divide(intermediate_size, self.tp_size) + self.hidden_act = hidden_act + if params_dtype is None: + params_dtype = torch.get_default_dtype() + self.params_dtype = params_dtype + + self.router = ReplicatedLinear(self.hidden_size, + self.num_total_experts, + bias=False) self.w1 = nn.Parameter( - torch.empty(moe_num_experts * ffn_hidden_size, hidden_size)) + torch.empty(self.num_total_experts, + self.intermediate_size, + self.hidden_size, + device=current_platform.device_type, + dtype=self.params_dtype)) self.w2 = nn.Parameter( - torch.empty(moe_num_experts * ffn_hidden_size, hidden_size)) - self.activation_fn = get_act_fn(ffn_act_fn) + torch.empty(self.num_total_experts, + self.hidden_size, + self.intermediate_size, + device=current_platform.device_type, + dtype=self.params_dtype)) + self.bias = nn.Parameter(torch.zeros(self.hidden_size)) + set_weight_attrs(self.w1, { + "weight_loader": self.weight_loader, + }) + set_weight_attrs(self.w2, { + "weight_loader": self.weight_loader, + }) - def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor: - expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, - self.hidden_size)[expert_idx] - expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, - self.hidden_size)[expert_idx] + def weight_loader( + self, + param: nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, + ): + # NOTE: Nomic-MoE has fused experts weights with shape + # (num_experts * intermediate_size, hidden_size) + tp_rank = get_tensor_model_parallel_rank() + param_data = param.data + shard_size = self.intermediate_size + shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size) + if weight_name.endswith("w1"): + loaded_weight = loaded_weight.reshape( + self.num_total_experts, + self.total_intermediate_size, + self.hidden_size, + )[:, shard] + if weight_name.endswith("w2"): + loaded_weight = loaded_weight.reshape( + self.num_total_experts, + self.total_intermediate_size, + self.hidden_size, + )[:, shard].transpose(1, 2) + param_data.copy_(loaded_weight) - x1 = x.matmul(expert_w1.t()) - act_out = self.activation_fn(x1) - x2 = act_out.matmul(expert_w2) - return x2 + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + num_tokens, hidden_size = hidden_states.shape + hidden_states = hidden_states.view(-1, self.hidden_size) + # router_logits: (num_tokens, n_experts) + router_logits, _ = self.router(hidden_states) + final_hidden_states = fused_moe(hidden_states, + self.w1, + self.w2, + router_logits, + self.top_k, + renormalize=False, + inplace=False, + activation=self.hidden_act, + is_act_and_mul=False) + if self.tp_size > 1: + final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states) -class NomicExperts(nn.Module): - - def __init__(self, config, hidden_size: int, ffn_hidden_size: int, - moe_num_experts: int): - super().__init__() - self.moe_num_experts = moe_num_experts - - self.mlp = NomicExpertMLP(hidden_size=config.n_embd, - ffn_hidden_size=config.n_inner, - moe_num_experts=moe_num_experts, - ffn_act_fn=config.hidden_act) - self.bias = nn.Parameter(torch.zeros(config.n_embd)) - - def forward(self, x: torch.Tensor, weights: torch.Tensor, - top_weights: torch.Tensor, - top_experts: torch.LongTensor) -> torch.Tensor: - q_len, hidden_size = x.shape - x = x.view(-1, hidden_size) - out = torch.zeros_like(x) - - expert_mask = nn.functional.one_hot( - top_experts, num_classes=self.moe_num_experts).permute(2, 1, 0) - for expert_idx in range(0, self.moe_num_experts): - topk_idx, token_idx = torch.where(expert_mask[expert_idx]) - if token_idx.shape[0] == 0: - continue - - token_list = token_idx.tolist() - topk_list = topk_idx.tolist() - - expert_tokens = x[None, token_list].reshape(-1, hidden_size) - expert_out = self.mlp( - expert_tokens, expert_idx) * top_weights[token_list, topk_list, - None] - - out.index_add_(0, token_idx, expert_out) - - out = out.reshape(q_len, hidden_size) - return out + self.bias - - -class NomicMoELayer(nn.Module): - - def __init__(self, config: PretrainedConfig): - super().__init__() - - self.router = NomicRouter( - config.n_embd, - moe_num_experts=config.num_experts, - moe_top_k=config.moe_top_k, - ) - - self.experts = NomicExperts( - config, - hidden_size=config.n_embd, - ffn_hidden_size=config.n_inner, - moe_num_experts=config.num_experts, - ) - - def forward(self, x: torch.Tensor): - weights, top_weights, top_experts = self.router(x) - out = self.experts(x, weights, top_weights, top_experts) - return out + return final_hidden_states.view(num_tokens, hidden_size) + self.bias class BertWithRopeBlock(nn.Module): @@ -332,7 +324,11 @@ class BertWithRopeBlock(nn.Module): prefix=f"{prefix}.attention") if moe: - self.mlp = NomicMoELayer(config=config, ) + self.mlp = NomicMoE(num_experts=config.num_experts, + top_k=config.moe_top_k, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act) else: if config.hidden_act in ["silu", "geglu"]: self.mlp = BertWithRopeGatedMLP( @@ -463,7 +459,11 @@ class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant): param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) + if name.endswith((".w1", ".w2")): + # Nomic-MoE has fused experts weights + weight_loader(param, loaded_weight, name) + else: + weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params @@ -481,6 +481,10 @@ class NomicBertModel(BertWithRope): "mlp.fc12": "mlp.gate_proj", "mlp.fc2": "mlp.down_proj", "norm2": "mlp_ln", + # MoE mapping + "experts.mlp.": "", + "experts.": "", + "router.layer": "router", }) From 37efc63b644b2f4e3b08bf7ff198dd8cd4c3f354 Mon Sep 17 00:00:00 2001 From: Reza Barazesh <3146276+rzabarazesh@users.noreply.github.com> Date: Tue, 29 Jul 2025 03:15:30 -0700 Subject: [PATCH 010/224] [V0 deprecation] Guided decoding (#21347) Signed-off-by: Reza Barazesh Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 3 +- .github/CODEOWNERS | 3 - .github/mergify.yml | 3 - tests/entrypoints/llm/test_guided_generate.py | 552 ------------------ tests/entrypoints/llm/test_lazy_outlines.py | 52 +- tests/entrypoints/openai/test_chat.py | 151 +---- tests/entrypoints/openai/test_completion.py | 36 +- .../openai/test_prompt_validation.py | 26 +- .../model_executor/test_guided_processors.py | 207 ------- .../language/generation/test_mistral.py | 51 +- tests/samplers/test_no_bad_words.py | 6 +- tests/test_sampling_params.py | 3 +- tests/v1/test_oracle.py | 7 - tools/check_pickle_imports.py | 1 - vllm/config.py | 20 +- vllm/engine/arg_utils.py | 24 +- vllm/engine/async_llm_engine.py | 66 +-- vllm/engine/llm_engine.py | 48 +- vllm/engine/multiprocessing/client.py | 18 - vllm/entrypoints/llm.py | 76 +-- .../guided_decoding/__init__.py | 192 ------ .../guided_decoding/guidance_decoding.py | 63 -- .../guidance_logits_processors.py | 104 ---- .../guided_decoding/guided_fields.py | 41 -- .../lm_format_enforcer_decoding.py | 67 --- .../guided_decoding/outlines_decoding.py | 117 ---- .../outlines_logits_processors.py | 307 ---------- vllm/model_executor/guided_decoding/utils.py | 242 -------- .../guided_decoding/xgrammar_decoding.py | 426 -------------- 29 files changed, 103 insertions(+), 2809 deletions(-) delete mode 100644 tests/entrypoints/llm/test_guided_generate.py delete mode 100644 tests/model_executor/test_guided_processors.py delete mode 100644 vllm/model_executor/guided_decoding/__init__.py delete mode 100644 vllm/model_executor/guided_decoding/guidance_decoding.py delete mode 100644 vllm/model_executor/guided_decoding/guidance_logits_processors.py delete mode 100644 vllm/model_executor/guided_decoding/guided_fields.py delete mode 100644 vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py delete mode 100644 vllm/model_executor/guided_decoding/outlines_decoding.py delete mode 100644 vllm/model_executor/guided_decoding/outlines_logits_processors.py delete mode 100644 vllm/model_executor/guided_decoding/utils.py delete mode 100644 vllm/model_executor/guided_decoding/xgrammar_decoding.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ac145453dabde..6cda800b6477d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -128,11 +128,10 @@ steps: - tests/entrypoints/offline_mode commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - label: Entrypoints Test (API Server) # 40min diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 2441055371663..a3b2713430eb5 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -10,7 +10,6 @@ /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth -/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm /vllm/multimodal @DarkLight1337 @ywang96 /vllm/vllm_flash_attn @LucasWilkinson /vllm/lora @jeejeelee @@ -35,9 +34,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm -/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm /tests/kernels @tlrmchlsmth @WoosukKwon -/tests/model_executor/test_guided_processors.py @mgoin @russellb /tests/models @DarkLight1337 @ywang96 /tests/multi_step @alexm-redhat @comaniac /tests/multimodal @DarkLight1337 @ywang96 diff --git a/.github/mergify.yml b/.github/mergify.yml index 5c878ac02069f..d8ae509e0ac30 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -149,9 +149,6 @@ pull_request_rules: - files=examples/offline_inference/structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py - - files~=^vllm/model_executor/guided_decoding/ - - files=tests/model_executor/test_guided_processors.py - - files=tests/entrypoints/llm/test_guided_generate.py - files~=^tests/v1/structured_output/ - files=tests/v1/entrypoints/llm/test_guided_generate.py - files~=^vllm/v1/structured_output/ diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py deleted file mode 100644 index 55578341cb2e7..0000000000000 --- a/tests/entrypoints/llm/test_guided_generate.py +++ /dev/null @@ -1,552 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import json -import weakref -from enum import Enum - -import jsonschema -import pytest -import regex as re -from pydantic import BaseModel - -from vllm.distributed import cleanup_dist_env_and_memory -from vllm.entrypoints.llm import LLM -from vllm.outputs import RequestOutput -from vllm.sampling_params import GuidedDecodingParams, SamplingParams - -MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" - -# Separate backends which support grammars vs ones -# which only support regex based constraints in tests. -GRAMMAR_DECODING_BACKENDS = [ - # (backend, disable_any_whitespace), - ("lm-format-enforcer", False), - ("xgrammar", True), - ("guidance", True), -] - -ALL_DECODING_BACKENDS = ([("outlines", False)] + GRAMMAR_DECODING_BACKENDS) - - -@pytest.fixture(scope="module") -def llm(): - # pytest caches the fixture so we use weakref.proxy to - # enable garbage collection - llm = LLM(model=MODEL_NAME, max_model_len=1024, seed=0) - - with llm.deprecate_legacy_api(): - yield weakref.proxy(llm) - del llm - cleanup_dist_env_and_memory() - - -@pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", - ALL_DECODING_BACKENDS) -def test_guided_regex(sample_regex, llm, guided_decoding_backend: str, - disable_any_whitespace: bool): - sampling_params = SamplingParams( - temperature=0.8, - top_p=0.95, - guided_decoding=GuidedDecodingParams( - regex=sample_regex, - backend=guided_decoding_backend, - disable_any_whitespace=disable_any_whitespace)) - - outputs = llm.generate(prompts=[ - f"Give an example IPv4 address with this regex: {sample_regex}" - ] * 2, - sampling_params=sampling_params, - use_tqdm=True) - - assert outputs is not None - for output in outputs: - assert output is not None - assert isinstance(output, RequestOutput) - prompt = output.prompt - generated_text = output.outputs[0].text - print(generated_text) - assert generated_text is not None - assert re.fullmatch(sample_regex, generated_text) is not None - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - -@pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", - ALL_DECODING_BACKENDS) -def test_guided_json_completion(sample_json_schema, llm, - guided_decoding_backend: str, - disable_any_whitespace: bool): - sampling_params = SamplingParams( - temperature=1.0, - max_tokens=1000, - guided_decoding=GuidedDecodingParams( - json=sample_json_schema, - backend=guided_decoding_backend, - disable_any_whitespace=disable_any_whitespace)) - outputs = llm.generate(prompts=[ - f"Give an example JSON for an employee profile " - f"that fits this schema: {sample_json_schema}" - ] * 2, - sampling_params=sampling_params, - use_tqdm=True) - - assert outputs is not None - - for output in outputs: - assert output is not None - assert isinstance(output, RequestOutput) - prompt = output.prompt - - generated_text = output.outputs[0].text - assert generated_text is not None - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - output_json = json.loads(generated_text) - jsonschema.validate(instance=output_json, schema=sample_json_schema) - - -@pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", - ALL_DECODING_BACKENDS) -def test_guided_complex_json_completion(sample_complex_json_schema, llm, - guided_decoding_backend: str, - disable_any_whitespace: bool): - sampling_params = SamplingParams( - temperature=1.0, - max_tokens=1000, - guided_decoding=GuidedDecodingParams( - json=sample_complex_json_schema, - backend=guided_decoding_backend, - disable_any_whitespace=disable_any_whitespace)) - outputs = llm.generate(prompts=[ - f"Give an example JSON for an assignment grade " - f"that fits this schema: {sample_complex_json_schema}" - ] * 2, - sampling_params=sampling_params, - use_tqdm=True) - - assert outputs is not None - - for output in outputs: - assert output is not None - assert isinstance(output, RequestOutput) - prompt = output.prompt - - generated_text = output.outputs[0].text - assert generated_text is not None - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - output_json = json.loads(generated_text) - jsonschema.validate(instance=output_json, - schema=sample_complex_json_schema) - - -@pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", - ALL_DECODING_BACKENDS) -def test_guided_definition_json_completion(sample_definition_json_schema, llm, - guided_decoding_backend: str, - disable_any_whitespace: bool): - sampling_params = SamplingParams( - temperature=1.0, - max_tokens=1000, - guided_decoding=GuidedDecodingParams( - json=sample_definition_json_schema, - backend=guided_decoding_backend, - disable_any_whitespace=disable_any_whitespace)) - outputs = llm.generate(prompts=[ - f"Give an example JSON for solving 8x + 7 = -23 " - f"that fits this schema: {sample_definition_json_schema}" - ] * 2, - sampling_params=sampling_params, - use_tqdm=True) - - assert outputs is not None - - for output in outputs: - assert output is not None - assert isinstance(output, RequestOutput) - prompt = output.prompt - - generated_text = output.outputs[0].text - assert generated_text is not None - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - output_json = json.loads(generated_text) - jsonschema.validate(instance=output_json, - schema=sample_definition_json_schema) - - -@pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", - ALL_DECODING_BACKENDS) -def test_guided_enum_json_completion(sample_enum_json_schema, llm, - guided_decoding_backend: str, - disable_any_whitespace: bool): - sampling_params = SamplingParams( - temperature=1.0, - max_tokens=1000, - guided_decoding=GuidedDecodingParams( - json=sample_enum_json_schema, - backend=guided_decoding_backend, - disable_any_whitespace=disable_any_whitespace)) - outputs = llm.generate(prompts=[ - "Create a bug report JSON that fits this schema: " - f"{sample_enum_json_schema}. Make it for a high priority critical bug." - ] * 2, - sampling_params=sampling_params, - use_tqdm=True) - - assert outputs is not None - - for output in outputs: - assert output is not None - assert isinstance(output, RequestOutput) - prompt = output.prompt - - generated_text = output.outputs[0].text - assert generated_text is not None - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - output_json = json.loads(generated_text) - jsonschema.validate(instance=output_json, - schema=sample_enum_json_schema) - - # Additional assertions to verify enum values - assert output_json["status"] in ["active", "inactive", "pending"] - assert output_json["priority"] in ["low", "medium", "high", "critical"] - assert output_json["category"]["type"] in [ - "bug", "feature", "improvement" - ] - assert output_json["category"]["severity"] in [1, 2, 3, 4, 5] - for flag in output_json["flags"]: - assert flag in ["urgent", "blocked", "needs_review", "approved"] - - -@pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", - ALL_DECODING_BACKENDS) -def test_guided_choice_completion(sample_guided_choice, llm, - guided_decoding_backend: str, - disable_any_whitespace: bool): - sampling_params = SamplingParams( - temperature=0.8, - top_p=0.95, - guided_decoding=GuidedDecodingParams( - choice=sample_guided_choice, - backend=guided_decoding_backend, - disable_any_whitespace=disable_any_whitespace)) - outputs = llm.generate( - prompts="The best language for type-safe systems programming is ", - sampling_params=sampling_params, - use_tqdm=True) - - assert outputs is not None - for output in outputs: - assert output is not None - assert isinstance(output, RequestOutput) - prompt = output.prompt - generated_text = output.outputs[0].text - print(generated_text) - assert generated_text is not None - assert generated_text in sample_guided_choice - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - -@pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", - GRAMMAR_DECODING_BACKENDS) -def test_guided_grammar(sample_sql_statements, llm, - guided_decoding_backend: str, - disable_any_whitespace: bool): - sampling_params = SamplingParams( - temperature=0.8, - top_p=0.95, - max_tokens=1000, - guided_decoding=GuidedDecodingParams( - grammar=sample_sql_statements, - backend=guided_decoding_backend, - disable_any_whitespace=disable_any_whitespace)) - outputs = llm.generate( - prompts=("Generate a sql state that select col_1 from " - "table_1 where it is equals to 1"), - sampling_params=sampling_params, - use_tqdm=True, - ) - - assert outputs is not None - for output in outputs: - assert output is not None - assert isinstance(output, RequestOutput) - prompt = output.prompt - - generated_text = output.outputs[0].text - assert generated_text is not None - # use Lark to parse the output, and make sure it's a valid parse tree - from lark import Lark - parser = Lark(sample_sql_statements) - parser.parse(generated_text) - - # remove spaces for comparison b/c we removed them in the grammar - ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace( - " ", "") - - assert generated_text.strip() == ground_truth - - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - -@pytest.mark.skip_global_cleanup -def test_guided_options_request_deprecation_warning(sample_regex, llm): - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - with pytest.warns(DeprecationWarning, match="guided_options_request"): - llm.generate(prompts="This should fail", - sampling_params=sampling_params, - use_tqdm=True, - guided_options_request=dict(guided_regex=sample_regex)) - - -@pytest.mark.skip_global_cleanup -def test_validation_against_both_guided_decoding_options(sample_regex, llm): - sampling_params = SamplingParams( - temperature=0.8, - top_p=0.95, - guided_decoding=GuidedDecodingParams(regex=sample_regex)) - - with pytest.raises(ValueError, match="Cannot set both"): - llm.generate(prompts="This should fail", - sampling_params=sampling_params, - use_tqdm=True, - guided_options_request=dict(guided_regex=sample_regex)) - - -@pytest.mark.skip_global_cleanup -def test_disable_guided_decoding_fallback(sample_regex, llm): - # see has_xgrammar_unsupported_json_features() - unsupported_json = { - "type": "object", - "properties": { - "example": { - "type": "string", - "minLength": 5 # unsupported by xgrammar - } - } - } - sampling_params = SamplingParams(temperature=0.8, - top_p=0.95, - guided_decoding=GuidedDecodingParams( - json=unsupported_json, - backend="xgrammar", - disable_fallback=True)) - - with pytest.raises( - ValueError, - match="xgrammar does not support advanced JSON schema features " - "like string length, item limits, or property bounds."): - llm.generate(prompts="This should fail", - sampling_params=sampling_params, - use_tqdm=True) - - -@pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", - GRAMMAR_DECODING_BACKENDS) -def test_guided_json_object(llm, guided_decoding_backend: str, - disable_any_whitespace: bool): - sampling_params = SamplingParams( - temperature=1.0, - max_tokens=100, - n=2, - guided_decoding=GuidedDecodingParams( - json_object=True, - backend=guided_decoding_backend, - disable_any_whitespace=disable_any_whitespace)) - - outputs = llm.generate( - prompts=("Generate a JSON object with curly braces for a person with " - "name and age fields for John Smith who is 31 years old."), - sampling_params=sampling_params, - use_tqdm=True) - - assert outputs is not None - for output in outputs: - assert output is not None - assert isinstance(output, RequestOutput) - - for i in range(2): - generated_text = output.outputs[i].text - print(generated_text) - assert generated_text is not None - - if disable_any_whitespace: - assert "\n" not in generated_text - - # Parse to verify it is valid JSON - parsed_json = json.loads(generated_text) - # A list is not what was intended, but is still valid - # json. - assert isinstance(parsed_json, (dict, list)) - - -class CarType(str, Enum): - sedan = "sedan" - suv = "SUV" - truck = "Truck" - coupe = "Coupe" - - -class CarDescription(BaseModel): - brand: str - model: str - car_type: CarType - - -@pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", - ALL_DECODING_BACKENDS) -def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str, - disable_any_whitespace: bool): - json_schema = CarDescription.model_json_schema() - sampling_params = SamplingParams( - temperature=1.0, - max_tokens=1000, - guided_decoding=GuidedDecodingParams( - json=json_schema, - backend=guided_decoding_backend, - disable_any_whitespace=disable_any_whitespace)) - outputs = llm.generate( - prompts="Generate a JSON with the brand, model and car_type of" - "the most iconic car from the 90's", - sampling_params=sampling_params, - use_tqdm=True) - - assert outputs is not None - for output in outputs: - assert output is not None - assert isinstance(output, RequestOutput) - prompt = output.prompt - - generated_text = output.outputs[0].text - assert generated_text is not None - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - output_json = json.loads(generated_text) - jsonschema.validate(instance=output_json, schema=json_schema) - - -@pytest.mark.skip_global_cleanup -@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace", - ALL_DECODING_BACKENDS) -def test_guided_number_range_json_completion(llm, guided_decoding_backend: str, - disable_any_whitespace: bool): - sample_output_schema = { - "type": "object", - "properties": { - "age": { - "type": "integer", - "minimum": 18, - "maximum": 99 - }, - "score": { - "type": "number", - "minimum": 0.0, - "maximum": 100.0 - }, - "zipcode": { - "type": "string", - "pattern": r"^\d{5}(-\d{4})?$" - }, - }, - "required": ["age", "score", "zipcode"], - } - sampling_params = SamplingParams( - temperature=1.0, - max_tokens=1000, - guided_decoding=GuidedDecodingParams( - json=sample_output_schema, - backend=guided_decoding_backend, - disable_any_whitespace=disable_any_whitespace), - ) - outputs = llm.generate( - prompts=[ - "Create a JSON object for a user with age, score, and zipcode." - ] * 2, - sampling_params=sampling_params, - use_tqdm=True, - ) - - assert outputs is not None - - for output in outputs: - assert output is not None - assert isinstance(output, RequestOutput) - prompt = output.prompt - - generated_text = output.outputs[0].text - assert generated_text is not None - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - output_json = json.loads(generated_text) - jsonschema.validate(instance=output_json, schema=sample_output_schema) - assert 18 <= output_json["age"] <= 99 - assert 0.0 <= output_json["score"] <= 100.0 - assert (re.fullmatch(r"^\d{5}(-\d{4})?$", output_json["zipcode"]) - is not None) - - -@pytest.mark.skip_global_cleanup -def test_guidance_no_additional_properties(llm): - schema = { - 'type': 'object', - 'properties': { - 'a1': { - 'type': 'string' - }, - 'a2': { - 'type': 'string' - }, - 'a3': { - 'type': 'string' - } - }, - 'required': ['a1', 'a2', 'a3'], - } - - prompt = ( - "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a " - "helpful assistant.<|im_end|>\n<|im_start|>user\nPlease generate a " - "large JSON object with key-value pairs a1=b1, a2=b2, ..., a20=b20" - "<|im_end|>\n<|im_start|>assistant\n") - - def generate_with_backend(backend, disable_additional_properties): - guided_params = GuidedDecodingParams( - json=schema, - backend=backend, - disable_any_whitespace=True, - disable_additional_properties=disable_additional_properties) - sampling_params = SamplingParams(temperature=0, - max_tokens=256, - guided_decoding=guided_params) - - outputs = llm.generate(prompts=prompt, sampling_params=sampling_params) - assert outputs is not None - generated_text = outputs[0].outputs[0].text - assert generated_text is not None - parsed_json = json.loads(generated_text) - assert isinstance(parsed_json, dict) - jsonschema.validate(instance=parsed_json, schema=schema) - return parsed_json - - base_generated = generate_with_backend("guidance", False) - assert "a1" in base_generated - assert "a2" in base_generated - assert "a3" in base_generated - # by default additional keys are generated - assert "a4" in base_generated - assert "a5" in base_generated - assert "a6" in base_generated - - generated = generate_with_backend("guidance", True) - assert "a1" in generated - assert "a2" in generated - assert "a3" in generated - assert "a4" not in generated - assert "a5" not in generated - assert "a6" not in generated diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py index 61b6b4fbf8e35..ac0b7e134c55a 100644 --- a/tests/entrypoints/llm/test_lazy_outlines.py +++ b/tests/entrypoints/llm/test_lazy_outlines.py @@ -4,43 +4,11 @@ import sys from contextlib import nullcontext -import pytest from vllm_test_utils import BlameResult, blame from vllm import LLM, SamplingParams from vllm.distributed import cleanup_dist_env_and_memory - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - V1 only supports xgrammar so this is irrelevant. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -def run_normal_opt125m(): - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - # Create an LLM without guided decoding as a baseline. - llm = LLM(model="facebook/opt-125m", - enforce_eager=True, - gpu_memory_utilization=0.3) - outputs = llm.generate(prompts, sampling_params) - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - # Destroy the LLM object and free up the GPU memory. - del llm - cleanup_dist_env_and_memory() +from vllm.sampling_params import GuidedDecodingParams def run_normal(): @@ -67,20 +35,22 @@ def run_normal(): cleanup_dist_env_and_memory() -def run_lmfe(sample_regex): +def run_xgrammar(sample_regex): # Create an LLM with guided decoding enabled. llm = LLM(model="distilbert/distilgpt2", enforce_eager=True, - guided_decoding_backend="lm-format-enforcer", + guided_decoding_backend="xgrammar", gpu_memory_utilization=0.3) - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + prompt = f"Give an example IPv4 address with this regex: {sample_regex}" + guided_decoding = GuidedDecodingParams(regex=sample_regex) + sampling_params = SamplingParams(temperature=0.8, + top_p=0.95, + guided_decoding=guided_decoding) outputs = llm.generate( - prompts=[ - f"Give an example IPv4 address with this regex: {sample_regex}" - ] * 2, + prompts=[prompt] * 2, sampling_params=sampling_params, use_tqdm=True, - guided_options_request=dict(guided_regex=sample_regex)) + ) for output in outputs: prompt = output.prompt @@ -103,7 +73,7 @@ def test_lazy_outlines(sample_regex): lambda: module_name in sys.modules) if use_blame else nullcontext() with context as result: run_normal() - run_lmfe(sample_regex) + run_xgrammar(sample_regex) if use_blame: assert isinstance(result, BlameResult) print(f"the first import location is:\n{result.trace_stack}") diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index e7c3ffaa6a9f2..5ad29d70f10df 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -488,7 +488,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, @pytest.mark.asyncio async def test_guided_choice_chat(client: openai.AsyncOpenAI, - sample_guided_choice): + sample_guided_choice, is_v1_server: bool): + if not is_v1_server: + pytest.skip("Guided decoding is only supported in v1 engine") messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -524,8 +526,10 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_guided_json_chat(client: openai.AsyncOpenAI, - sample_json_schema): +async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema, + is_v1_server: bool): + if not is_v1_server: + pytest.skip("Guided decoding is only supported in v1 engine") messages = [{ "role": "system", @@ -568,7 +572,10 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex): +async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex, + is_v1_server: bool): + if not is_v1_server: + pytest.skip("Guided decoding is only supported in v1 engine") messages = [{ "role": "system", @@ -653,7 +660,10 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema): +async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema, + is_v1_server: bool): + if not is_v1_server: + pytest.skip("Tool use is only supported in v1 engine") messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -741,131 +751,6 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema): assert json1["age"] != json2["age"] -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_required_tool_use(client: openai.AsyncOpenAI, - is_v1_server: bool, model_name: str): - if is_v1_server: - pytest.skip( - "tool_choice='required' requires features unsupported on V1") - - tools = [ - { - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "city": { - "type": "string", - "description": - "The city to find the weather for, e.g. 'Vienna'", - "default": "Vienna", - }, - "country": { - "type": - "string", - "description": - "The country that the city is in, e.g. 'Austria'", - }, - "unit": { - "type": "string", - "description": - "The unit to fetch the temperature in", - "enum": ["celsius", "fahrenheit"], - }, - }, - "required": ["country", "unit"], - }, - }, - }, - { - "type": "function", - "function": { - "name": "get_forecast", - "description": "Get the weather forecast for a given location", - "parameters": { - "type": "object", - "properties": { - "city": { - "type": "string", - "description": - "The city to get the forecast for, e.g. 'Vienna'", - "default": "Vienna", - }, - "country": { - "type": - "string", - "description": - "The country that the city is in, e.g. 'Austria'", - }, - "days": { - "type": - "integer", - "description": - "Number of days to get the forecast for (1-7)", - }, - "unit": { - "type": "string", - "description": - "The unit to fetch the temperature in", - "enum": ["celsius", "fahrenheit"], - }, - }, - "required": ["country", "days", "unit"], - }, - }, - }, - ] - - messages = [ - { - "role": "user", - "content": "Hi! How are you doing today?" - }, - { - "role": "assistant", - "content": "I'm doing well! How can I help you?" - }, - { - "role": - "user", - "content": - "Can you tell me what the current weather is in Berlin and the "\ - "forecast for the next 5 days, in fahrenheit?", - }, - ] - - # Non-streaming test - chat_completion = await client.chat.completions.create( - messages=messages, - model=model_name, - tools=tools, - tool_choice="required", - ) - - assert chat_completion.choices[0].message.tool_calls is not None - assert len(chat_completion.choices[0].message.tool_calls) > 0 - - # Streaming test - stream = await client.chat.completions.create( - messages=messages, - model=model_name, - tools=tools, - tool_choice="required", - stream=True, - ) - - output = [] - async for chunk in stream: - if chunk.choices and chunk.choices[0].delta.tool_calls: - output.extend(chunk.choices[0].delta.tool_calls) - - assert len(output) > 0 - - @pytest.mark.asyncio async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI, sample_json_schema): @@ -948,7 +833,11 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_response_format_json_schema(client: openai.AsyncOpenAI): +async def test_response_format_json_schema(client: openai.AsyncOpenAI, + is_v1_server: bool): + if not is_v1_server: + pytest.skip( + "JSON schema response format is only supported in v1 engine") prompt = 'what is 1+1? The format is "result": 2' # Check that this prompt cannot lead to a valid JSON without json_schema for _ in range(2): diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 6eca3e767f3f0..74ef6deeea16b 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -28,7 +28,7 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" # but we're not testing generation quality here LORA_NAME = "typeof/zephyr-7b-beta-lora" -GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"] +GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"] @pytest.fixture(scope="module") @@ -95,6 +95,14 @@ def server(default_server_args, request): os.environ['VLLM_USE_V1'] = original_value +@pytest.fixture +def is_v1_server(server): + import os + + # For completion tests, we assume v0 since there's no explicit v1 setup + return os.environ.get('VLLM_USE_V1', '0') == '1' + + @pytest_asyncio.fixture async def client(server): async with server.get_async_client() as async_client: @@ -631,7 +639,10 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI): @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_json_completion(client: openai.AsyncOpenAI, guided_decoding_backend: str, - sample_json_schema): + sample_json_schema, is_v1_server: bool): + if not is_v1_server: + pytest.skip("Guided decoding is only supported in v1 engine") + completion = await client.completions.create( model=MODEL_NAME, prompt=f"Give an example JSON for an employee profile " @@ -653,7 +664,10 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI, @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_regex_completion(client: openai.AsyncOpenAI, guided_decoding_backend: str, - sample_regex): + sample_regex, is_v1_server: bool): + if not is_v1_server: + pytest.skip("Guided decoding is only supported in v1 engine") + completion = await client.completions.create( model=MODEL_NAME, prompt=f"Give an example IPv4 address with this regex: {sample_regex}", @@ -674,7 +688,11 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI, @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_choice_completion(client: openai.AsyncOpenAI, guided_decoding_backend: str, - sample_guided_choice): + sample_guided_choice, + is_v1_server: bool): + if not is_v1_server: + pytest.skip("Guided decoding is only supported in v1 engine") + completion = await client.completions.create( model=MODEL_NAME, prompt="The best language for type-safe systems programming is ", @@ -692,7 +710,9 @@ async def test_guided_choice_completion(client: openai.AsyncOpenAI, @pytest.mark.asyncio async def test_guided_grammar(client: openai.AsyncOpenAI, - sample_sql_statements): + sample_sql_statements, is_v1_server: bool): + if not is_v1_server: + pytest.skip("Guided grammar is only supported in v1 engine") completion = await client.completions.create( model=MODEL_NAME, @@ -754,7 +774,11 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI, @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_decoding_type_error(client: openai.AsyncOpenAI, guided_decoding_backend: str, - sample_json_schema, sample_regex): + sample_json_schema, sample_regex, + is_v1_server: bool): + if not is_v1_server: + pytest.skip("Guided decoding is only supported in v1 engine") + with pytest.raises(openai.BadRequestError): _ = await client.completions.create( model=MODEL_NAME, diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index ff0730c77032c..e31a1d077608f 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -9,6 +9,11 @@ import regex as re from ...utils import RemoteOpenAIServer +@pytest.fixture(scope="function", autouse=True) +def use_v1_only(monkeypatch): + monkeypatch.setenv('VLLM_USE_V1', '1') + + @pytest.mark.asyncio async def test_empty_prompt(): model_name = "gpt2" @@ -37,24 +42,3 @@ async def test_out_of_vocab_token_ids(): prompt=[999999], max_tokens=5, temperature=0.0) - - -@pytest.mark.asyncio -async def test_reject_multistep_with_guided_decoding(): - model_name = "gpt2" - server_args = ["--enforce-eager", "--num-scheduler-steps", "8"] - with RemoteOpenAIServer(model_name, server_args) as remote_server: - client = remote_server.get_async_client() - - with pytest.raises( - openai.BadRequestError, - match=re.compile( - '.*Guided decoding .* multi-step decoding.*').pattern): - await client.completions.create( - model=model_name, - prompt="Hello", - max_tokens=5, - temperature=0.0, - extra_body={"response_format": { - "type": "json_object" - }}) diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py deleted file mode 100644 index 2cf0ba2fe6866..0000000000000 --- a/tests/model_executor/test_guided_processors.py +++ /dev/null @@ -1,207 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import json -import pickle - -import pytest -import torch -from transformers import AutoTokenizer - -from vllm.config import ModelConfig -from vllm.model_executor.guided_decoding import ( - get_guided_decoding_logits_processor, - get_local_guided_decoding_logits_processor) -from vllm.model_executor.guided_decoding.outlines_logits_processors import ( - JSONLogitsProcessor, RegexLogitsProcessor) -from vllm.sampling_params import GuidedDecodingParams - -MODEL_NAME = 'HuggingFaceH4/zephyr-7b-beta' -GUIDED_DECODING_BACKENDS = [ - "outlines", "lm-format-enforcer", "xgrammar", "guidance" -] -GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT = ["outlines", "xgrammar"] -REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" - - -# Initialize the tokenizer for the model here to avoid repeated loading -@pytest.fixture(scope="module") -def zephyr_7B_tokenzer(): - return AutoTokenizer.from_pretrained(MODEL_NAME) - - -@pytest.fixture(scope="module") -def deepseek_r1_qwen_tokenizer(): - return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME) - - -def test_guided_logits_processors(zephyr_7B_tokenzer, sample_regex, - sample_json_schema): - """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor.""" - regex_LP = RegexLogitsProcessor(sample_regex, - zephyr_7B_tokenzer, - reasoner=None) - json_LP = JSONLogitsProcessor(sample_json_schema, - zephyr_7B_tokenzer, - whitespace_pattern=None, - reasoner=None) - - tensor = torch.rand(32000) - original_tensor = torch.clone(tensor) - tensor = regex_LP([], tensor) - assert tensor.shape == original_tensor.shape - assert not torch.allclose(tensor, original_tensor) - - tensor = torch.rand(32000) - original_tensor = torch.clone(tensor) - tensor = json_LP([], tensor) - assert tensor.shape == original_tensor.shape - assert not torch.allclose(tensor, original_tensor) - - -@pytest.mark.asyncio -@pytest.mark.parametrize("backend", GUIDED_DECODING_BACKENDS) -@pytest.mark.parametrize("is_local", [True, False]) -async def test_guided_logits_processor_black_box(backend: str, is_local: bool, - sample_regex, - sample_json_schema, - zephyr_7B_tokenzer): - - config = ModelConfig( - MODEL_NAME, - runner="generate", - seed=0, - dtype="bfloat16", - ) - regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend) - - regex_lp = get_local_guided_decoding_logits_processor( - regex_request, zephyr_7B_tokenzer, config) if is_local else \ - await get_guided_decoding_logits_processor( - regex_request, zephyr_7B_tokenzer, config) - assert regex_lp is not None - tensor = torch.rand(32000) - original_tensor = torch.clone(tensor) - # allowed tokens at state 0 - tensor = regex_lp([], tensor) - assert tensor.shape == original_tensor.shape - assert not torch.allclose(tensor, original_tensor) - - json_request = GuidedDecodingParams(json=sample_json_schema, - backend=backend) - json_lp = await get_guided_decoding_logits_processor( - json_request, zephyr_7B_tokenzer, config) - assert json_lp is not None - tensor = torch.rand(32000) - original_tensor = torch.clone(tensor) - tensor = json_lp([], tensor) - assert tensor.shape == original_tensor.shape - assert not torch.allclose(tensor, original_tensor) - - -@pytest.mark.asyncio -@pytest.mark.parametrize("backend", - GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT) -@pytest.mark.parametrize("is_local", [True, False]) -@pytest.mark.parametrize("reasoning_backend", ["deepseek_r1"]) -async def test_guided_logits_processor_with_reasoning( - backend: str, is_local: bool, reasoning_backend: str, sample_regex, - sample_json_schema, deepseek_r1_qwen_tokenizer): - - config = ModelConfig( - REASONING_MODEL_NAME, - runner="generate", - seed=0, - dtype="bfloat16", - ) - token_ids = deepseek_r1_qwen_tokenizer.encode( - "here is the thinking process") - regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend) - - regex_lp = get_local_guided_decoding_logits_processor(regex_request, - deepseek_r1_qwen_tokenizer, config, - reasoning_backend) if is_local else \ - await get_guided_decoding_logits_processor( - regex_request, deepseek_r1_qwen_tokenizer, config, - reasoning_backend) - assert regex_lp is not None - tensor = torch.rand(151664) - original_tensor = torch.clone(tensor) - tensor = regex_lp(token_ids, tensor) - assert tensor.shape == original_tensor.shape - assert torch.allclose(tensor, original_tensor) - - token_ids = deepseek_r1_qwen_tokenizer.encode( - "here is the thinking process") - json_request = GuidedDecodingParams(json=sample_json_schema, - backend=backend) - json_lp = get_local_guided_decoding_logits_processor( - json_request, deepseek_r1_qwen_tokenizer, config, - reasoning_backend) if is_local else \ - await get_guided_decoding_logits_processor( - json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend) - assert json_lp is not None - tensor = torch.rand(151664) - original_tensor = torch.clone(tensor) - tensor = json_lp(token_ids, tensor) - assert tensor.shape == original_tensor.shape - assert torch.allclose(tensor, original_tensor) - - # Thinking is over, so the tensor should change. - token_ids = deepseek_r1_qwen_tokenizer.encode( - "here is the thinking process") - json_request = GuidedDecodingParams(json=sample_json_schema, - backend=backend) - json_lp = get_local_guided_decoding_logits_processor( - json_request, deepseek_r1_qwen_tokenizer, config, - reasoning_backend) if is_local else \ - await get_guided_decoding_logits_processor( - json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend) - assert json_lp is not None - tensor = torch.rand(151664) - original_tensor = torch.clone(tensor) - tensor = json_lp(token_ids, tensor) - assert tensor.shape == original_tensor.shape - assert not torch.allclose(tensor, original_tensor) - - -def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex): - with pytest.raises(ValueError, - match="You can only use one kind of guided"): - GuidedDecodingParams(json=sample_json_schema, regex=sample_regex) - - with pytest.raises(ValueError, - match="You can only use one kind of guided"): - GuidedDecodingParams(json=sample_json_schema, json_object=True) - - with pytest.raises(ValueError, - match="You can only use one kind of guided"): - GuidedDecodingParams(json=sample_json_schema, choice=["a", "b"]) - - with pytest.raises(ValueError, - match="You can only use one kind of guided"): - GuidedDecodingParams(json=sample_json_schema, grammar="test grammar") - - -def test_pickle_xgrammar_tokenizer_data(): - try: - import xgrammar as xgr - except ImportError: - pytest.skip("Could not import xgrammar to run test") - - from vllm.model_executor.guided_decoding.xgrammar_decoding import ( - TokenizerData) - tokenizer_data = TokenizerData( - metadata= - '{"vocab_type":2,"vocab_size":151665,"add_prefix_space":false,"stop_token_ids":[151645]}', - encoded_vocab=['!', '"', '#', '$', '%'], - ) - pickled = pickle.dumps(tokenizer_data) - - assert pickled is not None - - depickled: TokenizerData = pickle.loads(pickled) - - assert depickled is not None - assert json.loads( - depickled.metadata)['vocab_type'] == xgr.VocabType.BYTE_LEVEL.value diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py index 81a88f2d485eb..af51a60edfd62 100644 --- a/tests/models/language/generation/test_mistral.py +++ b/tests/models/language/generation/test_mistral.py @@ -3,13 +3,11 @@ import copy import json -import jsonschema -import jsonschema.exceptions import pytest from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import ( MistralToolCall, MistralToolParser) -from vllm.sampling_params import GuidedDecodingParams, SamplingParams +from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer import MistralTokenizer from ...utils import check_logprobs_close @@ -274,53 +272,6 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None: assert parsed_message.content is None -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("guided_backend", - ["outlines", "lm-format-enforcer", "xgrammar"]) -def test_mistral_guided_decoding( - monkeypatch: pytest.MonkeyPatch, - vllm_runner, - model: str, - guided_backend: str, -) -> None: - with monkeypatch.context() as m: - # Guided JSON not supported in xgrammar + V1 yet - m.setenv("VLLM_USE_V1", "0") - - with vllm_runner( - model, - dtype='bfloat16', - tokenizer_mode="mistral", - guided_decoding_backend=guided_backend, - ) as vllm_model: - guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA) - params = SamplingParams(max_tokens=512, - temperature=0.7, - guided_decoding=guided_decoding) - - messages = [{ - "role": "system", - "content": "you are a helpful assistant" - }, { - "role": - "user", - "content": - f"Give an example JSON for an employee profile that " - f"fits this schema: {SAMPLE_JSON_SCHEMA}" - }] - outputs = vllm_model.llm.chat(messages, sampling_params=params) - - generated_text = outputs[0].outputs[0].text - json_response = json.loads(generated_text) - assert outputs is not None - - try: - jsonschema.validate(instance=json_response, - schema=SAMPLE_JSON_SCHEMA) - except jsonschema.exceptions.ValidationError: - pytest.fail("Generated response is not valid with JSON schema") - - def test_mistral_function_call_nested_json(): """Ensure that the function-name regex captures the entire outer-most JSON block, including nested braces.""" diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py index 11803b8d7a5eb..128e8f552a161 100644 --- a/tests/samplers/test_no_bad_words.py +++ b/tests/samplers/test_no_bad_words.py @@ -14,9 +14,9 @@ from vllm import LLM, SamplingParams @pytest.fixture(autouse=True) -def v1(run_with_both_engines): - """We can run both engines for this test.""" - pass +def v1(monkeypatch): + """Only run on vLLM v1.""" + monkeypatch.setenv('VLLM_USE_V1', '1') def _generate( diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py index be6427dd6bde5..7330f61e67689 100644 --- a/tests/test_sampling_params.py +++ b/tests/test_sampling_params.py @@ -56,8 +56,7 @@ def test_sampling_params_from_request_with_no_guided_decoding_backend( @pytest.mark.parametrize("request_level_guided_decoding_backend,expected", - [("xgrammar", "xgrammar"), - ("lm-format-enforcer", "lm-format-enforcer"), + [("xgrammar", "xgrammar"), ("guidance", "guidance"), ("outlines", "outlines")]) def test_sampling_params_from_request_with_guided_decoding_backend( request_level_guided_decoding_backend: str, expected: str, diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index cc59287a9fbe6..b68ed298a1895 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -47,13 +47,6 @@ def test_unsupported_configs(monkeypatch): }, ).create_engine_config() - with pytest.raises(NotImplementedError): - AsyncEngineArgs( - model=MODEL, - guided_decoding_backend="lm-format-enforcer", - guided_decoding_disable_fallback=True, - ).create_engine_config() - with pytest.raises(NotImplementedError): AsyncEngineArgs( model=MODEL, diff --git a/tools/check_pickle_imports.py b/tools/check_pickle_imports.py index ef197d1fbace1..5e99dc63ebe0c 100644 --- a/tools/check_pickle_imports.py +++ b/tools/check_pickle_imports.py @@ -34,7 +34,6 @@ ALLOWED_FILES = set([ 'vllm/model_executor/models/registry.py', 'tests/test_utils.py', 'tests/tokenization/test_cached_tokenizer.py', - 'tests/model_executor/test_guided_processors.py', 'vllm/distributed/utils.py', 'vllm/distributed/parallel_state.py', 'vllm/engine/multiprocessing/client.py', diff --git a/vllm/config.py b/vllm/config.py index 3bcbbe60652b7..7ae615f477057 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1774,8 +1774,8 @@ class CacheConfig: - "builtin" is Python's built-in hash.\n - "sha256" is collision resistant but with certain overheads. This option uses Pickle for object serialization before hashing.\n - - "sha256_cbor_64bit" provides a reproducible, cross-language compatible - hash. It serializes objects using canonical CBOR and hashes them with + - "sha256_cbor_64bit" provides a reproducible, cross-language compatible + hash. It serializes objects using canonical CBOR and hashes them with SHA-256. The resulting hash consists of the lower 64 bits of the SHA-256 digest.""" cpu_offload_gb: float = 0 @@ -3721,12 +3721,7 @@ def get_served_model_name(model: str, return served_model_name -GuidedDecodingBackendV0 = Literal["auto", "outlines", "lm-format-enforcer", - "xgrammar", "guidance"] - -GuidedDecodingBackendV1 = Literal["auto", "xgrammar", "guidance", "outlines"] -GuidedDecodingBackend = Literal[GuidedDecodingBackendV0, - GuidedDecodingBackendV1] +GuidedDecodingBackend = Literal["auto", "xgrammar", "guidance", "outlines"] @config @@ -3734,7 +3729,7 @@ GuidedDecodingBackend = Literal[GuidedDecodingBackendV0, class DecodingConfig: """Dataclass which contains the decoding strategy of the engine.""" - backend: GuidedDecodingBackend = "auto" if envs.VLLM_USE_V1 else "xgrammar" + backend: GuidedDecodingBackend = "auto" """Which engine will be used for guided decoding (JSON schema / regex etc) by default. With "auto", we will make opinionated choices based on request contents and what the backend libraries currently support, so the behavior @@ -3776,13 +3771,6 @@ class DecodingConfig: return hash_str def __post_init__(self): - if envs.VLLM_USE_V1: - valid_guided_backends = get_args(GuidedDecodingBackendV1) - else: - valid_guided_backends = get_args(GuidedDecodingBackendV0) - if self.backend not in valid_guided_backends: - raise ValueError(f"Invalid backend '{self.backend}'," - f" must be one of {valid_guided_backends}") if (self.disable_any_whitespace and self.backend not in ("xgrammar", "guidance")): raise ValueError("disable_any_whitespace is only supported for " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d4d6001a428d2..6bdc3c361af34 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -25,14 +25,14 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, ConfigFormat, ConfigType, ConvertOption, DecodingConfig, DetailedTraceModules, Device, DeviceConfig, DistributedExecutorBackend, - GuidedDecodingBackend, GuidedDecodingBackendV1, - HfOverrides, KVEventsConfig, KVTransferConfig, - LoadConfig, LogprobsMode, LoRAConfig, ModelConfig, - ModelDType, ModelImpl, MultiModalConfig, - ObservabilityConfig, ParallelConfig, PoolerConfig, - PrefixCachingHashAlgo, RunnerOption, SchedulerConfig, - SchedulerPolicy, SpeculativeConfig, TaskOption, - TokenizerMode, VllmConfig, get_attr_docs, get_field) + GuidedDecodingBackend, HfOverrides, KVEventsConfig, + KVTransferConfig, LoadConfig, LogprobsMode, + LoRAConfig, ModelConfig, ModelDType, ModelImpl, + MultiModalConfig, ObservabilityConfig, ParallelConfig, + PoolerConfig, PrefixCachingHashAlgo, RunnerOption, + SchedulerConfig, SchedulerPolicy, SpeculativeConfig, + TaskOption, TokenizerMode, VllmConfig, get_attr_docs, + get_field) from vllm.logger import init_logger from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins @@ -1343,14 +1343,6 @@ class EngineArgs: recommend_to_remove=True) return False - if self.guided_decoding_backend not in get_args( - GuidedDecodingBackendV1): - _raise_or_fallback( - feature_name= - f"--guided-decoding-backend={self.guided_decoding_backend}", - recommend_to_remove=False) - return False - # Need at least Ampere for now (FA support required). # Skip this check if we are running on a non-GPU platform, # or if the device capability is not available diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 39642d89167bd..06bb4eeab69eb 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio -import copy import time import weakref from functools import partial @@ -24,8 +23,6 @@ from vllm.inputs import PromptType from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.model_executor.guided_decoding import ( - get_guided_decoding_logits_processor) from vllm.model_executor.layers.sampler import SamplerOutput from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams @@ -469,19 +466,6 @@ class _AsyncLLMEngine(LLMEngine): tokenization_kwargs=tokenization_kwargs, ) - if isinstance(params, SamplingParams) and \ - params.guided_decoding is not None: - # Guided decoding has an async implementation for building logits - # processors in a separate threadpool. - # We want to invoke that here instead of using the blocking - # implementation in the LLMEngine - params = await build_guided_decoding_logits_processor_async( - sampling_params=params, - tokenizer=await self.get_tokenizer_async(lora_request), - default_guided_backend=self.decoding_config.backend, - reasoning_backend=self.decoding_config.reasoning_backend, - model_config=self.model_config) - self._add_processed_request( request_id=request_id, processed_inputs=processed_inputs, @@ -503,48 +487,6 @@ class _AsyncLLMEngine(LLMEngine): raise NotImplementedError -async def build_guided_decoding_logits_processor_async( - sampling_params: SamplingParams, tokenizer: AnyTokenizer, - default_guided_backend: str, reasoning_backend: Optional[str], - model_config: ModelConfig) -> SamplingParams: - """Constructs logits processors based on the guided_decoding, - logits_bias, and allowed_token_ids fields in sampling_params. Deletes - those fields and adds the constructed logits processors to the - logits_processors field. Modifies sampling params in-place and returns - the modified sampling params.""" - if sampling_params.guided_decoding is None: - return sampling_params - - # Defensively copy sampling params since guided decoding logits - # processors can have different state for each request - sampling_params = copy.copy(sampling_params) - guided_decoding = sampling_params.guided_decoding - - logger.debug( - "Building guided decoding logits processor. " - "guided_decoding: %s%s", guided_decoding, - f", reasoning_backend: {reasoning_backend}" - if reasoning_backend is not None else "") - - guided_decoding.backend = guided_decoding.backend or default_guided_backend - - processor = await get_guided_decoding_logits_processor( - guided_params=guided_decoding, - tokenizer=tokenizer, - reasoning_backend=reasoning_backend, - model_config=model_config) - - if processor: - if sampling_params.logits_processors is None: - sampling_params.logits_processors = [] - sampling_params.logits_processors.append(processor) - - # Unset guided decoding params after constructing the lp from them - sampling_params.guided_decoding = None - - return sampling_params - - class AsyncLLMEngine(EngineClient): """An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine]. @@ -1028,7 +970,7 @@ class AsyncLLMEngine(EngineClient): ``` # Please refer to entrypoints/api_server.py for # the complete example. - + # initialize the engine and the example input # note that engine_args here is AsyncEngineArgs instance engine = AsyncLLMEngine.from_engine_args(engine_args) @@ -1036,13 +978,13 @@ class AsyncLLMEngine(EngineClient): "input": "What is LLM?", "request_id": 0, } - + # start the generation results_generator = engine.encode( example_input["input"], PoolingParams(), example_input["request_id"]) - + # get the results final_output = None async for request_output in results_generator: @@ -1052,7 +994,7 @@ class AsyncLLMEngine(EngineClient): # Return or raise an error ... final_output = request_output - + # Process and return the final output ... ``` diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e7919d90442f9..3f30a34170ffe 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import copy import time from collections import Counter as collectionsCounter from collections import deque @@ -36,8 +35,6 @@ from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.logits_process import get_bad_words_logits_processors from vllm.lora.request import LoRARequest -from vllm.model_executor.guided_decoding import ( - get_local_guided_decoding_logits_processor) from vllm.model_executor.layers.sampler import SamplerOutput from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.processing import EncDecMultiModalProcessor @@ -686,11 +683,10 @@ class LLMEngine: "Priority scheduling is not enabled.") if isinstance(params, SamplingParams) \ - and (params.guided_decoding or params.logits_processors) \ + and params.logits_processors \ and self.scheduler_config.num_scheduler_steps > 1: raise ValueError( - "Guided decoding and logits processors are not supported " - "in multi-step decoding") + "Logits processors are not supported in multi-step decoding") if arrival_time is None: arrival_time = time.time() @@ -1226,7 +1222,7 @@ class LLMEngine: engine = LLMEngine.from_engine_args(engine_args) example_inputs = [(0, "What is LLM?", SamplingParams(temperature=0.0))] - + # Start the engine with an event loop while True: if example_inputs: @@ -1983,43 +1979,13 @@ class LLMEngine: def _build_logits_processors( self, sampling_params: SamplingParams, lora_request: Optional[LoRARequest]) -> SamplingParams: - """Constructs logits processors based on the guided_decoding, - logits_bias, and allowed_token_ids fields in sampling_params. Deletes - those fields and adds the constructed logits processors to the - logits_processors field. Returns the modified sampling params.""" + """Constructs logits processors based on the logits_bias, and + allowed_token_ids fields in sampling_params. Deletes those fields and + adds the constructed logits processors to the logits_processors field. + Returns the modified sampling params.""" logits_processors = [] - if sampling_params.guided_decoding is not None: - # Defensively copy sampling params since guided decoding logits - # processors can have different state for each request - sampling_params = copy.copy(sampling_params) - guided_decoding = sampling_params.guided_decoding - - logger.debug( - "Building guided decoding logits processor in " - "LLMEngine. Params: %s", guided_decoding) - - tokenizer = self.get_tokenizer(lora_request=lora_request) - guided_decoding.backend = guided_decoding.backend or \ - self.decoding_config.backend - - if self.decoding_config.reasoning_backend: - logger.debug("Building with reasoning backend %s", - self.decoding_config.reasoning_backend) - - processor = get_local_guided_decoding_logits_processor( - guided_params=guided_decoding, - tokenizer=tokenizer, - model_config=self.model_config, - reasoning_backend=self.decoding_config.reasoning_backend, - ) - if processor: - logits_processors.append(processor) - - # Unset so this doesn't get passed down to the model - sampling_params.guided_decoding = None - if (sampling_params.logit_bias or sampling_params.allowed_token_ids): tokenizer = self.get_tokenizer(lora_request=lora_request) diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index cde8fc367fb54..f69f72edf6a52 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -20,8 +20,6 @@ from vllm.config import DecodingConfig, ModelConfig, VllmConfig from vllm.core.scheduler import SchedulerOutputs # yapf conflicts with isort for this block # yapf: disable -from vllm.engine.async_llm_engine import ( - build_guided_decoding_logits_processor_async) from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT, IPC_HEALTH_EXT, IPC_INPUT_EXT, IPC_OUTPUT_EXT, RPC_REQUEST_T, @@ -537,22 +535,6 @@ class MQLLMEngineClient(EngineClient): if request_id in self.output_queues: raise ValueError(f"Request {request_id} already exists") - # Constructing guided decoding logits processors is expensive, so we do - # it here to avoid contending with cpu resources and the GIL on the - # backend process. - if isinstance(params, SamplingParams) and \ - params.guided_decoding is not None: - params = await \ - build_guided_decoding_logits_processor_async( - sampling_params=params, - tokenizer=await self.get_tokenizer(lora_request), - default_guided_backend=(self.decoding_config.backend - if self.decoding_config - else DecodingConfig.backend), - model_config=self.model_config, - reasoning_backend=self.decoding_config.reasoning_backend, - ) - # 1) Create output queue for this requests. queue: asyncio.Queue[Union[RequestOutput, BaseException]] = asyncio.Queue() diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 04dd193966421..adef350931f3d 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools -import warnings from collections.abc import Sequence from contextlib import contextmanager from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union, @@ -40,15 +39,13 @@ from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt from vllm.inputs.parse import parse_and_batch_prompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.model_executor.guided_decoding.guided_fields import ( - GuidedDecodingRequest, LLMGuidedOptions) from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput, PoolingRequestOutput, RequestOutput, ScoringRequestOutput) from vllm.pooling_params import PoolingParams -from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams, - RequestOutputKind, SamplingParams) +from vllm.sampling_params import (BeamSearchParams, RequestOutputKind, + SamplingParams) from vllm.tasks import PoolingTask from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer, get_cached_tokenizer) @@ -330,8 +327,6 @@ class LLM: *, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - guided_options_request: Optional[Union[LLMGuidedOptions, - GuidedDecodingRequest]] = None, ) -> list[RequestOutput]: ... @@ -345,8 +340,6 @@ class LLM: prompt_token_ids: Optional[list[int]] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - guided_options_request: Optional[Union[LLMGuidedOptions, - GuidedDecodingRequest]] = None, ) -> list[RequestOutput]: ... @@ -360,8 +353,6 @@ class LLM: prompt_token_ids: Optional[list[list[int]]] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - guided_options_request: Optional[Union[LLMGuidedOptions, - GuidedDecodingRequest]] = None, ) -> list[RequestOutput]: ... @@ -376,8 +367,6 @@ class LLM: prompt_token_ids: list[int], use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - guided_options_request: Optional[Union[LLMGuidedOptions, - GuidedDecodingRequest]] = None, ) -> list[RequestOutput]: ... @@ -392,8 +381,6 @@ class LLM: prompt_token_ids: list[list[int]], use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - guided_options_request: Optional[Union[LLMGuidedOptions, - GuidedDecodingRequest]] = None, ) -> list[RequestOutput]: ... @@ -406,8 +393,6 @@ class LLM: prompt_token_ids: Union[list[int], list[list[int]]], use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - guided_options_request: Optional[Union[LLMGuidedOptions, - GuidedDecodingRequest]] = None, ) -> list[RequestOutput]: ... @@ -425,8 +410,6 @@ class LLM: prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - guided_options_request: Optional[Union[LLMGuidedOptions, - GuidedDecodingRequest]] = None, priority: Optional[list[int]] = None, ) -> list[RequestOutput]: """Generates the completions for the input prompts. @@ -478,14 +461,6 @@ class LLM: parsed_prompts = cast(Union[PromptType, Sequence[PromptType]], prompts) - if isinstance(guided_options_request, dict): - if len(guided_options_request) > 1: - raise ValueError( - "You can only use one guided decoding but multiple is " - f"specified: {guided_options_request}") - guided_options_request = GuidedDecodingRequest( - **guided_options_request) - if sampling_params is None: # Use default sampling params. sampling_params = self.get_default_sampling_params() @@ -507,7 +482,6 @@ class LLM: params=sampling_params, use_tqdm=use_tqdm, lora_request=lora_request, - guided_options=guided_options_request, tokenization_kwargs=tokenization_kwargs, priority=priority, ) @@ -1361,17 +1335,17 @@ class LLM: of your inputs into a single list and pass it to this method. Supports both text and multi-modal data (images, etc.) when used with - appropriate multi-modal models. For multi-modal inputs, ensure the + appropriate multi-modal models. For multi-modal inputs, ensure the prompt structure matches the model's expected input format. Args: - data_1: Can be a single prompt, a list of prompts or - `ScoreMultiModalParam`, which can contain either text or - multi-modal data. When a list, it must have the same length as + data_1: Can be a single prompt, a list of prompts or + `ScoreMultiModalParam`, which can contain either text or + multi-modal data. When a list, it must have the same length as the `data_2` list. - data_2: The data to pair with the query to form the input to + data_2: The data to pair with the query to form the input to the LLM. Can be text or multi-modal data. See [PromptType] - [vllm.inputs.PromptType] for more details about the format of + [vllm.inputs.PromptType] for more details about the format of each prompt. use_tqdm: If `True`, shows a tqdm progress bar. If a callable (e.g., `functools.partial(tqdm, leave=False)`), @@ -1582,17 +1556,8 @@ class LLM: use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]], tokenization_kwargs: Optional[dict[str, Any]] = None, - guided_options: Optional[GuidedDecodingRequest] = None, priority: Optional[list[int]] = None, ) -> None: - if guided_options is not None: - warnings.warn( - "guided_options_request is deprecated, use " - "SamplingParams.guided_decoding instead", - DeprecationWarning, - stacklevel=2, - ) - if isinstance(prompts, (str, dict)): # Convert a single prompt to a list. prompts = [prompts] @@ -1608,8 +1573,6 @@ class LLM: for sp in params if isinstance(params, Sequence) else (params, ): if isinstance(sp, SamplingParams): - self._add_guided_params(sp, guided_options) - # We only care about the final output sp.output_kind = RequestOutputKind.FINAL_ONLY @@ -1647,29 +1610,6 @@ class LLM: priority=priority, ) - def _add_guided_params( - self, - params: SamplingParams, - guided_options: Optional[GuidedDecodingRequest] = None): - if guided_options is None: - return params - - if params.guided_decoding is not None: - raise ValueError("Cannot set both guided_options_request and " - "params.guided_decoding.") - - params.guided_decoding = GuidedDecodingParams( - json=guided_options.guided_json, - regex=guided_options.guided_regex, - choice=guided_options.guided_choice, - grammar=guided_options.guided_grammar, - json_object=guided_options.guided_json_object, - backend=guided_options.guided_decoding_backend, - whitespace_pattern=guided_options.guided_whitespace_pattern, - structural_tag=guided_options.structural_tag, - ) - return params - def _run_engine( self, *, diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py deleted file mode 100644 index 7540e6344a498..0000000000000 --- a/vllm/model_executor/guided_decoding/__init__.py +++ /dev/null @@ -1,192 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from __future__ import annotations - -from typing import TYPE_CHECKING - -from vllm.logger import init_logger -from vllm.model_executor.guided_decoding.utils import ( - convert_lark_to_gbnf, grammar_is_likely_lark, - has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features) -from vllm.reasoning import ReasoningParserManager - -if TYPE_CHECKING: - from transformers import PreTrainedTokenizer - - from vllm.config import ModelConfig - from vllm.logits_process import LogitsProcessor - from vllm.sampling_params import GuidedDecodingParams - -logger = init_logger(__name__) - - -def maybe_backend_fallback( - guided_params: GuidedDecodingParams) -> GuidedDecodingParams: - - def fallback_or_error(guided_params: GuidedDecodingParams, message: str, - fallback: str) -> None: - """Change the backend to the specified fallback with a warning log, - or raise a ValueError if the `disable_fallback` option is specified.""" - if guided_params.disable_fallback: - raise ValueError(message) - - logger.warning("%s Falling back to use %s instead.", message, fallback) - guided_params.backend = fallback - - # `auto` was added for V1 to explicitly declare a mode that has fallbacks - # in place. If that is specified with V0, treat it as `xgrammar`, as we have - # fallbacks enabled for that and it is the V0 default. - if guided_params.backend == "auto": - guided_params.backend = "xgrammar" - - # lm-format-enforce doesn't support grammar, fallback to xgrammar - if guided_params.backend == "lm-format-enforcer": - if guided_params.grammar is not None: - fallback_or_error( - guided_params, - "lm-format-enforcer does not support grammar guided decoding.", - "xgrammar") - - # lm-format-enforcer doesn't support some JSON schema features - elif (guided_params.json is not None - and has_lmf_unsupported_json_features(guided_params.json)): - fallback_or_error( - guided_params, - "lm-format-enforcer does not support advanced JSON schema " - "features like patterns or numeric ranges.", "outlines") - - if guided_params.backend == "xgrammar": - from vllm.model_executor.guided_decoding.xgrammar_decoding import ( - xgr_installed) - - # xgrammar doesn't support some JSON schema features - if (guided_params.json is not None and - has_xgrammar_unsupported_json_features(guided_params.json)): - fallback_or_error( - guided_params, - "xgrammar does not support advanced JSON schema features like " - "string length, item limits, or property bounds.", "outlines") - - # xgrammar only supports GBNF grammars, so we must convert Lark. - # We must check if the grammar is likely Lark and if that - # grammar is convertible to GBNF - elif (guided_params.grammar is not None - and grammar_is_likely_lark(guided_params.grammar)): - try: - convert_lark_to_gbnf(guided_params.grammar) - except Exception: - fallback_or_error( - guided_params, - "xgrammar does not support Lark grammars and the " - "grammar failed to convert to GBNF.", "guidance") - - # If the xgrammar module cannot be imported successfully, - # we should still allow users to use guided decoding with a fallback. - elif not xgr_installed: - fallback_or_error( - guided_params, - "xgrammar module cannot be imported successfully.", "guidance") - - if guided_params.backend == "outlines": - if guided_params.json_object is not None: - # outlines doesn't support json_object, fallback to guidance - fallback_or_error(guided_params, - "outlines does not support json_object.", - "guidance") - elif guided_params.grammar is not None: - # outlines grammar support has been removed, fallback to guidance - # if it is a lark-based grammar and xgrammar otherwise - if grammar_is_likely_lark(guided_params.grammar): - fallback_or_error(guided_params, - "outlines no longer supports grammars.", - "guidance") - else: - # The grammar is likely already GBNF format. - fallback_or_error(guided_params, - "outlines no longer supports grammars.", - "xgrammar") - - return guided_params - - -async def get_guided_decoding_logits_processor( - guided_params: GuidedDecodingParams, - tokenizer: PreTrainedTokenizer, - model_config: ModelConfig, - reasoning_backend: str | None = None) -> LogitsProcessor | None: - - reasoner = None - if reasoning_backend: - reasoner_class = ReasoningParserManager.get_reasoning_parser( - reasoning_backend) - reasoner = reasoner_class(tokenizer) - - guided_params = maybe_backend_fallback(guided_params) - - if guided_params.backend == 'outlines': - # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193 - from vllm.model_executor.guided_decoding.outlines_decoding import ( # noqa - get_outlines_guided_decoding_logits_processor) - return await get_outlines_guided_decoding_logits_processor( - guided_params, tokenizer, reasoner) - if guided_params.backend == 'lm-format-enforcer': - from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import ( # noqa - get_local_lm_format_enforcer_guided_decoding_logits_processor) - return get_local_lm_format_enforcer_guided_decoding_logits_processor( - guided_params, tokenizer) - if guided_params.backend == 'xgrammar': - from vllm.model_executor.guided_decoding.xgrammar_decoding import ( # noqa - get_local_xgrammar_guided_decoding_logits_processor) - return get_local_xgrammar_guided_decoding_logits_processor( - guided_params, tokenizer, model_config, reasoner) - if guided_params.backend == 'guidance': - from vllm.model_executor.guided_decoding.guidance_decoding import ( - get_local_guidance_guided_decoding_logits_processor) - return get_local_guidance_guided_decoding_logits_processor( - guided_params, tokenizer) - raise ValueError( - f"Unknown guided decoding backend '{guided_params.backend}'. " - "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar', 'guidance'" - ) - - -def get_local_guided_decoding_logits_processor( - guided_params: GuidedDecodingParams, - tokenizer: PreTrainedTokenizer, - model_config: ModelConfig, - reasoning_backend: str | None = None) -> LogitsProcessor | None: - guided_params = maybe_backend_fallback(guided_params) - - reasoner = None - if reasoning_backend: - reasoner_class = ReasoningParserManager.get_reasoning_parser( - reasoning_backend) - reasoner = reasoner_class(tokenizer) - - if guided_params.backend == 'outlines': - # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193 - from vllm.model_executor.guided_decoding.outlines_decoding import ( # noqa - get_local_outlines_guided_decoding_logits_processor) - return get_local_outlines_guided_decoding_logits_processor( - guided_params, tokenizer, reasoner) - if guided_params.backend == 'lm-format-enforcer': - from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import ( # noqa - get_local_lm_format_enforcer_guided_decoding_logits_processor) - return get_local_lm_format_enforcer_guided_decoding_logits_processor( - guided_params, tokenizer) - if guided_params.backend == 'xgrammar': - from vllm.model_executor.guided_decoding.xgrammar_decoding import ( # noqa - get_local_xgrammar_guided_decoding_logits_processor) - return get_local_xgrammar_guided_decoding_logits_processor( - guided_params, tokenizer, model_config, reasoner) - if guided_params.backend == 'guidance': - from vllm.model_executor.guided_decoding.guidance_decoding import ( - get_local_guidance_guided_decoding_logits_processor) - return get_local_guidance_guided_decoding_logits_processor( - guided_params, tokenizer) - - raise ValueError( - f"Unknown guided decoding backend '{guided_params.backend}'. " - "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar', 'guidance'" - ) diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py deleted file mode 100644 index 05b6a1c3239f1..0000000000000 --- a/vllm/model_executor/guided_decoding/guidance_decoding.py +++ /dev/null @@ -1,63 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import json - -import llguidance -from regex import escape as regex_escape -from transformers import PreTrainedTokenizerBase - -from vllm.model_executor.guided_decoding.guidance_logits_processors import ( - GuidanceLogitsProcessor) -from vllm.sampling_params import GuidedDecodingParams -from vllm.v1.structured_output.backend_guidance import ( - process_for_additional_properties) - - -def get_local_guidance_guided_decoding_logits_processor( - guided_params: GuidedDecodingParams, - tokenizer: PreTrainedTokenizerBase) -> GuidanceLogitsProcessor: - """ - Given an OpenAI-compatible request, check for guided decoding parameters - and get the necessary logits processor for the given guide. - """ - - grm = "" - any_whitespace = not guided_params.disable_any_whitespace - if (guide_json := guided_params.json) is not None: - # Optionally set additionalProperties to False at the top-level - # By default, other backends do not allow additional top-level - # properties, so this makes guidance more similar to other backends - if guided_params.disable_additional_properties: - if not isinstance(guide_json, str): - guide_json = json.dumps(guide_json) - guide_json = process_for_additional_properties(guide_json) - - grm = llguidance.LLMatcher.grammar_from_json_schema( - guide_json, - overrides={"whitespace_pattern": guided_params.whitespace_pattern}, - defaults={ - "whitespace_flexible": any_whitespace, - }) - elif guided_params.json_object: - grm = llguidance.LLMatcher.grammar_from_json_schema( - '{"type": "object"}', - overrides={"whitespace_pattern": guided_params.whitespace_pattern}, - defaults={ - "whitespace_flexible": any_whitespace, - }) - elif guided_params.regex: - grm = llguidance.grammar_from("regex", guided_params.regex) - elif guided_params.choice: - # choice just uses regex - choices = (regex_escape(str(choice)) - for choice in guided_params.choice) - choices_regex = "(" + "|".join(choices) + ")" - grm = llguidance.grammar_from("regex", choices_regex) - elif guided_params.grammar: - # this supports Lark and GBNF - grm = llguidance.grammar_from("grammar", guided_params.grammar) - - if grm: - return GuidanceLogitsProcessor(grm, tokenizer) - - raise ValueError("Unknown guided decoding mode") diff --git a/vllm/model_executor/guided_decoding/guidance_logits_processors.py b/vllm/model_executor/guided_decoding/guidance_logits_processors.py deleted file mode 100644 index 379b5eaa38a76..0000000000000 --- a/vllm/model_executor/guided_decoding/guidance_logits_processors.py +++ /dev/null @@ -1,104 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import copy -import os -from typing import Any - -import llguidance -import llguidance.hf -import llguidance.torch -import torch -from transformers import PreTrainedTokenizerBase - -from vllm.logger import init_logger - -logger = init_logger(__name__) - - -class GuidanceLogitsProcessor: - """Base Guidance Logits Processor""" - - cached_tokenizers: dict[str, Any] = {} - - def __init__( - self, - grammar: str, - tokenizer: PreTrainedTokenizerBase, - ) -> None: - """Base Guidance Logits Processor - - Args: - grammar (str) - grammar to guide the generation - tokenizer (PreTrainedTokenizerBase) - model's tokenizer - """ - self.grammar = grammar - self.tokenizer = tokenizer - self.tokenizer_name = tokenizer.name_or_path - self.ll_tokenizer = None - self.ll_matcher = None - self.bitmask = None - self.new_sampling = False - self.initialized = False - - def clone(self) -> "GuidanceLogitsProcessor": - cloned = copy.copy(self) - if self.initialized: - cloned.ll_matcher = llguidance.LLMatcher( - self.ll_tokenizer, # type: ignore[assignment] - self.grammar, - log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")), - ) - self.bitmask = llguidance.torch.allocate_token_bitmask( - 1, self.ll_tokenizer.vocab_size) # type: ignore[attr-defined] - return cloned - - def _initialize(self): - if self.initialized: - return - - ll_tokenizer = self.cached_tokenizers.get(self.tokenizer.name_or_path, - None) - if ll_tokenizer is None: - ll_tokenizer = llguidance.hf.from_tokenizer(self.tokenizer, None) - self.cached_tokenizers[self.tokenizer.name_or_path] = ll_tokenizer - - self.ll_tokenizer = ll_tokenizer - self.ll_matcher = llguidance.LLMatcher( - self.ll_tokenizer, - self.grammar, - log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")), - ) - - # create reusable bitmask - self.bitmask = llguidance.torch.allocate_token_bitmask( - 1, self.ll_tokenizer.vocab_size) # type: ignore[attr-defined] - - self.initialized = True - - def __call__( - self, - input_ids: list[int], - scores: torch.Tensor, - ) -> torch.Tensor: - # we initialize the guidance model here - # to avoid pickling ll_tokenizer and ll_interpreter - self._initialize() - - if self.new_sampling and len(input_ids) > 0: - self.ll_matcher.consume_token( # type: ignore[attr-defined] - input_ids[-1]) - err = self.ll_matcher.get_error() # type: ignore[attr-defined] - if err: - logger.warning("Error in LLMatcher: %s", err) - - llguidance.torch.fill_next_token_bitmask(self.ll_matcher, self.bitmask, - 0) - llguidance.torch.apply_token_bitmask_inplace( - scores, - self.bitmask.to(scores.device)) # type: ignore[attr-defined] - - self.new_sampling = True - - return scores diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py deleted file mode 100644 index fa97b6dbf5115..0000000000000 --- a/vllm/model_executor/guided_decoding/guided_fields.py +++ /dev/null @@ -1,41 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from dataclasses import dataclass -from typing import Optional, TypedDict, Union - - -# These classes are deprecated, see SamplingParams -class LLMGuidedOptions(TypedDict, total=False): - guided_json: Union[dict, str] - guided_regex: str - guided_choice: list[str] - guided_grammar: str - guided_decoding_backend: str - guided_whitespace_pattern: str - guided_json_object: bool - - -@dataclass -class GuidedDecodingRequest: - """One of the fields will be used to retrieve the logit processor.""" - guided_json: Optional[Union[dict, str]] = None - guided_regex: Optional[str] = None - guided_choice: Optional[list[str]] = None - guided_grammar: Optional[str] = None - guided_decoding_backend: Optional[str] = None - guided_whitespace_pattern: Optional[str] = None - guided_json_object: Optional[bool] = None - structural_tag: Optional[str] = None - - def __post_init__(self): - """Validate that some fields are mutually exclusive.""" - guide_count = sum(x is not None - for x in (self.guided_json, self.guided_regex, - self.guided_choice, self.guided_grammar, - self.guided_json_object, - self.structural_tag)) - if guide_count > 1: - raise ValueError( - "You can only use one kind of guided decoding but multiple are " - f"specified: {self.__dict__}") diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py deleted file mode 100644 index f9b51f4c15745..0000000000000 --- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +++ /dev/null @@ -1,67 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from functools import lru_cache -from json import loads as json_loads -from typing import Optional, Union - -from lmformatenforcer import (CharacterLevelParser, JsonSchemaParser, - RegexParser, StringParser, - TokenEnforcerTokenizerData, UnionParser) -from lmformatenforcer.integrations.vllm import ( - build_vllm_logits_processor, build_vllm_token_enforcer_tokenizer_data) -from transformers import PreTrainedTokenizerBase - -from vllm.logits_process import LogitsProcessor -from vllm.sampling_params import GuidedDecodingParams - - -def get_local_lm_format_enforcer_guided_decoding_logits_processor( - guided_params: GuidedDecodingParams, - tokenizer) -> Optional[LogitsProcessor]: - """ - Given an OpenAI-compatible request, check for guided decoding parameters - and get the necessary logits processor for the given guide. - We cache logit processors by (guide, tokenizer), and on cache hit - we make a shallow copy to reuse the same underlying FSM. - """ - - tokenizer_data = _cached_build_vllm_token_enforcer_tokenizer_data( - tokenizer) - character_level_parser: CharacterLevelParser - if guided_params.json: - schema_dict = _normalize_json_schema_object(guided_params.json) - character_level_parser = JsonSchemaParser(schema_dict) - elif guided_params.choice: - character_level_parser = UnionParser( - [StringParser(choice) for choice in guided_params.choice]) - elif guided_params.regex: - character_level_parser = RegexParser(guided_params.regex) - elif guided_params.grammar: - # CFG grammar not supported by LMFE - raise ValueError("Cannot construct a guided decoding logits processor" - " using the grammar option with the" - " lm_format_enforcer backend.") - elif guided_params.json_object: - # None means any json object - character_level_parser = JsonSchemaParser(None) - else: - return None - - logits_processor = build_vllm_logits_processor(tokenizer_data, - character_level_parser) - return logits_processor - - -def _normalize_json_schema_object(schema: Union[str, dict]) -> dict: - if isinstance(schema, str): - return json_loads(schema) - if isinstance(schema, dict): - return schema - raise AssertionError(f"Unsupported schema type {schema}") - - -@lru_cache -def _cached_build_vllm_token_enforcer_tokenizer_data( - tokenizer: PreTrainedTokenizerBase) -> TokenEnforcerTokenizerData: - return build_vllm_token_enforcer_tokenizer_data(tokenizer) diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py deleted file mode 100644 index 7e365b294438b..0000000000000 --- a/vllm/model_executor/guided_decoding/outlines_decoding.py +++ /dev/null @@ -1,117 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -import concurrent.futures -import os -from enum import Enum -from json import dumps as json_dumps -from typing import Optional, Union - -from regex import escape as regex_escape -from transformers import PreTrainedTokenizerBase - -from vllm.model_executor.guided_decoding.outlines_logits_processors import ( - JSONLogitsProcessor, RegexLogitsProcessor) -from vllm.reasoning import ReasoningParser -from vllm.sampling_params import GuidedDecodingParams - - -class GuidedDecodingMode(Enum): - JSON = "json" - REGEX = "regex" - CHOICE = "choice" - - -global_thread_pool = None # used for generating logits processor fsm - -# It's not yet clear that using more provides a benefit, and it could -# potentially starve other processes on the machine. We'll cap this for now and -# adjust later if testing proves it to help overcome a bottleneck. -_MAX_THREADPOOL_WORKERS = 16 - - -async def get_outlines_guided_decoding_logits_processor( - guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase, - reasoner: Optional[ReasoningParser] -) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, None]: - """ - Given an OpenAI-compatible request, check for guided decoding parameters - and get the necessary logits processor for the given guide. - """ - global global_thread_pool - guide, mode = _get_guide_and_mode(guided_params) - if not guide or not mode: - return None - - if global_thread_pool is None: - max_workers = os.cpu_count() or 2 - if max_workers > _MAX_THREADPOOL_WORKERS: - max_workers = _MAX_THREADPOOL_WORKERS - global_thread_pool = concurrent.futures.ThreadPoolExecutor( - max_workers=max_workers) - loop = asyncio.get_running_loop() - return await loop.run_in_executor(global_thread_pool, - _get_logits_processor, guide, tokenizer, - mode, guided_params.whitespace_pattern, - reasoner) - - -def get_local_outlines_guided_decoding_logits_processor( - guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase, - reasoner: Optional[ReasoningParser] -) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, None]: - """ - Given an OpenAI-compatible request, check for guided decoding parameters - and get the necessary logits processor for the given guide. - """ - guide, mode = _get_guide_and_mode(guided_params) - if not guide or not mode: - return None - - return _get_logits_processor(guide, tokenizer, mode, - guided_params.whitespace_pattern, reasoner) - - -def _get_guide_and_mode( - guided_params: GuidedDecodingParams -) -> Union[tuple[str, GuidedDecodingMode], tuple[None, None]]: - if guided_params.json: - if isinstance(guided_params.json, dict): - # turn dict into hashable string - json = json_dumps(guided_params.json) - else: - json = guided_params.json - return json, GuidedDecodingMode.JSON - elif guided_params.regex: - return guided_params.regex, GuidedDecodingMode.REGEX - elif guided_params.choice: - # choice just uses regex - choices = [ - regex_escape(str(choice)) for choice in guided_params.choice - ] - choices_regex = "(" + "|".join(choices) + ")" - return choices_regex, GuidedDecodingMode.CHOICE - elif guided_params.grammar: - raise ValueError( - "The `outlines` guided decoding backend no longer supports grammar " - "guided generation. Please use either the `xgrammar` or `guidance` " - "backend") - else: - return None, None - - -def _get_logits_processor( - guide: str, - tokenizer: PreTrainedTokenizerBase, - mode: GuidedDecodingMode, - whitespace_pattern: Union[str, None], - reasoner: Optional[ReasoningParser], -) -> Union[JSONLogitsProcessor, RegexLogitsProcessor]: - if mode == GuidedDecodingMode.JSON: - return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern, - reasoner) - elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE: - return RegexLogitsProcessor(guide, tokenizer, reasoner) - else: - raise ValueError(f"Unknown guided decoding mode {mode}") diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py deleted file mode 100644 index 7f047a1df6a58..0000000000000 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ /dev/null @@ -1,307 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# SPDX-FileCopyrightText: Copyright 2024-present the Outlines developers -from __future__ import annotations - -import copy -import hashlib -import importlib.metadata -import json -import os -from typing import Optional, Union - -import regex as re -import torch -from cachetools import LRUCache -from diskcache import Cache -from outlines_core import Guide, Index, Vocabulary -from outlines_core.json_schema import build_regex_from_schema -from outlines_core.kernels.torch import (_apply_token_bitmask_inplace_kernel, - allocate_token_bitmask) -from pydantic import BaseModel -from transformers import PreTrainedTokenizerBase -from transformers.file_utils import SPIECE_UNDERLINE -from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode - -import vllm.envs as envs -from vllm.logger import init_logger -from vllm.reasoning import ReasoningParser -from vllm.transformers_utils.tokenizer import AnyTokenizer - -logger = init_logger(__name__) - -CACHE = None - - -class BaseLogitsProcessor: - - def __init__(self, guide: Guide, eos_token_id: int, - reasoner: Optional[ReasoningParser]) -> None: - self._guide: Guide = guide - self._eos_token_id: int = eos_token_id - self._reasoner: Optional[ReasoningParser] = reasoner - self._mask: Optional[torch.Tensor] = None - - def __call__(self, input_ids: list[int], - scores: torch.Tensor) -> torch.Tensor: - if self._mask is None: - self._mask = allocate_token_bitmask(scores.size(-1)) - - # Skip the structured logits processing if reasoning is not finished. - # reasoner is not None only when `--reasoning-parser` is set. - if self._reasoner is not None and not self._reasoner.is_reasoning_end( - input_ids): - return scores - - # Remove the reasoning tokens from the input_ids - # We need this because our implementation relies on the - # input_ids sequence to store the FSM state. - input_ids = (self._reasoner.extract_content_ids(input_ids) - if self._reasoner is not None else input_ids) - - # Vllm V0 engine has a weird bug where we have to repeat - # the eos token id twice for generation to stop, or at least - # that is what we have to do from here in any case. - # This is a patch until a better solution can be pushed - # to outlines_core - if input_ids and input_ids[-1] != self._eos_token_id: - self._guide.advance(token_id=input_ids[-1], return_tokens=False) - - self._guide.write_mask_into( - data_ptr=self._mask.data_ptr(), - numel=self._mask.numel(), - element_size=self._mask.element_size(), - ) - - # Any allowed tokens beyond the length of the scores will - # be ignored by the kernel, taking care of the issue with - # models such as Llama 3.2 Vision with an `<|image|>` token - # with id 128256, but scores.shape == torch.Size([128256]) - _apply_token_bitmask_inplace_kernel( - logits=scores.unsqueeze(dim=0), - # mask must be on same device - mask=self._mask.to(scores.device, non_blocking=True)) - self._mask.to("cpu", non_blocking=True) - - return scores - - def clone(self) -> BaseLogitsProcessor: - guide = copy.deepcopy(self._guide) - guide.reset() - return BaseLogitsProcessor(guide=guide, - eos_token_id=self._eos_token_id, - reasoner=self._reasoner) - - -class RegexLogitsProcessor(BaseLogitsProcessor): - - @classmethod - def _get_guide(cls, regex_string: str, - tokenizer: PreTrainedTokenizerBase) -> Guide: - global CACHE - if CACHE is None: - CACHE = get_cache() - vocabulary = get_vocabulary(tokenizer) # type: ignore[arg-type] - cache_key = f"{vocabulary._hash}_{regex_string}" - if CACHE is not None and cache_key in CACHE: - return Guide(CACHE[cache_key]) - - index = Index(regex_string, vocabulary.inner) - - if CACHE is not None: - CACHE[cache_key] = index - - return Guide(index) - - def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase, - reasoner: Optional[ReasoningParser]) -> None: - super().__init__( - guide=RegexLogitsProcessor._get_guide(regex_string, tokenizer), - eos_token_id=tokenizer.eos_token_id, # type: ignore - reasoner=reasoner) - - -class JSONLogitsProcessor(RegexLogitsProcessor): - - def __init__(self, schema: Union[str, dict, BaseModel], - tokenizer: PreTrainedTokenizerBase, - whitespace_pattern: Union[str, None], - reasoner: Optional[ReasoningParser]) -> None: - - if isinstance(schema, type(BaseModel)): - schema_str = json.dumps(schema.model_json_schema()) - elif isinstance(schema, dict): - schema_str = json.dumps(schema) - elif isinstance(schema, str): - schema_str = schema - else: - raise ValueError( - f"Cannot parse schema {schema}. The schema must be either " - f"a Pydantic object, a dictionary or a string that contains " - f"the JSON Schema specification") - - regex_string = build_regex_from_schema(schema_str, whitespace_pattern) - super().__init__(regex_string, tokenizer, reasoner) - - -class OutlinesVocabulary: - """ - Wrapper class for `outlines_core.Vocabulary`, - which allows us to store a hash with the vocabulary - """ - - def __init__(self, vocabulary: Vocabulary) -> None: - # Actual vocabulary object - self.inner = vocabulary - # Have to do abs(hash()) because python hashes can - # be negative, and we are using hash as a cache key. - hex_str = hashlib.sha256( - vocabulary.__repr__().encode('utf-8')).hexdigest() - hash_int = int(hex_str, 16) - self._hash = hash_int - - -re_llama_byte_token = re.compile(r"^<0x[0-9A-F]{2}>$") -re_replacement_seq = re.compile(r"^.{0,6}�+.{0,6}$") - - -def _reduced_vocabulary(tokenizer: AnyTokenizer, - eos_token_id: int) -> dict[bytes, list[int]]: - """Create a map from vocabulary tokens to lists of equivalent token ids. - - Returns: - A Dict of token string -> equivalent token ids - """ - unicode_to_bytes = {v: k for k, v in bytes_to_unicode().items()} - - def convert_token_to_string(token: str) -> str: - - string = tokenizer.convert_tokens_to_string([token]) - - # A hack to handle missing spaces to HF's Llama tokenizers - if (type(token) is str and token.startswith(SPIECE_UNDERLINE) - or token == "<0x20>"): - return " " + string - - return string - - vocabulary: dict[bytes, list[int]] = {} - empty_token_ids: list[int] = [] - for token, token_idx in tokenizer.get_vocab().items(): - if token in tokenizer.all_special_tokens: # type: ignore - continue - - token_str = convert_token_to_string(token) - if token_str: - if isinstance(token, (bytes, bytearray)): - # For BPE tokenizers where tokens are stored as bytes. - - # safe to ignore since token_str is of type (bytearray, bytes) - # by this point. - token_bytes = bytes(token_str) # type: ignore[arg-type] - - elif "\ufffd" in token_str and not re_replacement_seq.match( - token_str): - # Handle tokens with invalid UTF-8 sequences. - if re_llama_byte_token.match(token): - # Llama-like tokenizers use <0xXX> for incomplete sequences. - token_bytes = bytes([int(token[3:5], 16)]) - else: - # GPT2 tokenizers: map each byte back using unicode_to_bytes - byte_vals = [unicode_to_bytes.get(c) for c in token] - if None in byte_vals: - raise RuntimeError( - f"Cannot convert token `{token}`" - f" ({token_idx}) to bytes: {token_str}") - # safe to ignore, since if None in byte_vals, - # an error is thrown. - token_bytes = bytes(byte_vals) # type: ignore[arg-type] - else: - token_bytes = token_str.encode('utf-8') - - if token_idx != eos_token_id: - vocabulary.setdefault(token_bytes, []).append(token_idx) - else: - empty_token_ids.append(token_idx) - - return vocabulary - - -def get_vocabulary(tokenizer: AnyTokenizer) -> Vocabulary: - """Get the `Vocabulary` object for a given tokenizer. - """ - if hasattr(tokenizer, "_outlines_vocabulary"): - return tokenizer._outlines_vocabulary # type: ignore - - try: - if hasattr( - tokenizer, - "eos_token_id", - ) and tokenizer.eos_token_id is not None: - eos_token_id = tokenizer.eos_token_id - else: - raise ValueError( - f"Error during guided decoding setup: Tokenizer" - f" ({type(tokenizer)}) has no `eos_token_id` property, " - "but `eos_token_id` is required for guided decoding" - " to work properly.") - - reduced_vocab = _reduced_vocabulary( - tokenizer, - eos_token_id #type: ignore - ) - vocabulary = OutlinesVocabulary(Vocabulary(eos_token_id, - reduced_vocab)) - tokenizer._outlines_vocabulary = vocabulary # type: ignore - - return vocabulary - except AttributeError as e: - raise ValueError(f"Cannot get the vocabulary of the tokenizer " - f"({type(tokenizer)}). The tokenizer should have a " - "get_vocab method.") from e - - -def get_cache_path() -> str: - """Get the context object that contains previously-computed return values""" - outlines_cache_dir = os.getenv("OUTLINES_CACHE_DIR") - xdg_cache_home = os.getenv("XDG_CACHE_HOME") - home_dir = os.path.expanduser("~") - - if outlines_cache_dir: - # OUTLINES_CACHE_DIR takes precedence - return outlines_cache_dir - elif xdg_cache_home: - return os.path.join(xdg_cache_home, ".cache", "outlines") - # If homedir is "/", we may be inside a container, and thus writing to - # root would be problematic, so we fallback to using a tempfile. - # Also validate the path exists, since os.path.expanduser does - # not garuntee existence. - elif os.path.isdir(home_dir) and home_dir != "/": - # Default Unix fallback: ~/.cache/outlines - return os.path.join(home_dir, ".cache", "outlines") - else: - import tempfile - - # home_dir may be / inside a docker container without existing user - tempdir = tempfile.gettempdir() - return os.path.join(tempdir, ".cache", "outlines") - - -def get_cache(): - """Get the Cache instance to be used for index caching""" - - cache_dir = get_cache_path() - if envs.VLLM_V0_USE_OUTLINES_CACHE: - logger.warning("Enabling outlines cache. This is an unbounded on-disk " - "cache. It may consume a lot of disk space and should " - "not be used with untrusted clients.") - cache = Cache(cache_dir, eviction_policy="none", cull_limit=0) - outlines_version = importlib.metadata.version("outlines_core") - - cached_version = cache.get('__version__', None) - if cached_version != outlines_version: - cache.clear() - cache.set('__version__', outlines_version) - return cache - else: - return LRUCache(maxsize=128) diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py deleted file mode 100644 index 8fdfa983e120b..0000000000000 --- a/vllm/model_executor/guided_decoding/utils.py +++ /dev/null @@ -1,242 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import regex as re - - -def has_xgrammar_unsupported_json_features(schema: dict) -> bool: - """Check if JSON schema contains features unsupported by xgrammar.""" - - def check_object(obj: dict) -> bool: - if not isinstance(obj, dict): - return False - - # Check for numeric ranges - if obj.get("type") in ("integer", "number") and ("multipleOf" in obj): - return True - - # Check for array unsupported keywords - if obj.get("type") == "array" and any(key in obj for key in [ - "uniqueItems", "contains", "minContains", "maxContains", - "minItems", "maxItems" - ]): - return True - - # Unsupported keywords for strings - if obj.get("type") == "string" and any( - key in obj for key in ["minLength", "maxLength", "format"]): - return True - - # Unsupported keywords for objects - if obj.get("type") == "object" and any(key in obj for key in [ - "minProperties", "maxProperties", "propertyNames", - "patternProperties" - ]): - return True - - # Recursively check all nested objects and arrays - for value in obj.values(): - if isinstance(value, dict): - if check_object(value): - return True - elif isinstance(value, list): - for item in value: - if isinstance(item, dict) and check_object(item): - return True - - return False - - return check_object(schema) - - -def has_lmf_unsupported_json_features(schema: dict) -> bool: - """ - Check if JSON schema contains features unsupported - by lm_format_enforcer. - - Known issues: - - Regex patterns: - "grade": { - "type": "string", - "pattern": "^[A-D]$" # Regex pattern - }, - """ - - def check_object(obj: dict) -> bool: - if not isinstance(obj, dict): - return False - - # Check for pattern restrictions - if "pattern" in obj: - return True - - # Recursively check all nested objects and arrays - for value in obj.values(): - if isinstance(value, dict): - if check_object(value): - return True - elif isinstance(value, list): - for item in value: - if isinstance(item, dict) and check_object(item): - return True - - return False - - return check_object(schema) - - -def grammar_is_likely_lark(grammar_str: str) -> bool: - """ - Check if grammar appears to use Lark syntax. - - Args: - grammar_str: Input grammar string - - Returns: - bool: True if grammar appears to be in Lark format, False otherwise - - Examples: - >>> grammar_is_likely_lark("rule: 'abc'") - True - >>> grammar_is_likely_lark("rule ::= 'abc'") - False - """ - if not grammar_str or not isinstance(grammar_str, str): - return False - - for line in grammar_str.split('\n'): - # Remove both comment styles - line = re.sub(r'(#|//).*$', '', line).strip() - if not line: - continue - - # Look for GBNF rule definition - if '::=' in line: - return False - - return True - - -def convert_lark_to_gbnf(grammar_str: str) -> str: - """ - Convert a Lark grammar string to GBNF format. - - GBNF reference: - https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md - Lark grammar reference: - https://lark-parser.readthedocs.io/en/latest/grammar.html - - Args: - grammar_str: Input grammar in Lark format - - Returns: - str: Converted grammar in GBNF format - - Examples: - >>> print(convert_lark_to_gbnf("rule: 'hello'")) - root ::= rule - rule ::= "hello" - """ - if not isinstance(grammar_str, str): - raise ValueError(f"Grammar must be a string, got {type(grammar_str)}") - if not grammar_str.strip(): - raise ValueError("Grammar string cannot be empty") - - defined_rules = set() - referenced_rules = set() - output_lines = [] - - def clean_line(line: str) -> str: - """Remove comments and whitespace from line.""" - return re.sub(r'(#|//).*$', '', line).strip() - - def check_quotes(text: str, rule_name: str, line_num: int) -> None: - """Validate quote matching in text.""" - if text.count("'") % 2 != 0 or text.count('"') % 2 != 0: - raise ValueError( - f"Mismatched quotes in {rule_name} on line {line_num}") - - def extract_references(text: str) -> set: - """Extract rule references from text.""" - # Remove quoted strings and special characters - text = re.sub(r'"[^"]*"', '', text) - text = re.sub(r'[+*?()|\[\]{}]', ' ', text) - return set(re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', text)) - - # First pass: Find root rule and validate rule definitions - lines = [clean_line(line) for line in grammar_str.split('\n')] - first_rule = None - - for line_num, line in enumerate(lines, 1): - if not line or line.startswith('|'): - continue - - if ':' in line: - try: - name = line.split(':', 1)[0].strip().strip('?') - defined_rules.add(name) - if first_rule is None: - first_rule = name - if name == 'start': - first_rule = 'start' - except IndexError as e: - raise ValueError(f"Invalid rule format on line {line_num}. " - "Expected 'rule_name: definition'") from e - - if not defined_rules: - raise ValueError("No valid rules found in grammar") - - # Add root rule - output_lines.append(f"root ::= {first_rule}") - - # Second pass: Process rule definitions and alternatives - current_rule = None - current_definition = [] - - for line_num, line in enumerate(lines, 1): - if not line: - continue - - try: - if ':' in line and not line.startswith('|'): - # Save previous rule if exists - if current_rule: - output_lines.append( - f"{current_rule} ::= {' | '.join(current_definition)}") - - # Process new rule - name, definition = line.split(':', 1) - current_rule = name.strip().strip('?') - - check_quotes(definition, f"rule '{current_rule}'", line_num) - definition = re.sub(r"'([^']*)'", r'"\1"', definition) - referenced_rules.update(extract_references(definition)) - current_definition = [definition.strip()] - - elif line.startswith('|'): - if not current_rule: - raise ValueError(f"Alternative '|' on line {line_num} " - "without a preceding rule definition") - - alt_def = line[1:].strip() - check_quotes(alt_def, f"alternative for rule '{current_rule}'", - line_num) - alt_def = re.sub(r"'([^']*)'", r'"\1"', alt_def) - referenced_rules.update(extract_references(alt_def)) - current_definition.append(alt_def) - - except ValueError as e: - raise ValueError(f"Error on line {line_num}: {str(e)}") from e - - # Add final rule if exists - if current_rule: - output_lines.append( - f"{current_rule} ::= {' | '.join(current_definition)}") - - # Validate all rules are defined - undefined_rules = referenced_rules - defined_rules - {'root'} - if undefined_rules: - raise ValueError("Referenced rules are not defined: " - f"{', '.join(sorted(undefined_rules))}") - - return '\n'.join(output_lines) diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py deleted file mode 100644 index bdd3a1a9c0a59..0000000000000 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ /dev/null @@ -1,426 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# noqa: UP007 -from __future__ import annotations - -import json -from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any - -import regex as re -import torch - -import vllm.envs -from vllm.logger import init_logger - -try: - import xgrammar as xgr - xgr_installed = True -except ImportError: - xgr_installed = False - pass - -from vllm.model_executor.guided_decoding.utils import (convert_lark_to_gbnf, - grammar_is_likely_lark) -from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer - -if TYPE_CHECKING: - from transformers import PreTrainedTokenizer - - from vllm.config import ModelConfig - from vllm.reasoning import ReasoningParser - from vllm.sampling_params import GuidedDecodingParams - -logger = init_logger(__name__) - - -def get_local_xgrammar_guided_decoding_logits_processor( - guided_params: GuidedDecodingParams, - tokenizer: PreTrainedTokenizer, - model_config: ModelConfig, - reasoner: ReasoningParser | None, - max_threads: int = 8): - config = GrammarConfig.from_guided_params(guided_params=guided_params, - model_config=model_config, - tokenizer=tokenizer, - max_threads=max_threads) - return XGrammarLogitsProcessor(config, reasoner) - - -@dataclass(frozen=True) -class TokenizerData: - """Immutable container for cached tokenizer data.""" - metadata: str - encoded_vocab: list[str] = field(default_factory=list) - - -class TokenizerDataCache: - """Cache manager for tokenizer data to avoid repeated processing.""" - _cache: dict[int, TokenizerData] = {} - - @classmethod - def get_tokenizer_data( - cls, - tokenizer: PreTrainedTokenizer, - /, - *, - tokenizer_hash: int, - vocab_size: int, - ) -> TokenizerData: - - if tokenizer_hash not in cls._cache: - tokenizer_info = xgr.TokenizerInfo.from_huggingface( - tokenizer, - # NOTE: We will need to use lm_head's vocab_size - # to determine correct special_token_ids for this tokenizer. - # See https://github.com/mlc-ai/xgrammar/commit/70c959fb6d9cea75aae33c414763cd0602022d92 # noqa: E501 - vocab_size=vocab_size, - ) - metadata = json.loads(tokenizer_info.dump_metadata()) - - # Vendored from xgrammar logic to get encoded_vocab - # https://github.com/mlc-ai/xgrammar/blob/989222175c2a30fb7987d8bcce35bec1bf6817f2/python/xgrammar/tokenizer_info.py#L127 # noqa: E501 - try: - vocab_dict = tokenizer.get_vocab() - except AttributeError as e: - raise ValueError( - f"Cannot get the vocabulary of the tokenizer " - f"{type(tokenizer)}. The tokenizer should have a " - "get_vocab method.") from e - - # maintain tokenizer's indexing - encoded_vocab = [""] * tokenizer_info.vocab_size - for token, idx in vocab_dict.items(): - if idx < tokenizer_info.vocab_size: - encoded_vocab[idx] = token - - if isinstance(tokenizer, MistralTokenizer): - # REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501 - metadata.update({ - "vocab_type": xgr.VocabType.BYTE_FALLBACK, - "add_prefix_space": True - }) - - cls._cache[tokenizer_hash] = TokenizerData( - encoded_vocab=encoded_vocab, - metadata=json.dumps(metadata), - ) - - return cls._cache[tokenizer_hash] - - -class GrammarCompilerCache: - """ - Cache for GrammarCompiler instances based on tokenizer. - - This cache reduces the overhead of creating new compiler instances when - using the same tokenizer configuration. - """ - _cache: dict[str, xgr.GrammarCompiler] = {} - - @classmethod - def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler: - cache_key = str(config.tokenizer_hash) - - if cache_key not in cls._cache: - config_data = config.tokenizer_data - - # In TokenizerDataCache.get_tokenizer_data, a serializable - # tokenizer_data is created and cached. This data is used to build - # a tokenizer_info and create an xgrammar compiler. - tokenizer_info = xgr.TokenizerInfo.from_vocab_and_metadata( - encoded_vocab=config_data.encoded_vocab, - metadata=config_data.metadata, - ) - cache_size = vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024 - cls._cache[cache_key] = xgr.GrammarCompiler( - tokenizer_info, - max_threads=config.max_threads, - cache_enabled=True, - cache_limit_bytes=cache_size, - ) - - return cls._cache[cache_key] - - -@dataclass -class GrammarConfig: - """Serializable configuration for grammar compilation""" - tokenizer_hash: int - tokenizer_data: TokenizerData - json_str: str | None = None - grammar_str: str | None = None - json_object: bool | None = None - any_whitespace: bool = True - regex_str: str | None = None - max_threads: int = 8 - - @classmethod - def from_guided_params(cls, - guided_params: GuidedDecodingParams, - model_config: ModelConfig, - tokenizer: PreTrainedTokenizer, - max_threads: int = 8) -> GrammarConfig: - - tokenizer_hash = hash(tokenizer) - tokenizer_data = TokenizerDataCache.get_tokenizer_data( - tokenizer, - tokenizer_hash=tokenizer_hash, - vocab_size=model_config.hf_text_config.vocab_size, - ) - - if guided_params.json: - if not isinstance(guided_params.json, str): - json_str = json.dumps(guided_params.json) - else: - json_str = guided_params.json - - any_whitespace = not guided_params.disable_any_whitespace - - # Check and log if model with xgrammar and whitespace have history - # of runaway generation of whitespaces. - # References: - # https://github.com/vllm-project/vllm/pull/12744 - # https://github.com/mlc-ai/xgrammar/issues/212 - model_with_warn = None - - if 'Mistral' in model_config.model: - model_with_warn = 'Mistral' - elif 'Qwen' in model_config.model: - model_with_warn = 'Qwen' - - if model_with_warn is not None and any_whitespace: - logger.info_once( - "%s model detected, consider setting `disable_any_whitespace` to prevent runaway generation of whitespaces.", # noqa: E501 - model_with_warn, - ) - # Validate the schema and raise ValueError here if it is invalid. - # This is to avoid exceptions in model execution, which will crash - # the engine worker process. - try: - xgr.Grammar.from_json_schema(json_str, - any_whitespace=any_whitespace) - except RuntimeError as err: - raise ValueError(str(err)) from err - - return cls(json_str=json_str, - tokenizer_hash=tokenizer_hash, - max_threads=max_threads, - tokenizer_data=tokenizer_data, - any_whitespace=any_whitespace) - elif guided_params.grammar: - # XGrammar only supports GBNF grammars, so we must convert Lark - if grammar_is_likely_lark(guided_params.grammar): - try: - grammar_str = convert_lark_to_gbnf(guided_params.grammar) - except ValueError as e: - raise ValueError( - "Failed to convert the grammar from Lark to GBNF. " - "Please either use GBNF grammar directly or specify" - " --guided-decoding-backend=outlines.\n" - f"Conversion error: {str(e)}") from e - else: - grammar_str = guided_params.grammar - - # Validate the grammar and raise ValueError here if it is invalid. - # This is to avoid exceptions in model execution, which will crash - # the engine worker process. - try: - xgr.Grammar.from_ebnf(grammar_str) - except RuntimeError as err: - raise ValueError(str(err)) from err - - return cls(grammar_str=grammar_str, - tokenizer_hash=tokenizer_hash, - max_threads=max_threads, - tokenizer_data=tokenizer_data) - elif guided_params.json_object: - return cls( - json_object=True, - tokenizer_hash=tokenizer_hash, - max_threads=max_threads, - tokenizer_data=tokenizer_data, - ) - elif guided_params.choice: - choice_str = GrammarConfig.choice_as_grammar(guided_params.choice) - try: - xgr.Grammar.from_ebnf(choice_str) - except RuntimeError as err: - raise ValueError(str(err)) from err - - return cls( - grammar_str=choice_str, - tokenizer_hash=tokenizer_hash, - max_threads=max_threads, - tokenizer_data=tokenizer_data, - ) - elif guided_params.regex: - return cls( - regex_str=guided_params.regex, - tokenizer_hash=tokenizer_hash, - max_threads=max_threads, - tokenizer_data=tokenizer_data, - ) - else: - raise ValueError( - "Currently only support JSON and EBNF grammar mode for xgrammar" - ) - - @staticmethod - def escape_ebnf_string(s: str) -> str: - """Escape special characters in a EBNF string.""" - # Escape double quotes and backslashes - return re.sub(r'(["\\])', r'\\\1', s) - - @staticmethod - def choice_as_grammar(choice: list[str] | None) -> str: - if choice is None: - raise ValueError("Choice is not set") - escaped_choices = (GrammarConfig.escape_ebnf_string(c) for c in choice) - grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices)) - return grammar - - @staticmethod - def tokenizer_info(tokenizer_data: TokenizerData) -> xgr.TokenizerInfo: - return xgr.TokenizerInfo.from_vocab_and_metadata( - encoded_vocab=tokenizer_data.encoded_vocab, - metadata=tokenizer_data.metadata, - ) - - -@dataclass -class XGrammarLogitsProcessor: - """Wrapper class to support pickle protocol""" - config: GrammarConfig - reasoner: ReasoningParser | None = None - - ctx: xgr.CompiledGrammar | None = None - tokenizer_info: xgr.TokenizerInfo = None # type: ignore[assignment] - token_bitmask: torch.Tensor = None # type: ignore[assignment] - matchers: list[xgr.GrammarMatcher] = field(default_factory=list) - batch_size: int = field(default=1) - prefilled: bool = field(default=False) - - def __post_init__(self): - if self.tokenizer_info is None: - self.tokenizer_info = self.config.tokenizer_info( - self.config.tokenizer_data) - - def __getstate__(self) -> dict[str, Any]: - return {'config': self.config, 'reasoner': self.reasoner} - - def __setstate__(self, state: dict[str, Any]): - self.config = state['config'] - self.reasoner = state['reasoner'] - - self.tokenizer_info = GrammarConfig.tokenizer_info( - self.config.tokenizer_data) - self.ctx = None - self.matchers = [] - self.batch_size = 1 - self.token_bitmask = None # type: ignore[assignment] - self.prefilled = False - - def _ensure_ctx(self): - """Lazily initialize the processor in the worker process""" - if self.ctx is None: - compiler = GrammarCompilerCache.get_compiler(self.config) - if self.config.json_str is not None: - any_whitespace = self.config.any_whitespace - self.ctx = compiler\ - .compile_json_schema(self.config.json_str, - any_whitespace=any_whitespace) - elif self.config.grammar_str is not None: - self.ctx = compiler.compile_grammar(self.config.grammar_str) - elif self.config.json_object: - any_whitespace = self.config.any_whitespace - self.ctx = compiler\ - .compile_json_schema('{"type": "object"}', - any_whitespace=any_whitespace) - elif self.config.regex_str: - self.ctx = compiler.compile_regex(self.config.regex_str) - else: - raise ValueError( - "Invalid configuration for xgrammar logits processor") - - def __call__(self, input_ids: list[int], - scores: torch.Tensor) -> torch.Tensor: - - # Skip the structured logits processing if reasoning is not finished. - # reasoner is not None only when `--reasoning-parser` is set. - if self.reasoner is not None and \ - not self.reasoner.is_reasoning_end( - input_ids): - return scores - - if self.ctx is None: - self._ensure_ctx() - - if len(self.matchers) == 0: - self.matchers = [ - xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size) - ] - self.token_bitmask = xgr.allocate_token_bitmask( - self.batch_size, self.tokenizer_info.vocab_size) - - if not self.prefilled: - # Have not sampled a token yet - self.prefilled = True - else: - for i, matcher in enumerate(self.matchers): - if not matcher.is_terminated(): - sampled_token = input_ids[-1] - assert self.matchers[i].accept_token(sampled_token) - - for i, matcher in enumerate(self.matchers): - if not matcher.is_terminated(): - # @ubospica: ideally, fill_next_token_bitmask should be - # parallelized with model decoding - # See https://github.com/vllm-project/vllm/pull/10785/files#r1864278303 - matcher.fill_next_token_bitmask(self.token_bitmask, i) - - # token_bitmask is a CPU tensor for use with accept_token and - # fill_next_token_bitmask so we move it to the device of scores - device_type = scores.device.type - dtype = scores.dtype - if device_type != "cuda": - # xgrammar on cpu only supports float32 scores - # see: https://github.com/mlc-ai/xgrammar/blob/c1b64920cad24f44f235778c1c00bb52d57da01a/python/xgrammar/kernels/apply_token_bitmask_inplace_cpu.py#L22 - scores = scores.to("cpu").float().unsqueeze(0) - - # Note: In this method, if the tensors have different dimensions - # on CPU device fails, but on GPU it runs without error. Hence the - # unsqueeze above for scores, to match the token bitmask shape - xgr.apply_token_bitmask_inplace( - scores, self.token_bitmask.to(scores.device, non_blocking=True)) - if device_type != "cuda": - scores = scores.to(dtype).to(device_type).squeeze() - - return scores - - def clone(self) -> XGrammarLogitsProcessor: - """Create a new instance with shared compiled grammar - but separate state""" - new_processor = XGrammarLogitsProcessor(self.config, self.reasoner, - None, self.tokenizer_info) - - # Share the compiled grammar context (immutable after compilation) - new_processor.ctx = self.ctx - - # Create fresh matchers for the new sequence - if self.ctx is not None: - new_processor.matchers = [ - xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size) - ] - - # Create a new token bitmask with the same size - if hasattr(self, 'token_bitmask') and self.token_bitmask is not None: - new_processor.token_bitmask = self.token_bitmask - - # Copy simple attributes - new_processor.batch_size = self.batch_size - # Reset prefilled state for new sequence - new_processor.prefilled = False - - return new_processor From 61a6905ab036fd00eafdb1b0ca130d5feccfe686 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 29 Jul 2025 18:25:07 +0800 Subject: [PATCH 011/224] [Model] Refactor JambaForCausalLM (#21394) Signed-off-by: Jee Jee Li --- vllm/model_executor/models/jamba.py | 231 ++++++++++++++-------------- 1 file changed, 116 insertions(+), 115 deletions(-) diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 34281b2e99ee8..263f4c8379cf2 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -25,6 +25,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.llama import LlamaMLP as JambaMLP from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -33,7 +34,7 @@ from vllm.utils import LayerBlockType from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP, SupportsV0Only) -from .utils import (is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -87,23 +88,6 @@ class JambaMoE(nn.Module): return hidden_states.view(orig_shape) -class JambaMLP(JambaMoE): - - def __init__(self, - config: JambaConfig, - params_dtype: Optional[torch.dtype] = None, - tp_size: Optional[int] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): - super().__init__(config, - num_experts=1, - top_k=1, - params_dtype=params_dtype, - tp_size=tp_size, - quant_config=quant_config, - prefix=prefix) - - class JambaMambaDecoderLayer(nn.Module): def __init__(self, @@ -132,10 +116,20 @@ class JambaMambaDecoderLayer(nn.Module): ) num_experts = config.layers_num_experts[layer_idx] - ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP - self.feed_forward = ffn_layer_class(config, - quant_config=quant_config, - prefix=f"{prefix}.feed_forward") + if num_experts > 1: + self.feed_forward = JambaMoE( + config, + quant_config=quant_config, + prefix=f"{prefix}.feed_forward", + ) + else: + self.feed_forward = JambaMLP( + config.hidden_size, + config.intermediate_size, + config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.feed_forward", + ) self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.pre_ff_layernorm = RMSNorm(config.hidden_size, @@ -216,10 +210,20 @@ class JambaAttentionDecoderLayer(nn.Module): ) num_experts = config.layers_num_experts[layer_idx] - ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP - self.feed_forward = ffn_layer_class(config, - quant_config=quant_config, - prefix=f"{prefix}.feed_forward") + if num_experts > 1: + self.feed_forward = JambaMoE( + config, + quant_config=quant_config, + prefix=f"{prefix}.feed_forward", + ) + else: + self.feed_forward = JambaMLP( + config.hidden_size, + config.intermediate_size, + config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.feed_forward", + ) self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.pre_ff_layernorm = RMSNorm(config.hidden_size, @@ -359,15 +363,97 @@ class JambaModel(nn.Module): hidden_states, _ = self.final_layernorm(hidden_states, residual) return hidden_states + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + return FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.num_experts) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + (".gate_up_proj", ".gate_proj", 0), + (".gate_up_proj", ".up_proj", 1), + ] + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + expert_params_mapping = self.get_expert_mapping() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + if 'experts' in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for ( + param_name, + weight_name, + expert_id, + shard_id, + ) in expert_params_mapping: + if weight_name not in name: + continue + + if is_pp_missing_parameter(name, self): + continue + name = name.replace(weight_name, param_name) + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, IsHybrid, SupportsV0Only): + hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={ + ".self_attn.": ".", + ".A_log": ".A" + }, ) packed_modules_mapping = { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], + "gate_up_proj": ["gate_proj", "up_proj"], "in_proj": ["in_proj"], } @@ -468,96 +554,11 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) - # Params for weights, fp8 weight scales, fp8 activation scales - # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = FusedMoE.make_expert_params_mapping( - ckpt_gate_proj_name="gate_proj", - ckpt_down_proj_name="down_proj", - ckpt_up_proj_name="up_proj", - num_experts=self.config.num_experts) - - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - - if "A_log" in name: - name = name.replace("A_log", "A") - - if ".self_attn." in name: - name = name.replace(".self_attn", "") - - if "feed_forward" in name and not _is_moe_layer(name): - ## map MLP layers to expert with ID=0 - name = name.replace("feed_forward", "feed_forward.experts.0") - - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - if 'experts' in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - - if name.endswith(".bias") and name not in params_dict: - continue - # Skip layers on other devices. - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - for ( - param_name, - weight_name, - expert_id, - shard_id, - ) in expert_params_mapping: - if weight_name not in name: - continue - - if is_pp_missing_parameter(name, self): - continue - name = name.replace(weight_name, param_name) - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, - loaded_weight, - name, - shard_id=shard_id, - expert_id=expert_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - - -def _is_moe_layer(name: str): - return any( - [experts_name in name for experts_name in [ - "experts", - "router", - ]]) + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() class JambaForSequenceClassification(JambaForCausalLM): From 2470419119aa5bc2734b4b8972bbfa348ccdc8b1 Mon Sep 17 00:00:00 2001 From: Kay Yan Date: Tue, 29 Jul 2025 19:56:27 +0800 Subject: [PATCH 012/224] [Docs] Fix the outdated URL for installing from vLLM binaries (#21523) Signed-off-by: Kay Yan Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/contributing/ci/update_pytorch_version.md | 3 +-- docs/getting_started/installation/gpu/cuda.inc.md | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md index 5046db11a4715..699d0531ac768 100644 --- a/docs/contributing/ci/update_pytorch_version.md +++ b/docs/contributing/ci/update_pytorch_version.md @@ -57,8 +57,7 @@ cc the PyTorch release team to initiate discussion on how to address them. ## Update CUDA version -The PyTorch release matrix includes both stable and experimental [CUDA versions](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix). Due to limitations, only the latest stable CUDA version (for example, -`torch2.7.0+cu12.6`) is uploaded to PyPI. However, vLLM may require a different CUDA version, +The PyTorch release matrix includes both stable and experimental [CUDA versions](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix). Due to limitations, only the latest stable CUDA version (for example, torch `2.7.1+cu126`) is uploaded to PyPI. However, vLLM may require a different CUDA version, such as 12.8 for Blackwell support. This complicates the process as we cannot use the out-of-the-box `pip install torch torchvision torchaudio` command. The solution is to use diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md index 5ca5296d0a657..5298c22c8435e 100644 --- a/docs/getting_started/installation/gpu/cuda.inc.md +++ b/docs/getting_started/installation/gpu/cuda.inc.md @@ -38,10 +38,10 @@ We recommend leveraging `uv` to [automatically select the appropriate PyTorch in As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions: ```bash -# Install vLLM with CUDA 11.8. -export VLLM_VERSION=0.6.1.post1 -export PYTHON_VERSION=312 -uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +# Install vLLM with a specific CUDA version (e.g., 11.8 or 12.6). +export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//') +export CUDA_VERSION=118 # or 126 +uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION} ``` [](){ #install-the-latest-code } From 755fa8b657e3666cc93b08a6d2b9a50d0f46c37e Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Tue, 29 Jul 2025 04:58:29 -0700 Subject: [PATCH 013/224] [KVCache] Make KVCacheSpec hashable (#21791) Signed-off-by: Chen Zhang --- tests/v1/core/test_kv_cache_utils.py | 34 +++++++- .../v1/e2e/test_correctness_sliding_window.py | 8 +- vllm/v1/core/kv_cache_coordinator.py | 31 ++++--- vllm/v1/core/kv_cache_utils.py | 35 ++++---- vllm/v1/kv_cache_interface.py | 80 +++++++------------ 5 files changed, 100 insertions(+), 88 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index ebe3a30e3352d..e9c6f1f95cd71 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -17,7 +17,7 @@ from vllm.v1.core.kv_cache_utils import ( estimate_max_model_len, generate_block_hash_extra_keys, get_kv_cache_config, get_max_concurrency_for_kv_cache_config, hash_block_tokens, hash_request_tokens, init_none_hash, - unify_kv_cache_configs) + is_kv_cache_type_uniform, unify_kv_cache_configs) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheTensor, SlidingWindowSpec) @@ -685,6 +685,38 @@ def test_merge_kv_cache_spec(): assert merged_layer_spec.sliding_window == 1 +def test_is_kv_cache_type_uniform(): + kv_cache_spec = { + "layer_1": new_kv_cache_spec(num_kv_heads=32), + "layer_2": new_kv_cache_spec(num_kv_heads=32), + } + assert is_kv_cache_type_uniform(kv_cache_spec) + + kv_cache_spec = { + "layer_1": new_kv_cache_spec(num_kv_heads=32), + "layer_2": new_kv_cache_spec(num_kv_heads=32, sliding_window=1), + } + assert is_kv_cache_type_uniform(kv_cache_spec) + + kv_cache_spec = { + "layer_1": new_kv_cache_spec(num_kv_heads=32), + "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=1), + } + assert not is_kv_cache_type_uniform(kv_cache_spec) + + kv_cache_spec = { + "layer_1": new_sliding_window_spec(num_kv_heads=32, sliding_window=1), + "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=1), + } + assert is_kv_cache_type_uniform(kv_cache_spec) + + kv_cache_spec = { + "layer_1": new_sliding_window_spec(num_kv_heads=32, sliding_window=1), + "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=2), + } + assert not is_kv_cache_type_uniform(kv_cache_spec) + + @pytest.mark.parametrize( ("model_id", "max_model_len", "want_estimated_max_len"), [ ("Qwen/Qwen1.5-7B", 16385, 16384), diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py index 277ea3c838505..4dfe1d3bb33fa 100644 --- a/tests/v1/e2e/test_correctness_sliding_window.py +++ b/tests/v1/e2e/test_correctness_sliding_window.py @@ -30,7 +30,9 @@ model_config = { ]) @pytest.mark.parametrize("batch_size", [5]) @pytest.mark.parametrize("seed", [1]) -def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed): +@pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False]) +def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed, + disable_hybrid_kv_cache_manager): """ The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then asks for value of one of them (which is outside the sliding window). @@ -42,7 +44,9 @@ def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed): test_config = model_config[model] - llm = LLM(model=model) + llm = LLM( + model=model, + disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager) sampling_params = SamplingParams(temperature=0.0, max_tokens=100) prompts, answer, indices = prep_prompts(batch_size, diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index de72e60434ad7..0cce2ec81e08a 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -7,7 +7,8 @@ from vllm.v1.core.block_pool import BlockPool from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock from vllm.v1.core.single_type_kv_cache_manager import ( FullAttentionManager, get_manager_for_kv_cache_spec) -from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheSpec) from vllm.v1.request import Request @@ -258,44 +259,40 @@ class HybridKVCacheCoordinator(KVCacheCoordinator): one of them is full attention. Then, split the kv cache groups into full attention groups and other groups. """ - full_attention_type_id: Optional[str] = None - other_type_id: Optional[str] = None + full_attention_spec: Optional[FullAttentionSpec] = None + other_spec: Optional[KVCacheSpec] = None self.full_attention_group_ids: list[int] = [] self.other_group_ids: list[int] = [] for i, g in enumerate(self.kv_cache_config.kv_cache_groups): if isinstance(g.kv_cache_spec, FullAttentionSpec): - if full_attention_type_id is None: - full_attention_type_id = g.kv_cache_spec.type_id + if full_attention_spec is None: + full_attention_spec = g.kv_cache_spec else: - assert full_attention_type_id == g.kv_cache_spec.type_id, ( + assert full_attention_spec == g.kv_cache_spec, ( "HybridKVCacheCoordinator assumes exactly one type of " "full attention groups now.") self.full_attention_group_ids.append(i) else: - if other_type_id is None: - other_type_id = g.kv_cache_spec.type_id + if other_spec is None: + other_spec = g.kv_cache_spec else: - assert other_type_id == g.kv_cache_spec.type_id, ( + assert other_spec == g.kv_cache_spec, ( "HybridKVCacheCoordinator assumes " "exactly one other type of groups now.") self.other_group_ids.append(i) - assert full_attention_type_id is not None, ( + assert full_attention_spec is not None, ( "HybridKVCacheCoordinator assumes exactly one type of full " "attention groups now.") - assert other_type_id is not None, ( + assert other_spec is not None, ( "HybridKVCacheCoordinator assumes exactly one type of other " "groups now.") self.full_attention_manager_cls = FullAttentionManager self.other_attention_cls = self.single_type_managers[ self.other_group_ids[0]].__class__ - - self.full_attention_spec = self.kv_cache_config.kv_cache_groups[ - self.full_attention_group_ids[0]].kv_cache_spec - self.other_spec = self.kv_cache_config.kv_cache_groups[ - self.other_group_ids[0]].kv_cache_spec - + self.full_attention_spec = full_attention_spec + self.other_spec = other_spec self.full_attention_block_size = self.full_attention_spec.block_size self.other_block_size = self.other_spec.block_size diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 5b0218640a8c8..3a72ac271afa6 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -5,7 +5,7 @@ import os from collections import defaultdict, deque from collections.abc import Iterable, Sequence -from dataclasses import dataclass +from dataclasses import astuple, dataclass from typing import Any, Callable, NamedTuple, Optional from vllm.config import VllmConfig @@ -727,7 +727,9 @@ def create_kv_cache_group_specs( def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool: """ - Whether all layers in the given KVCacheSpec have the same type of KV cache. + Whether all layers in the given KVCacheSpec have the same KV cache spec. + Note that we regard FullAttentionSpec with and without sliding window as + the same type. Args: kv_cache_spec: The kv cache spec of each attention layer in the model @@ -736,8 +738,12 @@ def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool: True if all layers have the same type, False otherwise. """ - layer_keys = set(layer.type_id for layer in kv_cache_spec.values()) - return len(layer_keys) == 1 + try: + kv_cache_spec_values = list(kv_cache_spec.values()) + _ = kv_cache_spec_values[0].merge(kv_cache_spec_values) + except AssertionError: + return False + return True def get_max_concurrency_for_kv_cache_config( @@ -928,12 +934,12 @@ def _get_kv_cache_config_uniform_page_size( Returns: The generated KVCacheConfig """ - # Group all layers by type_id. + # Group all layers by kv_cache_spec. # E.g., 2 full attention layers and 3 sliding window attention layers, # -> (full.0, full.1), (sw.0, sw.1, sw.2). - same_type_layers: dict[str, list[str]] = defaultdict(list) + same_type_layers: dict[KVCacheSpec, list[str]] = defaultdict(list) for layer_name, layer_spec in kv_cache_spec.items(): - same_type_layers[layer_spec.type_id].append(layer_name) + same_type_layers[layer_spec].append(layer_name) # Split each group into smaller groups, to make the number of layers in each # group identical. Add padding to the last group of each type if necessary. @@ -1017,12 +1023,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): kv_cache_spec: The kv cache spec of each attention layer in the model """ - def is_hybrid(kv_cache_spec: dict[str, KVCacheSpec]) -> bool: - type_ids = set(layer_spec.type_id - for layer_spec in kv_cache_spec.values()) - return len(type_ids) > 1 - - if not is_hybrid(kv_cache_spec): + if is_kv_cache_type_uniform(kv_cache_spec): return logger.warning( @@ -1060,7 +1061,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): attention_chunk_size=spec.attention_chunk_size, ) - if is_hybrid(kv_cache_spec): + if not is_kv_cache_type_uniform(kv_cache_spec): raise ValueError("Hybrid KV cache manager is disabled but failed to " "convert the KV cache specs to one unified type.") @@ -1119,11 +1120,11 @@ def unify_kv_cache_configs(kv_cache_configs: list[KVCacheConfig]): in-place modified to make them consistent. """ - # Sort the kv cache groups by the type_id of their KV cache spec. + # Sort the kv cache groups by their KV cache spec. # This can avoid the inconsistency caused by the order of groups. for kv_cache_config in kv_cache_configs: - kv_cache_config.kv_cache_groups.sort( - key=lambda x: x.kv_cache_spec.type_id) + kv_cache_config.kv_cache_groups.sort(key=lambda x: (type( + x.kv_cache_spec).__name__, astuple(x.kv_cache_spec))) # Verify that the groups of each rank are the same. for kv_cache_config in kv_cache_configs[1:]: diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 1da5230116d26..4ff96f9786b88 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy -from dataclasses import dataclass +from dataclasses import dataclass, fields from math import prod from typing import Optional @@ -16,7 +16,7 @@ from vllm.utils import cdiv, get_dtype_size logger = init_logger(__name__) -@dataclass +@dataclass(frozen=True) class KVCacheSpec: """ A base class for specifying the KV cache format of one layer. @@ -25,20 +25,6 @@ class KVCacheSpec: # number of tokens in a block block_size: int - @property - def type_id(self) -> str: - """ - The type identifier of this KV cache. - Return different strings for layers with different KV cache type (e.g., - different number of tokens like full attention vs sliding window - attention, different KV cache size per token like layers with different - number of heads) - - Returns: - The type identifier of this KV cache. - """ - raise NotImplementedError - @property def page_size_bytes(self) -> int: """ @@ -63,13 +49,12 @@ class KVCacheSpec: """ Merge a list of KVCacheSpec objects into a single KVCacheSpec object. """ - assert all(spec.type_id == specs[0].type_id for spec in specs[1:]), ( - "All layers in the same KV cache group must share the same " - "type_id.") + assert all(spec == specs[0] for spec in specs[1:]), ( + "All layers in the same KV cache group must be the same.") return copy.deepcopy(specs[0]) -@dataclass +@dataclass(frozen=True) class AttentionSpec(KVCacheSpec): num_kv_heads: int head_size: int @@ -84,7 +69,7 @@ class AttentionSpec(KVCacheSpec): * get_dtype_size(self.dtype) -@dataclass +@dataclass(frozen=True) class FullAttentionSpec(AttentionSpec): sliding_window: Optional[int] = None attention_chunk_size: Optional[int] = None @@ -98,10 +83,6 @@ class FullAttentionSpec(AttentionSpec): Default to None for not using sliding window attention. """ - @property - def type_id(self) -> str: - return f"full_attention_{self.block_size}_{self.page_size_bytes}" - def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: max_model_len = vllm_config.model_config.max_model_len return cdiv(max_model_len, self.block_size) * self.page_size_bytes @@ -123,15 +104,28 @@ class FullAttentionSpec(AttentionSpec): Merge a list of FullAttentionSpec objects into a single FullAttentionSpec object. """ - merged_spec = super().merge(specs) + assert all(isinstance(spec, FullAttentionSpec) for spec in specs), ( + "All attention layers in the same KV cache group must be " + "FullAttentionSpec.") + sliding_window = set(spec.sliding_window for spec in specs if spec.sliding_window is not None) attention_chunk_size = set(spec.attention_chunk_size for spec in specs if spec.attention_chunk_size is not None) - - merged_spec.sliding_window = cls.merge_window_sizes(sliding_window) - merged_spec.attention_chunk_size = ( - cls.merge_window_sizes(attention_chunk_size)) + merged_spec = cls( + block_size=specs[0].block_size, + num_kv_heads=specs[0].num_kv_heads, + head_size=specs[0].head_size, + dtype=specs[0].dtype, + use_mla=specs[0].use_mla, + sliding_window=cls.merge_window_sizes(sliding_window), + attention_chunk_size=cls.merge_window_sizes(attention_chunk_size), + ) + for spec in specs: + for f in fields(AttentionSpec): + assert getattr(spec, f.name) == getattr(merged_spec, f.name), ( + "All attention layers in the same KV cache group must have " + "the same attention spec.") assert ( (merged_spec.sliding_window is not None) + (merged_spec.attention_chunk_size is not None) <= 1 @@ -140,16 +134,10 @@ class FullAttentionSpec(AttentionSpec): return merged_spec -@dataclass +@dataclass(frozen=True) class ChunkedLocalAttentionSpec(AttentionSpec): attention_chunk_size: int - @property - def type_id(self) -> str: - return ( - f"local_attention_{self.attention_chunk_size}_{self.block_size}_{self.page_size_bytes}" - ) # noqa - def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: max_model_len = vllm_config.model_config.max_model_len max_num_batched_tokens = ( @@ -165,17 +153,13 @@ class ChunkedLocalAttentionSpec(AttentionSpec): return cdiv(num_tokens, self.block_size) * self.page_size_bytes -@dataclass +@dataclass(frozen=True) class SlidingWindowSpec(AttentionSpec): sliding_window: int def __post_init__(self): assert not self.use_mla, "MLA is not supported for sliding window" - @property - def type_id(self) -> str: - return f"sliding_window_{self.sliding_window}_{self.block_size}_{self.page_size_bytes}" # noqa - def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: max_model_len = vllm_config.model_config.max_model_len max_num_batched_tokens = ( @@ -195,23 +179,17 @@ class SlidingWindowSpec(AttentionSpec): return (cdiv(num_tokens, self.block_size) + 1) * self.page_size_bytes -@dataclass +@dataclass(frozen=True) class MambaSpec(KVCacheSpec): shapes: tuple[tuple[int, ...], ...] dtype: torch.dtype page_size_padded: Optional[int] = None mamba_type: str = "mamba2" - def __post_init__(self): - self.num_elements = sum(prod(shape) for shape in self.shapes) - - @property - def type_id(self) -> str: - return f"mamba_{self.shapes}_{self.dtype}_{self.mamba_type}" - @property def page_size_bytes(self) -> int: - page_size = self.num_elements * get_dtype_size(self.dtype) + num_elements = sum(prod(shape) for shape in self.shapes) + page_size = num_elements * get_dtype_size(self.dtype) if self.page_size_padded is not None: assert self.page_size_padded >= page_size return self.page_size_padded From ab714131e4a83469e8bebaf456853aa73b51324d Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 29 Jul 2025 21:29:51 +0800 Subject: [PATCH 014/224] [Doc] Update compatibility matrix for pooling and multimodal models (#21831) Signed-off-by: DarkLight1337 --- docs/features/compatibility_matrix.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md index 8be1585f8e76b..259a447984cb0 100644 --- a/docs/features/compatibility_matrix.md +++ b/docs/features/compatibility_matrix.md @@ -34,23 +34,25 @@ th:not(:first-child) { } -| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | pooling | enc-dec | logP | prmpt logP | async output | multi-step | mm | best-of | beam-search | +| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | enc-dec | logP | prmpt logP | async output | multi-step | mm | best-of | beam-search | |---|---|---|---|---|---|---|---|---|---|---|---|---|---|---| | [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | | | [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | | | [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | | | [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | -| pooling | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | | +| [pooling](../models/pooling_models.md) | ✅\* | ✅\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | enc-dec | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | | logP | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | | async output | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | | multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | -| mm | ✅ | [🟠](gh-pr:8348) | [🟠](gh-pr:4194) | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | +| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194) | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | | best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | | beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | +\* Chunked prefill and prefix caching are only applicable to last-token pooling. + [](){ #feature-x-hardware } ## Feature x Hardware @@ -62,9 +64,9 @@ th:not(:first-child) { | [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | -| pooling | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ❌ | +| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | | enc-dec | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | -| mm | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | +| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | | logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | | prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | | async output | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | From 04e38500eeaa683f107fc16011aee65981afc6cd Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Tue, 29 Jul 2025 09:35:58 -0400 Subject: [PATCH 015/224] [Bugfix] VLLM_V1 supports passing other compilation levels (#19340) Signed-off-by: Richard Zou --- tests/compile/test_config.py | 55 ++++++++++++++++++++++++++++-- vllm/compilation/counter.py | 2 ++ vllm/config.py | 21 ++++++++++-- vllm/v1/worker/gpu_model_runner.py | 13 ++++++- vllm/worker/model_runner.py | 2 ++ 5 files changed, 88 insertions(+), 5 deletions(-) diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index 0ba59f4b5a056..90e8e0ff95858 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -26,6 +26,8 @@ def test_use_cudagraphs_dynamic(monkeypatch): assert not vllm_config.compilation_config.use_cudagraph +# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 +@pytest.mark.forked # NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends # on the state of the cache directory on the current machine, which # may be influenced by other tests. @@ -33,8 +35,8 @@ def test_use_cudagraphs_dynamic(monkeypatch): def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val): assert vllm.envs.VLLM_USE_V1 - # spawn means that the counters are in the same process. - monkeypatch.setenv('VLLM_WORKER_MULTIPROC_METHOD', "spawn") + # Disable multiprocessing so that the counter is in the same process + monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0') monkeypatch.setenv('VLLM_DISABLE_COMPILE_CACHE', val) compilation_config = { @@ -50,6 +52,8 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val): pass +# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 +@pytest.mark.forked @pytest.mark.parametrize("enabled", [True, False]) def test_use_cudagraphs(vllm_runner, monkeypatch, enabled): assert vllm.envs.VLLM_USE_V1 @@ -72,3 +76,50 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled): compilation_config=compilation_config, gpu_memory_utilization=0.4) as _): pass + + +# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 +@pytest.mark.forked +def test_dynamo_as_is(vllm_runner, monkeypatch): + # Disable multiprocessing so that the counter is in the same process + monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0') + + with ( + compilation_counter.expect(dynamo_as_is_count=1), + # loading the model causes compilation (if enabled) to happen + vllm_runner('facebook/opt-125m', + compilation_config={"level": 1}, + gpu_memory_utilization=0.4) as _): + pass + + +# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 +@pytest.mark.forked +def test_no_compilation(vllm_runner, monkeypatch): + # Disable multiprocessing so that the counter is in the same process + monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0') + + with ( + compilation_counter.expect(num_graphs_seen=0, + dynamo_as_is_count=0), + # loading the model causes compilation (if enabled) to happen + vllm_runner('facebook/opt-125m', + compilation_config={"level": 0}, + gpu_memory_utilization=0.4) as _): + pass + + +# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 +@pytest.mark.forked +def test_enforce_eager(vllm_runner, monkeypatch): + # Disable multiprocessing so that the counter is in the same process + monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0') + + with ( + compilation_counter.expect(num_graphs_seen=0, + dynamo_as_is_count=0), + # loading the model causes compilation (if enabled) to happen + vllm_runner('facebook/opt-125m', + enforce_eager=True, + gpu_memory_utilization=0.4) as _): + pass diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py index 6acb8abb3deb1..e01dd3915a3a1 100644 --- a/vllm/compilation/counter.py +++ b/vllm/compilation/counter.py @@ -27,6 +27,8 @@ class CompilationCounter: num_cache_entries_updated: int = 0 # The number of standalone_compile compiled artifacts saved num_compiled_artifacts_saved: int = 0 + # Number of times a model was loaded with CompilationLevel.DYNAMO_AS_IS + dynamo_as_is_count: int = 0 def clone(self) -> "CompilationCounter": return copy.deepcopy(self) diff --git a/vllm/config.py b/vllm/config.py index 7ae615f477057..86c3b9eae64cb 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4106,9 +4106,11 @@ class CompilationConfig: certain small batchsizes, where inductor is good at optimizing. """ # Top-level Compilation control - level: int = 0 + level: Optional[int] = None """The level of compilation: + - None: If None, we will select the default compilation level. + For V1 engine this is 3, for V0 engine this is 0. - 0: no compilation. - 1: dynamo as is. - 2: dynamo once. @@ -4664,6 +4666,22 @@ class VllmConfig: "To workaround this limitation, vLLM will set 'ieee' input " "precision for chunked prefill triton kernels.") + # If the user does not explicitly set a compilation level, then + # we use the default level. The default level depends on other + # settings (see the below code). + if self.compilation_config.level is None: + if envs.VLLM_USE_V1: + if (self.model_config is not None + and not self.model_config.enforce_eager): + self.compilation_config.level = CompilationLevel.PIECEWISE + else: + self.compilation_config.level = \ + CompilationLevel.NO_COMPILATION + else: + # NB: Passing both --enforce-eager and a compilation level + # in V0 means the compilation level wins out. + self.compilation_config.level = CompilationLevel.NO_COMPILATION + # async tp is built on top of sequence parallelism # and requires it to be enabled. if self.compilation_config.pass_config.enable_async_tp: @@ -4676,7 +4694,6 @@ class VllmConfig: # By default, V1 uses piecewise CUDA graphs. If full_cuda_graph # is set to True, full CUDA graphs will be used. self.compilation_config.cudagraph_num_of_warmups = 1 - self.compilation_config.level = CompilationLevel.PIECEWISE self.compilation_config.set_splitting_ops_for_v1() self._set_cudagraph_sizes() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index fc55d09fc97e7..84ad582c9c9de 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -43,7 +43,7 @@ from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size, - is_pin_memory_available, round_up) + is_pin_memory_available, round_up, supports_dynamo) from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, @@ -1930,6 +1930,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): rank_mapping, ) + if ( + self.vllm_config.compilation_config.level == \ + CompilationLevel.DYNAMO_AS_IS and supports_dynamo() + ): + backend = self.vllm_config.compilation_config.init_backend( + self.vllm_config) + compilation_counter.dynamo_as_is_count += 1 + self.model.compile( + fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, + backend=backend) + def reload_weights(self) -> None: assert getattr(self, "model", None) is not None, \ "Cannot reload weights before model is loaded." diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 5a185e7451ade..20b9b733cd3b9 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -22,6 +22,7 @@ import vllm.envs as envs from vllm.attention import AttentionMetadata, get_attn_backend from vllm.attention.backends.abstract import AttentionState from vllm.attention.backends.utils import CommonAttentionState +from vllm.compilation.counter import compilation_counter from vllm.config import CompilationLevel, VllmConfig from vllm.core.scheduler import SchedulerOutputs from vllm.distributed import broadcast_tensor_dict, get_pp_group @@ -1121,6 +1122,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): CompilationLevel.DYNAMO_AS_IS and supports_dynamo(): backend = self.vllm_config.compilation_config.init_backend( self.vllm_config) + compilation_counter.dynamo_as_is_count += 1 self.model = torch.compile( self.model, fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, From f693b067a28768e16534cfd49672c020c41071b0 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 29 Jul 2025 15:22:50 +0100 Subject: [PATCH 016/224] [Docs] Merge design docs for a V1 only future (#21832) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/.nav.yml | 4 +- docs/design/automatic_prefix_caching.md | 40 --------------- docs/design/huggingface_integration.md | 2 +- docs/design/{v1 => }/metrics.md | 0 docs/design/{v1 => }/multiprocessing.md | 0 docs/design/{v1 => }/p2p_nccl_connector.md | 56 +++++++++++---------- docs/design/{kernel => }/paged_attention.md | 4 ++ docs/design/{v1 => }/prefix_caching.md | 0 docs/design/{v1 => }/torch_compile.md | 0 9 files changed, 35 insertions(+), 71 deletions(-) delete mode 100644 docs/design/automatic_prefix_caching.md rename docs/design/{v1 => }/metrics.md (100%) rename docs/design/{v1 => }/multiprocessing.md (100%) rename docs/design/{v1 => }/p2p_nccl_connector.md (95%) rename docs/design/{kernel => }/paged_attention.md (99%) rename docs/design/{v1 => }/prefix_caching.md (100%) rename docs/design/{v1 => }/torch_compile.md (100%) diff --git a/docs/.nav.yml b/docs/.nav.yml index ab54dc3e535bd..ad742be3d6947 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -56,9 +56,7 @@ nav: - contributing/model/tests.md - contributing/model/multimodal.md - CI: contributing/ci - - Design Documents: - - V0: design - - V1: design/v1 + - Design Documents: design - API Reference: - Summary: api/README.md - Contents: diff --git a/docs/design/automatic_prefix_caching.md b/docs/design/automatic_prefix_caching.md deleted file mode 100644 index 60e21f6ad0fcb..0000000000000 --- a/docs/design/automatic_prefix_caching.md +++ /dev/null @@ -1,40 +0,0 @@ -# Automatic Prefix Caching - -The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand. - -To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block. - -```text - Block 1 Block 2 Block 3 - [A gentle breeze stirred] [the leaves as children] [laughed in the distance] -Block 1: |<--- block tokens ---->| -Block 2: |<------- prefix ------>| |<--- block tokens --->| -Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->| -``` - -In the example above, the KV cache in the first block can be uniquely identified with the tokens “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the following one-to-one mapping: - -```text -hash(prefix tokens + block tokens) <--> KV Block -``` - -With this mapping, we can add another indirection in vLLM’s KV cache management. Previously, each sequence in vLLM maintained a mapping from their logical KV blocks to physical blocks. To achieve automatic caching of KV blocks, we map the logical KV blocks to their hash value and maintain a global hash table of all the physical blocks. In this way, all the KV blocks sharing the same hash value (e.g., shared prefix blocks across two requests) can be mapped to the same physical block and share the memory space. - -This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system. - -## Generalized Caching Policy - -Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full. - -Managing KV cache with a hash table allows us to implement flexible caching policies. As an example, in current vLLM, we implement the following eviction policy: - -* When there are no free blocks left, we will evict a KV block with reference count (i.e., number of current requests using the block) equals 0. -* If there are multiple blocks with reference count equals to 0, we prioritize to evict the least recently used block (LRU). -* If there are multiple blocks whose last access time are the same, we prioritize the eviction of the block that is at the end of the longest prefix (i.e., has the maximum number of blocks before it). - -Note that this eviction policy effectively implements the exact policy as in [RadixAttention](https://lmsys.org/blog/2024-01-17-sglang/) when applied to models with full attention, which prioritizes to evict reference count zero and least recent used leaf nodes in the prefix tree. - -However, the hash-based KV cache management gives us the flexibility to handle more complicated serving scenarios and implement more complicated eviction policies beyond the policy above: - -* Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency. -* Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images. diff --git a/docs/design/huggingface_integration.md b/docs/design/huggingface_integration.md index 7b01313ddb00a..5a7582c86d49f 100644 --- a/docs/design/huggingface_integration.md +++ b/docs/design/huggingface_integration.md @@ -1,4 +1,4 @@ -# Integration with HuggingFace +# Integration with Hugging Face This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`. diff --git a/docs/design/v1/metrics.md b/docs/design/metrics.md similarity index 100% rename from docs/design/v1/metrics.md rename to docs/design/metrics.md diff --git a/docs/design/v1/multiprocessing.md b/docs/design/multiprocessing.md similarity index 100% rename from docs/design/v1/multiprocessing.md rename to docs/design/multiprocessing.md diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/p2p_nccl_connector.md similarity index 95% rename from docs/design/v1/p2p_nccl_connector.md rename to docs/design/p2p_nccl_connector.md index 9d334f8873d97..082dff15ef2c8 100644 --- a/docs/design/v1/p2p_nccl_connector.md +++ b/docs/design/p2p_nccl_connector.md @@ -1,8 +1,10 @@ +# P2P NCCL Connector + An implementation of xPyD with dynamic scaling based on point-to-point communication, partly inspired by Dynamo. -# Detailed Design +## Detailed Design -## Overall Process +### Overall Process As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow: 1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface. @@ -15,7 +17,7 @@ As shown in Figure 1, the overall process of this **PD disaggregation** solution ![image1](https://github.com/user-attachments/assets/fb01bde6-755b-49f7-ad45-48a94b1e10a7) -## Proxy/Router (Demo) +### Proxy/Router (Demo) A simple HTTP service acts as the entry point for client requests and starts a background thread to listen for P/D instances reporting their HTTP IP and PORT, as well as ZMQ IP and PORT. It maintains a dictionary of `http_addr -> zmq_addr`. The `http_addr` is the IP:PORT for the vLLM instance's request, while the `zmq_addr` is the address for KV cache handshake and metadata reception. @@ -29,13 +31,13 @@ Currently, to quickly verify whether xPyD can work, a round-robin selection of 1 Each P/D instance periodically sends a heartbeat packet to the Proxy/Router (currently every 3 seconds) to register (i.e., report `http_addr -> zmq_addr`) and keep the connection alive. If an instance crashes and fails to send a ping for a certain period of time, the Proxy/Router will remove the timed-out instance (this feature has not yet been developed). -## KV Cache Transfer Methods +### KV Cache Transfer Methods There are three methods for KVCache transfer: PUT, GET, and PUT_ASYNC. These methods can be specified using the `--kv-transfer-config` and `kv_connector_extra_config` parameters, specifically through the `send_type` field. Both PUT and PUT_ASYNC involve the P instance actively sending KVCache to the D instance. The difference is that PUT is a synchronous transfer method that blocks the main process, while PUT_ASYNC is an asynchronous transfer method. PUT_ASYNC uses a dedicated thread for sending KVCache, which means it does not block the main process. In contrast, the GET method involves the P instance saving the KVCache to the memory buffer after computing the prefill. The D instance then actively retrieves the computed KVCache from the P instance once it has allocated space for the KVCache. Experimental results have shown that the performance of these methods, from highest to lowest, is as follows: PUT_ASYNC → GET → PUT. -## P2P Communication via ZMQ & NCCL +### P2P Communication via ZMQ & NCCL As long as the address of the counterpart is known, point-to-point KV cache transfer (using NCCL) can be performed, without being constrained by rank and world size. To support dynamic scaling (expansion and contraction) of instances with PD disaggregation. This means that adding or removing P/D instances does not require a full system restart. @@ -43,7 +45,7 @@ Each P/D instance only needs to create a single `P2pNcclEngine` instance. This i When a P instance and a D instance transmit KVCache for the first time, they need to establish a ZMQ connection and an NCCL group. For subsequent KVCache transmissions, this ZMQ connection and NCCL group are reused. The NCCL group consists of only two ranks, meaning the world size is equal to 2. This design is intended to support dynamic scaling, which means that adding or removing P/D instances does not require a full system restart. As long as the address of the counterpart is known, point-to-point KVCache transmission can be performed, without being restricted by rank or world size. -## NCCL Group Topology +### NCCL Group Topology Currently, only symmetric TP (Tensor Parallelism) methods are supported for KVCache transmission. Asymmetric TP and PP (Pipeline Parallelism) methods will be supported in the future. Figure 2 illustrates the 1P2D setup, where each instance has a TP (Tensor Parallelism) degree of 2. There are a total of 7 NCCL groups: three vLLM instances each have one NCCL group with TP=2. Additionally, the 0th GPU card of the P instance establishes an NCCL group with the 0th GPU card of each D instance. Similarly, the 1st GPU card of the P instance establishes an NCCL group with the 1st GPU card of each D instance. @@ -51,7 +53,7 @@ Currently, only symmetric TP (Tensor Parallelism) methods are supported for KVCa Each NCCL group occupies a certain amount of GPU memory buffer for communication, the size of which is primarily influenced by the `NCCL_MAX_NCHANNELS` environment variable. When `NCCL_MAX_NCHANNELS=16`, an NCCL group typically occupies 100MB, while when `NCCL_MAX_NCHANNELS=8`, it usually takes up 52MB. For large-scale xPyD configurations—such as DeepSeek's 96P144D—this implementation is currently not feasible. Moving forward, we are considering using RDMA for point-to-point communication and are also keeping an eye on UCCL. -## GPU Memory Buffer and Tensor Memory Pool +### GPU Memory Buffer and Tensor Memory Pool The trade-off in the size of the memory buffer is as follows: For P instances, the memory buffer is not required in PUT and PUT_ASYNC modes, but it is necessary in GET mode. For D instances, a memory buffer is needed in all three modes. The memory buffer for D instances should not be too large. Similarly, for P instances in GET mode, the memory buffer should also not be too large. The memory buffer of D instances is used to temporarily store KVCache sent by P instances. If it is too large, it will reduce the KVCache space available for normal inference by D instances, thereby decreasing the inference batch size and ultimately leading to a reduction in output throughput. The size of the memory buffer is configured by the parameter `kv_buffer_size`, measured in bytes, and is typically set to 5%~10% of the memory size. @@ -59,15 +61,15 @@ If the `--max-num-seqs` parameter for P instances is set to a large value, due t To address the above issues, I have designed and developed a local Tensor memory pool for storing KVCache, inspired by the buddy system used in Linux memory modules. Since the memory is sufficiently large, typically in the TB range on servers, there is no need to consider prefix caching or using block-based designs to reuse memory, thereby saving space. When the memory buffer is insufficient, KVCache can be directly stored in the Tensor memory pool, and D instances can subsequently retrieve KVCache from it. The read and write speed is that of PCIe, with PCIe 4.0 having a speed of approximately 21 GB/s, which is usually faster than the Prefill speed. Otherwise, solutions like Mooncake and lmcache would not be necessary. The Tensor memory pool acts as a flood diversion area, typically unused except during sudden traffic surges. In the worst-case scenario, my solution performs no worse than the normal situation with a Cache store. -# Install vLLM +## Install vLLM ```shell pip install "vllm>=0.9.2" ``` -# Run xPyD +## Run xPyD -## Instructions +### Instructions - The following examples are run on an A800 (80GB) device, using the Meta-Llama-3.1-8B-Instruct model. - Pay attention to the setting of the `kv_buffer_size` (in bytes). The empirical value is 10% of the GPU memory size. This is related to the kvcache size. If it is too small, the GPU memory buffer for temporarily storing the received kvcache will overflow, causing the kvcache to be stored in the tensor memory pool, which increases latency. If it is too large, the kvcache available for inference will be reduced, leading to a smaller batch size and decreased throughput. - For Prefill instances, when using non-GET mode, the `kv_buffer_size` can be set to 1, as Prefill currently does not need to receive kvcache. However, when using GET mode, a larger `kv_buffer_size` is required because it needs to store the kvcache sent to the D instance. @@ -79,16 +81,16 @@ pip install "vllm>=0.9.2" - Supports multiple nodes; you just need to modify the `proxy_ip` and `proxy_port` in `--kv-transfer-config`. - In the following examples, it is assumed that **the proxy's IP is 10.0.1.1**. -## Run 1P3D +### Run 1P3D -### Proxy (e.g. 10.0.1.1) +#### Proxy (e.g. 10.0.1.1) ```shell cd {your vllm directory}/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/ python3 disagg_proxy_p2p_nccl_xpyd.py & ``` -### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1) +#### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1) ??? console "Command" @@ -110,7 +112,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 & ``` -### Decode1 (e.g. 10.0.1.3 or 10.0.1.1) +#### Decode1 (e.g. 10.0.1.3 or 10.0.1.1) ??? console "Command" @@ -132,7 +134,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 & ``` -### Decode2 (e.g. 10.0.1.4 or 10.0.1.1) +#### Decode2 (e.g. 10.0.1.4 or 10.0.1.1) ??? console "Command" @@ -154,7 +156,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 & ``` -### Decode3 (e.g. 10.0.1.5 or 10.0.1.1) +#### Decode3 (e.g. 10.0.1.5 or 10.0.1.1) ??? console "Command" @@ -176,16 +178,16 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 & ``` -## Run 3P1D +### Run 3P1D -### Proxy (e.g. 10.0.1.1) +#### Proxy (e.g. 10.0.1.1) ```shell cd {your vllm directory}/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/ python3 disagg_proxy_p2p_nccl_xpyd.py & ``` -### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1) +#### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1) ??? console "Command" @@ -207,7 +209,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 & ``` -### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1) +#### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1) ??? console "Command" @@ -229,7 +231,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 & ``` -### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1) +#### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1) ??? console "Command" @@ -251,7 +253,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 & ``` -### Decode1 (e.g. 10.0.1.5 or 10.0.1.1) +#### Decode1 (e.g. 10.0.1.5 or 10.0.1.1) ??? console "Command" @@ -273,7 +275,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 & ``` -# Single request +## Single request ```shell curl -X POST -s http://10.0.1.1:10001/v1/completions \ @@ -286,7 +288,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \ }' ``` -# Benchmark +## Benchmark ??? console "Command" @@ -310,14 +312,14 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \ --num-prompts 1000 ``` -# Shut down +## Shut down ```shell pgrep python | xargs kill -9 && pkill -f python ``` -# Test data +## Test data -## **Scenario**: 1K input & 200 output tokens, E2E P99 latency ~2s +### **Scenario**: 1K input & 200 output tokens, E2E P99 latency ~2s ![testdata](https://github.com/user-attachments/assets/cef0953b-4567-4bf9-b940-405b92a28eb1) diff --git a/docs/design/kernel/paged_attention.md b/docs/design/paged_attention.md similarity index 99% rename from docs/design/kernel/paged_attention.md rename to docs/design/paged_attention.md index 94bfa97ee2217..ef525e8c60412 100644 --- a/docs/design/kernel/paged_attention.md +++ b/docs/design/paged_attention.md @@ -1,5 +1,9 @@ # vLLM Paged Attention +!!! warning + This document is being kept in the vLLM documentation for historical purposes. + It no longer describes the code used in vLLM today. + Currently, vLLM utilizes its own implementation of a multi-head query attention kernel (`csrc/attention/attention_kernels.cu`). This kernel is designed to be compatible with diff --git a/docs/design/v1/prefix_caching.md b/docs/design/prefix_caching.md similarity index 100% rename from docs/design/v1/prefix_caching.md rename to docs/design/prefix_caching.md diff --git a/docs/design/v1/torch_compile.md b/docs/design/torch_compile.md similarity index 100% rename from docs/design/v1/torch_compile.md rename to docs/design/torch_compile.md From 759b87ef3e29da09f36b37046e8ff51196c09679 Mon Sep 17 00:00:00 2001 From: Brittany <24945384+bvrockwell@users.noreply.github.com> Date: Tue, 29 Jul 2025 07:23:19 -0700 Subject: [PATCH 017/224] [TPU] Add an optimization doc on TPU (#21155) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/assets/design/v1/tpu/most_model_len.png | Bin 0 -> 12126 bytes docs/configuration/tpu.md | 104 +++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 docs/assets/design/v1/tpu/most_model_len.png create mode 100644 docs/configuration/tpu.md diff --git a/docs/assets/design/v1/tpu/most_model_len.png b/docs/assets/design/v1/tpu/most_model_len.png new file mode 100644 index 0000000000000000000000000000000000000000..344a81ed90801ee1a2ff1343f3609c8318c96f75 GIT binary patch literal 12126 zcmds73p~_m_aD~emZDhORwAoha+}F5>o&x=MZ>sX8fKVb#*8uUAw{}ucnc{ZMGRq+ zyO3@w6^WF4wGtUp?za4&-C?O25Nbqf5JL`WB8NcWXfzc(3Fk}4QG!$g2n-+s<2YYJ0B3?VHG)JY3k;7IVYBmwQF z8b~b_q$Uu{TcYi)t>h5K;5(V*PXNEn2;TlwXo)eAPNe{esX9_yMGg7`gmyR|9G$d6 z1+>!;0zHUC4Ol)50-6kTLjytHI5J^btiCMqqSEmM`mzY{c7awnFE1a;ziZ>_q=cE#aWtYe70XBbJqml{2sPhe5}pvm(YCyjL8X!zB-$^9 z-qe5qf;R_Bjz%1vP7VF#G#@INb2`o(8aVhbv_k^BUzrq=I+E}V;0us5X=`w{2g%U{ zk}r|_W_2~r2nENT%n<|;ad>Lz@_M#tB`V8y5`KwCM?okR&UCXgYA&r=zX~$*BR1 z6>=Z;50D2j=v05giV=i~mL`A;if$;7Xt$@Y_(V!1QNS*sTqjL_>4h95T*}K(?g@kvOT7zT9p81UhKUe=+_O z_*^$36MUcz{H~L%NXyE9TVCGDH6_OgJb%uUvxkyHG-A))}<>AyOpe=2=B@FBUe zy5d^>hlKQh(9`^rL;7!vXas^&jdCk39nSL#m*58P%1VnH_0`=v@^n@Qq1M(j>lG~GKQ#1c>F!h=cwHA7?12T-T~)Y z^k&bMF4oL`+lagqoH;SPIIpk2Wq9c0{Qil0L38kPCllbt4t#Y-Va8czZ=t<{YYjbbvo`Je!t`6; z#TnLB(vwThlP_;mo?dni8O{vtzISSSv2SZuW!R)wOSVDdG^?kpHiR|W4YpJ?J=)ol zIZzwyzi}6%fYM!l4~y9nJ(qK0v+mPRKYp9UOd-u%+_#!;Bs$h2bA30vr zoE0_W+~pf@ohy@PYj1dBlTJInGsl?UwZbn|*te!HxXWEL@TEzHK}>^nwtnBEYy&FL zYB^vuG%Da`d?48?0xutHi!OG$a-%-hf6MDLzFAkEz+Hl4FBV#)_EZPHa$8(T z-ENcBl9M|l?xGX)nt{7?Ks#hCy)Sx^RT=oov6p@L3 z+Zysa4yUAjD0D0xpUJTAyf*jp_)HW3mg=Yrt@?`(^sLSk^jRN&xcjf0EJtP;dc?Lf zxIWpXDj>6G{S$iso5vU3VSW=paiK?xM~A!wQBv;$%ki<&3YRO_%}>-05j`tfSbRMI!6n5fl@4Q%(bwH zEem0}%&NulVtDe5TaF5?#v&v38QFXqDZmz`QJ$CG7AFg(ZqEw4feqPZAFNP4HIHuG zV?i0^S;wjvAwIqk|1d+V6$>`57~Xk$!?5sd$|PKX`JKVx-X;yc^4{PcN>z;(Q7@#} zGa`BLzHw5bf8(~^z2(HcO7;b|g_*%8^de`RnhwHMC#>W5JdN&T+U563IlsTJte|jD zna87=)}*&DkbuoEJ%o`68_MYdDTHN^{WP@fS$$-c>w?{Yj_BRHym9uXQhEzgNS=D8 zs!O>%IkK?9ucJWhamAF=HK~}S&YaSV?NUzF$r$Y2SVJEP>0HFyyL8VH#q&aDonrjF z0w&u*8XInYE)WOC(!Ub8WqKg_*a_KU>33UrP{#yg4as~0%xBu|LcDR}L9kdDc!JrI z5DoAtcaP&^9`%So+$Tr zHQly5La5Y?J%W}dqp`&c21&K)l_lS7lTt@1QbNwfgZ<2HHAK%N)=hl@op(?XxQ02q zx*046+hn!1sYBZYQXR;U(x0>RI#wpNZC6vzqjvV5W9QZhY{6h(3CGs4(S2tt{rJ1& z_cXUR>(~IE5yM!0!9E-nE}%e7IGiX09Z?kX0I~#(=W~|ch_}ykTz`#4E%08dYF$)m zH-fet`;~y(^s84A7;Moe)m7H~PfWaCiuE863h8%9640TpEAgQ27ri)-WQWB)aO~G# z{Gl`dwu^rbQk?Pqo_^D3>d_DO$L+he37qbomGn(4hB<9%^{rQ&yOk6om~Xb2a<&Z(tJR|@) zcU2kA-vuQrLvc_ViHDtIU#6m$L6dq}wODp} z2@Q+ZAhuoC>vL?3{wROedcbZPpPl&NU5eL&1w9x#Wfc6z;S^p)*~X^zm=RrOHKbBh zDfWBM2St90va32rJBzG&QpFafig@xiLkp0CAoA*;D-|$SOMUwc3=iwikOXa`R&UO^ zZ=P)zgQQEOpV=lkdl1^Q673=Nc*>6tJtZxqRx@j8@*Su~rLmILwYJmi{ZuQus>?FJ zYe-7_+Q#D$n0_n6@cclXs<^iQ)5|q(sdA+cZm>4b+ZW&N!4F15fjYp)E5f8{73~v> zmy9%Pu;yQYmd(l0ZW>!ul};2j_y5ateQZDE+xkMeuIOTyiu*iaisT8M-6*BREv z+xyJzc{`~eUL6CSh6qvu5!cNw3eW)LRG(dYOrRVn-~#f4OZp5uKeX>iR*}gwYnzWK z*qO9p`&OVoP!@m8noiYi}Vd&^=%Ch zN*T_IeB^G{ga{dPDgu3uYj@ zldb<0Fek!T3^*s?iHaRdJuXfBPy&+?{%z?2$p1&UVDE`cv?)>^x4t%+ufYjY~I!3?m0X8HdPa(54 zz#U+*<$`W&6tFr%M?7Obc4Dm~zHope1A4Vx%kW1R0i_`&x!U^d&Ok>y;?RRLPds8i z20PYUozs52D`boq^W$Vfra??U>qD+jjjA|C$>FjZYw1CjUQbr|>kYb53*|+HfD`fo z{d*%Y>qEe#G__6^&t2M=Py`-u zU!$Nrudojs_J_3liJNr1AkIZ1rJpFlC!uKnJz$nx8}`p{>N{_Ai2Ul#iAu%@MsiKz zHAcb0Uc%l>R%ec&{JKiqOE8$ui`RPr@8e$zg2|y_f4HQ83ubkcM@+uon|s@{84}0S z_@=KUhW3XyH!OXMbd(s4F8Q|O(zwz|3k#Xz4<+B?rvNX}Lv8%{I!!*fz;6+_K*{{z zst;?SvSqNF_q1`LS&~#qyFh%$TSb_3bs7?a;3WC)i>dX zZdh(n;@b+?m)og3oDsK^{Qya%_x`TzxJzN8{XKc@hEEIrRX<)zK$d&v8e7H)*<)=B zXT)<0+u4LjW~rb2$z(-`%jp_`f+$zb6S8RrR(6+Qs&!JdwjUvTcWyz z6_j(B$DU-R&m=U=zuxHIS(xhT(Sdnowj~M=4s=ymQl&c2LeDlwj|p7jEU#(jd6bh8 za^H2>+YY9kiA>(2Bkk=%!~v5oMtC-3{TAlY0#=*<_F3Fm0i9JK7O08e1j;nlLC5)2-T`J^jOlHDHr!cGac}!I)h( zHmeRNf}By8FC+25@o)#Ajo*~CvOC9Q&DbM=Pf0pH@%(BtYi<0@CDiGshj=5(_C)Tz zw-&oALIBkyIc~fAQr@i&sR(foKou&RC2l%x^F(%N)b+)UcvaQr_)nOa{Crr&RKD5E zQ(OFO7I81u)K}W(pqA#9^qv~8+`(VlI0U@ z1J|TA`5bhKO4DRPO}#Pafd>0!?cW%p^NyZ4#eN8ggUJNTrqrxrUvOu^yq#A9+1=&! e6QXl5M|<@J;ajtk!GD;CneX0XQi9rd@P7b99exG? literal 0 HcmV?d00001 diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md new file mode 100644 index 0000000000000..005b7f78f4407 --- /dev/null +++ b/docs/configuration/tpu.md @@ -0,0 +1,104 @@ +# TPU Optimization Tips + +This doc serves as a collection of handy tips for optimizing your vLLM on TPU workload. + +## Get started + +Looking for setup and installation instructions? Find them [here](../getting_started/installation/google_tpu.md). + +### TPU workload sizing + +When selecting the ideal number of chips for a single serving instance, it's important to account for both the model size and the average request context length. Adequate HBM for the KV cache is essential to ensure a sufficient number of concurrent requests can be processed. + +The following colab [calculator](https://colab.research.google.com/github/ericehanley/rightsize-vllm/blob/main/HBM_Calculator.ipynb) will tell you: + +- KV cache size requirement per token and per request +- TPU/GPU memory consumed by the model weights +- TPU/GPU memory allocated for the KV cache +- Maximum \# of requests you can approximately set (--max-num-seqs) + +This approach serves as a general rule of thumb. + +#### Latency-throughput tradeoff + +As with rightsizing the number of chips for your workload, consider adjusting `--max-num-seqs` to fine-tune the latency-throughput balance. Decreasing `--max-num-seqs` and/or increasing the number of chips can help reduce latency. + +`--max-num-seqs` defines the number of concurrent decode slots, effectively limiting the number of requests the server can process tokens for simultaneously. Increasing this value allows the server to pre-allocate more HBM to handle a higher number of concurrent requests, which can maximize overall throughput. However, this often increases the end-to-end (e2e) latency per request. + +Therefore, carefully tuning `--max-num-seqs` is crucial to achieving the desired balance between latency and throughput for your specific workload. + +In a similar way, `--max-num-batch-tokens` can be adjusted down to improve latency, or adjusted up to improve throughput. + +#### Compilation and Caching + +Coming from a GPU background, one of the key differences you'll notice with TPUs is an initial compilation step. TPUs are specialized accelerators (ASICs) that achieve maximum performance by executing pre-compiled, static computation graphs via the XLA compiler. Unlike GPUs, which can handle dynamic input shapes more flexibly, TPUs require a specific compiled graph for each tensor shape (e.g., batch size and sequence length) they process. + +To manage this, vLLM performs a one-time "warmup" process when you first launch the server. During this phase, it pre-compiles the model for various common input shapes and saves these compiled graphs to a cache on disk or remote storage (located at `~/.cache/vllm/xla_cache` by default). This process can range significantly, anywhere from a few minutes to an hour depending on the size of the model and context length used. + +Although the first compilation can take some time, for all subsequent server launches, vLLM can load these graphs directly from the cache, eliminating the compilation time for future runs. + +Use `VLLM_XLA_CACHE_PATH` environment variable to write to shareable storage for future deployed nodes (like when using autoscaling). + +#### Reducing compilation time +This initial compilation time ranges significantly and is impacted by many of the arguments discussed in this optimization doc. Factors that influence the length of time to compile are things like model size and `--max-num-batch-tokens`. Other arguments you can tune are things like `VLLM_TPU_MOST_MODEL_LEN`. + +### Optimize based on your data + +#### max model len vs. most model len + +![most_model_len](../assets/design/v1/tpu/most_model_len.png) + +If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most model len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable. + +For example, 1% requests are 32k length and 99% requests are 2k length. You can pass 32k into `--max-model-len 32768` and use `VLLM_TPU_MOST_MODEL_LEN=2048`. + +The requests get subdivided into max-model-len and most-model-len categories, for the latter category, we can gain better performance since the server can process more requests at a time. + +#### Padding + +For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128: 128, 256, etc. + +The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about tpu padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests: + +1) the default exponential padding (pad to the nearest power of 2) +2) bucket padding (pad to the nearest linearly increasing bucket). + +When using bucket padding, the buckets start from 16, end at max_model_len, and increment by `VLLM_TPU_BUCKET_PADDING_GAP`. + +For example, max_model_len=512, padding_gap=64, the buckets will be [16, 32, 64, 128, 192, 256, 320, 384, 448, 512]. + +The fewer tokens we pad, the less unnecessary computation TPU does, the better performance we can get. For example, if num_tokens=300, with exponential padding, we pad to 512, with the bucket_padding above, we pad to 320. + +However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compilaed graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding. + +**If possible, use the precision that matches the chip’s hardware acceleration** + +- v5e has int4/int8 hardware acceleration in the MXU +- v6e has int4/int8 hardware acceleration in the MXU + +Supported quantized formats and features in vLLM on TPU [Jul '25] +- INT8 W8A8 +- INT8 W8A16 +- FP8 KV cache +- [WIP] FP8 W8A8 +- [WIP] AWQ +- [WIP] FP4 W4A8 + +**Don't set TP to be less than the number of chips on a single-host deployment** + +Although it’s common to do this with GPUs, don't try to fragment 2 or 8 different workloads across 8 chips on a single host. If you need 1 or 4 chips, just create an instance with 1 or 4 chips (these are partial-host machine types). + +### Tune your workloads! + +Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case. + +### Future Topics We'll Cover + +#### Profiling + +The auto-tuner provides a profile of optimized configurations as its final step. However, interpreting this profile can be challenging for new users. We plan to expand this section in the future with more detailed guidance. In the meantime, you can learn how to collect a TPU profile using vLLM's native profiling tools [here](../examples/offline_inference/profiling_tpu.md). This profile can provide valuable insights into your workload's performance. + +#### SPMD +More details to come. + +**Want us to cover something that isn't listed here? Open up an issue please and cite this doc. We'd love to hear your questions or tips.** From ad341c519457fa706c549c9b7edc8438c35fd8d1 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Tue, 29 Jul 2025 22:26:31 +0800 Subject: [PATCH 018/224] [Bugfix]fix mixed bits and visual language model quantization in AutoRound (#21802) Signed-off-by: Wenhua Cheng --- .../layers/quantization/auto_round.py | 155 +++++++++++++----- 1 file changed, 116 insertions(+), 39 deletions(-) diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py index ea17cd56c9855..a9e967e608e96 100644 --- a/vllm/model_executor/layers/quantization/auto_round.py +++ b/vllm/model_executor/layers/quantization/auto_round.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from fractions import Fraction -from typing import Any, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union import torch @@ -16,6 +16,9 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.platforms import current_platform from vllm.scalar_type import scalar_types +if TYPE_CHECKING: + from vllm.model_executor.models.utils import WeightsMapper + logger = init_logger(__name__) @@ -28,7 +31,13 @@ class AutoRoundConfig(QuantizationConfig): SUPPORTED_DTYPES = {"int"} SUPPORTED_FORMATS = {"auto_round:auto_gptq", "auto_round:auto_awq"} SUPPORTED_BACKENDS = { - "auto", "gptq", "gptq:marlin", "awq", "awq:marlin", "marlin", "ipex" + "auto", + "gptq", + "gptq:marlin", + "awq", + "awq:marlin", + "marlin", + "ipex", } def __init__( @@ -109,26 +118,70 @@ class AutoRoundConfig(QuantizationConfig): ) def get_layer_config(self, layer, layer_name: str): - # Priority: extra_config > block_name_to_quantize > type fallback - if self.extra_config and layer_name in self.extra_config: - cfg = self.extra_config[layer_name] - return cfg.get("bits", self.weight_bits), cfg.get( - "group_size", self.group_size), cfg.get("sym", self.sym) - quantized = True + def get_config(name: str, quantized: bool = True): + cfg = self.extra_config.get(name, {}) if self.extra_config else {} + return ( + cfg.get("bits", self.weight_bits if quantized else 16), + cfg.get("group_size", self.group_size if quantized else -1), + cfg.get("sym", self.sym if quantized else True), + ) + + # 1. Exact match from config + if self.extra_config and layer_name in self.extra_config: + return get_config(layer_name) + + # 2. Determine whether layer should be quantized + quantized = not isinstance(layer, ParallelLMHead) if self.block_name_to_quantize: quantized = any( layer_name.startswith(name) for name in self.block_name_to_quantize) - elif isinstance(layer, ParallelLMHead): - quantized = False - return (self.weight_bits, self.group_size, - self.sym) if quantized else (16, -1, True) + # 3. Handle fused MoE + if self.extra_config and "fusedmoe" in layer.__class__.__name__.lower( + ): + moe_configs = [ + get_config(name, quantized) for name in self.extra_config + if name.startswith(layer_name) + ] + if moe_configs: + if len(set(moe_configs)) == 1: + return moe_configs[0] + raise ValueError(f"Fused MoE layer '{layer_name}' requires " + f"consistent quant config for all sub-layers") + + # 4. Handle fused QKV or other patterns + if self.extra_config: + for fusion_key, sub_keys in self.packed_modules_mapping.items(): + if fusion_key in layer_name and layer_name.count( + fusion_key) == 1: + sub_names = [ + layer_name.replace(fusion_key, sub_key) + for sub_key in sub_keys + ] + sub_configs = [ + get_config(name, quantized) for name in sub_names + ] + if len(set(sub_configs)) == 1: + return sub_configs[0] + raise ValueError( + f"Fused module '{layer_name}' requires " + f"consistent quant config for {sub_names}") + + # 5. Fallback + return get_config(layer_name, quantized) def check_quantized(self, weight_bits: int) -> bool: return weight_bits < 16 + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): + if self.block_name_to_quantize is not None: + self.block_name_to_quantize = hf_to_vllm_mapper.apply_list( + self.block_name_to_quantize) + if self.extra_config is not None: + self.extra_config = hf_to_vllm_mapper.apply_dict(self.extra_config) + def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"): from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.quantization.utils.marlin_utils import ( @@ -141,9 +194,14 @@ class AutoRoundConfig(QuantizationConfig): else: return None - logger.debug("[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s", - prefix, layer.__class__.__name__, weight_bits, group_size, - sym) + logger.debug( + "[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s", + prefix, + layer.__class__.__name__, + weight_bits, + group_size, + sym, + ) if backend == "auto" or "marlin" in backend: AWQ_TYPE_MAP = { 4: scalar_types.uint4, @@ -162,15 +220,19 @@ class AutoRoundConfig(QuantizationConfig): if use_marlin: from vllm.model_executor.layers.quantization.awq_marlin import ( AWQMarlinConfig, AWQMarlinLinearMethod, AWQMoEMethod) - quant_args_marlin = AWQMarlinConfig(weight_bits=weight_bits, - group_size=group_size, - zero_point=not sym, - lm_head_quantized=False, - full_config={}, - modules_to_not_convert=[]) + + quant_args_marlin = AWQMarlinConfig( + weight_bits=weight_bits, + group_size=group_size, + zero_point=not sym, + lm_head_quantized=False, + full_config={}, + modules_to_not_convert=[], + ) else: from vllm.model_executor.layers.quantization.awq import ( AWQConfig, AWQLinearMethod) + quant_args = AWQConfig( weight_bits=weight_bits, group_size=group_size, @@ -182,6 +244,7 @@ class AutoRoundConfig(QuantizationConfig): return AWQMoEMethod(quant_args_marlin) from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) + config = { "quant_method": "awq", "bits": weight_bits, @@ -206,6 +269,7 @@ class AutoRoundConfig(QuantizationConfig): from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.quantization.utils.marlin_utils import ( check_marlin_supported, check_moe_marlin_supports_layer) + weight_bits, group_size, sym = self.get_layer_config(layer, prefix) if not self.check_quantized(weight_bits): if isinstance(layer, (LinearBase, ParallelLMHead)): @@ -213,19 +277,24 @@ class AutoRoundConfig(QuantizationConfig): else: return None - logger.debug("[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s", - prefix, layer.__class__.__name__, weight_bits, group_size, - sym) + logger.debug( + "[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s", + prefix, + layer.__class__.__name__, + weight_bits, + group_size, + sym, + ) if backend == "auto" or "marlin" in backend: GPTQ_TYPE_MAP = { (4, True): scalar_types.uint4b8, (8, True): scalar_types.uint8b128, } - use_marlin = ((weight_bits, sym) in GPTQ_TYPE_MAP - and check_marlin_supported( + use_marlin = (weight_bits, + sym) in GPTQ_TYPE_MAP and check_marlin_supported( GPTQ_TYPE_MAP[(weight_bits, sym)], group_size, - has_zp=not sym)) + has_zp=not sym) if isinstance(layer, FusedMoE): use_marlin = use_marlin and check_moe_marlin_supports_layer( layer, group_size) @@ -234,26 +303,33 @@ class AutoRoundConfig(QuantizationConfig): if use_marlin: from vllm.model_executor.layers.quantization.gptq_marlin import ( GPTQMarlinConfig, GPTQMarlinLinearMethod, GPTQMarlinMoEMethod) - quant_args_marlin = GPTQMarlinConfig(weight_bits=weight_bits, - group_size=group_size, - is_sym=sym, - lm_head_quantized=False, - desc_act=False, - dynamic={}, - full_config={}) + + quant_args_marlin = GPTQMarlinConfig( + weight_bits=weight_bits, + group_size=group_size, + is_sym=sym, + lm_head_quantized=False, + desc_act=False, + dynamic={}, + full_config={}, + ) else: from vllm.model_executor.layers.quantization.gptq import ( GPTQConfig, GPTQLinearMethod) - quant_args = GPTQConfig(weight_bits=weight_bits, - group_size=group_size, - lm_head_quantized=False, - desc_act=False, - dynamic={}) + + quant_args = GPTQConfig( + weight_bits=weight_bits, + group_size=group_size, + lm_head_quantized=False, + desc_act=False, + dynamic={}, + ) if isinstance(layer, FusedMoE): if use_marlin: from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) + config = { "quant_method": "gptq", "bits": weight_bits, @@ -282,6 +358,7 @@ class AutoRoundConfig(QuantizationConfig): return None from vllm.model_executor.layers.quantization.ipex_quant import ( IPEXAWQLinearMethod, IPEXConfig, IPEXGPTQLinearMethod) + if isinstance(layer, (LinearBase, ParallelLMHead)): if "awq" in self.packing_format: config = IPEXConfig(method="awq", From 58b11b24a69f0d5fc48f3a6ce8291e8d92af26e2 Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Tue, 29 Jul 2025 22:34:00 +0800 Subject: [PATCH 019/224] [Bugfix] Fix workspace buffer None issue for Flashinfer TRTLLM Backend (#21525) Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> --- .../kernels/benchmark_trtllm_attention.py | 42 ++++++++++++------- ...test_flashinfer_trtllm_decode_attention.py | 16 ++++--- vllm/attention/backends/flashinfer.py | 15 +++++-- vllm/v1/attention/backends/flashinfer.py | 28 ++++++------- 4 files changed, 60 insertions(+), 41 deletions(-) diff --git a/benchmarks/kernels/benchmark_trtllm_attention.py b/benchmarks/kernels/benchmark_trtllm_attention.py index 8c980f930366c..68c48858e61cc 100644 --- a/benchmarks/kernels/benchmark_trtllm_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_attention.py @@ -71,22 +71,20 @@ def benchmark_decode( if kv_cache_dtype.startswith("fp8"): kv_cache, _ = to_float8(kv_cache) + output_trtllm = torch.empty(q.shape, dtype=dtype) + # Benchmark TRT decode def trt_decode(): return flashinfer.decode.trtllm_batch_decode_with_kv_cache( q, kv_cache, workspace_buffer, - num_qo_heads, - num_kv_heads, - sm_scale, block_tables, kv_lens_tensor, - page_size, max_kv_len, - kv_cache_dtype, - k_scale, - v_scale, + bmm1_scale=k_scale * sm_scale, + bmm2_scale=v_scale, + out=output_trtllm, ) def time_fn(fn, warmup=10, trials=20): @@ -125,6 +123,8 @@ def benchmark_decode( kv_indices = torch.tensor(kv_indices, dtype=torch.int32) kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32) + output_baseline = torch.empty(q.shape, dtype=dtype) + wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper( workspace_buffer, kv_layout, @@ -145,7 +145,7 @@ def benchmark_decode( ) def baseline_decode(): - return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale) + return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale, output_baseline) baseline_mean, baseline_std = time_fn(baseline_decode) @@ -214,25 +214,39 @@ if __name__ == "__main__": max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072] all_results = [] - print("Running benchmark for kv_cache_dtype: bfloat16") print( - "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent" + "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, " + "output_dtype: bfloat16" + ) + print( + "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t" + "baseline_std\tspeedup_percent" ) for max_seq_len in max_seq_lens: for bs in num_seqs: result = benchmark_decode( - bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="auto" + bs, + max_seq_len, + dtype=torch.bfloat16, + kv_cache_dtype="auto", ) all_results.append(result) - print("Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8") print( - "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent" + "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8, " + "output_dtype: bfloat16" + ) + print( + "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t" + "baseline_std\tspeedup_percent" ) for max_seq_len in max_seq_lens: for bs in num_seqs: result = benchmark_decode( - bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="fp8" + bs, + max_seq_len, + dtype=torch.bfloat16, + kv_cache_dtype="fp8", ) all_results.append(result) diff --git a/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py index 96eee13695a9d..2e2130fab6a21 100644 --- a/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py +++ b/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py @@ -113,27 +113,25 @@ def test_flashinfer_trtllm_decode_with_baseline( kv_data_type=dtype, logits_soft_cap=soft_cap) - output = wrapper.run(query, key_value_cache, scale) + output = torch.empty(query.shape, dtype=dtype) + wrapper.run(query, key_value_cache, scale, out=output) # TRTLLM Decode max_kv_len = max(kv_lens) kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int, device=query.device) - output_trtllm = flashinfer.decode.trtllm_batch_decode_with_kv_cache( + output_trtllm = torch.empty(query.shape, dtype=dtype) + flashinfer.decode.trtllm_batch_decode_with_kv_cache( query.contiguous(), key_value_cache, workspace_buffer, - num_query_heads, - num_kv_heads, - scale, block_tables, kv_lens_tensor, - block_size, max_kv_len, - "auto", - k_scale, - v_scale, + bmm1_scale=k_scale * scale, + bmm2_scale=v_scale, + out=output_trtllm, ) torch.testing.assert_close(output, output_trtllm, atol=1e-2, rtol=1e-2), \ diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index e6e60e7562482..824ff8cca201a 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -1104,7 +1104,12 @@ class FlashInferImpl(AttentionImpl): window_left = window_size[0] if window_size is not None else -1 prefill_output: Optional[torch.Tensor] = None - decode_output: Optional[torch.Tensor] = None + if num_decode_tokens > 0: + decode_output = torch.empty(decode_query.shape, + dtype=decode_query.dtype, + device=decode_query.device) + else: + decode_output = None stride_order = FlashInferBackend.get_kv_cache_stride_order() if prefill_meta := attn_metadata.prefill_metadata: # We will use flash attention for prefill @@ -1155,17 +1160,18 @@ class FlashInferImpl(AttentionImpl): num_decode_tokens, attn_metadata.max_decode_seq_len, kv_cache_dtype, attn_metadata.num_qo_heads, attn_metadata.num_kv_heads, attn_metadata.head_dim): - decode_output = decode_meta.decode_wrapper.run( + decode_meta.decode_wrapper.run( decode_query, kv_cache.permute(*stride_order), k_scale=layer._k_scale_float, v_scale=layer._v_scale_float, + out=decode_output, ) else: workspace_buffer = ( - decode_meta.decode_wrapper._int_workspace_buffer) + decode_meta.decode_wrapper._float_workspace_buffer) assert FlashInferState.get_kv_cache_layout() == "HND" - decode_output = trtllm_batch_decode_with_kv_cache( + trtllm_batch_decode_with_kv_cache( query=decode_query, kv_cache=kv_cache.permute(*stride_order), workspace_buffer=workspace_buffer, @@ -1174,6 +1180,7 @@ class FlashInferImpl(AttentionImpl): max_seq_len=attn_metadata.max_decode_seq_len, bmm1_scale=layer._k_scale_float * softmax_scale, bmm2_scale=layer._v_scale_float, + out=decode_output, ) if prefill_output is None and decode_output is not None: diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index b72745ef156eb..775780807eae2 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -194,7 +194,6 @@ class FlashInferMetadata: max_seq_len: int seq_lens: torch.Tensor block_table_tensor: torch.Tensor - workspace_buffer: torch.Tensor # For handling prefill decode split num_decodes: int @@ -473,7 +472,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): max_seq_len=max_seq_len, seq_lens=seq_lens, block_table_tensor=block_table_tensor, - workspace_buffer=self._get_workspace_buffer(), ) self._plan(num_prefills, num_decodes, attn_metadata) @@ -641,11 +639,11 @@ class FlashInferImpl(AttentionImpl): if decode_wrapper := attn_metadata.decode_wrapper: decode_query = query[:num_decode_tokens] assert decode_query.shape[0] == num_decode_tokens + assert decode_wrapper is not None if not FlashInferBackend.use_trtllm_decode_attention( attn_metadata.num_decodes, attn_metadata.max_seq_len, self.kv_cache_dtype, attn_metadata.num_qo_heads, attn_metadata.num_kv_heads, attn_metadata.head_dim): - assert decode_wrapper is not None assert decode_wrapper._window_left == window_left assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap or 0.0) @@ -666,22 +664,24 @@ class FlashInferImpl(AttentionImpl): num_decode_tokens] seq_lens_decode = attn_metadata.seq_lens[: num_decode_tokens] + workspace_buffer = decode_wrapper._float_workspace_buffer assert get_kv_cache_layout() == "HND" assert decode_query.is_contiguous() assert kv_cache_permute.is_contiguous() assert block_tables_decode.is_contiguous() assert seq_lens_decode.is_contiguous() + assert workspace_buffer.is_contiguous() - output[:num_decode_tokens] = ( - trtllm_batch_decode_with_kv_cache( - query=decode_query, - kv_cache=kv_cache_permute, - workspace_buffer=attn_metadata.workspace_buffer, - block_tables=block_tables_decode, - seq_lens=seq_lens_decode, - max_seq_len=attn_metadata.max_seq_len, - bmm1_scale=layer._k_scale_float * self.scale, - bmm2_scale=layer._v_scale_float, - )) + trtllm_batch_decode_with_kv_cache( + query=decode_query, + kv_cache=kv_cache_permute, + workspace_buffer=workspace_buffer, + block_tables=block_tables_decode, + seq_lens=seq_lens_decode, + max_seq_len=attn_metadata.max_seq_len, + bmm1_scale=layer._k_scale_float * self.scale, + bmm2_scale=layer._v_scale_float, + out=output[:num_decode_tokens], + ) return output_padded From 37f86d90489dd47b3f9ac4dba8cd38d5907b016f Mon Sep 17 00:00:00 2001 From: David Xia Date: Tue, 29 Jul 2025 13:32:06 -0400 Subject: [PATCH 020/224] [Docs] use `uv` in GPU installation docs (#20277) Signed-off-by: David Xia --- .../installation/gpu/cuda.inc.md | 84 ++++++++++--------- 1 file changed, 44 insertions(+), 40 deletions(-) diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md index 5298c22c8435e..69a9842e4719b 100644 --- a/docs/getting_started/installation/gpu/cuda.inc.md +++ b/docs/getting_started/installation/gpu/cuda.inc.md @@ -20,16 +20,16 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I # --8<-- [end:set-up-using-python] # --8<-- [start:pre-built-wheels] -You can install vLLM using either `pip` or `uv pip`: - ```bash -# Install vLLM with CUDA 12.8. -# If you are using pip. -pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128 -# If you are using uv. uv pip install vllm --torch-backend=auto ``` +??? console "pip" + ```bash + # Install vLLM with CUDA 12.8. + pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128 + ``` + We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first. !!! note @@ -50,36 +50,22 @@ uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VE LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. -##### Install the latest code using `pip` - -```bash -pip install -U vllm \ - --pre \ - --extra-index-url https://wheels.vllm.ai/nightly -``` - -`--pre` is required for `pip` to consider pre-released versions. - -Another way to install the latest code is to use `uv`: - ```bash uv pip install -U vllm \ --torch-backend=auto \ --extra-index-url https://wheels.vllm.ai/nightly ``` -##### Install specific revisions using `pip` +??? console "pip" + ```bash + pip install -U vllm \ + --pre \ + --extra-index-url https://wheels.vllm.ai/nightly + ``` -If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL: + `--pre` is required for `pip` to consider pre-released versions. -```bash -export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl -``` - -Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. - -##### Install specific revisions using `uv` +##### Install specific revisions If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL: @@ -92,17 +78,35 @@ uv pip install vllm \ The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. +??? note "pip" + If you want to access the wheels for previous commits (e.g. to bisect the behavior change, + performance regression), due to the limitation of `pip`, you have to specify the full URL of the + wheel file by embedding the commit hash in the URL: + + ```bash + export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch + pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl + ``` + + Note that the wheels are built with Python 3.8 ABI (see [PEP + 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible + with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a + placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in + the wheel metadata (the wheels listed in the extra index url have correct versions). Although we + don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the + wheels are still built with Python 3.8 ABI to keep the same wheel name as before. + # --8<-- [end:pre-built-wheels] # --8<-- [start:build-wheel-from-source] #### Set up using Python-only build (without compilation) -If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM: +If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM: ```bash git clone https://github.com/vllm-project/vllm.git cd vllm -VLLM_USE_PRECOMPILED=1 pip install --editable . +VLLM_USE_PRECOMPILED=1 uv pip install --editable . ``` This command will do the following: @@ -121,7 +125,7 @@ In case you see an error about wheel not found when running the above command, i ```bash export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl -pip install --editable . +uv pip install --editable . ``` You can find more information about vLLM's wheels in [install-the-latest-code][install-the-latest-code]. @@ -137,7 +141,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T ```bash git clone https://github.com/vllm-project/vllm.git cd vllm -pip install -e . +uv pip install -e . ``` !!! tip @@ -152,14 +156,14 @@ pip install -e . The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`. !!! note "Faster Kernel Development" - For frequent C++/CUDA kernel changes, after the initial `pip install -e .` setup, consider using the [Incremental Compilation Workflow](../../contributing/incremental_build.md) for significantly faster rebuilds of only the modified kernel code. + For frequent C++/CUDA kernel changes, after the initial `uv pip install -e .` setup, consider using the [Incremental Compilation Workflow](../../contributing/incremental_build.md) for significantly faster rebuilds of only the modified kernel code. ##### Use an existing PyTorch installation -There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.: +There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, e.g.: - Building vLLM with PyTorch nightly or a custom PyTorch build. -- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124` to [install PyTorch nightly](https://pytorch.org/get-started/locally/), and then build vLLM on top of it. +- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 torch torchvision torchaudio` to [install PyTorch nightly](https://pytorch.org/get-started/locally/) and then build vLLM on top of it. To build vLLM using an existing PyTorch installation: @@ -167,8 +171,8 @@ To build vLLM using an existing PyTorch installation: git clone https://github.com/vllm-project/vllm.git cd vllm python use_existing_torch.py -pip install -r requirements/build.txt -pip install --no-build-isolation -e . +uv pip install -r requirements/build.txt +uv pip install --no-build-isolation -e . ``` ##### Use the local cutlass for compilation @@ -179,7 +183,7 @@ To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to po ```bash git clone https://github.com/vllm-project/vllm.git cd vllm -VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . +VLLM_CUTLASS_SRC_DIR=/path/to/cutlass uv pip install -e . ``` ##### Troubleshooting @@ -189,7 +193,7 @@ to be run simultaneously, via the environment variable `MAX_JOBS`. For example: ```bash export MAX_JOBS=6 -pip install -e . +uv pip install -e . ``` This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory. @@ -228,7 +232,7 @@ Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing: ```bash export VLLM_TARGET_DEVICE=empty -pip install -e . +uv pip install -e . ``` # --8<-- [end:build-wheel-from-source] From f03e9cf2bbee0b18b83ffe2ed0e8ddbd589c9cc4 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Tue, 29 Jul 2025 23:02:30 +0530 Subject: [PATCH 021/224] [Doc] Add FusedMoE Modular Kernel Documentation (#21623) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- .../fused_experts_blocks.png | Bin 0 -> 191037 bytes .../fused_moe_batched.png | Bin 0 -> 193655 bytes .../fused_moe_non_batched.png | Bin 0 -> 232056 bytes .../prepare_and_finalize_blocks.png | Bin 0 -> 130810 bytes docs/design/fused_moe_modular_kernel.md | 236 ++++++++++++++++++ 5 files changed, 236 insertions(+) create mode 100644 docs/assets/design/fused_moe_modular_kernel/fused_experts_blocks.png create mode 100644 docs/assets/design/fused_moe_modular_kernel/fused_moe_batched.png create mode 100644 docs/assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png create mode 100644 docs/assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png create mode 100644 docs/design/fused_moe_modular_kernel.md diff --git a/docs/assets/design/fused_moe_modular_kernel/fused_experts_blocks.png b/docs/assets/design/fused_moe_modular_kernel/fused_experts_blocks.png new file mode 100644 index 0000000000000000000000000000000000000000..5721d5582c7f14d89e1bcd7defc58fe1669442e0 GIT binary patch literal 191037 zcmeEv1zc3w`?n$(pjaRlimgZwjf5y*0E!3*5>kWYkTdiMiUA5X3?V8?hdA^QVt@hz zN)N3FNDPQ{zvs@(MO1Wm-`$_YLc=TiWqh8QX(P;J&_zu@$X>63W@s!a{$WfV>b7AGozsN#D@a${uBRdK*F( zd_QPqZ)^cR!DaCCh${GT0{r9Wu6 zZ)$6dL^*kXkX^$QSOXQ|KZ)!w+_QXLGZvKOErkZm559rIO8Xi(4h zqb0AX#tnEAVo$Wbg#(S`)C+d@PA~`Cq8zM@jG-Ok2f~6mZEA0KR`xTlBPsuZ>vs0GD0AZ(cMyAp1QF0cw22wp z+8e`+OTA>R|Ct^X68(5nK5vbJG0M`|-qs1Mm)0Xfgho)BAOKmXUBG0a_PS4~i zn5Fb-ylgVvIYiIjv)`Z%ja`1c07xf(#Z|LzC5>JFuWqHV2#*kzL1>#P3~%lidno|7 znz@?@c-#M$-IPUWD4U;lP(ENUuZM6vp`wa*M8SH0h8!9j0VPNiOO&nsX_N`dO5Z~6 z^Htf;k_t?U?B_O$C=?J=iujrt+uJ+QG`hZnJqom&A-zD)*gBm6P5A(o5VDl7n$TDH z%YI03K7WHK4SoBLU<&Zrp&V=}ngX<#b(u8%W3O*(0t`DvkI&kUX0ad|P+R@~O1`bJ zg}%KhdS)*GOZ*)|JnK4V)5pRVfHJa1rs!<~!Y0tK;FEF*QmR1NeC82h8wbcpQM4_$ z9mQ@#5QAUH8tfbN80|jX(f*@);6Al8ppK~heK`iWJj=+lT>8_K0Ta=;1bixMWnc$= z?38z~Gd5B}$$=G^T3I_lOWkV-_73bH%E}G^wLdg*NG*S+lmM{&)({f_Yoc#qVQhhd z;3XK<+St|s{AKR248ATDnFO8xA{C$QV}|2pD?piXGUzW*z=C`~*5rWU2S zC;xgaN~6aYQ0OmGnJNzYR`#aO6dm+k4fK_+`O+@`K?KdGFUBwON0f!Yei31SPJfcJ zfWa{hbP2*pYNoP)f!G(y;%^1ev)W!7e4K@A{ukzsYI{MfLeI#0o4`JxHV^+vlLia` zWc+8~IeiOL6DT}k2#_3P(PSYrVG11Sy|f#arbb4PpEs*T`;u*chUpQ7Ifcee)Bf2^ zZbHmrdf;onj^z>lzhQZ37UefB4?oR2{fVaM_aGn`2ko`B)raB&F#lMiK>W}S+QQ%8 z3*48b<5B2o&>JnvFddkpaka20je~zXNLcfpQRTN z5&kl~A_Q~sOx+B-3NtM|+zytlpBgEfm32g^x_K7X`Cl1&8^B9nS$fFg!)#?>3x4~; zwg20_aR{F1QS%sIOr3{^=a0e^D)OF%w6iYzi*{cD>3;#j`3f%!!N_H%6u}ZRQ;J~8 z`gNp;mNWWEQnZ&ULZ6ZHFD^ebg@-!n|0L&k!YK&iOv8$46gBM&z$iy-I^aK3c0fj# z5`&s4JFtxXI3s4fOeqfk*fIQ@wr8a?Vt6{J$PL;0GHA?cI<09Oi>R@3QHcnCQVMTOSnu zOh?pyd7JiIUqW=vp(V!MmK?Rxi zwx*z{l8>5=2Z4seRK=}49b=eP;e65kSDNKtj1WYGzEU~Dv_SGql>>+AzQ6~+Ua}4L z(|*i(Fq8dh`=zABuZNk3G{fl37O;jz{}&SJ@8^B~w0+sDG^=@`aE&|^XA7Q>1_@^ThG_Dbawf+pQ;fGn|7q+{F~oF2 z??(Zh`#V3+XwYoIZyODoKlSf78Z?T`qB3Tfgs($t5IFoF zh18V40HL1~Bhoe1I!D0WI`Uf_7zmVbjeqi@c8!ja!f!Pe# z56aiS(uDtf|3*kulm}*n9~mnlejb5;L_rEX%fHFV`FZb~(#+6r+bdeI;NNerXcYOe zy;8IKTsksck^?Q1D?XV{0x40Als(GY{6BLAS?}2^ z;y}R&{4zTnWf7b9t7e)kVOm6Urpbb}*DqwUXtVsgRM1b0v(PllZ>k_TDe~`ELEjI| z&QK=DQPv4rP3Zag;5gMxnHQW*=4mOMUr6Tv-q`9-gB`Fa|F+E2;s*bEnTILzW7u)b z*vJ8#A3_uFpX#grA_6b+)e9rwh$5P6H&fdAW^pn^X{_|?NIN*!>2C;%QOEq|)F@DN z^jnA1AZQf)ZK?ggk`aL^GDB)Vzbv^AWock)1*G*WcmL0rT|Pcha9YZ2W|xmo2%L%n z?m+R383%|>A1(IXeg7YxmGU|7Ia4}e=lTo${ma>6#P14Rg7*SH*HKx2?Vf$@5q`x9 z{~fyXJzq`)93Kh0WH7E1fK~rYUJ{&6lFa0$UoQX&pD{T@Q^02Y3y4mQLjUQQ1XIT0hqbqKqvft23rciFSN~MN*Ln(LI8vYlYSCYVkmXa#zz0~ zBn1-ug?3j+2=R@&oYf76GdRCEU;eHbEUZ8Ow6c`@Y{O?U;MDGa+S@$;Xx%6Z>pj?J z^TQWrD97n@@qTe8{I>4(8z^sVWdu%$p`4g(XrXUsXKD!S(u@PAzJ&V5j;8k1(?t0M z1VKaSyCyh}3Y@zQ3Tvp}KfiJeZtjQ32pLGIC3pvR`Xol^+IMZ;Z~Aexh5h-M%8X5g zvDwT$g|GeYGe&=W5)KsBga4o9*q`Ya&S<e;l9To&3ae#3CzCa%! z+1`!*QaryaP=BSNhB*Z0m|4K%Y-PZ(D8Y>N_a7RBnC-liulDPgRFD5QwI@syRT@VL z^9k|@(DnxAmjAuxn{Cv8tn!0PD!*`x{$p*RA7M50lPo&??k3FI{9-?Ywc+xBAB_6X zW$oDtxc?k$PoI$w;Z*30CI!+?ADI6c(Ly(;A?>Sqe=Y-mzvljbk@AwaML`F&p?-gKVfH*1 z_3A7${KMro|AKIZ2vibAv7@kp6@b&SGr$v!B4!5vU}FHY@!ub@fJ*?sKm>m*-zp&R zoxdCeN3aaQFO-9e;Xl>`qPEZpgL0H}{_LjHvVc)#El`G_o_m@Jryreh?aSQ5SIYm- zu-892%OD1;Wg1Tk!9aNiOA5l*X7VJg-+v*VoGps>T@CbmL$Baq%h~_H70g=Gy!M-H z`Ulu^y6G3X^DCzP9CC#+9m&Y!5!tWL=$Za&1T&av`mEiV%mgdFUx%55 zW^?X;A~PxJJ5i1~{g0`{U^g1JnzL1@{%u@ENiKco@3+EA(IF zDSM{rawx;BeS(MPf8Cza$HMJTeDSB0_jt`R+EszMp7@*GqGfWkjJ9yQjP}Feg`&Z^s?kStfG;2ksTD z3U1&0Ejbs`ujCMAYJILc+hlfudxuG$2%T@JDKp386rgo(up{GalQ9POcAc~i_nC!m z%M{NvaRsbAvT2sdJb>;QGZot}pJfX@7cC2BuzR;_w#nQC_a0j0;&>ps$$X-3Ejf9L ztg-q=<|Y2H{fwEysd3zTSz4Rh@*dE3gu@9;@zqIHo?;h34T=&9uj*go`ja9BM(-Fcg}wxHpDMN{&`K7m<WlGm2+B`&+ zNSVtFk&$bfUmSk(Iw}%({{=Fdx`wUL&gwH2*i#-V-6kVPw2#ARn;e!Srp2#smZ!0; zC&GOm^?~On&Ugp%>t?bn3GoixrDYNBP2nY(r$7i+q})|p0P!7%4&Cu`#VY4~hm_;x zmpcpit-f8t@0-vTFroffhhURKazxRD4TcKePUtu^A)U0FOM<|JrkVSz2}wc|D$&Y! z3I`MF>)=O!J0V4gLS*lD5Py4etEVn`>Q5&ofk~# zeS*yj<=cS;~0Krwh4_u6q;PSL;pPZe99@X;9Ck6 z(E|$M84kBw0}7RTnbgcip}%ZGBxpi|quY(q;IrV6z|wCgbQzjZ*o#BGN5O>Ny)fyR zbwb>LLPiOwV@L%#;)B?&eX~&L07N0JSeB)?0EKj2!+mF?&|fy8-B$;_12sy7PR5Vk z@02jVPkN3ac1yqNig<;~F|Vn#c;ezS#XLDq?5a&I>J}@u81+w|P+ptLziwE+HKf*$ z&&AHjswbk|l+!HSG#R&>Y9wE=0V_4R7O@W^lgGJ*53=sJeq_LT1&`Xah9<8ItcVUB z$-z!EbQSm_%klz~PopY_)bKfNIJY6q)*kb6p_nq!J}h?1i_|n=H>R?lug>BmTUwOc zh-^5oRt4CL_x>>2IJg1~Rca={Mj2{96BsJo6Hh`AmB)5)l^G;t-6AE1X`iWl()DZ# zcjZF{pXgY}HvH}sz2O_L@jIkYDF#dMKD9WdwJXQl1nkdi5}k2vQ+ImMp+dI$_5Dp? zX$OC6X-KtP+->xPGBPXlRTp1p8~V18Z9ZpmbF8g(xXA)Pk(gBU_EV~_S3R(Ff227i zq4Rn$o0KVy(64W93y|a&eOcS?a^5?z{iGd|CY{$A78YO6%qUtyQK%|(rU{Yh51JHx zTzv_6ivb7XdCN|^q?{P$V&Uo+UBf$yS_x<3a%z(Y^AmF2H)Mr6yscA4E*&7c5^5if z`?}2UUTDsQ4gP=!_s}<<6tdQ(0sj?Cz#F%_UJ$wsoiT*9z7Am*3 zLo86`j5tYQfdEE`+e$qVD2-|sd^$EDF-hXw+dWW(gcfE3v327fIPM(8b#f1(>#x zu6xgW|F|wr92spr`6-0dzJ$%K&tS+TCSD)iHy3HBJ(YBM4y(|}HX3M_DFv4DddmGG zs-?UJZEX7j6LEo&-pHaA3GdPVLL@;uEGhO9a-WFEn1ge#Cw@zhusP-~iAf_s()!bV z>6m%vgDhQwNjv<4bgIOzNvCISmy;uARy!`DfxgXL0Or1Zp?{jXm|vG>LgL;049(i zO)hB^$lqtWW-h1Mr^7UDC6<_U>G3sn(5*Xv1o;28g1IKQW z)C44TvDJ$k)TwYRTMFR0W9-VX{g5R;kO)ZQlQUc!db%>Iu{0;f8^>63Iv3KSpHVu8 zE`WBuTDFJKg(DrbHz3Zt+v0?_2;v*Fn;RTac}pS-IygnJ4H#w^_Wjz-Xe-Jp|Cqh&69 zVDR~$!R|$W0O2z$ecn(Y{DY%+>sW;}bSo^b3=p?xuC#Q$(&wiaCYiU@&(}Nfpsr3U z)p?R}2C`jB7%W;3bsXRU z6|R0z51(4^NYN!Jq#{&Nu~p)*WAXU&*thaXX(fJ%u!E0!bw|TUoDR3gyU6v#hD^)e zoE+zP(t{!5d8_5Ip|YXT&8=nj2z-Jrm(EpSo@F(z4ABUnIwYZS+DwR39)!wu&*a)qNz32`B5)(kR_yVLnG8D#$nzl1sYLhLdcSO^Qr<$Y8>|M1)Y{rRCmfdyP~)Phje{x%J?XD?U5fE-2ae zYV@s+)uv*e2u`!s!JsBnsw!M|(GP5@Yo+%!cvFK}g;~km?R-2`IwX*xWv1jV?WH@?kWymVJlhd-ljUdkMaNkwtZE`OBK7h^aYwK zcr^w5%215LMxABo{A;+}(OiNLo_ToY^j(SD)oE8qy%nn_C)RAl#kD_8 zZQ!|jOs1kaTk-V6IZ-%?r;&!YHm22Eil7VW&Bh4LcNnu+wpT=(s5dekV7zgDTwSt*3#vNlDonJV=}e`G-g{?8mz;|pasXCGMo^17JuTVpdauwD@S{E zcvzriMgV)Cb?d^8>K!^C-;ZkNd%4zZ3x9odVy9?K_Wd#Sh!9V?;9mdSX$OKWdY8A>@dTwTIb?Ws*-82DwUr|nT zY;^i=!J>wW+Y^ca-Z1Wryy}_f9?wKH6XKK5AMwh?bst*y*7s#J`+K^AseVD1iR~hx zs<$@;>onxXu2&F>id@r#E!yKr!LD=Y5?O%eZF$Upnxc6H7v3b?Z(WW_Hw#>VtACVA zQt&f+KGCE|NM&&~PF)j!{o$P$0Z}^zrm8v)d~<9vQ(+Pzrp6X&=Z+>kZ4ekMb?a#{ zN6am~n}E=p_~hlKr{}U8&#L}vy=l3XpdEoYnr=gSC6-& zeM8h6$s5Lm?e^H;Wg^Wd7+!I=Mz?H47N$2v4>2;0m5chZ-xAv{?wX242gfCSa5J4O z!<+DoY@Uqv#Lwj^2Nt=Nd_%8bJ)$C%+$Jkd7A+J;y=kbAX+_^?7wsHyk>eE&yN_Hu znX}xEV&h~uvVdxB*Kt8oK>RWr4_l4$J>84&SF}$(c+}dgfH<+YTQr7Kw2bGmntEYZ z^fA?CDajI(>33@PR^BykEy>HW7 zDIe!4j@`g{`mW(+#=A*&N)j)}n-t$xZtPAo)Dw}-o37wG@R-Pso(aEYOx)Wt)fTSWggSGYV<2ejCrlo?0Gah}qDB_DY{~5Oa0c__2lI z;S%NcuK4ICMK5~h9JjaP^=_9*FtUj2|u#`yfR-cMVW$V^eco*T6QO zSH@a>Qs@*tb)Cm|9WC|_T>DzMTlvwxx-2c!L?~&B{mvSLtL!F>Q7r21vJ*$@;*%?_ zQg3%}>^qB4dscpJznYKh)LhK|QZl2e(82nZXe0uEXDQC|u^-cg?E!8deC>s8{NHu% z4D;PLh*1zDq;r@rz_eBCt;ESEY z0Uv=&GaC*Nsw@K{`;?Q9<|RZg+i4=Fv_wH|7jGa^c;JSj+=1kfu^@-^YAzRr<1P0@ zjt?keNI6dhgtGK%$*o#Zw>3kt(R!nV(s2x{XFVB>$s)^&FlZ|fWGp5^R=I0R#9GkM z4?h$dJ=`whb-Lzl>ji!P$$+Rc=?_}jc1Mm6X`~kfFs7Z)uue!|%4Ydw-1rck8S^eZ zw;0Eco@6(P?v7%-W5&o5n*FkCTWX)8W#3t%eMHbx0R{_!H!3B(+6?6}!8r7A`*4JB zUiLdQvEHukdcOAMoO}0?{468$X<&1|8t@OIcRx#`I8(=XAzqQn#!6SeAQx><-x826 zNTyeP6jrYVHih^wL^UG~>nRc4uGilq9kV5vbLZW9z0t7(C2Y=liJGVojY+w&^vem+ zCTgO0FEK6hFE_c@s8>0r&=9hV`R?6ggy!R1`UfTQSguYq(LF1$BDo)atXC;&JK_R> z%l#L47+Q+}%bXUsx|tFYJ>VsizT0f{C1dLRv{N017_o_k7}x8o;$_g*!Y}G;R}IL^ z5hD|pHB*D2b39}LCS_!OPqR_7%OL#MwX1zH!M9sEQMI1`#Wu1PqX);W(0;$0X8D@y zA=DK9$)w-`6+j-i1++r+?Vwi_(DYmmLc+Tx+8g*~@gK30ww1j_$JQru5)&jSt0uPu0XKxh}C7!qu}hTS{^$=q=wu5()M^m+>%OSmX_(I^K6mp0E9Pqw+Tjj ziD2TF+5GNpfgFdesvMl4%P2zgE+nVGqb3v{5fn5bC^lRblp++9b)FP)1liC&7D&=M z88D!`X;|cTzfs6`q>>0?tm8!yyOj=)K&{gml0yhl$|>_>4=a*GIeq(~1+xYZpo)rhLD?Fv(1oSJ-`&G&u* zKF3`|qR{J-jXBWawch-?v^Bp}0NlG#s9`TP0KVTN5)5h|O%xH4Ad|iHaFe%4cLzMN z$sl&I#dmkC!C+^9Q{%$5!R_g48ob4+XD?gzY(j=8BM5yv@l`11PxAJ6fOyzfdR~K_ zTt~2~x1>+T)B!)OIG`m*JgX3pEM3!eTU~+t9(u7 z9D8BWrHb@R*D)0rCsmT?1B^zx=?4X!+?qM?^6PMt4*IP3uJMHK^ZGmU+ks7uCCH| zT@&~E!&9ZALD4FU8*T!Gn54o(mSt{4U-u^#4SF~}rM<^P@{lVV9X3@JBByNTRv$_( zl5)xGx>iPH&dz1foDyEwBpb~XVU+!*#7j>+FRI>PloH%{hYk-Btnorh7}3RCGOW8+ z!d;K(&Wg}!5`M;@P6#npuVTLu{Lc5fZ}w)p;G{4nqJ2+wRYE|BM6seo9ZSl@NDyf> zL}1e$Y!W_)DPM4v5QA|Hp>R?o%CNO3TH>P?o|$L=*rXjE*BF>={Wc&QGj$2cs~h5k z`f#~OVj}oj>oR|!P10gnnyH8fXcOR~F}`W0WvFDA9XKXOUGT6lpP4EmG(opxYlj~I}go4b`$p#4z5D@<~w|~ zr$z7!bb9BKSJ$TJ)Ow*4NkLnPRv3x=;^AtG$vVYGvq`V}OrVB1@CwGJ@D!(Zx!(o9bRzu?0%6nraaCGHA;Pi%ec?)P1URNWE1mMa9Y_ z=&3mQR2j1%gUwPxVsupXULVV%Lk*5gM}DevwtDsP(aYe>zb9XwKg^tx7yQ^L_?>HhyoQ}kR5baaQ0*Jh4qOvfH630Q{_iBYm=e|h`(YI^k z+NxDdejc5D9q7P79Rh&_g1H6PcT{N$ik9V!s0sQfH7k~UIE}Q#h}2x^ei+e?j3(K zmcMU;4>&5xc2hd}^_}5{_suePkX0~>$H_8mw;e=<@%xk|THd_|L81$42A5%pTMQ)5 zVCSbLRF&hs_38ufz>J%E{yo^?N2GR5NZR#6o4IwjdntjiI8uEwOJZjC-ys}ez|Wmy*8ej>`(-wYGzDU{q$l0hg# z7u3y-pj&8j8ffEZoc!`MZM^e5ut1-&h1?2ASo*Y2sil(MDX`16G@1{-JX7>ySb~5x z!;ls%4U9{4Z&gOHpFF;>wWsNrnzAIet{KNP;*gIE^wmpWE1U<{0(9smniU#H8w@%q z?5Q}eY4^khi8smc8P++GqO}FHwG=oH(P9<#ZsBxu#Dz1)NH~dn`E+(US_@$ z_D$r06Wg20y3?XUjiS$*EvRLe6v*fHXAL$LA%*J@rLa79`|;`PFcKP?<&C}JHqp`H z(U%6JR32I)_%!eE4);Xz6PsY*j z`jBzrfm!G8D6#I2!P#f4lvGS+zsbks%{A(lYy7hNNN016o|x?KVf zz7;5@&FNf=DK!loSLjURBXM~HzMa8fT~)UX>uBq`3dQZD%rYrR>Ws`aC;)79REo_T zfW&?JEn&1~%k+VwSjfw~kAi#;_<_4+_)42QyI6&qGo3H9Q)V-l{?l>}(L#kQ7%IQ+ zd<-GY#RYA$%kSPt?u>P1p*8z5)ZPx=b*>aIMu4RRYxpHnKqa7`6u?6xk-f~+h0Co1 z&93bo(cTHI)a#ZXywY>fN;5?lQNBy>fOVBA-K!lxqe9VmBq=j98h0sT`zkL9}S&r^XqGZfM{3~81+3_|*D3~Qum{PBJZD9Q0#+e1Z_ zN4J7zs}@}^ImZd2@kgCTL}|_Tp8&e$sn4_5l)d_1zSqMQ@bD*j)&o*2qakK% zX=$FF*|Ndi8##Fg12%!rN8UwL5&VYPJRxQ~^KyqQ8{m@<&+Vxxjj#8>hzaZgsWa#% z2{`S$cE8#~WmOrDul4|J6@KAG*Uk3O9w;j=n0;y~(A2WB58~Lt=i}=|^JbTWzjSJJ zkce_{B>3+KOt(!xeC4d1yHx|+eWg>om3SFsQM8m=;eg^-oVycjDBWDAtgVndEJB39 zzQ~tCl@9JIcGqk-MS@Q_XAIW*l~Bn*yteRV=Qc>94+gZ;&noY)SixP}D&d0&5c0dO zej^9&EZ2(}Su~EL+kpfN(VcH1rK2igW0UCyxc@ZJ{|n=Xt9Q`) zXOZJ5&LxR=#u_%~`{_NuXIYz48Kt^IqAl3Y;+C3v@1)gW7fC%vJ?=%Ox}KXuXX!&9 z9oOC$>Zw-s!F#Y%6POrHeE3jtM5w#jI;%AOyF*Jwo93KEWes#?_7&r^25Sm8xb>^l zqH-p>vd6klc1>)Mbgt%3tur8WJ@%0tC|C8!)~nd3)oh-SRp~s?acfBAv0khWA%&bX zHHj`$5>XrP)x%C0*Jn8NwU*^zv9+#!L~<9tBxR}e_-iIIE&t7N74i*?Vz=Po-1Qkc zo2A7p)$3il5*sW`h=Inu*~9m*tU=_}q@sGflo*TL$H*cl9V*l;W8zXC7l%l=lysEz zcLkxcO&4r*eN^w%7BcsRfM2~`i6r{<-UUumsdV@Eo}JI#H&LaBtzTDlbC+tYN|=1Z zi4E9sn;uEBY}x9Ga2mkL zcScNVS*-k(wc+(9E4M2?YPo&VDynDqPE}F#P(8+_#CU;8jZHXT0qTg<$OAIisfHYR zG0_|M(0TRfTFXkj$q~>+C+B(|LYKfgar;Q6J9|j#C-l9Zr0ls!ldm^`)qT2a_ylOe zvT!elX)f;Kf~J>1nuc6YNY$|;3JF`*2%ZF=^A>dm5d($#(1B5Y zh>|^Jy6z*!ov-gbThTbw_(nw@TNN}J<(@ni=g?U>_+;MFbtwUulKcPgZoWFQ=6B%TF z@JocNpGf#d)~g>j6TcpHS5d#T#yo*68IIMs8@{X~7GL~06gx>`IW&?lctpH?t4nu! zTV~>dMaA|;wl|c76B7;do%dE2>`v(N(lT?mZf%SSmmt~49-Gj!eqK=HF)N2If=b&xd^5ggP}!erVtpA{QNd@*q6yMvVjO1MU-#o-oyRlt&r%WDVUJ@_rhP${xwfo|+u<_N2d+ z^U&I?>4`^!74OtzS7}kgP=ia>N8-i{SjE1xOQk;2rQZb!#nojGC&YzE8mA_!)_I~m zcBMtH4>n%lSR)o;z|fp`r4N0PJ7$+?-@Jp8F}7*8qyKz2_qm{1CON& z8)4AB7#Kf;M+X+emdSg^)$%6Ad{#AX)fe-U&K`d2%W2h}RHn@8nBj`8wMbg5+jRWiOrO3nRng&-@BTki`tR&37#ljuEBAC>Oi z%D!h9n2zpS(bSEcQYMEbaQfwq{xZvwxl`}Ee z5T6)=X>h`=a<>lG!9)*7xx70rr)n7QdL-D_+jqTKDT`<+j0GkkTc@`V37bc;g}iRn#aDamu%Ijh;==VSP<^5;1J>OYZRZ_;Ovt*+2XI~d=KZD5FJP-PPQ%U zRn7r|?62z_rQvZ5;~NKL7TpmQ;f#rFSrN?cP_Cl;u&_&Zf%N2X80LY1OFj;L=G6;S zv{Xi|6+v@w!u&8^#X7v{EaZ45FgX|9Gl|fy$HQms|V<*{k+6 zUq*PZ3XXCP_jNPFR7R?Vtx_4>ZB!h<|E!}TCbgPzz3t0Q4aVbY>=os{2=;+;6@_~C z!KSlIo#zI1Zz|j-+SjA&LQbx-05v4Nd3SwA01x%ZdE(jhT;Cvxfoe8J$2Wy=oC$@cm#7XYV;vpd{_9`tK(V6{F^$J6AP65 z`^|^c1|T?Lk&8X(%Sm2QcwDh)H_2Tv)PKAg&JYIQZ&l~lVmO?!V$dWsj$z~OgCnv^ShVHb~ zve>s@G*pwD!y+0bJrL@)xU}c6{&l*nQ8P62;}`mu~D<%P3zhq%Oo@GHUgj z{p2JwB1eyv?X}9Qd;2apd(AIz19k=_WLD0{&a0;SGV)1+CLY;fQ#v}(=`s`>^+-ld zDHrFx?D*uEM0Xl6I9+@wg*$X&Qvs)WUa)=AS=Rs)zZ5`@TX{HLnATP z#JXV4a>vDGek$0W;2P8|raE^H`!-^7i+~h^S50C#hoZt^e2^zX?fLOX552<}StexZ zChanc^e}PlpQdDMmWb3f)z=Ce7qFmqtfe^A-viL|JAr0Py(8(uqt;C_2^|2OmO8>J zwnHh<2w0_8?T8m^_0ZPp6?2m8lvxEFE}R5_$9ApZ;IqC~VqZg*O-VzROWOpyE{wrW zgUUELbIH|9X#8{zD#sJBeps>*JKme}&U<$au*-*Pd!@6ucZ&u|W3!1kM1Jih^#hPf zwEWnV+rWRke%~~E8UYGzGi1lXp)`6>N5QmE?LmIh%FcwG$xq#@cPo^!BgF?vclVw> zN~l`T#&zP8z84S* zR;!n6u=C-O1g7!O_#qC)hER%Ud>MRuF1;r}o(#9%iP3sIa1M;P`S!Jg({Ohifc?8? z_+`g|dcOa7#bwyjTy`-VJknovaXB0gSq|{B_r+J!uj_c61C!e3>VexpqbwIE2q5uN z14xkj!4V01IJ)Z&%v}2+(iyD5(bL*<99;pB;B~&TMYIby58s8b5-{}DAKyNUJDF<5 zswTA2v1u;g+UBlD&zSLrJcL;7XJL9Uq2_!UAONDs!ovrc%)Io$rHUxk{6HadaHgz=5K(Ykye)TK&)L`v9Rs?muZVb;b0$F(#^fV0bBgR9(J!D zvPEWgUam(z)SKr_<8t7PLN8OLvKnT>b4WeYExwN5;2gWt`)hTUw^KLs(l&AqT|3zO z!zEe>RVo>e@iz5{8>VV@U>WC3^M?nRD;%dKnC;g)Z$N-gS6rG~alT-ehK|HyyBf3uj&*b$G-phUK|do)rVaSpmbKZ5N|Zx8flSI; zWE4QP$0+e>b#f6rE$8a7n!@xl8Q_`HtVtW3WUl7S2yHx|NV<3L9xC;=0Ufb>ldw(; zH*kYR*x`+kc@Cc5oGr7M5}4(v2hX(YPNfyyY2xD9{X!IW$TfhFXuojWK=J53R)7`X zS{4-l2~eak$d?Un1-;d`T#w^C#f7GM`B-h_5$jd{w+w<|ricNAi8t6oi$^R1;~%=H zq~QeC)S{H{Lu<9n28?)-ciA%vgCBs1nNcOpgWnHlLdSYiLiUNpP%AA7-I=pUlJ zAoRR0E-^$+&x&CTS-D`aFceXm6|m?IP_v$F{!!G|Vz1f)J#}+xPJk4&&oW62?^Xv_ z=tgCS^oLI2lRK;o3e`?#(>Yf5sFEu&V>LS(vyL9?zfx6~i<}w=V{3@2&ulab`q;cf zcsN~$X+wUFy6I`+R7H(c=w!wOe~*YnSaVqA{RU)Z^W=60tc3DLU_{Ey_|<8e?WG*h zMr_=7sBgU9eNJb?RJ^v_!YTseTFtU^LaYZfW}6ozjGEM|d_38j(s5{@tB0Yls}eI> z6VupmQsC9ep{e5S6M~h+?%~bSxg?gJ#uB#_)m+1B>wxUSd6 zTm^%W_%td1J164X*t&fTbE8<;aB%Co!GcY~1OB{{iML{^9#7Uve;~__nHFQvpTdG} zD?SEgMTwE77f#%Z1HYjX(#8v-ByJzFR{y|z>ASBZTvP?MNP+%6BiYW!6BnIg_}8LX@sP?a6f;v06XtI#cBMqw^7Ev~KhWWd%c4%LMjULXan-1uoycLLj%xB12r?Tv-pshL-T zdhh5A$(P)xcX>FvaXHAiK4_;V-saLD0=7zh#dTB{Tnk`kGfG8|z*7_jZ@C|i?McDZ z-VBvaMT9nF7iexER1BmTaypnjb>Pb2Wv1Zn$)@!^BgCNY_6d?O^^0WUtqkNZC_b ztA=^*W*+GIbdyVFJ2_?S-31ZtWx}=N28$-&C|{9N%5A7ZG~yjPlI@0`s-MKr}47Ivdg)cTO0MU~XBCTvh|sT;C% z%Sb(Jo>Z$cII=W>8zI`6IKrCJ8(2%Y8taq#Oc4`dsu6^(D_dHe=RWmV=TqsVZfJys zi+{2%K1^po3*&+2WvY%~fV8-|28&do3)DkPT9oTyu+ZR}g)LQlo z_HF1)ZxKpy=(9rS2Y58Z<3Wvf38}r^yQ%AS3R^k{d7+Ep(|N-0;#Ou3P2nr&RvP;3 z*0J&Q%aW)RTt(hrR-L@>h(z~%^m?`4A%>|lhx&mUTd}tcc2I8um=pW{y*(AZ%2-1T z!g!N~>&nC=D;qsoqh%{$YV25T{EK^oFC+YGZt7*9;1aF9->zF}-npC=1U#-u3X8sB zy4ew5`jXwtq8ojtLYqsJ&>PnQj9P)@rtc-uP zVQ>P~Os+FQ$BS-{Vm~brYnP@Wrg~3jB2Xzy%9Gw?9(p)5+3ZoW&PWHAd^W1&xMSVp z#^YmllRcH4_ii^-+aK@i-%+bEKA}{0j8T|BvOSKkeK6Jy;u8Ssw220JPn z*7A(Dqdd@#nuaW~{8$`jIjARH8|9ZAvoO^Dc%srX-@(BLDh*cr3`fNUqQ){gH&z_? zIz)c=$?wo${on>vJ}Edor&?ZhV3x=4uwA zU8PJ?*GxPxa*H=(xmkRg)(igU#I-$UJ5n~9qLf^#!yZ>V_0ISp+;6seeJ98cXJNc+Jv)fBP%S*s|&Nd zZ-X~Gh8CEd%prH%F)-ZjL*v{=o}FYhTrX{YIaR_UM;O#XP31ek#rgVJl}6qeDk9w2 zh!1M)HYCJ17Vg%)TR0_@a8^Vz;hisn;S3|4L+_MAnY@eKN_$1MzE=%P@_q4FC-a}d zX-rV<0HO`7bFUqtdbG=K%hY)y#A0$WUqpB8joEZAV{=UN9U^RvKEwJ z9qW_K71em8CnmS`tf{|L88;w9cuQ=~luBxjbL*SPJfk?|t%Dyj5q_hV>k5ofy=X$b zZF`=gQ|UUx9CAVK$2Mcls`!-5DGSI7Q-b=~)IB>u`qedRa zA*_b&8NH^u4;6)o6PHR7dO6Y2?NSK#=FC(5cUYU#;yXoy2F3&>nxmNYCt|%Pb-G1& zi|^Vvjs5^hEqcx`+Shh3MBlu^vKbv3ce>1dq<}!LCsB~J&wRL3L9K3E zc!*AhL{ZLySk4Pe)TjK0vb!gbm0cq?*1tKz-nh85Jm0%8Vg2w(7b-eIO(WVwkLXxC zX6P1Ubw$1*#KhhFDBkIXE2p1A$!@Z)n1K#)c3{E_7 zA8~AFx?wk05a`O+r$_jx$Q}-hJp~RvQ)9peTKwzo9^g2(*hFYhp&A`@CW^tkdji1C zz2!9b1J#qBOD_wZ>T>1wQkvIEE5zF53WoT2r5&yYDF9_J>Kic5w3AJs_iZ?~Z}@Lw z3ptMdN3s8Sj1oNkPb&XY7yo_*d&U#7md<7w6#f=ne1m^FoGu3JOUv^YG&sw#3_~jy zJGaM1Phq;UbKy-;bREl5&=0?Ebqz$}eTt6pEu#e#WH>$nvvo~x^L#jN{Q`;`hi`A3 zKCkRL6!YvFFh0vZwDd`EenSPn^8ZD(+a5 zk~@{DsK(%ZVNf{~sHsB5BxGkO;RLp|4f*$%i8h^?2bEah`-&w>25ZcDUHacBSc0ku z{8%>um8jv||3N>s&elgHHR6slXOCWOH=)N0B(S3&`?*R*i@h`{eQ?kx(;XCH)I!BQ z?Be#1KuuYNR*HowC{s&KuF{nVwW5_EgYyZt0yMSVF@uVx=6HNs2rww2?T$;=-LuRB zUJ8u5ttFwTQ1{U&$&vRr20==-W@4xj1=W-#MA=}n-zGn1}VK%N@qJ-z#D&0^xZz?xEOlFePrBYHJh8CxaT zrgP;VL8jxWNH}~F*2Swb!DsOrdTtf9BQ2?*1gHk&*-zQE7A1m$pJKD}NR{Qml7rQT zDft^*j_;8$%v)R*kB5q7n0PanIm)@y>hmDVho6ab8R~9ej+PDb6!0>{G`gh@MRkF9 z$giWpOG9(-(eWlnE-(>kbj3r3B9$qYCtW5-+TxT$#2eBY-N!6zZ40@{&3@7;S40}p zj!My9N0{TW0w}6m3j?iSR$5DUkg<5Hxa+XB#M^T#yFe+%qkYU`Mo{6GWkvL@)P;Jk zy>d-2K4J)_5hOV_9iuP>zZ5V4k*REa@QuPoN*x>6h-|h;yB6S_MX48)H`aVX~RNj_YZV`usF+@cgvIOH@G z8e&_pE4)`5l)f~yu}1EqkEyfYs=8NmFhp+@!(dipimuc&A%UcE7rRZrpXnX@(u0F_ zb|p@Pa?cGFdq9m{R^kU9QF1u})E;CuU!RmTo6Is3FOB}FGO>Go9oTHPMcucNE#MTQ z0>F40C|6KPm5fPBvQ@=Cj&~%!FuDiowX;IZp=RjfH-Oa&$ex~jpa~Ghy67%OX7mp{J zY~-$}8z2|tYNF3z21zC^x$n|!nk_;cDm7PK+a(&~iX!b>DrqzZYC#Qpm{;>XIlp+# zyEr*eLFPYVxdAQ%{RFvZ*d?ps!k?Q*p`t{dQa#J$icA^#{B09Wv4{P6Sm)weV+}nS zGm|8^Peb*My98BFwKNt!Y`#(!ab(C>Ek3<9Fa=aOxqvrw!E0v7C~LEWxwxm>8T3gC z@<4|?TPmHTBuskF6vrad9uVG*F2i^1{s5v1E})JtBGpb2oYzp9WUAUD!AMA&M=2A_ zK^*ZP?EzJ?g%3WRMbu}7*IU+*!X?@SG34ikRyB#3=gSqU1NaRs@I>L0t*K6wH+jL2 zf-g@o;5aJl@qod^{lG&){_&ls4k`%QZL18?=02Iy@2hGy6xev2OGY9b%ztf}%7OZz z@T;AdcHD_{l*jn*x~X-{I)$84TT_HXWEI?XFJvy}$?8AWS(OlqX{*lYj5i0`Iofo` z2_8_k=PCh;CAw~D=q65$w@I^3M2dRmbuKW0iXXDZKNYfKr^;22kgN#gl$TH;83^X< z*f1{D&~<8I#Hy8-W0KdsMoqfeaCY1cK~df?0U&>ta**pR!R;R&baRaTKla`-s>-cx z8>YLv1qA69Sad2V-Jp~xE!{1Rba#kIBOxH&T?-VEkWMM-M)>C1TfFb*$M^hv$NS^m zV+|eS+Sj_)yyiUnJdV??(u!uU>YAf6-HvG>Xq(7dDMr2xZdBuTH(4(+Y5*!sY_I$i z4E6QH|MRTlkVj0G-_6Cmn5hB=RI^($y2x>BBPT%!8xfy zk@Zg-2HiOVIe!8te>_;hQF86?*ee5n2&#D_siH_y4W*(~5(3SxvnQ~iEtvZg;oxbc z+ojBsd5@z=^u}IB3-vyZBzrLUOdwt?^_No?(uq~9O{U{a_}!&>tEb#;vQRx5v6%kR@4^Lc{v?sr0O#uPe6obKa z+;iXk3L1A%7pw64PGqY!L6m9f_I&MWnlD|z4r_++gT@|`=TpGSZ}^k|?9Y63>$|2O z0mEtLccq+q5EJ*DtXltxc)`FH@Pa)LCT!WMN%cV~YA8wj8B3up*fP1R(;s#%BBRb} zZ3so+Y%VLMh35EOw?Edce~Y03o}o=?Y5r^aFUs*}*5xy6`SyyxH%G4v7I#2cbmE%6 zS#J%xBV{kTLL$7iNiVm{`ETpZp@F|QkayEH{E>HGjsG^c0+`m6THwl@1}8)o9QN;! zPphzL#Tle3cUae95E=|OxmiCEDXM7(F0B-zoI2`w)ij?>0sxCemWA3N{BzuTET`dh zy3_E@E!*c*Z!fzhzjkM_tYpJkQ={n{z<+D0%6B)EXMvn0d5%(hxhz;^RsN8 zmB6uSc|Qg2HQ@3QF~(+~+g-hl3KYZa^)FcOoGFO2*{_4Y@I2M3{1$u$0#_rGP~peL zE8)VA^`Z+zhMjk+0rWoCwE|B0!CJa`QNqd}@)ZLNys?BieEjENjn!a?Df~Y#0gC8S zV9x@c9#H-HB?cHp6@RFo;m`eW{_&uCnG}?!d?)IxKr+Zr}ASX)x!xwZGZ6~JZrf0$+19B!S)D{xl9B5qBO zYN}me@oIqH{Sd$2@5-OQuM=JM~jCvH{GR4)6oM*K;L<-#WSFd^tkE zY?#b?p|u4@(l9gzYS1=aQd(~H+@?0k<=5=Am$qMl`GWV6lF0dD;4V~TPk)^XOlJ5K zbsyfyOAP@_;OHqa916g4Y8s|P06#qlBKR~n9T*(}MEir~&4*9DHnQAaM{s=JZ_RJ8 zf==(kpzEzZSDt=E8(=07@xu+@-`pg8(3EA|D-K|CD7o8 zb@)|*t9C@-$M6+!2z)z<8p>55;+lfNf&U@f1Q9B82Wwlw!u&a}y%qt_V+##1h)vZ$ zH#$ZeSmo)ZuFxL(pIJ`T@uu93q06rdSr(qrelWw%DlSv~ne(H&`=9Sl%$D8wNy-@-) z{qycCQSFU0wF1WNaCU;pu4y)v9Vqivt$@|Rxi1Knxb+sKt2t7FkBio3qp07j1{6~mTq-0{VCWCqu)7nwqYZC`*)VBMV9AQaluL{ zDvi7GVqQAN$mxQg9~aX0d`r{W%~aLPmRfZX00lDTcz-(EGtYDgAywZJ7v>Lsenai# z{{>OJ$RJ{ftF0N9V$R!z-H|eSKm~%WmBr|;w}OT#%KQAc~Y?ds0Ys3FDad~ zSw}vvVD34FcR&p?CW-rt=Ku@7({eUHaC|OthXpt?RO>$(5nPx8AnSYemmhj_uWE>b zU`XrvE5TX&z&b)jx@?4eu>5w+-fxeisVteY-QRC^`tBlmU2Yccet&Lv!YWw;Bm-n5 zO2P1-0;_@Z@Hm<@{oQz^q|_1Xn#B@U!pi-v#R1@u<4M&|_7)br`f#D(t~F%+MoOqe zd2y_g$TbnoFHkk`59Fh+)-nQV-bYeROZ}2XA#-RZ7+DwWa^vIZYvB&48Pk5ZR|~WX z$&-BPzzF`XnTq??EZ(c|$Cg*7!gLpZhv`c+{PTtv*}SB_VO&{6t4tvf z1^YF4+{gN3@%{i7?itVRk1h8DZ)`cJ1Hb)C^Qg;XlW$rKz-+#sO_9BSz8Pfb&G+-A0;>Tg{7Xh@^dPLNLM-!hy1I~b z*mMt)A?#8@T2#DARvztyLj@UoMW!vX9Cp9C@)K)L<>hMAzi<8l#v4^C9nt^eJllU< zXl}(yi6paIsPyx%-`mw5)Vt($Wuhs|v4IJrx22mJP#IRxBbDQJv5*9Q8*&Pk| z3@al5P7GagCs(iq#@mO=CsiK;kcGvI@`^AJ+-4o*_KJ$4`L~$OLE?Hj56V6p$UZaV zDRVVujXEO>wd=Q8B~e?@hjTpN^A$+EUQBQ{jkEqIwOOi-F-FBQnQ%R7!E z6l{diaa}-do9^XmpS-(&G%DfM^t+^Fhy0sZzf}s=VTV?_dH@o;{3@~dgr0%A5&@7Q3pbqhCPEEVQuqD6z+iZhVi-9RH za&;i7)1I%sQ5Kk!E659P{3&r2??cTrfl~`8F#~KU?;M%|QHeN7lsKJ*tduHqvp*k$ zTyuUpe@;`lyu_DPa4zlhQV6BzSqV$G21tfLhOHv9Rg||AOlr6<%^n;^kyDa^f(-@T zVPZDoPE(yn6LA`IyUEm==C*BayakEv}m!#=3WiOj-+K0CLpXyFpx{OFgg zLH}eRd5Mh+N`1@|DX#O_)WY9>JOQcM%vzF9hu^8Y%Va774>g?@nx2;Zdg!Z55n>8e zv{aC6oLz?(JmZ?{D3C@XHE{l7(gRP#d6^_gyvm|Gj;}@ivgM$cQfY$g#8yj7%_UQF zLPhJ9q^W5lA8a5#(5#7m7^jN;@8m}u`g zdLbM8YCsAQ;H?^wjNFPYw4_|HH0h$lF@9c!QaOgfF!>-#DU4Nuf8WVGmr`Kx0>&?k z{^ScNkMl>M?o|GB8|eF)v^X(ZQOV{7D6cfXhwB7n!EIt9RWcUT-Sly&aFj|bAi)$B zou;*D*jnNUyB7P_Xs9U3X`Srd;6G_ADTt0>rP8n-FNiYmSWB8Om*5XJgr%yaA2xrQ zudRoAU~J%s9gl@CNXO8Kv9gA2L51~LV>OQUjE3?px<7r-XuE^3{15W5WC_wBekL)Y zZc~|a%J;A7B}Uqs`Q~V;^dhN5-l{H@)oJSj5YL91^4Oa;A$hDiR%dAeZ94@A zzMybB&!f1DqoE;L0H;8Ds$;ul4T??k)K(a&*F&c%50!7T`TQoIm49a)xsANEoNKIglgo_GC&^=J-T^fNQ0?u( zJo^9l?(MhLZY2Banl@y9`Ay(It*5eT>T%rg#DrA|qnxjM5QvfRrJms-CC01bS$=5B zP%uol%MLUb6LE9fgU;1cj$Zw~_L-eaBfNDobK6K0d~X1ealbFx>P;?ve<+b<<%ldy z1g+)Rbl(#cVV#gF*4J`D8ue^2g{wleIg>*Aky_Sx23^e&$a0RS%UCj{ERJ3QHccaj z$0I3!W3uk2VNPf(L1#2kNb+y~`OrQ@V$KW>VSCO9PeA+6CrlOL3uopd=I@%+B%&Rl zKAkTu5JAL3a9VYKKXZ4X&1g+n0fcj^y@HOb^8RF@`i}G6jqjYhAKyI@imnQYIKpn< zOn^g=1s};~{CD;w8Enx3m}vg87MAtckEml&v2#n7c!PZDy&vkn$XTKgyrTPe0sa>E zC70M|v?Evkt_Up*F1Y&5{=ut1D+KEdV?!WHWQiVLpq3*fS>IP6;6V=B74 zYm#{2B{At01s>f!T@ryu`Aw%@Bo}w#-(Oq=SIFIv?!E+`|E3MB;X)z2FYfO5x{<)M zd8#5a|LeA(VDQ3~#%4@^7aAuGp8cT=sZZ)ZzZa(huB}$!%gNte!;=MX6M&YW^!Mqp zl97a9^H(s3H~(I%2UmKj|LZ{vi(Ft0H}T63e}9ih3Orkp{XWxw-S&bG91MHvH2U59 zNLt~$1FGdKK*&DqAvKZ*S)VMX`ODtOI1rZW0`4^D1u!)l*6uqgcj5+wjy3=$AB#Op zV}AtZ4H5!VrS&8yi*{M+>R^(>(?n)EHtn)lQ2yvY-TlExMlVI{fFGaOfWp!1flyWf}av7?ks7ZaZ?`Y{>}m^zZ3OB7Aua zcD?2!S)an)iN=~B^t+sbEMW*F#a**zECv09PyW?h*jD?|Qo!W>Gd&gV zpKC~O;ex6gT`*CK`2`5Kkx01anTVh!K1o6ne}AM9?2iy*@5V0LLCwj z8@$N<_q6H5fpmvG!@&vpv+DtJ;I;R4du0@WPQOSJlKVgT(16FrljR17pbIkqq?W8k zV0z3Cyo!S$1c>`EItR_)v%uqmN!!pT89O#6SvH4i&_nHsCR- zh)k|MiHJ3MbQ&-asHUfX6$sA;eErdnaEIo9zTShR*uif7z07g*E8_)d1%3eSGd0oD zern<1>VAU^c`EYYYI%L)=(m4Gvli%$!Y;A#E%-i<)FKeNPcOPqSPYx%i8!7;oEFNG z@H6BzY-T3seI?YRyfQG7?<~+7Z7&>TK{hkxV=ta zHvnd*DWHp%=e`1E9z&lKEf3&?4JvGA>(#(C0_G?Re)S4awo@oj&SK3Fu))|QGW6VJ z^f($3Y&ksu^wiQScad_?ZaV}VfBCD^T||lvP{_!Y?F2pC#+fHM;w}31_EKN-6`0%? zu5nSPjk*?!Pdu$ocb%c@y5<#W%ngnPBtviZ(hE5wEFKc<%L03QhWsBvW?%%`-4p=Z zLUnoL!_|SWpvULpy3p!_Oh5kp<8!U|S0~$(L!ha`VB~v|?|BQvASv#?+#smNd|Ox| zfG)0`fGW_?{TNYtypkQ*aE>SA1xiVRWYM6`9P?@*{*i~5)Hg~I*D@*zUw!RzeU^z5 zEd@Fjd^;L*Vph#qFux(c-XQyD2Am9kgG?qKa7jTVG5^_9QkWYv%@bLynB21pPAU=9?^VNz z*&e{!%R3+<07EY5VFdeT&@E!sJOZuWhpAy5hY04(rVFv27W5i4Y62_BSs*as3> z5@tb#gY`8*?z>9BPkss{WUc!xAulL*q=~qdC$ee{+OR*vQz*7a!EKNZ#(1(f-!efJ zU|R5|wTwhI#Hw$_Ne(NJNj;TIOHNf&h-2T{S>72be}qbXq@h*=lMLSLPSP z32XgvG9aMM6)mH62W=w7Ug)RqnivwOedgQ>Rzu}rGDj)S z$*M!h^-GWkmwVdH)L1(HKAA-I`N*>uBYr9Gkc*rHOy443)EWs(X3yc4qwQb6#R6M; z{;E3-h?n#UC2cFw2_K-9yabh;@Ao6X()z(hCDyOA^pOWu^ILzH&wrZq?E(}*)El~E zkG4T2Bmq<*l|j#+OUJ+^$gPjAXF!YnK8*DL z^b~ig3}_9n!?J8ea}Dh&Xrg)^LjvWfhS}-gEW>f5nXV0`q=0l+;d`0heP1O=`v;gY z*7@I!u@vJQyk1y8c6}s+38efk@t*}>PrVbZbv9M4O3PfuyNMq4Ua)VdiO!* zrt-5o+mGS6k`Nh2l~ws8Gyiokv*@7!i7R;B&S=pRkq`DH6GMF+f8Bi$EC|K(H?0Oo zR;6SNvZ_JKQV^5y0+#eZEM1^=XFoQuD1zkT-u3K8t@>+1POOI4@Y_q(aKF?tOJUM9@0vtSjd^_+yTCF?`UnY~qyj-=GJ^^hhpNw5?b6Dx! zvU4x>0e%2fwcF~^&GBX~&-V5Py+S^#nQA;wV%5^bc}PvM71NhSa@O22zML)ot3U33 zJey8=3$`#~BNv6G15-VSwGQkEmKMzMnEelvlFR`9+s(9<1 zy(U74^Uv4?`AVbq%+q`ReCE)it(}=lx2n)xGsoUq%ZgUzf;c7GhEe)5A>Z-BCkeeP zvisDDsi{TtAP3W!IwD^=y4TZhOwMO^TrYRu5*hgQuU7W#Kr+;xz}>t2@!2K;knUM4 zstFjMx@V&Lb}wRL%$wX$*g{l22nEr#O0}bk2b1xJ7nM+5PcxYOF7_b{FfkAPSIj@Q zM?cb9Gi>p^m_xG-+6{5R;_Jk9gf+nJ`U~Z|!OU~2;E;fyV$f1|B18%Jgfp04j53U|ZWZ)Uq^ryzZ z`1PYs1%sG%^{iMvjR!z!+G1k9A>voPW6L6CR&8hGE_~H|I!e0Z^nsKId)aQj*~5Sf z%(t>W1ZP|LK_q*{A5L}vG+ayF9M^|grazft3Ks(NiY9Iy&nIf%2Kwx)V6ciSVIhF5 zz@JFcggoR1ZbimX3HiiFV>kCpOKDd`$>L7+ zk5l=UMB)uJGhp;#@^he>trPk7>w1tQ=c5p5QRe=a&pu$9dzS470q%b-5t}xSY`2+-VZJ}eZ&%hO__f{3aoGVyg%XXa09GjZWZZ#JH>oZIky*MI zB8gKmb+)I1PzDs1*YfL!cB#kqnC@_=%Mn~9_#w|B5y-8sORs@V zwVP$O5Qr+DWC(m@3+jJsAJVaCR;gL6Zl&MfkpR3F!^`6bnNqXTGeOvt0%+Z3Pm@@S zo2{~ku_jGyCrf10;T%Y(OW6c7OT=0?Na1i%FW53Ks&jT|%!^c`xvT9d9KXNRznvHUG%^ zNo96@w1W~;U7w7HlBDd{t9fx8?+5gI4BxpMcrN)Zw`u)?t0mW8lR|f{P-bDn;b1i* z>#69S=G~6=YEeCJxLq};>IlJ866pp?~;1`2l&*_Oq|9dzE=+t;p$i#odxO6sb?-9NlR$CNjL5<5F9SiwxwF4-Md=PutR#EU|F9V z+K0cMfA!`koZ2Snm?{eNksltvo^RIUzW{L_>k_&d$#*nckWTQ2&)HmI$t&wXGJn>YMe_%ZdthE(B#qcJ>JWZe z=m%KfQx!l;+5B;s9{s$WTHtl+_3^k$xr^laHfWZ<0!G6T%++E&kfFpN7Q6hJ$5{op z&^N;eQU+lVWtdD>>FQK-&Bm^XP|=uyF2BQ%=di#U#2byR3t2mbi5TX_!B%TCwDq+% zKeT_pJvl#Li80uts~G=SH_r;|`zU4zD78cfDB>j_+zOvjBfSSz0A{Ew;s!wa3@Xv~ z9N@ih;7ENBSHHP!0>bRA#`f%PYgju*F%$BIV}Yien!!a5M{;$<#S#ygR) z*S!UiNdq?JB%XNQ0j)uX;1X8m@b(D+CaUIm726#D%_AA`E4D#2WOObRM5P@&YKI^Su?emABWx>55tg3R`l6{er#W zTDX1yq$b(IH1xRQ31Yz)i$1eRz(-Yl1#nRp(dIAq8XxQwebkP%so(?`!Nfkb!sUX` zMs2=Z3-`SD{n=0-$lgPf1TYMSQxFzG0O)4#K96WJl4(4;3{izx%w^T5j zaz@l$0vgaw--$bMZ@xPT*gdYbUyljJ`Yu=Zas%-I&IA|B0 z_we2FVdmL8AU9H{PaOZ9sJ?_X&p}o#fc6m&oo*%#{af4{sx}ckSL^VR*3*wI7RVT0 zi~M#Wzp)&Yz2efWgIcW1SHKArcPpzDBwUKLgdh?_6>~>PN%fELl6SoNCWW#@+>*g$ z#0DJnQ+{|ZMFSt2ti_qLV?C55A|EeacLXp@QmMuCR~#@;Ltt#=mN4>>U`xZ|Qt(0p z43RrkeyZ#uY;Gr-KnPuYBhNk957@?k?}|qvyO$-cArHFj=sprLcUPKFqc9ZT>saR#xuP`e(1fq()ad4Q{qtJ!$EH=a750R<_LtkM-*9a}QCzgi!tl z*<7i_?gZyjRWPNchP%IcFPI|ttzJ~CrF#UYxo2lx2@<%0V=GPP4B)vk$y`R|%8&S! zYH)E;-7PKE`iP*5{3oznps`blyz|5U8#9m`*jD5jP@9-llvsLtPWMH*jy)LGDS9w3 z$NUP!)^nmwu?27*TTheOH(d#F@!@knKHKlUz*7ff_q--jSTd*kR+S)K*K|c&*H70y z7DIdrTHfquv&0G!LurpY#?f{ydCFCUORmAaYEiQ;??)B(nG4awF~u?--5qq0$84dA(@uRtjT*jmJ7F z>S&d;xZfPOm}MHWdg+9X`W({L{QUFl;?&d&l#y3Q>+@f{a5k1^_|F9A1DU92x@rk_ ztUEa~b0xfK=$yyIahfMfwAhGSZ9PpohcI%qxC=YUXZ%xWL_Zz-w#%S+N1GVni-YRM zskPCq4JovJ<`{#2<3oiDn$9KPScZLwHA)WgOyw=zdDoGeMDy4l7+tF5@klzedv@3w z4%f4?Ze()*_YnxZ85%hEJ%Sz53@6UnGX(9jCu5!M^rcTgaY;4KveXN(3+uUzB%t?M4fTyk3TJwe@ip&o1^+Yk2D zrWm%ZL43p2ez!MP`1UeUl#=d{Tx##ST%{)akUfzgGs_r)*`i#D-_&S!tU=*cfrXg1 zhA&)~h9|eI<1qTBg89&7g|P@5-s}Vpj&yy+8nSO5bc z8}{tB&G^2-9_jC~g0Q@Er0siV!~qt*Tmawm3Y15R+JHCNbVZDub++g$wlC zf6B?!o)GmuuiWmX5IjZ0KW3QO62q;M%m7cs7PH-X;Yx>={dgbJX(zaAfDVt-30Dqs zX*{`_N~9_j!Qb5AOA(T40l|j|3u2DEq2s+f! z7**ZJvy!Q~I4QDTXdYlRX9Ca-N?qgbbTlUfmVD@oj>oD`AdUBG;_z4NKU8T&PPjTbJ->KMCZ95 zpBJW-0<;V0!AvZHDN+lA+@QLpBDZMMiW1K@J&|5Jh>qI9oZ3E42Uw=s&iZImz zIk?tPMJW{QG{F}@gM&=0IuHfd_octn)AYtriZ|H{*v>-y7js7px#24^Xr$M>ST!Jl z#;EvX882|2Ays;1dWfLrDl==pQGJxUnU+Fjj22wq@K8*G_L_DwW^On`1x+FyRT^_B zg=g=Vvn zk}3m(Q5qb17-5sgS~5*Q|5ZXv(i+-}@IORvNbkL-v zL2BPdCwaQW&X|KZUk|L&Dm9Xqjqc0+o{gf@Ro=u?z z;q6yrXie=JmJEWu8hTVQT$0z4oiU_9J49*hW4>$Uj^SoMUJ1Q-__3Xs+Q|1N@p;S- zmV>WwN$|=1#oFQ0&kv9zbA3q?VI=ZR-WSsQnwSeTjfYVNq<^~I$Pkj(ySp5Fy zuOS)7tJ3_y74PIElAv;X%*YrW&>$gs(a~!_s+olQgbMooEBRL8YxcN|w!WCuuH&o%)^qHOH-PEgf<7SV&-7zN=2QpiBt5(Aie;)0;XZ*!OOXW!b-G65kYP#$l z^!c;h!n_UI6kC1hmyl8O9WDbFG9jE=w~rOuT`BeUZ;D}l zE)_JWW`U|wbnuc?C2&y~cMpNV{ZTpLHM1-Y!n4<)V@;B=O;Mt!2Einr{;*ph3kgIG zB@eOijGMA+R$5X#i>_o?^$pmdkmf#7iuS#?fQc6OP3Rq-au_OiE0$omSrJofo4rOp zw$I{YwG6H$EL=zXK$KBlGTY4ktgukI{rW9MwLN) zO>)tKhW%#+rRN@m9XlK8ooOxabHoKwfp)Aq>n3k11-0%31W9y~cami7FdY*X*`xSl zwb|^1?6-sY&kezi;2U84&&D!*o?ar{#WLQOdHB<8S#Fi>J)5ZGDv+lrm%qrWP~EWV zYRDFq+7-ead5J&_-67WFL2*ynoOKdz<|3Mu~(i8aNA?Uyq22xkof~A0WvZMD`p{b$RUj z2D?=X1c}Mu@l{}Y&%cw$rg>*^w2HhH^$22E_+N@>v4kZ-@BGOmOy^RrnNuyMkpG(zU|v96-rEFSE_$K8&3XY~m|F8B`{iF3Eb)GDD{~Y`1^JWZaR@C9``!;f!h0Nk z;STiF#3BUR!G_l0>^#peLaNy)<4cpul1aDojp$l*7sYu9ut$l*qA|hCHf~&uw?%y2-Y$yTHc5M&ex($Fig(XG zioScY*xu|DNS9^abt39jFA~UFpoPyQptZ@+p`|Sg$v)z61c|gic@V*MNbq!5$kIoQ zy-rnqLf{ec9$<}XlZneX%)O6A6!)V2nt|1VZoVEEnK(TCW4;d49EKZzUi6@x(Yrh& z7i#v%1Vn9$c0&5k-R1LS-XCxy2S2OXyuv^WE70MCEQuqTEp%DnP6l}xEDCqFV4l9y z!~Rwluze&$i)9Cj%ZRUBL4y{r4GEm^J?lmYT=wP45Vnz-*?W`OO39y{UhapuFE9mJ zi&z6iDeGTI84`HoTXm5V`wn;P&iqP`!6OwV{#ww6cF+@3Nw8|rfy*?DHfE9_Srk+) zN9o5@8&MPgo9T1EQ(L{nv|qtAbFt>4RfC(e+NSCnOGEJt-{Iyp%_rwcP2>ibe7O&= z=NseizZ4g;cgz@`=GFiH>-PQt0@Bye>8XfGW$oW-u7|G!UaRObN>tp)e&1=+KPj{7 zFgtXvj^nlY4R_%%T}ror*T6@b9N$n{+K?meM?Zao;xfDb^aX)Dhs)198Ej=a#D{Lh7)` zG(X)6H4fpCq#836^t4fQ=rfdR29?LbF0|}p<+FtGWjL~!)`rK+ZtlUBf#!@zYSmwh z_4LLJwFq}yq9J_Wuqy70p>!iQy+hgaWw;LA|3Fxf=f}sCrLlx{MLJux0E#!8KPf_( z@Q05Rzlw5ehl%5TIri=HI(_x40}l!Y=C`j9=1`mYHL5;hE`Mtbf&d<*Vf0RJjIwuQ zK^GojXTb2+e^--5*bb*>GdsZW6GxpD$BU9sSEp(puG&z7RQutqIKeHYxLX8_+Bs2edV*lerK8*E;MbM&wRQ$KI2IxJpFzD2;me}Xd5z7?2@%$jLZ;9X|% z_eo0La#8>8Mv)+JAPnSE4>Da2eGFQqTAMon)?4Rvo_ikgn8P5s&aEI(Ek=j?YX+ymftEP;YgQ^ubKMiaMevGTJ{L1#q z$J{0M1eK(5zr}t8sW`SRSM*_dW=;8ZSWW45{qgOQ?}c@r>=%>NflE-=`)X$>sMl)V zxV%XlFl*MW@uRNi@lJGRd=E|`Hhk(*~Hn6Zai|>XB>ncj-+7!`lR!j#*ni@>2?p*)Nupilhc`| zVZtII4yv7=)n32dDg8mG^*$;t;t_glq0{h~)Cc%>q1IvVpMe=Hv+@rnQD+pCA=Ns( z-9o};{U9CK(+EOgoNOamJZ|pf$lSchpwDCKEhtn5@rZ%J*uXR3t_BZT%hcW`YooRJqG0mUQk|xYT@;!u6r76{IelBwe5p*Yj z68eR{X6v`HT7w=FI$4j7mSyjZJ;9SPFU53NMXA5@fn{&!o{a>XepG47$^q#uUT1 zy4IecRxiK(X~RC@9>o6L;9h{YIo{g-H`KRYF7*}=5`+NqZt+N5X&R0iBx$+KmtK2yN_;)*y>LAE>Op>Z#^<=agKufH_Jk#=DdaVzUUfcu zx?|$;)2lZSb3u;GmTPOw>A1>gE!cK-eHmB@Up=gzNG}_DEN?t3pNz>8BAa=1Q3o=X z7adepS?yhD3-B|djJ0PE-B+chPo6XhVzEloOQb<_P1i#@HaKiAU4B(EOokDYgpHc| z@CBb8wCsb-nDKx&fc6bNpP4Rmw2HjEDqZZU_+l2K9bqK4*#qOgD7?)HO-7fW8XaQ=a{b7;cJFSx%&7vkYT#VZtrV5 zWy*Ks;Hzh(9bqS7jfg(FBm?Or(cEH#;pf+SI?v6NMZNgHf{s^{=mBm0B7COaE*=(- zN?95SUQH;PaQm5%Z8|D8#p;#{ra3TGEi-u%`f3pL_i*cFBW3g@8U{bloX;y*GM6aY z1D^!prLVI{kgZ3wynY}*xv}U*Dta>_jWPede+PA$m@M5;@m!EKuV9tp`{WuR?{l5+ z{(N%Y(WMb4bV{#ACPGu*xq~dhZO1|HSz7Y8ww)itPNLdqKR}Sad0r87o&UwC11g>N z=xF3{<;H&31~rm;Y6~lKv$Yy+N3&T6&J_lIi~xt^irvnmpbzQ9QA&H7Oirh+*v3Yw z0fmJtixZHF|dGsthzb7Or!-{+hyGuJM1BhOVK36-L`^)zRS z)$Nvgw_~k%CLl`l|WlWjKerTlrP2;TU0iEUZ z3m#bG%4Ob`P4t-UV&bww?QOKS#rAkSn}*a7nAcu5E27X_@k#tX!5Nw;bpIA@5~+@j zA*!qnsSn;y{HIP)p@c{jjromgc;w5)e5@C6CZx8R&Nxa&d0l7dodeM{VkNfO_A;``wtc!8Np^H}rHpNP3xcST zd?ew5>(F|`YbC=7MI*em(vIqOjj2OJ?aA(aC-B&Drr|Q$UAdyZ!a8cl{^XGmd6fC9 z(0HWhxd@06mq-M!&#{A1lcyx6j#^JeqI(Rxx}IG3cKSN0yNru-qT%9ke&IZQwA2|q zcd0VKA1OY@8AOG)+~0zthR`s3t*7BP*?x>RF=36LTn`#S6B0qzM#KbpOX+XW>U^$T zlO&)-k9~LoNIAY^U!j>~4bW1oNA6MV*m;u}<4mobL!bO=d{amhWHd~^G;=zN;CH6Q zKrOe#9kS_1dj>1M5TJ8FrhsuG|)$3VHd}vn+cAEm`EL1rj<)d{5fk{9K% zM!;Bt#H4lw^4CzpB`lx0=eLBG`j^^im6tNaC>OdbxmMm?$<4I=6WaO48pI*t8SS4u zw&l~=z7Ak&lqU}&Qz+Cm?v)(S2-j-hWKNa$W{KN8zVID5;&ocV^Z0zByIo&8G5eWD zkIwYyTO3u|;m|674r^Sw&jgcJ|4$qR)hn0ZlgZ~U>YfJts?g_4d6YOejKQB$@K>eA zm(uW{#%MNkF0ZprS{aQ$3@kp7MQO|fc-W`08vr09O9-#hsrN8SBEEO)2_)jffde}O z9y5S&_!F#Nz)H;kmyD(+DNed-*r6dMy6G=RmTil?^5mgxEof7*aKRnp4>C#0V?-7z zxm$h2{=JM|QeHzS>->5U(Z4O`hW_I{4zh;=&nyDWCn_0zR?CI=_J(X| zPsCJEnXgNhvn^-{Yx@~`_$nD*rA6vFG`!d6zLNCHmgYZ!%;*`uq%IX>auT?uRd7>r zB81kco^07o4t&1Pwzj2i@Zr%kO>+OKKchRbPUp0L!jH8zOOC`ooh|~wfxsw~{_GXX z{L#eTL$lK9r$}8F++iX6^)ljo?)B3${Gf|fKs2iLD1_4&dgy+ooOR@Iwe~dC+g;yd zbHO2xc->QKKFy_?;CpNxwo})X^F4pghoEJQe&`K+xCN}G>t6Z@{J}o0{t8wANZamI z`=`M0jc|X}28Glm2!*Z>nd2DA6WjLwhzP0*wj2EQ za(gbQI^~&i9IuDf5aYI=SF}w{Cq1cdl2l=TxR~}@;itJIdjX#=+KU&PxB7EFE<+?k zY#bkF1q*@>No3?7baP*e_?A88ETm|OxT7>>AahZP~|vI&w7W0>1s7jDdY2A(Wl1&I%Lx9)}$>Rr2vvxS4~Q==3kXR z7sPiBA0wcm#gFPTUtAIl(#ejVt_J`gWs)rk_;mW4T&iKYpfMY5+e zUsoP+!9tYFW zPksjzU=lGdCh7f#wgrM?A5WVeE$?w7^#78RT{nPbcZt7Fz$QnFUC&rS87Taw>S_w;>UpWl)LNe%BFviM|})Exb! zxLA(K{#x`(4wkQ%eqGPubu~3Aw#-^6P`B$jwHuzTHVgHQ`Rf54F<-`sojBsu6)m25?d7*e|X` zzL?Ehn`25b+g3om;Nv}72~IC9t?M2ABgTO^o}6vXRaY;k$CNtJe9it;7CO4OA}_Ui z^y_~!kzb!_Y`JS^hRFBHMgnYn5MwM}lQ(MyRj45;89VXxa!n-aqnl{63U zpxB9qx#G<(K0%SGdV|RHx#=cEFehyTdt;GI6N*&_s-%Jrr+?Wtc-XMcR2Qp5B+Tc4 zPb>oXS#`LP7!5_BEHA4}z#>c;g>dhHmj1>td#jlK`VQqt55stlu|GcIb$fE=xmOyi z9OT*?t_F!nN?H~Di!u&)0=l8xmY?o`(_ysWXQb=MPVt7uPKa8QLGe9?-;4e)f|dsS z@Au{(A?kmA*B+qM)9&j#_;CkU*DoL-{kvH3=OXCJ^Vwqlxh+f$d=CJ$Wgc>1nA0H) znf+l_yg_=O4whMDD2|i;N7R~OjGnorAn#38EU5={ezn&2onMOw`@Crm=5;&zu9B{ zWH7AtW76dDyEB4O$CSa#I7vtJ{pIL%V}c(z`1CcC>7Q)^6&bKcZVEJGK`j512-FP= zb$A*pGfDpJrz&KH#vTy1(g>f4Cf`{DT0_`wr6jG3{<}@f(O{42pC7(?pt!_N>n-}% zY|uW2@uR*3uhra19N;v(Ap|dzIksGNw*v?VFh*E&*~5(||7;VU8*Go};!z*Iz%Ykj zS{e9zBhOQCV$4c^xiv1h*yz0(zdFwX+oA#X$%9dmb$)&!u@LH=S%4?l2c!$WTfGDMyG#BM*h-*+aD^&=P0Szlaq9f5H~+=RxMKmzG67Bh z(qK!)|JQP|0u!5zF*eP3cLmV>=)qmZXgLOdxdXDKuAU%kr^uODKH%Qljt~jisDjNBpOC~_~s>$^Fzh(B$BYM5M`en1# zM(y=^FytfU>d%Awj+euhLN1qynN>Hp!_$1lLb9hyf8JV(Zoe-z-%+b@M#?e*7M8<4 zl9gy56f%@~&H9gbW+_FF}p%oLxjXULIRMfw;;JCNeWc zSs60AluRVdXl)f=2~YrHtsOY{dYNnbO4Ggv7dG>nX~O287@irv5^`Eke7N~l9{iKe zjDVWzTiX%%p9KJkIwf8`hgoD=Uvx=*5r^d-Vn*hsDmB zl9d1mRY3e1;3{f@f{=s~+_j?_sFHh6(x3Ec8+NGsx!GaAuAvk3 zUp9CzjdgA}{zWwSx{u79!CdMiUt(#k;k;nTl^0e77@3b)?xGwJTUG~Cc zifOQ#T~2wOW99R|{(%-=1t_J1(ZAx=8D5=6Q;DcvEB zv^1NRE-C4fu5Y&Z`=6V0bH3YiF5K4UeP?FPteW*a>xxP(4H69i1Y9)j|DnfyU0{Oc zy_eaq@CQvpH8!))`E52|ojH1wFXRqgMEimft;jLoPO= zp^h@i6iXh2Dvug*cpNETQZt--`b@&{Tx+7$z_avdSq;_#Pt$Kb6n3q+P}9$8FKPwc z9ahmpU;GEW$HqH18{xSy>eI6a{8Z9;T0Am#U~?&0On+R$dy619xi{7(D<=nRPy%P}o!2MfU&6>9EJ7WOoP>6bJbZnll6i{o{P zQz34idy8Q{mt8?D=%AeJ^yfNqrKjzMePDJ)XO%*>o!@iHH}EVlzjc72;3|N6DEZ~X zARk)sQ2Z#sP}OyA_0`24IR7rYT=w^mGDF~CSfu8i+>2Y2QRCu`lDOYh+0Xovt1Tr* zqyw|H2ct>5%iWGmd(#^;(7jz;3&W#Gz=bkUx1c5j<-N=F){oJvYUFGHjEn}D3Y4cf z!As%5!tG()ZxUB>Uy%o21yaZXuF25Vl#iZk16J9`hJal&o%Mr#G{*%w0`Vd9Th}!F zSBemW6qc$jNC-91RBz7OZ$+-&iFV3ept`d>1 zr_~k-agKoK{=}m&`K6uB)1{o1Bv`1a9L*f2>3HO5+z=c|auT4U>~EFVD#F4G>4t)- z)A^x!#)Q&3a&CK;JPlgLkBd5FQ(nWgn9M`8n07IX8wW!7>x3J^0gs1#L#yX?Tm8*q zf_5eddI&R%LUVYD9M;KvN9P^(MrQ=N8X5YnH4L-alR6PHVDoqKv%!a80ErtKk<6PO zwqT{lTQ}B9rd_)k+BsKn=R9nrU$$zZg7Awh({rdLR^0~x5Bvl`0>yOtS7X{fD0wlSz=+Kw@xT9V2K|5iy zhxxtg8If8Fr7JxErIA@D#m=414qH5!P@aC1`%NBt1 z=Ko!FxjFl)(x`*6eto@5sl@NhI8vTR;M-X_`aY-pp2sF5x9OwzFIvu}QoXN?NT_Tn z&Y0h!Scxu@6)f!k(dT`oU8!HGQ=(a?YI^fpp+RtVqU5L_I_P11%^5@GP%|((ZDBb& zte0cZ-=I|LzG*y*J^=>;Z1MQrd&fu&7ucz&Z_kL)U=9lx&5{_G==Rm=PJ(KD)x9g4!s8cF%|pudV1=;=^BMD z^m7eXV_BwDvORJ))Ya%}8P2{YJpySg8GRXsAx@CmoO)~ZFB>)rlinZvU*|A0*2)Q& z*REk`nAPXPu~qd+>7|dsFDk!$n6p5eFz7X0#911>P`1E?Pp1;=xam)JRCj*1cGNvC zu2gJ=GhJ_9>*&!CwmLwhUJdo)rO({ye`Mc{mm2E|>dUa}$(@rnrWS7W z*I~Is;lhX7nfl{bYB0MPxZPcNi%9{1BT#nAj44+c9^XgbeM#ea?skNeQ`r6=;8AX3wp7Z+%5yEyd1#VBQolVg>}V})kru0_VbG%Ck(qU) z4%4=pRkYXaSz|jpMdeyg>LBXZVtTyd=ZNzBP+8{c(o$-<84l@oi`fPe)(<_esi1MS zeCzGjpjuoG@=rk7NU#v;zHm8G=l~^KnX3b(#%hL^>C9VnYf{o(ke~^tk_s2)9}u{P zAffnDOZRq_`_((f*CG+no4B8-^;W~b>?>~{He&T|wM>2@$Z)orqY}hO&f@DID2!e7 z;nF<9hpvpQ>>M<|IO?D=$~hsKq-A1H#GDAQLlVvD*Ya2_jy}h_(T}yD5zR$^g$u1* z8=xiGtAD#Ld-+ZU$$EB|=atU%yy*V#Ort`w$F%uwDih@;nR_V5ck@SF+aeKF*^7iSd4hc%jt{v8;@V3mS&t@t8B4J`<{g!Ea-dpS5yYQls zN>AX-Q>Sgqts``h_BC#MjD|I?K_d3IrNA1u6OJTS&mfh|%I}Y9bz3J+LcCWf)ovjG zZm*Ma)>CIoEtAq%8pEo>h^C%vr<&#YmhgO?xxm~l*yLL$t7|>~xu#3e^Yz=u0D!%O zO@O*zXFwLNdNY4}&R|24MDryrwr&4&Bmqf^pNIY1}VQbCz3Ppvd8r~(dCOS75uzgqtJDLkSwH=3wrW( z6&g+pZM`nZgt7!yjmF9Je6DaXnOvWbPtbVNYyHTs$E#-kh(JavpPJS&ONqv+KY6z= zrNpk+zfe*La8QhIH`eZ{1Y+?B8b9)-y2X1SiZuBXB>(K9Oa^#ey zCz!BMyJlxxHb~1L!K|1z*?P~)d`-BY(q~XRi`K}!|ARA((&e%`lU5ebKZRylHq6*n z!6SD%u*!#^M1|CL(%H&F$ns<)7D1P-w@4mT_cYy;Cd8cIufYzKdx zu+GzxAG8+D$8pZ<(sj>dX~^WLu52kaSw-nH=|!J{)CVyO@4XKOI-LMFS09H90j9}O ze1$UFr>CpTOgb(xSdFI0;6D53MGLvY++C*qjFX=^E*`zfFT#w6yr*Mmr*gmdIpRF< zxH3YzwhENrdq)x59RZ2#?Z_>XPA@E4Eg@R#LeBEvjm@=k}{AX;sQN_mvB2 zY?R(LQx0uhWSAPXKxpo)owMd(G-GdU7}V|DKA_&`|4+(t!kc>liKf>Gl>G1E(J3g(Wu+#)!-6>-Js?^k%avS zRuc{*X8~js3uP1_=kU&AybiKOF}e%rSK`23V1WVPq57^=Whw_l*8%^n)A~1n$mT1T zYESQ00}ee60ODP8F9Fc#(MFg3$8v6fZ{`S~bkCh^0E1380DS%2FGedA6hob2@Vfk? zRH*z4hP-@j?auA)8y_nK_#sz+{Rr#LdZkth;R8Izy`)dVkYRXSQz)b$S2a(bq#n?d z%;hE}Ccabk1h|l|-@4eFJpL~I_S!(+jE!g7IV%Zfz z#>5Gjh>iG1OF*jvc!LKV02H}7Rq+*_h(i{}ELNKdn99440M@rrm)HGIJ|GV3-%SJz zx$@qCa^`H}UBD?q17k+2X#s(20e28B>ho#1fF)l*VKhXw z4kH;*s@+=~CzVI{{|e1XnRAm%ejNppN0ct=V&7W;^MYRp#>d}b{1DuaFLQ@>K?{`6 z&)EA)l3)ZqX?$+lWeggXViuF8N^U+H$VN=FSFnY=y>`>6Fbo{HH1HD&$%_)ohLN#t z(ybbJ&ZpO_luG(Y0S>*SJXNmTod`SUKL;N!4#z&&Z^WdkOclJ7;fu--#ars&%?To7 z?+|$QLanBdW~~@vIfMN?nR*c40B3M`V}az#WN!#9R#D==tdlyp-598{a|R?!{h)h< zcE14Nbugc<%$el~QC>#>==nP{coiT7KPOpxbd$LB94vQCc;?C`J%?eXSL%N>*wr}i zY9=HY0}hEbKp0{_fIm(i{F=9=H7`7D{3D~|D}*DS`Y_)%iN z0V=2+h4-mWs(^#0=7LkT`zUUR{^QRT=N1pMe&VbC=mRN;%N^86P#nEf9PA|SY z_&il*7V!+ryXu@ZV2uJn%46ta`va0KgKN^}qVs{hJLsb`-*5!z1~R5AjDpFRIofH) z0PNp5B)6OJQnSIaVA8y`kp?(LQozGpr_g@vpxx|Qk9sps;rt$*)Z(-`wS4R$kHh+B za=xLx4|$7%AT?+V_-=YN$>*Ms>Vn)Tsw?jm9fy9&2#xFXAR`1&ILHACpzRMaU2Q{L6T1Qb$zfOU=ua7ahY4zhJpt)3?f`ZR}$ z0A_H6ELD(?nd>1I@Kh(c0=TvQ!|$^Xs!RM6EwI;?+1;M z)_F_;yNm@%1PO00A2!8Y>t6GfO&lKf#TtN{ttD>&BJ3;?5*FtW2#gM@0ZF!FMzna; z1Am_a+!f=N!dZhd(geUpEp+pV*^s$=3Y+$m;(}BRb%Bp%$4SVs$DiL6*h$#W&jF0S zaRK>-wi%#%iU!K;=ZU*?U=^UjmOR^=SK*x^!Y-=Vb0;HzO-PY(5LC7Q(JamF4fz&e zmQ^N+sHc#9Y*?;?a06Jp)&L7UPB9t!qype;1o#~VvD)9?1umPjlyL(G)m>>ku znILcbvI0IRMHdBahP-KMo!Auk(SXTh6)>eAqCR;ZdV1uaKivHNY~Q%44Dpd`9l6vk zAZ+qtgya#M9Y~IGr`q8~%$e0?PX0o>y#+Wwl}3P^y%7C2+8&T$g%RDU3I2XU(aHQ-3r*r2)>OkKurhh0<#MY?T6ue>x=4ok^tGNVlgLtFjxyKB z^5ByrlXELTrup$Pz51|`jqp|dxojKhKr(xgJK8VGP6-91w?xlN7nmxV5_XTV;sF}m z_@Z)+b+?f_nmguBH$vb^s%^AT2rN^GGo{ErdsTh4_Wl=6+1_^M7U$?JzZ2iv(QG+^ z*RP3>TN8xZ+l-=l_2h8`Unqi#1tXKGd%{y83%mn0z?ZZ7Y8*X{3DE!J0f>4#8qQ@N z;=_>l2djRGCnu8$MGec;M9-hL7fB~D?&4A`g@4SxJt!aXfAl3#($9{Wax7Rs1H*8B zb1W}T0<>-lU{jvy4CiUmCG|WIUi%Ti+yfxW1@8`PPt(`{svA$!u0tJ0#unE@_A7$m zfSfebIw0Xvi@5dAn`0_S+X~hM#^6>)A{*@o9s{#b3~D-N@n3F@c1RSDNR6F8l3&7@ zramPAW`sQ4C#fiAC?f&EIGxnHa)C7z9BnD69*7HeGY7T@h{)e-L2C$6DkSdS52o(h z(}%jUp<^e<6|2fHYjF4B^Kk_#s9<+S%j{u znQLdd5zeANWSFY|9<80N6n{t56=iF(BG{mA-SK_dtM|w6gbYgItc-MV3F=TKPIjY5Z2W6(=_a}Xr|&AMIsAR>F72rtto(zVvypYi8$z$z!z9*REok6cyenr(C` z(^8)Z!FdrYLDf^eap&mox6Iw?O> zJOu~}937K)Y{L#6PunmSROP3V3?FudUx*Yxutj+WN{xka;0Z2+^s}vpGlZqbMNp^; zc`cTQQ-af(BG?#|ECkB#{;LH5(OHn9=B`@~KLQKpwN@~ONO^!u5-7OUeQWxjp^HVo z)t52>uALm93wPY-A46Lo(CY)Y+ujoEIb9Rh;I($HdhoCUgTDymO~_XOlH;TnVZgvDrE<-b+%$&F|DWVuGOumZ;41ffF(@(GR4H~ zx+WDLepG-J#r%MgtK;r>)aVju|qPD365_M+bzgTQ_^KUu=5M~Fvy-j=5Dc! z?pEfQ__C9IRgEp?NYr(dBBEzH;1mE^Fx#s35h=YU7kHc_*le%fJRuVLFbf!o&6 zk3JDl7ee_b4c)&S+8cBh)sHSxNa+5_3#p=SpbaEYd-_e?C52y(U0g)QU|h_WFIFd| zIbc(rRAAE_)&ka%j?9rDK&FF;S01s0v|l`H(BV~ z_4^*mzy}>@vYfgCP~~{Q7b(a7=|G#UC0#?90G>qbTz#Sc^aNljo%Qd8&2?LH`0TWp zY{^(WQxTr~gVP?Oj&z4qVLhqp_Iu4ETib02-;J-j(ri|MkZsBAm1<4M&fGUGH)oa7 zZSEF-pRXu;Z~Xx67zfdxpIG)z+aSFwOvTQ{09jsiX|YQ&?*G880JjrS5QZQ^L?7Fp zWU0-mxnkk!rrrFxUBf=${Jsc5h#niLewk7(A?W}%z4}^mbaF8?!$eM7GF~6o0`hT; zW%HSX$gyv$g%%ndoC0A0;qzcM=Fo>i zJ;Hm8Pwj=;t}rRQ4pc|WcnyN>B;Q$qr>3Qp8=;yxFLyouyo8f~L6J>BDqjv-w9sWz ztG5%L8h^s2x^6(PwVusw`X#$OkpAm|=?XRhg*)IkIM|$&#l)T^e`&#n_zI;Qu!$-2 zMYFly8kFYRt(&KVSbkXG^mW&EFZmmz-M2drkg()-Kjk_!ByCXq8GVfJ6otx`J;4}| zIsEbT{V^qEbJ9@|W#kQ%%lh{*;5U`!u+6L=GIV2_J@aA)=IKuSh#=uC?TWI#r1wmU zTm?kKGAh?lo;%A6c(?r&&KyaWp*0A9HOF?|MWw~^Gb(b7JzkMV?>Wz`cJo(sv`#fi zb+%1b`WT|AJP7Dkt3HpnfN^n@xY=WlhJ-w;lh8&pDm-rE`il)3e%4mXl&6fU>6TOe_+Z;MI0O{hw|!R z<|tN~H10Wt?q-U&Yj5juXAH#vTBdE!s~phGDHl1JaAsq01zp!sQWtaoIol?W2=nrq z@e`=5@Ix;H`58R46=wc$#Wfriv5ypx=^_hvLvVU~D>|BFE8u#s2Mr&B{fyEr9w_^c zOLR>pAO^6(ko`o=?eIOcRme;RITm}W@NSkVGXPZaBcRqHM8&R52H=gUkhblDyM9sa z`oY|ANDji+#`M~za{Jqjz_lWh09Q;epX}HYRn0yXI;sPGEszm3AFYWRZ)ixy9n3{f zr!Xmsk5-D4?pdGEG6C7o(MxFjRyCR6m6SX&@2lz(qblPet55HX)>`*Ij>0mq4nJjW z+6i^L55WypWz#p>B0gi8_2Nj)@FJ-gjy;pblf(^$3mNh>Ad!QwP4Z*UDu+<}n_QQ( z9NiBppH>gyuDtQ$77pJ{Fc=eMgfOJENUzIllJloEs#IQ0`UIr94N z;`R+Z06+rv=l3d4hxSE$hq*}Pjj5rN?Sx`4UCG-bUCA<&utl)55X+bt6ovTO_k5y>;T(&Q8ir4@orc| zb ztwrp0?9n8^876W7@@gtt*mLDRZBIQa)XMJK4_^!Jc#Pp;iW5c#=*@!Aw8Z{_h3FyX z>I|9~*TZ1IiP(S|DWv}s-^u92S?cgbxh0MAVcSIy13;8vT2+f!Rx*2W9MI zV_o1ZQ?8;^jOX&(F5#O2M$8^^J^W6gr=8@hb|QfauKT}H{hp$UXtEGAN`w>YS}ZDI z8<8iU7Zz?>6ZwC|b6&>UPqxobNjYbkXBn{{S9AREaA#A3%u9LA=;QnBGPoUa6m9tcHtlx@<6>7_dTO(s0-_^u0X`y$Y{wzy zLo6PNlrs$=b)am3;_8`%FSs0a0or=J^z=h9<7xppoWhKU>t$%}OHSpj)nU(Z57ftl z3^At4Ty>scFqc)p{EM4sAfyrby_ed!wqctqaZyVYOtuXQoWw^^Ba)83eoo`iMkP2> z;QEMN?R6t>K9`$WD35^k8s+W5?}n8}bWzP@&Reco=y#3~$~4K?b}BSY6}m8~PpINK ziihy_dpmA~2YuM-qz2fdHJb3Fr3@aciEH13_$*PTcaF-bAo!dCY~T*c%nONDfvXC4 zZTaSr1!nDOv&PYr-W20bz3-Q)sA$$Rd(x90np~nk2K;O>8;MKJ13j{mO*i|f*0zX< z@;?N#b`v5w{268`V(xg!UX!e*aw3(|oxL?!4Uzc1WP;C;axfRKlCqbsDf~3=4?y zDbs?!Dyx2HBm;|u$y}-%b<&^gf{s(m(Q>9THXP9X-pb?xFn< z+W^q~?dV$nCTb^X*B&g4N$WBmGBG{lId;ShkU&{#u&~N#Ps$T$CD_hDV@~`*%|}ue z7Ps^rCeeCeWjaJ7F}68-Y(+ONuK89{C_w?qUr;#LZ=c`iwkD(df*OqDy=Z1A4#d|` zlwJr!l@|UWTiy^Y&Q6U4uXJV_a~>QFU+aI;5~qD(PrJ|6I8j$w zxH{laZy!~Irh3KIP1;ugIGdWn-5t`4OO5^8FA2xF+wH`U51>5CLW{7uFD^+`o zr_;$T53~igir!xWWW|F^fC1f4Sj&+IT!$tH%Dx%C%&)CcSVTv?U87qE=49(A-h||C znq&`&?&X2I3$afG%*u9R?`e*Nrp0kX9`+E$3wXJ!wH#Q8JWa)1=_KNJ|D(vc1ZbWr zcrbNs>TrU)vH8sO4~m8HWl&gotT4+u`jIYbmxPh`S=9l*pD|z(D;0>kder{3P&ye2 z4qKOdrUP@o%>lU=dc#6eXnM9)wLyOTGak=5cS>KiTWl6aNV>Lj1RzD*g)s>!3qHAa zA;xD1^sfWo1^gEA^e;#FD1mc|jBYLRU^I^U@L2#Giu7!)Z30Nqby+Vid+m@EGw_Y+ z!x+HAzMlZj*1}^JGgm(f1i}RkP3Crwg8&gd5a2SsRAy&CBkxpml}hG5AkS)V#G#1F zHMlcm+KMz|qaXhBhjobMIA+=DGZ{WZg>6M`wM&W{W~l-suV`xP3?%onIxQ47S^*n} zr7676SlBozffZ_cohnYKc@WOZj2H`|jwHO1uM2T0F9|es5-@DP0_1;bO@PAFrnsC# zokj=aiEY=Bu5(J$i3D(owQ2Hy@oaR9pW2*rJ`#HjVp5!H;$~#F_v31;PPao_G6LM7 zZBbKy!)Ze=Zs;xDqwpp@*v;=jSHo7&-Q3Ub)wNz7jKV*~_o7i1@dUhAPrj zMZ8hesEW&FNoXHcSnW*4WWP^pr+ti9-#Ihqlp0%LU#~enlS8yt`?e~^9ilNIfOF~= zh`;AH`7TwbWBR*OnN>AHqkBAa0>z4tpyCPK0T-RITe}-7xbd%nZtbfT39h~u)jCrb z$7O`HJSA90zX`vwh<5bO2bb)JdIjzZpk)$O?bnO{6%SEE5Lr1xPhn>xTzc{xsMkEs+CrO}$vg-o2?| zl$Rin9R+$?&<=49{E(GSpS#F(UwqW47~ZzP*bYyjDExj*SG@T$=soHog4@oH2im^D zcZ#JfMKOC3+n4L_QO$UU$WGx)AECIJ-tt%ZgrqF_ZX=1KGkg2Ex+XtFPz~VwD!1dN zV}S1?dj9amQ+r&I*wJN_-_!@TB-G$eVfe@`ID^C&}KM^3g{pWmYPBDji~VH1=t2Z0P=pP+=s4BF_`n zO4CDkVI|L|VA=%iFLpL-JZmO*r4Lz+=J?hQf7Z;ND9zAPGq6STetvj|=)D$=1(=t$d6rI_STbq3$-8tJN;jZa6@-2zf?4bM#+_i5p>?Y-k$~xs! z2egS-#cS@XCH%Tmz52#P(d9IAq}{<~aN07-I89EZY7=kC%b_BUt*_7GH9+jPG_gl2 zE?9*H(kcz_r|z*+LSi^5fp!FM@mBRE$v-3TJu)-c5h!dpqJ)S~AUuCx4zAkYvv9?h z!|FLW#{FDY0eSfMz9xLQbIUGV4KA-ws9FJL>h?$x46ZijqUSVxO_2`h&&LN29aO62 zpSHg$(z@t$JcM&iXzNEuil{IpDOJr(Mlout9U@_Elin)>%Cl%N{W5_*rM7S43178M z(~V}CiqPA9{amUp{sU>w-pE%Uc@)$p#2DG0HZ5RVpdMnl%{W%Sb)9QFw&_o{Rvigg zT)ku*%*mp{bgvw_p($k9-by`fd2(H&PB5rVJDR}cU4zYbxBQ6L+kuB`%nRSgo$$LV}9ECrPdWnk4R%HfSVtjs7>bwUrY zXTNF-d}bKO*%53w<7hrH+l?pRQ}KdEx%nHg5_N^TTMi28JYVfUt$l0i*a;#wJ7;pa zh0uD!d!_MtOIL0Tm?8qOJV`e-F&}MGm8nvib+fETtj=V8l5%C!jFQEe#2upwk+M>| zCNsLj_0K*QwT-)-RAg6WmCG(c^2{?`5hj1fSE&BJFU6HXffuqiQ+@nsN$MSQS;#oeQB2r5g$Y9U5^}p2g<{N4u%S! z{MqiR`?e(!&D>sPmq~nuM}=y?F3LnQ>)$AiIc8;~1vrDmzqsnZ zPji!NnwNbp!ryXdU!m=BHnH)b?Rw!l7esgh?Yy?PV@|kw*K>1LX*aB7GrtJ?;h?1h z`&1`u366XF{2)D1nRP(dD|1LoWYMl<4x=p$;NegWTYG(tOvf&L0 zcFI}V(t!AJ>C(WNam5VHjwz@w7!hIl8t`GE(D(8g#J@^}#hM4@ehz-jUE z0q_1Zh#dxg$YG?I4ty9Raw#L4!J*~`7s2EO2CD=fT?*kK`3C-y5Bokp_SoO=|M~a* zasJu2#QM~s59cy}z`@7LW2wQ`SP0)P>aQ?0c$EK+x#?xb7d9`g&aA)xy8Zuqc;jMR z(D?Qb9AYG9aI|^>_?ezAq5mU<3G&Q}q>R{y7G)Z&#~7z5J$UJWcomSKc+6V2 zVgxl=j6HsT@(X;(1w|rh*1r?r4*FWc8m$2y2dA6wzcK!93T#VM2)~CatV7TlKQE7Q zZL{GIR1Gczlb*}5FUo#)V1)FW39rp3m_@#h^%vgDF&{otO@9Cl&tx~*NeSeFhFwbe|NU2h3Jly~ zK3@|zchGGq;!J{QMiHaLiwZl8?JsTe8&&NHEW}^TF8*|yHCaEeS3*#jFZx+b@0elo zAS0DKm4d22Hdc$2lwcYTaBcltRX=RHg9#6|wQ#jt8gOlo@nL8t^{;3Oh3fDVS&V|_ z&X)K`_j7eA8e{#dBBcsQ0^ha3`J+Z`XlgpR9-#K^e1b3^;BRr61D zq%;Y7v!e5+QR$>4e*c~qg0Eit{_E?)5!nT|t9k6R;`xI6IR|L@jnqa&TVHX?Y?ls- zI=}kgPi&D)$IOA6A@UC%O%?LH3Jb*gL#)X+-gZ3j|2BvUc_(T%sP$gaOL`7mlBh7o zhL@e+EnWT{s%0g8K!R_-ne%_T9ZC*|bAlRM5^npUNVoRcrxrmANs^vgfZM-D+9xbciq zeyO1W8Xu8C``<7Adp?EY66gybWt8_WXSP+I;x~iY6I7@OV}bKiB%rbbg~#oj3YO@# z``$F`t@Fr8NyiCHo+;OAtmcVI!1Mxm3brZY=$E|yLJh&*&2Q)zi$u2OQ>KY0KllD_ ziJk~}^r5EXQeMhtWyrVvc_rygUyb|}|0+eg!ka)*Ao^uRc3Cf4F0`2PzznShX#9`{ zL&qMGHp-i?qJOnEYznAv{7(8g`?GB0dl^8lf`?_KFH1D5qruf^QWp2jU$q0ir0NHx z)CZX3O+Zx1>a?ZocK7it=PK?=uiGwBur)c@*3>1=#3HbXDGu1Vs7$p^(|I5VT*X)W zsE{wBYOC8+IWjgXxFBxZhWgQ|T}x z2YkqnzJP77|2e0B-}q1M)+7>rgDEf2yZ(t>JSKY<&@^HB*X|$w+nN95+R6mxazPbC z`q%*cLEIG{zP`!7N@#r4$OvW{D)_G4_y5i0-)}Z=!JxEEX=}O*U}gtqdt!1#!(kf! zIcZ(F*8uUZW3yt^35oR=C z`#}rr%=P%B(A^KTz^+{XH5QwSFA9DAt3vw9p8PWs|0CIJB`_X6)=gm?*z*Et$Rqo~ z78hgS=ZMD#TfD!j?ChTp@o&2=5rCQ2o8%gLKw!4y?yo`N84m$Um_F-h`^SHZ=szvr z0vk_)th-_W?0L(NP*#snq3~~IBEn6;gMJ+CY=8e}^8W5-VEM$sOl^`1rbzQ(Mx^Ec ziQxY#5wz3YfpTCfs?B`mLRE^UC-#55PX{!jo<9eM8{;yW_^zxa^f4WQr(QkA`kw-H zDZwblHQIO#_klE$rb-u~o~v<86+eCay0V{>ZO)k+|MpgAA~;}%R_!|Ojj#>-;eU3m z;?vgY(0WVbk!v2MBiy3cp9(1ehL**oVI zZhgxYP;}*Rt<4s2l}2d2q~<*{j%6z8dC0XXnC0RvKlA}Q_bO~PVX$#ES5PN?zd6^3 zN9%NNj;Z$6#Py57PSd1~;ycF1b=-^Jfrs|T>7ZOIu5ay1b3vYxt7t4!RPt&ZI*s1? z>AzY4Wk3DLlG+s_k0FR(Gt!D+=Um7NcHlTR=B-^yfDU2H{U1ewgKTEU=JltYON!fb z{I^~A3a8Yk=pQc%rSciQX>Vei?&k9Br>pt1KlCG2TC{0u9GobP-7L)_ONxBelxhZE|9G*Q4${KU#nw)ZbRVWX`83Q&2M(x3 zOZ3Y7?{{gGfto~%i=K)2tBWdffXAs4_oTPk*6;qt|uk5Cq)$VCQ2{taR z7Ee6#Y0m=HM%o9z@TW5z?@B+i-k!W-{pFXbl=CJ>QSKucN>c31+Yp$geoj$M^jCEo z$^zxW!m-TwT?f&OF_{bYK~G{Kh9B~pD}~K1vskM^*alJC{cc(TLx_m)GSv4qg`?*6 zuKmM>M48V^*8LHJGDXlqIiG>C>aDBlN|RmZPhm{d;Y+tX4m_gLv9uJ{4UqlGYS(bY zMfx}$Exa7V3Dn&E^as8(4}95qTAC*X1TH7J%;yR-wdY!@+~)nOOn0C4f5B*3MVb6& zknCxFyk+gp3w5)QYfS3T4*=19@cQvt;uM4cZbTXyg z9OQqUdR5{qXKQ6DY})7Gu+B>IoLDf4M@Z26AlQA+~hFzSZWOK*7I+*<}j;LTUTKh|Q( z`MdERzXBUCl$tHnR=h1Kj9Lx$Art)UE#mV>P}T=!3+J=Q)AfUUyo01#^-w|A2o|?3WOz}UTRCDEp#hBBqmCy*p3rwEG9?x*Hlxf*H|G-G- z!9du4B<|hUQmke(_ z^*$fJss*KTOuVm`87s8Ma5bh#F~=Xh(Khl?RJCuJR1p&O6T*{C?=|t!y4R+y;@Pf<3?oSQh&pzatAQ*)dfzT9Kdaj)FI*ywrvDIuPVag-PM*0lbzb`l>P z+q5(yOknQA;1I;J?YYpBVI7jxDs*euqFa zUs^XT#O(7-M`=tTiArs8Mq!InhB*)W8ao~bf+NNfz_YKoIy8=6u~h^S#7mmqx-IXT zZ(U8lU{9xePrVcmeH?Yno0fn9wQ>FZtnPv~+0v8EIlg?6=+3T0@SX8`qQkCLKyZ`K zP|f>GtV+X$XDdSVS_^`%$&=)dA!|92&AP)tgV4SIapL~hT@nihT06^#^y>{H@HyTG z;e(G#$9x+^gYxQK^SAo0 z!>=@}-^~OLA3@dS9H6U+tXm^dC_eNi(2Av<)4RsE@k2Tcf()}Sug^?{s8q)&>3_N1 z`d5(<`S`ut)v7l%w#CD>AToFsvo~9qO!GEde|si=$*D&fr}QKcwu>JmNoD{<;30Trb1Ev6zu-6U_#par)ab;%8d5596m zkxb~V;Tsb+ifo`wov6BbAnh7D7q2I|hHDu@2!_`9!`Na5daD z$TKxgvxEC6Y=rR^4iJ`aqa5vW>p5r$klnd&{qKeAF3?)%W6yF4>p1^>p_5}|pDm7w zSc-kX{#dR}Y+jg8pIYyHoCu|ESm2L^WQl?totw{`+`d1m2;VKf`HeVKN z$kvNFE>`Gu}P=fZfI4g5a65jeV&IG1zjnUf|N4+3T z(f`&ngF6(veC>f5J#!?!Y2V}9;*Z(EY ztU5+)*+@?J8)S9b(1)LWyh3~Lx~{F!6HK6Nq`XUA{WtcIktmmE$y0omX%1_}h30xM zGG36Iv<3Rro=AkeCpjK{IQje60pte=!$Rt zx}Mmnux$o)3$JNZvRlt0RIwwd@6~kiBDq*o5ArNOOnfY=tku0mwR%Zyvf3fny;_kY z0ai9$S65Ej5F1nyy&T?d*aWFxfzT9)8xpY*|*TT-E0G_+(>M^w?~{^|L>lPu%DV zsx)xaGS!l?aFwFJWQ9lt4!i7q?baK*r8TSg`NSb0W#jA@v0wwLl%89-9~RH3LblnH z&%N8OUOKRJMFg-Ls6*PnB|SmDc0ha|6VWUxh2Fmk@oIF<3{;{>90Rh@TY=WO?QNyC0yH|Q!1FBoMgM_q{_u2}Wlz#8#n~gJ$0YJI-Xf%$ zuIi*6>`3EGCZ!d)H-b&)<;1+Oi1AFz2*IRQeg8R($@It^pFv6a_ACMmW7xvU01YoKQ7bGg46oVzcbBw4 ztOY7U=a25*ln9?CJV=G$I`>zjt$0DKnCG4^kfvO}&pkt?K7Q^`a}P-S1?^woXg zE5@?3zo*AQ#rR6)L5hieuZm8k14oe7i&??RCqWsrkuG|ZCI)dBbet(C9Rk@)XKlN# zip!hY3n9pe^gZqC3IbAklg52;1P+Isi$*rdZv8Zb`CT>GXe6D*l)F|O`_vsv8jpj- zjs4xoaehzzaRq)qN;udLB459r;_AYDFtx?V)6ymSY~}{fb0$k_qgC>+ zE+q0oS@G>YXIS(`&WI?p)3E5c)5ONw`|7I6%`&;v(j(`qXnsi(pxd|MqGo^_VN`fY zwT+q1x}5f8lHR=QZ~}@qnx*uiK#x)WnP-zs*Pl}AIO)f6aEm*fWf9+tlViIA_0&^o znH)Q1cvvsp7XCvHAMcEf zI1;mxn@gaU!)_e^fzp~iaXZban;#nikG_*aL*8#iwCuw<;A!`r!~$QnUUKzESh&k`;iC{hG$t9lU6)NrDjgor zu*{a*-CDH~9t*JEy&w|C%8a)5bZwmr(Z(W--h0$mg2siL5tzAF%*Xd^d8hdq83Hy! z#T444=Ee=y8}T&+t+(7o5Az5{P@t|Uj*oa-YIlfduA>lgAhudw+>m{jT`h2^Me=}N z9X2oE4^#lEVR?*Wl^a)FFB<)mj$or>jvLW$;^gLF+lgCd~qRmuud$Q9bWZ<>U zVlGt@CYOjUA7KA1(>1UEeav5Z)=B7gH(F><*s5`L=pk$KZJu)??!sftvhG2I3rJ^Ik6(mp3;PJ8C5o))l?S*`?D!*0hZ z4h1ZcZ%t0#TgSN{T_UCTVm_BlE56x~;oER-%xtu}NJrgIjDZ%nLGe?TP6~5_@{%mi zB|TjeeQdJX9b$$BpIToM*S%$7onb^6(?zMHOR;wdZ|3N-Zyh}A07h8W=6O;k!JiZVDPY&TbE!lBslcbjW0W(46@|)k$gkSK>`m?HvmHnh>}eBS&b&b zZRVz-D|=qHlvv(Sft(E8j#4|SMfs8nLp<05s02La+>4GkoS;xKa?Sy)T#r~%t^tR! zOQ_$br>NST>F?h4lke$nH_X+mY0Z+)D5nLwg)$?T891NIZ0pB(j(pEju1uc8wM}b@ z!AZQd$9e1U9~3YU!fRWByjlo@s8Id1ImbkS9!vED7LsUEYR>M*>BoxKB3cQJ)YF{H zFXmVGSU+-Ip{$SFBSUm^l;F!^pCQstZ$PhaUmKUz@Y(&t0Gu>1X4F;S*$aTV`iJzX zFs@yf!@P>p_mpUfm?oMaU@Wb1v*kyl;P2d1`+JzM5BFfJ&7Y-IiuH(OKKq$=`E@JA z|4=mdY;Z9WEwKI*^`TTxG&fv-kTN%!gXALG$(R!Ab5qh;_WUfCd2@&Lcqevm7`=>uHlLLVuzF{gN#w&^y ziUmEVnR;vOCxD)lMn-=1OYU+q#3_lg|4sZ~6mZ|yII4V#sQ8(uPda}XE`jmrCrN)%qKINd-T2DFXATG34w(7CG ztiU{bwwWq64Qh={^)1PFlUec6C+`Fmb|C3klM|zq4z-?CB1s%)9~{8n%8KQMi`W@4 zsLIi|Brs$cf0>Sst+iZNzO`t^`WpFoIqXs+t6JG_qTi`+c|KhnVT!nT7iMbFDs6^^ z(wtK#1S@MP#6JmiNVqRZv+L%k_^IEfp|5#mIp{k9Lj>Q;8AbkmO%yg(3Xs^JwjAAj z5z7*9;L0Wh8W1@zO=54kjGP_pe0CzH$@cwm!d^t>lyz(nB`}myfpB~7j?0$ zX)Q>S+Nv~Uu4vbGsH>G37w2>eqvXKOcqvq+N}+>Tt7W~i+*qRr)NvD-nEG*VKT{hQ zgNkoI==yr|3+Bl)YB1-03X3jML+fmA}hu5}TdI#-p)1 zPX+Pru-vr&6CUtm0kJM5^iaQNBd(yyiC4a(i6LOA5Jt>O*ucHhAgU!(16d?Q4>hGDQxwTDxxbkJRch4{9T zSa2rK`kZ?Qo7p;eZpsBMWn7s-Z&j7J?{O(7O?7f-It$Mx;B!W-K-dHD?qI|tHM8-J z+qc-(Ijd#tMc3W<(`Y-UZh^iB@b}3Be`6Mis z@=_UJ<^i?d>65PducQjtnJ)01mxu-5lbK90e81EkBdcHK?hL7ZeW;eKq|Hk1`El{Z zsYwB|ySo4wwNVcZ3?9u)^JYSz~xo%)Kz5c6kuJMKkkkO7mA&5Eq+ z^iOfi!F#{*(nX7ULl1LM?^V#saJbddMgONgBs90vQq#8CAYNFXc<7(G^K_vxj3y7K zc}CB>eWujQjrUrdi;20?wm?~CQPV@PC1;sK$qc$hl%-1YYZmQ0gdQ3ji=5uP*nIG* z$U0pqD+9|HKLJ;F>)hS8C*Ugn-@x_HN_0oWB&$BnmxF4+b7Jk%@Jk6B6*S+zn+*^Y zr3>pRAnvBjZ7XD#4coXu)U3y&fbZE7$Ruald8Zgi;m9&sxee#k5%)i690k5+=E|@% z4+|GSTk?Qv!xa@Q8XF%?A5mr&!h74SPKs_8z@yV+iN`CP*`Ug;FJVp@*yo4)c-t58 zv&-@lk<1*Uof4fGF%A_R6>k>Uv%oLSr6ZQJBSeyraO8jb4;lI**)H^KJKdeW7=%F6 zT%usZKw+9YRL1x02ptLLwbW7=rOXpZ%24oGedxSjt&V!Fst1uR^m}#bb6OvQ-^UNL zTj9B`DZ4(&Z5cm=BFT?#zX*+bv2701^-=UeZnVM&65yrY!mwfPF|R@N3B(PLbh?a{ zy=KhQXw8ErUo@Y6H6WBsefu9n$$xO;t4@ln7GP{E^Nws339nie#WT%jsgaJjLq7HW3zLfa3Wfv zl;{yX{shrkxR-fZ397=*a18z_N+p(YItRN7KfgfJPTh)+8cg;#r3Jnx+7RmO4UUz zK!y$j*}I}aPM8HCGBQ(~^iGx8oHY4_xKwYm6NsZ_HEw0A~_SsaRMXguggq(d|^Y=^8tEZq6xy1j9qd-|UissLdZx zLVie{`Y>g!%&P_1i|2caIY)UzI2qtPxnYYJsOB+eL z_{zMIo%WO0rUJT}xNf7a5p^fmy;5X`!e3TR_~_l9 zj&`BDP0X%8`Yom>Ofzju&|zMBbJ{EStD(;TeD$4DhjXuOX4dm|@^Xhmc6CTE*AT@T zydN)Azlt!0-)b@XvD^_0_5I~_Hj`8K4&Lq5`0c(R zG9x=z>`6<;sR*hCul6^-b==e9ZivAa2?;Dsi#RS2AW~kn_h*EE^AG zNyr<$H~FZDr0MOeUxw|vRfziBNpqxe@)V_Lq2mx{$if1YfXUd)8~xNR@Z~|2x1E=% z2k#(3K2s?rxmVu7fDzcbF1RRLY26HT(e3n71M@wo}g1qI)&p8wpTSk z*{thk;dbDUC@sLrmH*qk2P)_01esXp1XY_w0kC!n1KFE%O02U@2oL_+Hf}*txvAB9 z+-HrvvN5#H>()-l`|poNPo!cx=knSeZNUS~d5POq-yqD}Y5I0`Zu>2vxK&#L(GshC zIL70SHAilGJEXAm9Jk})M7)vmh*ypphzlzpPs(I`qOU(8v zgaw)z6NaW2JW+iVAr|WmBd@YyH9-^RbPL>=id>b7ryyG}RK8K8*F*L0-n$nXRZ&#p zmwk7Y(J8l*6nvXI6cdC0!mE5(9-w5`L6PkhheDd9gW2;p=sr=K)N0@>J6+=dY!QcDu4I`f!sAVOk9qZ&G8DOtD&*E;HwwIFeRkg7mfVf{7iVZL8q4vPTW(j0eu2fbuq0R$<|pTEqmtpx-aKAY z7Q;Iu`CCj1eaKC+DTyYXVhbSmv@|3*yRUUCOBw~G1*rty1U24nwz?1o<)VUN3$jzI ze}@sjO@mHPh4}$X9*-g8;@yV4>=QJ3(;c3MC(uP^3P3Jg<-fNfA4;qT+%xcH{KVpq z`0+AD4iXg`@v80GVZGke+XCkwDVQXvWSl=blwhQJh>nPE9ZkCJ;UlLH&oXzevkbeJ;+$)Kx^_;sc9=i)FN{a5{R z1vMutudmm>ccXgix0j!9Z%qDpQRIP_nMqN$s^+>gP?W`si0+i>uVB4@LDzQeO5@ZA zl)Ho=?%@b?KaaI8f*vu9@u3mp4QirAV<+*4P9e-A@beZcW~h}L%P}R2Ee>LBj9JvG zB#=m_M};4-baCtC6;Jaq$a7GG!vxe8F{&^|x2oP_5E$#Z^#jyp3BUUzx#}Qcs7L6P)cx5Q01x}Q~s?=KL?80>)-BNi_pxo&tnqL1PzUX80uvGY+uZLf$hvdsC zOYcyb1;xoqSNG3HxsA2;>nN7Di!MEu)8o9qa}1oX-`{?A?U{{0%fMHFix1a+9#;ITC89&XV;bP{DMm+8yr zD0xBhH)_xy`|I#wR_`daNSuFxgSYvcz8BHQ3=7>B4ZT4(dBglV-yb8$=#nh*xnJw= z|4n=8y5%i#^8&P{SCq)Y)DW#KW%}6ff(D;7&8C}B53X*+pwkCvQt&HSFW1`;zOL$) zxzD*aCkMP1F>iDH>!)TTcy&jZ)-Rrea*Kmj*+aGJ^qIf4uS9PWZV|2%yr-G-oRNH` z<8490yVSR+w#d{|^>s+3uhnr@E|u2?QTlc9^G#hKTM55*8`NQQ#RG{X`AD~U?Xn^h z6;BmG*Dj(~XgdAOGO1eLfa=JRY>50$Uoe9h*L^n&xqp+cCg7Z#r@#*%oT~>AH-Y(aA7HoFgVR$clXoeX=b&jI?PRmhPDUuqA|&C$QV(F zSIiZzoT&;i7WwGQyuB$3x<9i&eBx=I;s2YbS?~e%+4NM>4#MlteWU#eL9Ya}bajMv zI5~Jd@)V<#dh)mtCU{vj*YmscE%&S@ag-Uiwl%6l5#=+O4!DB`g<(_1rhOe5^@NT> z_#7M;Lv3h?NrVCCx3=x}6kq6lRiWJK2UVe8e`g!D;H0&gX6|VXoZYgOU9SGjt+G^b zs4lerVD3b%i&wK*WOHkUyAoIJy<5Ka>y3AwFG-_l&bN}D8O*J+5>tb=kMp*FsI^uo z)n#B2*mk5h@W)*wm_OPKd>b_HuW1dM1+~Z($dg}zC5sBlo6C!uZc193 zS5s6-$G zyp95PGbHp9>Ld*wRnpQpLx&P#aJN&)7~4LlBTIK5I;ZyHb{V(K=IZ!O-Po1#7}bLj zzTlkacd=vn`lAtl_W>Q$nEe|nAx5t#4cN`(m|qP{wy3_{kJ}r9lo_MS7aqW4O6eL= z>8&&NWEfYx^Nz!eD%C0t;kryME!tFzISLBH$ZrM-^q4#5 zKmC|;dnAN%R|*n^*YAekfrl4AHW~^&$|KWa_Qp~xb2F3!La|#H_hme8+~kO9c5w(T zo#WGPbPTI~RDYFi2Zpfh{0|!t5CYa9Ebdt?y0(cNOcF+#hSo@3qLD_X#~8TK-|bD# zsrO)&B1EB4goT!eiFdE5Oy5l5zK!41Wug*H1YiKtizf=mRC5%8%#2&pgv(K#9&fR` zdO@<0ZmQb(qtk$2%Kj$X0e$P8wr+Vi;q|*IVjhC4ROjk`# z^V=oGi*f)p-gq5{v&Mqn?Y*=wlP$RGqJ@$*ikpm-)s^QguELszMPlwRLY)@^Hge6* zg7nuF(lsG_z%xTDpW-RXj|(YEC_GXSq`EbOal;RM4rkS(b4%W4;r(2mg`_0Du@ca! ze(^0Q%5kjVA;{-_k*bRyPT%^Whr^=9N+u0EQ(zRcpyYacNR9AGMUileO^qNhh)QXiEo0Uk60TH!JfHH@xGS zOWnuY3XZbPYhI$rK;8=x&HZW`YOlIh`MX^b?&&_!0r>-A~Z4vJG}Tss4~od7q-wkmwYuNQV=e z+7HmmXeqr-Q_*Kvw>xnw(OmI>u1(7Q0X2CcaOy{YF(mii(d4VMe`RT7SuA4s5`a_@ z1@5scKJRB_YOCmNhC1~5$HpBMzn13lR8(D6-5-gZdW5?Ya z_C*SH%>HVdMCGBp+Cl+~X0>11k4P+LFd9uzfvChT2@%3!cZY{`O8aR1uIn#P`Q>Qa305O;K3f)lQO7lpx7r|}k5u{8PB}@D zj8t%>r^8|K>J?(r#NxMfv|A6FAXnQ=M6){XRhRP4RBdv7p#(?Pi_zW(%=oPXB@~of zj|J+r;i~dVBql`s3@e;GOVD>zFpfk<@S4MhTBZ!aq@99ic}})pMjxF}{8wYk)s6!h zo`c!}NX>aLRe7f3!sA5(UHb9j`7d^+gP{>AZjgbdd?sJNW=v0# zeW>PJi~sBGv_ff2P^SP+Wd=2)kL?jc)#7V$Vrv_o%UW7i^YGBbJ8?@BJ5bFPM7R7I z@eM=)PjahL-2f>nam6-MjY$AbF_bw#zLD%h@cMSI_PEfye`b}kWxSxLkks) zFmi(>89E~{gM6{P*YCDDdWTPT1XOZ6(>e}#U-BG6LS`|>sDllqn+0(DbV z3d$n@dL^?aG#U+Lh1^+qvqo+<(k&l!->$>@oRWAkWGs7>O=H4P$$R_8x{yCdD1Wd9 zI;wDj0B;8G&ljDQ`Z+bf$VO9nBg!fOyKJsVjCb!0M*rW)KJU@;k9K)XbYVmLp@nVw zXxcKab^@cR?RdhYZ7IIPrnnk-`IJYp@m6lfABLTKA2D#FR;ovYgaXb&zZI)bt%KK~ z2k0{v;via0B$#NRAr{;R&t*-hTlQXTdfIt6$?;9YmWJdX!dcgx^ma4t$y1rqAjblk z(t-;zJk+OJOdH_9`B|C!PwkyIFvv%s4p`uNt8WqQ0qLFpamqqt`N^@M$tr5#-hrhb zMe@0DH3lq2FAJ~4jw)GhsZzt`@b1NST#ISuO7y@1#f73LrPRXV_~`4iw-~bOy${p0 z;xR|RBANQ8`w#Kb;zuPIS`r?uJ44D5jxk1-WZU?VT7vr%dUsXi@c8HL74qvJ9BoXd zSV6nN75y(|>@bD;i^-WYJoc7>S54t8jFuMSRn5S}*aY@ItGy%(@xIMJAJ@*vpo_t6 z5zp-tg+_q{Txmd^950hQd1T>d?X;?)%8BEx z#zq6QVQD_5@63LwH=WUY)D}1imR3stmc^!Ft;d-xIGck6CHy!@v#)0GU+ zXv%)t^b1~G;SX3d-Au8+e3acX5qi-DHpEvKK>2(t(lNP?vu61PfvF!{K$l^J!hKKp zOp$DkOhNn+MOs9k;DGiHl&k?ekEV9dU|auHRsFZKB3Y z+*g)=@6`~yMY|;(oDqCSv|lN6`N0v1o^9Ltsfu|m&0`$8#jgFA@P-mUe1Bmb$HLU7 zPtvgF^=QRau$}-`kySA{{6}*;0yMXPEk4pP6iR7V0Uhj(LyTEyjQ4O!W-H>PwT!3q zfOkQr?P(Kt-)!0o;((*vr*r}p6?R8Mlwp{a`z(()x=N_)%!d&r-jTblvyGNlQ%`ZC zu85yC9k6J9BxPscIC^)zx52p{9#O`B(EPz3hm1XQV+OnD-r@QftS?}fHy{gMV%1j> z49rp2y~W5wYRbXOOPkYeARepb1zDZ{=l1>M{C~zy6mZ)*c*dG?Weomp*iTvDk)oUS z`c{mY$7OxuWD>udJ~ci|n?0kHq9b+xW8b$o3Zz7v)yLqF*SAi>t#xaDw@vkS8&jr^ z7IxuAmaO>jlHc2+z=`U?XanXg3xsD2Q}ykPULkp>6G{8*0A{&B$6(v+9w`#B;%Fa} zp+d0Z+EKkk;6YwRFt3O;eEhlSdkh9vkdJ`y4Boy9k9iVKYIJWemqK_sDI(icQauvR zJF6Zv9bR_<0PU1!(7vkxfI*Yl`Bd?S8X}--XXTO+&zKJOu9vNbP-pd0I635g|*f62c`67Ia#M6>OvP~IYO}aPOwFI?j%R(by1T1}&ZLSd^G3dDJ9@nP3L{_`5`l4o zi)^Z1=4B<~3(_P6mMdb`6j#uW;x2}-V!Ch^nW{>WXxg4?)DL!jN3#zNuzN>xN~=Z1 zu6(4{sjs(8I_J%xo}DciXoPLDZz2nL_IMH_XFZUv9%)FvfWy5U@O4ZK*&D9&n{h>1 z_I9V9K+5?*{>JaQmnvbK0R;pjy(|-l7e$eS6E-fl3o>v|&ssL-IQN2-HIZ`)t$1;d zn-bBF(meJhB}R>J-f5YMlb3u$%qXYY30nXCk@1J5RCxwr(@2G}j>M*Ug1}D^pm%#> z;xH{;7XnW!eCEsuTC?8mtRul8#^TrWt^nRj@NA9qycUZR1@lXeLVvksZLU9kjUZ&Y zo}ZhJ_5lIEKukz~^US+lWvZ*Xl)mG$i?XfNTj`9k;l!NQ6RzxCV&e_9h;8#YsjIx! z`SHoG_B8am@guUu>xR*RzP%#!?c2(_J=31;XUBt6jpXLJHicr@tgds1C$$qgO5aLW>HBKhm`q06~trnai4*x-3{`=+i{6Uh^e#7E#tisEFMccv5D{l@1yPAj|^0?ZG- z$Yxu(r1JV9&~j-G5ma|-63N^$dbh&>jXL-&I7P$nSle#lI)g`b7+wATz~Vnf=J&9f zrDZQ=6Vv3KOXo&^wPr774hWiof{{7{8s_3`HTRU>}D1KE_2>26YaZJ57kXv8a{ zx3GtsVtoq(gdHNN*jQbQ>F(5+oYne)&c=`DZJG@M65;M%u7fu1(~N=Bf};bOK<#AT zUvS?)a-rdih)MHGWVKto6lUpDNV!Qv^lu%r)BRu@mwa7{9 zB$Tm9*P$aUl=WE`za4pY!q%lpdZ~LozRec7pWe8s7FnujrU|r_V81In(&)s_-q}p-(8Yu79J0 z4U%7W@T+r}2&*a&e*wb*C~IYuWqZOUkceVukK$iP6LuS&BgL4KY4@4YF*ofO7 zq$&WBSTK9-;rp8yWjwF0Z2V4}c@vHB%LilAEtD}h-VPgq#Mr^~@7*owi-Az_$vlhm zzi9Mz1zJXw$m%@9o4G3TR_uJj!Q{h$L3ZP}a2~=^BsoOjyFP{DDi zd6NZ^z1$L0hyg4Sy2Kb&tiEQmJ& zN4%zDTm3XoQp)i`Z}BofG-ZqLSP-1L+0=8s<10yD#~%b%D^W;N<1B`|@3#;zTm(vG zVbWPOg&oQ)2ihiY(`GKv*(D^q?|#XP@GoPZJYEUj?ZpeaEe%q4Gp{6s-s9s-pBU6P zl)0)bgdgVN%B!y4=GXi{P_3oO+Pd`lw(M@V8RNcTsyAec`mLrA7vq%#P|%`F@`MaZ zAUw1Jpz9iESt{zt|5pUgV?7&@a@B~GaOf`!a1{;sF2h|w6IuKl%cq}IJ0a@&<%A_j zsErUswUUNFWuO?5h^U01KSi@4z`81|i|%#9#=HLO`VDj?8gOg`&9ou^ zne!Kwc{sHwQsa7)pMpDC6N>>4@g^zDOPl$1{8}t0x*CvY;Ka9r3(qpR$^cR}WQ^dT zg7C$@y&ZjTHQe!)^{Tbysho+-!r67Dp9eFoDW=uofMy;dsJsnCtrLnQ^am9Ob z+LJrPU2!_e1uQb>hAH-2`mLwVkoMw~2-mkRXeVhI1OfSOvKV(m{j387-s1M&h^niG z$70*Vtedz{+&-~8?7pQ>M73z$4M}x?$ z0XzxJwt(vvTS*hTyGMs$n6O&mBA^D5tEV#7Jx55ndXW`WK-)#d4zidTtDOm&f%5Lz z@j|V55w^A#Tm-@cOm-BYBh#XtvoA5sZ%`Qcrnn^|lH5(k8PrJ+IIv;X?~UAkI?MF^ zSRl2Y^>v6o8WO2oJ;}qT0lz9?k^x(WhsHX-gN4%E6Ec)6_<7fjD5$nl=SVW9b?GN< zpj6^v?6|CXTPc44v#w9&u*OQ3Q%XKJ4h#Inlz@i;J6we9c?&e%*sj!~jWz0$dgV@p zz#_@GHZN!krx|HjuH2zS3O*))S{g+vKx5lR{HlM1uABX^b2ANQ-sID0QrFKy(53Ff z464RQbukp7NCWtqk;K-A>e#1M2JX)E+JX5wX>_X5%ABlFpHN>E@Zp(sS+(dCd$`x{ z4fyO8eHgxESFI!A#XY+E?{M>z8>fqc=VErrJh9f{e@y}0;-3;}hGu3!7uqL4oq&nq zaEb;iX`Y42w=G#BW&#-s0_sFJ<9A7^vJAq;PA*d|Ah#n0$u;XjNOOq}tLAlpI|DNc z4bmH*hPZ60g9Ct9;BAwQatM3n_`9$zg+@9^q}f)icf@*nhRrPOg&L$+l(UqjZ>Z~@ zS!F3>*?jHSsR%fXWX|RS1uXDuoD9@VbU%jJlm9ThEX(z#o|J)bHQ8Y}uG=_ck6F{79 z>Dyi>Ow&0livj)~6`ZaZo;xQ8M#E-NCxUCPF{MKsDFgzXA&Sv&K5e>5O2YYEU=p3P zLxA=$4g{*eSWV!|^Z-raE~r>8r}AEBF7+%-CHI{AjYD2CWL<^lLezPx5bH(F#?lNc zg0_7W@Eu13mR&q5cxkG1bLIVcZQa6RRSOh9XtXvVMw=;h?aRuZd5OGtsUeQyVVF^@ zQ8{lsV~qK&TWV^AkTGkR!(x>z&fd*PrRxH<#C(2GSfKPE(ylQ4FonuxtY-Pd;>2?a zs9V#Q>v1yJ|No;k9xP>;yb*T2KVs*5#t?&I<)&6D z=4X%@R2{V{r<$DNUp!mEEWF4>Us5Opr9yGSUFCp;+ZsT6Pq)H9=14z$toY4RV0|gr z(ISDUI=_ORV$cgc zi%R4sBurR92la4k)6W6@v**21x^&p$XUGN7GJ0&C6j8K(sITNdTpWbwqV>P41}xyI zzZM@5xP@VNLaI;alat1_pX8~45Dhi|?cKw5zH#MOeZwd*&j$aC3q5A|SYBJrO3QX6 zg*=_YU%;WP9{Q3+K9+aXJ*vmVuatebyZ`U&JD3@sY}qn3S*N~@=(m2m#VYzZd~rB~ zAY*%=!E>C}wNNpj_;&lo-IC}%ywzq^jSs$ZVphMk(W6OtOu(nQYTdF21e>MpUcj>rb}KcA{38Yd&uF+gErssN z5TywKrsnid@_($$+7IJ(R zXqY`|=eywusXyHUKRM0DsneY%t^>?+(%+O!)?031Y}l1|_w-SJTU<%vD0oV}Gz`iU z(veVWJ*vitWWHY$Oc>_5k8&@~wUl@_kVhk*k%7*Sp}*fetN7mHaMxlQ(>_X`*s+CT z8gp%#)w}%F?Gj!O$nx!j+un`9a0}&Sy8j*>1=ODpgKr_dJ1Hga2YRt$V7&zL0#1{i z4K{z}B4W}-y7J5(xGJaJFNStq6Z#_UE)13qVzMElpK)AqMmhCM3PvNX{W1=k_Je{S zw8X0W4$%-!<>?$H<1%kZ#&+W|#ate~SoOiy|JaZh=le#{f4LmSDp;b-c!raIa_*CT z2|v93wfwXu?MmCuVcNl-nY64neZPO~_ps-6zjPQy{D6uJD^WF~(mU&}E~?<9aRsi* z=Yiz-WKcU>lU*H0JM8JlwIGNe!Iv!esI@{wc~_HNzgNq=I1)5rcM1FIoxJsjNs?uX zF*?SE@=Kv2W2E0O*YECVi>Y?*Sn@Z+G=f9)L>lopI+0by>5Xa*e;7qXG zg{huQTs1C%L_483XhnPCF@rsI$CrK?2yyhIx{NIru5*S{#;?3vKGBL@Z4?MKUw^BxS_kg zFbGO?#*>U924UC#u555DmGruMBg*%;J6D~rtAPRI5v&v(V&9a*p-_8n0Av!MY7@-I zO88(3!FZ4hPf5Lce*`7)tBh0aFFGIb*-XQWGaF4(o1Sm{(NB2Zoys`9gx)#*_S3JG z-6dK#VSwgkS3@w;-Sh2H`VhGXv#s;zuvjX)1OGep>vV?Hh(}2b>7Xvr- zr`9Ofal_|nowHelI#-Jae5WtV+4-=z7%miF;JyD~Tfz^F0uxvp_iq1p3Y}Hgx376` zm2N(}*^K8Dx@}J=E{Oz`{@EU5*l`pkoi0^?zttW_sO-c-&s-}3bYSSC#eUs+0CpkLs)2|D|7swcT2ehB z-pKp6Y)NhuC$VBd->HXtKwdY9$bDOA&A75jjv=R6s}qf3wsS_!bv%`vKnvsZ5)GK2 zb)Y=$!UzVURSt;`vo1tOVL4j3L==z%lk(sANHPF(-}D($Py!wVlf$C)WLbCJ4e92n0&~M?KB0tylet%L~WdJtz1T|d}#BSq4-GSy^rnADb zSl)X*Vp~AoCfV5@@3aCU%CM{xDk`{9jr6g=3|l+Iima8tOF#?WYc^CGVl~4-E7|9# zvCEbM%+*0SJ{yxRL^wSget6#o74|j=knMLb#x_*gd^-;&!~8S_IQLw5u*S2=vEap5 zQU&1y@s(%1VB`l5V6Z?D#aKYdJ79ZkhOfG>Z+^qM#^KX zkX-j&3GIg|wE{|KB04G4wHuVszOq7V_{9ULaC57D%p=Iyss za&*j;|0Z%Iz3u31W8O+@tataW{!AAORR{(~+6EWo^@YC;RxiuQv}h@FrpQ^_%L$7Z z5jLCu%TW8nYy7#(%ZH15#NGWa|7|9}lS@GLeC;=N)%advmCcIEYr* zh*@muR-c9iHjv<%KRN=VhA9)1uzI3=O38^!hyjv1a`#2@Z4=ng+reQrHs9A1rXmcM zA>DI#6?v3eul6FCOx)S1`f}zT5T$g?b6S+v=&02#-^^{nPbGePCdOe_R@gIkRQ~t1i;BkqWJB+t{aSM%9(hvXb2V=7QKCHm_1!+ z^-#2luT>rQmkdgbUY{!MeVJCajQD9AnalM7-0UwL!y)OV4m3ErhRHRS+9`)f6q;K1 zk6rq@inFD-yQ>J5zi)X>;Cf!OOxqH@tEs|KrmrE|oz$|u!%7_4=Td5dOD11||9Yx( zvJ=;}0)a!~5Ze((b|_&Zu)mB`VBSH|j=wEUg`Z)Y7@N?}A@<&dZez-0S!=pwI7GA_ z;ak=0pr7BCJ8|M89Khjar$TMWcMe#vJ8h*n?0lpgBN4H_0rx8T(yzbI5*CF_ZhZ42 zyU?-k?g+{VuXxe^CR5ty79twniJ&^1N%e=uSs1kfl$Emg^^(NrKx>m3x>lD)F zYf>lPwagf*x%Od-JlDPdv9NwL(!-6?3DMz9HKXc)gR76 zW2AQ0=d4FRPDi8S$oH8p%Q;NDU-rLfve$GTO~^hyLC*JU!x!!QC4omLbtR+Hz=CH{ z{YaTv=h$`Ef*rwFn@eRyFEH=X^56WF9vI~`AQ#}N4qtU0jfA)#QN;CyC@IM5$?8%{ z7#j6${qX``#LODfXUUz`;!mf_q$7 z#ts z7DL2oo<$+dW8{0`iU(T;x=kPB^CXFE!lM|9Xo|@M^F|Hn_3bU}Kj6A-;{K8WgHfyn;=2*B$ShhU zXoOzY(@Sv(jx9txQfJa<*$XA@kIkU4FUx;m%^+%9LWlWT`AM+eYmj-rg~q@MnZ&kBIn;CViBB{N{w> z7DjPdripBC!~Y1VqK0HF=^YN5BQ{_K?LRMB#hFx={AJgjVjjtZ&XRV;%0d4N;LjEjNZ&41GUmzHyQ-P!iMim@9N6kqbnXwy@Dj; zd(PwRzPV=j;c7$H%kI)3-6g8Fin@ z{GP>t*{=8om7>*v3rIIdv*aKbOz@1$o+-lRw}7VtU;d!iua>I4?=Ys`vEcrlY24=; zq~MZ_FSK^%E&p%f0Vrb^j&Ozs6S`DTWgb7gWajclll))Iy=7e0UAHz$cS<)1f~0ge zg0xD5bT^Aq=?0M&>5}f2Zj^?Fv@}SIgmm-H<$XV~pLg$b_W5?cto0LQt@$5w&Jov$ z>wOykF3Vf$Ggr(?(Jr#`ZQy8g9}JUhUn{|J3a8#7K9Xja9IRi8LcmhJhFE#TYX&B;i9nQ0G*k7{a{gU7%x!H10DZ?bDlEKR(E=q3ye z?tZD}H2GyN_-2Qh*|SwxGX*Nx<8f|Ci^WC~7DqtgzKmkcMCMDX}boVM!o-Z|VmKk4p;?6D4{aVyPa-u!Neu^35yRrjQdc4mL*c@4v65pV_jxMHRUw-W2J>N~c8=Mp`dJrM zAyCuix$QibpwoDhwJ7*obg0X#zVz%=sQU*5*H$DeBxU{QjTKdUoqDs_dV&-)PJuka zJDI)b4-D+7Rwg%6M|`KhnApzg8b3Iv_n^KkiQeZ@v*F+ueZacJf2XxOFZajC43I+q3GjNkase= zKYCBztNE+}B?ATP)&o_wR0OYgfZ=*x?KuU6pWHZ_bIehkY*X>@>-CxaLe&wqWp|xq zm23mi!D#K+R3GO{E3>)+FNj~yST%e0K05U64bJ+v?`Sj0Dhx{$=OZtQ?P42(_S=-I zJo2?v@G+FI?em$07aC_RYpWtSq&EkhK|{CoHBD!-d%A}g4X1ZMe=}ZX4c;{4 zPfN*Knrd`SAR1ZaFmi5WgvA^D_a|irioS5(n|iKX#`vsV_)kaRl+CDRDtyY&p7pQI zZZoV2%Rzcaw_b^rq^1kGrPScl{>}~tpZ{tm`GDyOq#jLGBh&)7D|Phrz60{19)-4M zg{{GT7>w?8|FGs7$aB|<9bTto%$_$92{6{)-33&K#Evnubc{)P;xrTY3M}EcV+0a8Yjn^ z@5u*cugV{|bOnp`f@)zpfmCJFkyuWx;F^`;`D_2nW&5>sNUh^g%c~@+40`9P0vH_6z-BN6U;Jvem&ZSUV+@v6~7TH1w>ocTXse4&CC%@YdZyBV!ktL19z5 z8EB-hh^NdVNaw=JG>(kA>~2gHp_myeCI9$A7j5EKa9xtBd2>tiYIc`^j#I|69dSB7 z?bTH#tzZjgbA4wJ?xxA-gv;dD{`)^#r=6msh>}NXHI^!xpkOq{+pg2}1Wr_Gdf{Fd zJ5@;U=|?+p8og>=2tQthU;$~B#PxCS6n32i%MMqZ7A+_*9A9I%p8?g`CTVfoq%ncD z+sS>_9hk6iV$~YaO1bF%LAPv0PeEV>V|Tuz{M_ixlVKYzY*`eY{C7C$fjot-!#t}c z*i#H>50ks@5C-nm^D$gXZ?V%j0!c$JH&D~LvU#P*jtk3Wsq2_th=mJvH+tBkLS!X3 z-5O^bKDw$8igMvT%)(nlUWgo5i30`rkqbNq@WpRc7;_4E7>Moq_)BhjcbI((!;3rJwn+;g6S@i__d`p^p1&EcCV(jnLD%oj zW(`;s(a3PF2eet_0&Z9d`DuHKH?Lr66)uogAw+XKwhG`uH8$b9zn!QOF6bN>@}Ip& zN@j|@$AG-soos>hKlpmg(|4MZ9S9PjqZg8a*-1_hy{jd}ul1h;8v-iJ&vFZ~i`JU& zE6ZMZvZ8_6s2azix~L;u`VwS6pKtAY72!RD^j38(OmhjpOnDVIaMd*Jg=j_dWrGi- zHEQs8=NtS3^j|q}C~bK*!EL!xd6Ia1bL?prL(1L)IrUIQa9$co?Su@x-GoPboBMxo zp5h~Tt!{aRoM+5jjW}~zy?QSA1v*r+xrJoqoq?>8Tsxsf^N7+u5S8!DB=k56{P?CV zB>$!X*}@H5BKZB-DPj?6ImORdpVqouP8K{+*Wa7nSYvma-r_Qcar=oVo#={r93D0= zhr^Si5gueLSiG^bzG$ZW-& z);gQ@{lIM#U6W<6lPcP7Bt(@e>&h=$1*fuy;dJy9Ig`wTS+rrt-q2EReWYO9Ts+SK zq!E7oUg;;oc-xM)iFI6;I8eS&^(FMMIepPRs~^6hf;Fb*N(`%3bwbf8Ya0_+&D0+C zZykiIqgTX})3z|yWz60Zz(0_&6|Hny)B#oQu@lvg4WJKx*tg*QaFeKzeJ4*seN0%b z_L!oy4__nU%Vtv4<(tq?kbv5UjDFA0JEy+GsS{~ovB%)4HD9BBN*}oJ_?1OgkHKCR znjSH_N=|_sJFs^DeuO83VR-gS?++ZBvkKb}&b3@j_z^UC@423aS72{OgU%0n(E@L%kjX^_-lVP#t4rMq%@OvGZpPSg&R9`!%d0GP zrH*jAq{+l~&?`UG-+C~}7c=FH4!xiaT8YdO2t`@Kgqy_3@ht_qNE`7e{ zAZ_Jdbjm`ojY24GhXs0f`TIPV9?VKDUF*i<4KGC2l&Y@J%GWAgZfHNtWfj>-9DT+Q(5m-K}rL$ZyDp;bho>u;s0rOBRalU-ZNzu&4} zsu)<)F>w*_k7pu0vWi@arrWMDv$%g0K{Qgqzj@Gz(8e#o9jxsUvQ%XxEL+4R6XVV2 zsG}>nLITIe?`XT%(Uu`Y`&x9ZY^LeZT~xp|)9jNtQWDhe@>m>V7__R66iyNA$6k33 zuvN)r8_(8xLbUVE(TeP0yUjA+j-z#nf7AllxIZkjI$i9!o*1$65~tLQVH1k<4($ke zcRl_w%fiX$N(2>Hi>|XhO2kTaTqviw_wNRe>&(Wro0@(ZE#mCd@J%|zfDEf4yW34~ zwvK{mWw_2YnIO>ChBL8hAA!E^-g@kAXg5{o9+~!yc=q>}_u`A|+z-?QHpty=ujQ`p zx_x|Z)5Bh^e*b0ErE3puncvp~O3spvyemaQvR_s`PX=2Oo=Fh%^cuv~a0J<5&1wPi zPIrvCo=+AnZ+?wq_x?=xS<%9n3c}DYTDON^rrr0j2x|wmYruPIa6b?YNy)-oA*&%+#m{EK!mU>##i!`d5eTXMY~? zo(>{92~el~VTrEe7U6G zT|vN)g=iTvs5%v~g%&k@V1YK;OCM&z8Qs=e;X@>SMaDk_xsptEEJB3QntVivGZdkB zU(Rl>@P)!k$8%xD;*RRDVsT40z0Q;fhMDYU*ibNz3_+^;Wl#`4>QsEviJ%4=DUcaH z8!|qIm6|JxkNvaFsqeqkum5u>50K;2jhP4%e}v{sR=Bv+2k^|GEc~y6WYBHbgB+GQjsGen1j|e7q2uEAE+4HrM9iiTBwh}`Q84Umy$Ad+^7gczxu`bV31Y5 z=NKu#xQ?UyOWEHm)m_}UoD6#~YYU=WrzaFvQ)QSKu#kK}eKGmTw-}*DL`c)HhtzUU z+~LfxP*b5e%Hx-i$I|fb5ca?i@nz3P$3xG*`>s{Hf;nA3$j((1+N{q!-l)u77+I_7 zH)}F8kQO+HiPdh)hd#tm=DoGKR#PjJG}Mg|GKf)pJ)_OWP!_C(b1dqz8R(6~E=L>i z+3wn@MQyvGib1NfL7Qnn9nMKVhG*+Z>TM!D{>dkgu#xI*Yp;9{=VE;ZIh;)KCs~sB zzjbKNHx-ttnAAcB@((IsJ+&nna{5fs&NUmkm2cUT?<@_k&_-*ym_#E=|LTKUKI-(l zsAiv5s0dZf)lV?p{ac7iSSl;>$?6mBnjKYm4N00J#K#q{_XLnn=62FPUWuKnNGt-E zEE5q5>*)xZhVHD`TP#0iR)3uJ3q{|Ph@-HOf+V5D-BeoCN1kDc8c=%bF%AUdxh((LSYuNSx3l$2`yAT8T7{HCxwlD5b=G zfx6N^x}=lBVCE~qjOobygn2t=YWm{Z6V9lufV%$$(cW3A^ARx|4gBox_XDNQ?jOj< zsdr}C5~to|yW9wPPR^B21Gu2Wb?QAj;iFmF&Ch3@q(y5dlJv%A>rt=pLk+{gTy#19 zTgL|PZ}lGoymzD1`QT#u%-2A2@TFvlv7_c{j#r6uwY@v2$o>?Ogfn%3vHc}$`Oy@P z4{YCj2w%LxeUIR|ccNl>n|WQ3wGVo6M&V(^gTqpGB%l&S>7dpA? zFa0GHAWA-PePb(U=;QZ&K}V(&KYCL>)8G{xoFJme-P&~Fd#nt;Ek^o&@{rP22`P{o#&u48cWapT!E|e=os@hB1Nv*BX_OnT zYi8BBrA1Y=_wl&HxUZ|X@X;J2f@o8Sd2^`k9sZ}iQ8<-0_`xDsX?tC|=@&}W%i7D;D#%?I8OU{a_U4R8+ zbF$Ragdp`wKi(8Y&XCVGhB7eTVa6cAd}(RI1@+Sg9Nf47M1Rwl!r6zc(yjm3ZfP_R zssVt*n$#p|?pz+F?Gs8Op9Bc||FE6$RhO*HXjdp+c3nQ1XcJ@oq{(+W3<(4&-}EZQ zq+U|=75iQ&p@-$~muugz@b>AutA)_!&2uqJ<*G@WHm}=iPNDA^Z;m)R2@#Uh@J!|L z+$5wQ?fSG}a+K?MRJtoL!}n>hbc@c0m9h?TuwL7v9)5j|SHUA(Qs14hi2>dIM!wjj z+`RUS*LZXfPRKrHGD095u(_O(N`qiWftD8w%#525#N7qW9U0qIwZ|I=i(Rwq9MM%q zj+Mr6YBPgwH3q(F(`41GGK~eSf?k-A@+S-C6%Yq{+V8w@U-~v_px;tN1SwFTeK$&? z|(h?p!Y+ z4;jQhzikcZleTl!J?#6YNRZ=k;W<*HwF zCB>?x9d7bOn=M^R__rQQNNTmPh0MCJ8GgVM_;A5OIy8#>`3Sm{d*pR9RW1>u=sv1+ z=VcqXaM`1+xYIWpywSCN@SCscKAvVPX^N5Y4~kkVZywC)cg@Nq&n&MD*JhVYcbzrl zKU0Bn&LnC&!(l?}A2N|0=W{j8s|kKL#rrVV@j6|(M~8$ic(Q)@#D~DlhsIH4llK~l zqb?DxFE3~$u$KE<(fm5E4?6S?8zMK8%_>L{^ALM!I~GjI0Xm$+UU*WU`4g^R(+8C| z7@rWXB&j5w>+Cw1Kh3Bv=S?0oIbH26Wx`s>twIT4Q;mZ)n|XWX5vOVLJx;9}mOz}wYy5urh238eOaibEc5q|DA7(=`7kU3P=|md1i4C|MIx>C8=OT^BH*<`Gcnon@oH;8AeH?6XRhi`j*#aQh->ND~D9JKvz z`MUzCZhoqB6&8FoCxTDQT<*e15kJ+jloL$YNSM}tqyJ`9)LQyMIeipfl;p4bszot> zP5#!n=*lFxlt$KUB05r6@1}-gfrOCVGXG0HSKGGshXb8CUGkC3$Q-=FbM^30&7jf< zp~+0Sd)=?>|TW($`Zpo`35vTJOa5T(~v)!tZV2xTUDQ$ji3l-2GJ^C#Hh?ekG61z~}!s!qO=nVYcSF$3Qh!jwoq61cb)5tf#G6CHDGxB{c- z%#qmpAs-28!ET|M_puA1*R}XXO#;!S4?g^;e0JnL)63aEcp*lYb<5~V=aFzSf2 zu!V;4urY?NmdlHrNh&u3H(>tC{$!<9Y-Cn_S)U@*(X!6R(rjblEM1Y$PDae^c;obE zW5wE`7V3F5V4K+=o*2!j0FNN#gG}#q18$>-9eFsjh7b*3vA1}H-@-a*-<5^+#{pH6LEq%IhWAI61ke1XNGPQUF_^ zmS=z$cCFY@Pp6EBHAmlOhgd>Ub`DLvNCCxf3$HMz1WY6PK2U45{lX`QC@L!jGJD@wo%$1|<)EX519u=BZ{%z&3Lk2<&X?5EKm zfoA@90@c0X&VxlkXBoD)jcM)w^atxJ=kvE7VOn7MQbvwPReRT5jZxk;F&l zM`wi0uO4Sf^^-9Qq%_=T!&N z;LOmgFU=>h^Ea#nhxL+T5)1~43r6#5N!ItK6pl7R*AGGeAH6z+*u3i$BBc#nn?Q~x zYEBB^+UPyU2SyjjK}+GU+w@;g66R>n!)?6<)|VWnZ);y?C8`pcJregb+!43vz$($! za4b`kzx%xM4y%Ob$p%KBeAsWVjriC3c{V$NHNYIqe*DPtW}4$6K||(;KDh64Z#d{x zh6rPAMAY*f!x7YSts`68xfKfb1s}P3B?>$&gBMK;9L;JfNoJFP^m?~r-GljhV0Xm` z9vT4k8*xJPdg&@U@T8V0CZ@v$hq1j93_%EFt+lt zB&M|$aEiA~Pf<@ik5eC@7)IUR=4k})+S?~_AB{Dy9d*sE)vP2wWhP3S!Mbbed1kTQ zYfd&(7pV~*6IUv?$JKFUdq9H0_1LAJut18LaRCY@0u17+%Rqm!C0Y(!cw;=^6t#_qdspNf8oxf`bXB%urf zCT}CW9WgKv-NShCtw*u0vj_QVsuR8PM4N-?YgH7exOwO7r=~Cq(QW~vbPHc`Fq(1n z;C=LTMbw>bSM-ziG4 z$>zv6#m!8Ytm!h(xvY9D{Pp9FRyL_+BrDueAvOtnZAjwOEZCd>QAptKkG0*6344L} z<|Ak-xZ|khfMDuSzf2Atc^$QrLz+jckzxDi%etx?lO>a;b~U=y7XZg^@x+^ogj{ zVH{$_)**F%elV|ox2P*L@~13tG}mo?7X{q$uHFGvWp%s3FhP1cAQ`WH~vF9PoD`TVhfO9ilL zf2&qd(#M$pny?2fBjQI-%#UIqy&pYqzD)w@SyvOnUTVlnkgX6csUy=BLdKNA{Twsh zeu0n!r0U;al5qu_RzdT1!ay@X3HB`NbmTBGN0sik3IR&zZ>fK*n+4CLh;8ox**ZVP zv6tQlKZiRQeBpOGxzzoKBd39JO#C4@LB1&4H@Df~6a9*cd<0d%I#pj)Z&Sj;lJXXq z#Q+r7dEjd>i*Gy+X<>uFE3z+11VFry{d(p!clmmR?`)@22s0IGZ z%z;{LU?a_7>&Zkc>3js6_Eo%s1m1*95;&q>{@?Z$Jl@~Nxl1>CeLPANEYlMQMvM&% zYwRDxs`K9%*1+1otuhAG1(@_Qn6z9aVe8%U4}#tDPd%W=`#&w}|FBM&mPMt^18+bk z{~N);7WV@7Y(guvuxI;wbKQ#QU}a?T8#6(uQt*L14keO}%!fLc3ak7KR52_okC8u@ z6yzo&-d6}U>n|KSIVP~{5RS8Lzqb7nrnnB z%fN#bY=!+ZZjpm2Przd<5BnRn%hQ#Ak&ee=f^}_3^Dg%M2BLAjc_7z=1Y2mzpQHG1 z3;h=hwS+u!>{DakuI{3zKDn`F7=6fg9NZ||a96BoqdFTyla>Q@^ z#yJpqxZElsM}095Clp@E&8=Zn&)%>_v%s52eZQxfkeNKG^lqAc zmbSKcBuTN3MSk?U*>c-#=D^G1msNb;xr~5<>6A;#S%^qKWw2U}rpE(>X1dO~Vho-p4Z`R@$ zZLPmTuZtYx=Ufc3>?V+1LR5Mbdxs4C+S!BNF8gw-;_M3%Ca&1ut4emn^ideA*C_wS zT49TIMYn>;X|MVhUMWh!vGY51&W2!;tk-IXDu%t9li!o$`+93NLlBq+=)OsM+vL#F zNLEYST1TimBvU`zF_bs+60~sKW^S|R4<`0cCB5v@tuUX+2rL7dh8A;wXcZjOoXmX+ z6`&X*3Z|x;N`Mmjb3fy<;ktAqb}RTf9)^D1`0SuxD_h_rG9($jQ9DB#dvrK-E`Nt$ zbJ+=qX%`zq?e*Ja_az%GyYv50S4j+oul+hVY8X1wEmgS>*3sr~ldfp4&mO>hee{(% zy~s`oGKQyVtZFJOafs$89sC#fmK1ThQ0XUqx`%*l^OXI40Jc8skAV zo5_+)=M_-kGQHcCmM6_OnhmPk%D-no!#@T@=Rn2$5gwZI-=N&pv)ouq_Hgk{r z$IgBmb#Trta39LWMLaZW7m~IGJ+KbVb%(Y%`xMNALkafH>aj4zY~BUf>;id=CMSGy zVa~$X=Q&~Re`*45LLFh(e0%-l7I;NsjdgQNEDO=bt*QZsdfy(#OPgWQrqwf?Ufp$! zvFWWZ)5eB~o$*^M?L~C>H;zkD?&XZtA(*K#(bUQxOb(*?cYVG%Pz@jWUU)VO z9qjv@MW?4%?98+LuLbNvn9S=I8o)1vY{mbw1v}fU=CfO4+D^%fZ(jRY^k8c@H z7d+G#a5ulQBxiNpMWsJO1MSvSI!cxm zAL`+}>pw7b5ghxAo%KozAy=IVb*9?AV>2N(aG%j%W}r%U|23ucldq+Fcx_9zV=x^Z z6JF%n!)>l9Yqnx#xUnrZy~E8ka6x$6a<3mHbp1`AXsW6HYC4gC7gZ~H9tAxD0R6J7_ ztx@_)=$ROv2EGoy_pH9U(&|l)($BlCYo*l-X|hf-k*#V+zhA^_MLCF(=u0VZORpfk zNXPe=*LY1O4u}NN*7MV+^JC{&h>&E-C2h!S^ZS=%cT49kX7{RUxxRHnohcHVE{q$I z-tQNTDzamKofmQ)TM*FZmX~;Jp6C*UdKjOZ2o6RMK~-8t-h!deXccHf?{nY5XIpIO z5#>L0fmVCwyQj~JIxJ*Hb5#TtvJ_^j!f8W#2h-~Z85e!94n{E2slG>*l+Z*gq3Z8T zSiSDf2n~vMln>(8 zNb{>n5Zbc+Ow?y8!zlVoZ9WY~XhN`p;P~90$rF&vSD8>mAb$&=-rv+3ui8ieOp_Mt zOF8uAFL$pPn?%3x()eCF&bHk8&#bUrUMO51Md34kJu2bH9*=49Sky}4_AV&iCq->s z!vPdbqO`dZna0w5ZGfg+RKKFBUVqW`E*bO7gRmRdywATDG*Vw%r;C$x%XhPY%Hi_R z`&|)4M09?oL}ohF*cpf0L6w~qz|`gd7dG!ZA0lckg4OuBLIM*?YN@c@Q;$8bO!>hJ z7@CFT=S(RtW?qZw4Mo1J<#A8P_jOoqRJsB~eTC|;M>E?cmG30*jbkcAgzqYlCN1`b zXDnTJn-IFvy~&8uH=eRD@@v4th=iji!HU3I{dWZ|PxFCqm*X<)KK@UabdBf&9xvYz zQxN*|?SbHYSnSYPma-(;D&Kgc%B>rAa!>?hcyk4qNF)iKj;yftL$4KXR;B@^8H4Pi}I`|4H6GBAe z$LoV0HmZ`sgUK}Zg91D}!GPcoD)=JpO(e9d)(UEY0VUNMTCmstLf5}}S9_fl7Trzf z6!k&6-|6*Gr(+_2;uj-k=Bi1ILSORA0Zbpnr2_5bwk7t z27ZyKzoyk%a!qWcsjh&wYA4ybnJ;(fAy=3#aGqir?qZpVC6jxX@u_dmH|YF1La@KD zJ<&<__wNg=4Uwu4TvNd-?ha zHAMwYT^dYvfF6m~bV^P5T1VyMu@%L|jEKsq+NNM>D8@(a1_$dEvaNb#98u}7dys4k zRH$I6Yf-Pw){gJPmN477(PEmIo=xVyln}c&;$_zWDmuMyfF|4AQq55X8QsP4%4QjElQ+d{(ZGLRCrvPylEDKfx z?qVQY%+NrH_-3l#9n&fbQUFCvLp7bzAYWu3Z@Ii}Zs z+h^7ecj~>?^AIBuTu(m9T8W??f{xbtg_?8EO1NfJXCOnAdTx~8GLpLv?ST=%9GuN9 z7{n~ei-+S9R?D@$Px|pVex5?h2`xTJhJ91PoH(^oS4s@R&;5VuvA-f& zNm5qM)uG~oaSNj0ZkwSZxYAJH>jSQCQ3Ge$CvWOx0}b6Oi6j%$nLZQreb^`!=tyX} zcS$2;$IsdY^G%29+?vhsV9w#wY9FzWz&RwEy!}U51pn7|1RTP!glW4koc-RHM;=(` zSj(DG!*SK7KQE8BR2)T(QvdQPgYtoiGG^WjTLUr*dN&vHuTcAsg~BVqh`cvtc>tYB z8myaiCN5CQ`*KtZj>HXH(Q6VsSgiVVy3?&>g^jb>w(>t8Q}8k8T?BK1ni(*&-6n#< zD4Mh@Fv&E2^@NjvA%D0JQA$CrcbA%8OzuBS;@_4L$q0D6T{HSIq7f?4*1#0MYzq5zq^uuO6 zQ@ZK>ofi5}c~Qb(s<6PF&1l%@*R>>07Dli>si89P5|J;X`5!06rKq#9D?VQ#tXHjA z&J^c-azqo}3RX{&vP+nE>4SrKc%c?hv?j`*`P_(gDCx*?1z z4 z$!a&^bJE4vBOJAtVdU5t>ECO$e=>rp(!|&iRQaxe!8RY027Utk8$O~){jdeWEo0=y zH#1$#M+<{+_(|~=bO=Psc~5stUAE$VZhb}fUi{nF0QjL(2%HfF;+KVZgs_7zYbhp= z1{1OL@6&JMu&f2srP~&omhrYo?#B1*yc4osVvfgDjyAcDwP=Q}8G5PU1fD2^x|g4V zuH7sKfDie9f>Hm8>B{-TRtYS)YL>8>rMhT#GS*wgMN9mu&%z=0FhH;H-QR#746xJ$ zDL~jU?=~TF!wP^wD6~^k?TE$TBuSQ(`iK|=tRDD0d3HvVkc`0P;CR042fV~H@6AG74lY~)MV z7nHQBu4$J_8hcE?V8S?4%0mqPbJfWIzH0xTO$D}uwubs4Ek>}2ppsY;OV~M80zx%% z-rM4w4@^#({@W{+s?Rgz<619|Fpf=-^9=}v{O(%Uhmvrgk;q@7_0^MN5-va^ZKzFcg}UWtYnv1|e5spg8-1tODUH+zo56UqYUjZij|2 zt%Dio7GI?SLyvlX@zp@)xQiEyLKh%jVOYodVtyIJX`jDqFnX`_z<&^MTEfb5(y1eh z&YazR6VoQnNp;7-l=0DK8TvtN{>yu zTRBFlzc2VG!_$a0!A>(4?bO=UR`{9uc_yhodTuc)^fiAs}3Ib*_Q3I^)2MMe(T1eJun^?SYf2PE@LD;*va zl_VqqZ+6hBR3Ob^2n6?Hs7uzy`kdqj=2;!$`V}2@ZIewUmIYz@Tvk!`I!y{!5@UUE zl9`7aHI8^1mh=z)h)Ku|GJwJzDf*RO}%Sudy4VZmJlqlh#gls8}iM;?pK? zNOmaGx|l7MneeM$u?Jv>1pqzr`g{zcT!-pm0PQbyOpC=2r{neWf8q&$>mN(S0xOY@ zwWCH99OX{-bf;-ETn0LjUccq*1uC^XYkkid%t8)=Kz@*Xl+u+GLnHil%_WB23d>mj zPu{xIWRB-$uIUj7ZA??>T|Tr2*6OYilg7$&o@`LiV@b9kJ92`4`9I*;s}6clX@)(q zZbhN(#1&n;)~DJZ>laOUw;wvs9qLmvT3a|em~>A{7xKe&qe5bNb1F4mL5TjC=}AI4 z#p`|#xC$4|LB(B;kw@9yde&fyc;dme~G@<-c5N0FmBzJ3+>s~*QJzV zZ2X0rY1|O8JFKDf)1$xbDyVh%*IB>~m%`J<#m7VThPQIl^*Q>Xz0)*O5S4Br`K@iV z+dz|T#K59!HtLO`6&#@q8!LQ8HJ5UHMXe$qYky6a78&FWU|ifb+gLkumGP7IFGkkV z3+b8Wocg2A`dQD-({_J#hhTCa1v$IiY@@HC48<&1m&U%?Tgj9`X~hr44plF+ye(lQ zNpZL_$mr$E2bi<@;y5-89W3^8XJ0EQT_JWH^tljPCEIeO=pvI*L0?Q}DhGi4$g}G= z50#JCew*_B22+-TPj?%hrQl7W_(OnVRKRtu6M`n*G$I%~Bn0?l+1%dF?4c~UcU+?s z>lJ<&^Y|S}G@fl`*93>UTsb75IKIk2hPLwNYTYJ&EwWs1aj+Y2!YJ{Uu@7!#-}6d~ zIrInC5`P<`XQ>9HVdA=a<>%gHE=URf9F(1Dge7*`{*~Qvn*3XHY|EJ9J)M?8dJ1pU zNft0Xi7$~alTr@(czby2wr1Q6&>0>>OejkGH>1!F2|3%zQrk~a*b{jKyBaQvpp>f? zk}(_-RC16jpwy&jAqw>vJK3o5p0;h;N|}Gu{x*)yJ-hb(K9@8Lbs9UezxClgcD!8q zPZ$_)33`#;?C6$)ZKJPE-xoifU0dSVS)&uo#et;{vU-P=P~T%s?RWRpL~Y%RKe<>nc7l4^&PfC&&@$Zwt87T9G#U1Gt}AtwAR~EdVgo> zeB`nIR;-wU6!#k%v(?T?c!R)PJ>QnhJuWd%`GGRr7|0XK$&uBj=NsS^aleGN-D{hQ z^_!uwI=U9Slz$)+OeGPVSYQ%AzHaJe-iTTomgb+qQ#^Y{s!;`z+6f6n))j)bB3s@2 z*t~Pj#s4~M*+s@``Cu{yqO$G*O08_&OXYVfMm8k;?g#B4y!@ead_J&?XukK<%W4{5M|GeX|**Ay{dczwNHi*I)4 zCd;imX=5He5VcJ+S5zxD>BCjbEQ+v1puEklFDi|a59Nv3A*fctS_JK>jKg)W_f*(b zk{JW2jw3dZ(qA6@K=0vRk2Onp8QQAXM|NDsW&Y7)gtSDrL?0c1X_hD?E13@0!TXZZ z&xHrk#`eA1vvhHVq0I|pW9yb?YsowBL19pArO`K$sq5-bO8fnYpmAWmMs^W&BMMm$ z{QG?hq?d5LWS|lcsQP5uB}5zDjjB69Pev{8>II3Wu|yZtzB`Ac&9~Rg#SBzG&l1jX zc&DuzUB1G`WV=(pnkVVlH9OcOLGfnRA;=Q}?orE`l5 z4SCL{2H}PAUX4#~$X2a5SYB^FqZhUkKua1gx(g023_K`KU1TxO`^X}2-eOEK2%4$#Z(8OKMXdZ37u&A#VCp6;&H`RbK;ui1s@>Sg8|(CxB%g_W zVTJ3e&Sfq-9@ZGHa~zu1WJv0z-a;DoeX)Lv>D8A}kKi6op`-Q;)WcX;!$Af{`P}K# z8Ws6HFeMl-71#4jimA4pZ2zg&xg)E+#ZlsIPR;vL|V5@>kEnSV0 zVh59_QgIE$oDP+IQJamrGYcKbgQ|KZGC}nI`eK&H~^*1#m!6%2S;aNO4 zA-H57c-(}M{BM3djK}Ri4H&*{q@?)rwqkc4%Fy{boc z3`L3&tE$j{Bdu1E&*Yz)#y1uAb)l%>IyeZS3Et}LYF7`b)15=fs^a4#34jZ}Lp`VD z$Gew;^JXlU_)Idec`qiXZ}g%ui04dJ@{7Jz+4lCF&X|h;9FZ%e?qGJ1wSl$r8kR5d z-g0ooezs29=Y1t}we`#E;LL|{A+t-sr_80ouGoM6*qC_T8k|a(;pFf{TVIoBBKs#F z7iIfmQVbp5x!J0aI#t3G5F8xKiY*H0z@mi2abLmcKpu&A$+9Fd(@$7nRduO84bRRp zg|pf>(la+Kv|UP0?tdE_qppN4MXZm~BifS0jlEJG$ey@Q3!-U$tw93rFPH`fQq@MA zYFB4L&ToxH%-xQVpbMFaHd+WcfXC!f0P+x;P3ffswX6(WhcHD*J z%!q-Ca`*I$=Gd#)ZCI&R5n(^JAm*k{%9A}bX(qFyEgn^#O?7(joQU_G1Hlni<2sMV zLwS#`M(~u|4<85k{yNyJ7`E%hh1In(>vev zB7W^ze7^bY56I2k0dlNZQ3h)S?{|-a5q1uCLTb2uz=!vkH`Z3a!JF%2{29H!rC2Aw za1)Q6r^;}6PS-p?5PKIz8#VLa()Oe@0zv#{gs0S!c9*U(vR|3hOOMSG({bBhhO^EO z`EHFdo!|vUJPaK(d}*r?4`mKU1_?h_8w|>KkfB=aFHQJvkEIwO{g4B#4RcX;{c^#n z3jv%7!7h)YB>9ymd_nUIX%4JrfkBzWW~qH%4s|?QrA4Jk5lb3(N6<{#hB1*EVDkAd z^vh-^5t8hL1^DPhmUuUhbfXn5DS7CY6I3c=OYv;KppsI@D^xqR9ygzu*+1*YYdG*7 z#bz81zyB>2ZW8LI@D_u>6o(>oOUEh**=o|5qFdeI3!g)B!Bihe<)Mm%igGU`l-p(u z)_Cn2sz;3oH(YQk>C_S7#0Ul(YnG*0m-G{XMEtK>uU+w3e~4N$4F>(2mwR$k*=w2~ zNc6tH7%83O=YGPYlCyf4N=%w!m@i zwLl?lsx%~Ut-1Pt->OlZbg{PWcUU{gv>nu9Oe%^bu{Vth8LjlPcl9Sl&K{?wR7|KV z8g>|O>m)W>Tbpl^hvza-Dh>bYfHWV^EfaOq%9JvyehZ;-m$A7dlW5?eVZGOjXTa4b zd(uJkJOS25Nk-b+>mZhC2$arL@i}6QNsc$iOIzHTmX6>p00K*aAue z(7IAWfNb$=JnB-|7qy3XrTd@8{=k5Kh{@m;f|8P(!#2?p7*va`&BYHi+1`LUfk@c% zFqxCLuihxhiPyxJ%BxPc=o`^&iINr4|6%Mb!?KFHZcTT0w;-6h@9jgk)? z(%l`>@X+1TT_PZLw(s|T*LBXn^UoiOy7$^^%{9k8#vF77fo%2;c`sG0es<96!d}AIHI3!?+0rs%RusYEaMq zo@P@Ppkr99N-`P)XB&va=-cDJGvLmm=fK(P;G!3(@K}@Vl2y*LfBWtPb2^?>U{(?{ zc9D3m1fK=t&wZxNoq!pE+bqqJJ3O=XXPO}ZRc}E&F!wXuz~WcGFv_wmEb%(N$pM;tp!^9Pb~r6%&| z_+@`Bo++d(*Dc_m6MS4J>T7%m6k#!jnyi1Vd;QkLg3tH?>5x01X5grM z=7_Er#FnT0&k~BtHXKOw0f$X=-g2p1&$PYB3KaMB`mGBUgO{*3a&`!!>;EFs`HdDa zoJ9VY1)yma+bGj@@tQZxNq)DFJ%`_?qA)eju$sk^RuW0OF;;3v0ePM*W8%*DU`w>F z#oI-p&UfwI7;)Hdym+`A_RxCBV`i}jDF@MfZ}D+x(CaL?isL~ie9b#f%^l5({&Awz zcJ&kp;pvy;Y}H}`2R=?2XVFN7IPH8_Qp%$nF$ZXmLe!w3xDc;!c^;)kpb@c;BZg0z z%bUQNZL-eYi(WwG*|c-cMoI2=)dag!M(MZ) z*7@C-NEwRW#v-W;w)*I-W0v%`o z3Jx)3a_(R4giyb#;b-Atx=n76EP*63gRwRYT4A+M1bx5Mglw;2{;;gFL1*mA#Jr>V z&&A0Z64b&(?+CdmkRbaA+4m2MKNy3R1wj87xXFOdr(z*g-qp$}#nEohm{I1>YHFu_ zgt}{jYD**{jN~Jz||ML`&o-w9coL|wEP*ISJJ|jH$hHS6OjI426 z&JWBbtMn@o0 zsYo=_g^*9sZ70LE4zrG3*JH%Uachq#XkNG+TaPNu8cj^Z0Y*8XpD@V- zW8GW5W&sLrQTvjxdH+S%)(KcT3!Y-I-LnueN6G6^_LoNDsg#HrJ^ylD#IYmFdpUfu7Qm1HckN z`a2@eCXWB2w$_iqwmwq^{kqw=u&9&n&K?uOosTwuYu|%6oTh*2I6M>B#r#6}q7F12iHfgU{=ph3YNI3GEVGoQec#ioV1Kh@gT_1aU zIHOA|7ps{*Ongg%9zAOzEBFCp?NIeLcCC3Rwhl$QQYFFzkWgs1oE>*d%({8hE;4;% zT^>p?QzUNTKf?XGAsI3ir&{9eTQbPz`I+}3|nvbT<(1pY4|!I4yeVDM$f;FKmNVPw#h&EMn~a!mk&@cDa`N;b>LSOClbbX?&&7e4v>#hEd#I) zwu3dXAzKl^4pGH|e10%V6_-(lDgKv*8+z2Awj2%T#9AJ%gYfXZZ9eIN>q|?Jv$`xi zvSwZXT!RLO4=npLFY}x&cyOLs=O0fBEG$UW*NZYT9tT{yfsg24y%O5L4vG@mr zN20uS7gjWXTOEP>=P>55sxqw}ZI>&m3Qay$*)7m>+_(W?sgf&dUShGIK)Gm!jLQ{$ zJpr^5c2fZ9iuyh|jEN4QUcwqWK32wM-QDfmLNhs*m7-2`U9|_Ez@i)UaB5oV5c|ySX(7~IqqV&@Xd$qRg*OHPAJ}A zXSYrSP&V94Z~l&1Ay0=};e-g<@fl)6^C}+QlSDnUAC>1*gFgYf)hc~)UgUrEur89Y zP)^pb;|Es$s=BMKp&-d9GZXwA(cFNO)~JYeKQZVAZ^) zSpD_f4F_rtWjqJ`U~cEu&VNYj?RbDZzm>2aut( z?Cx4SaqE~VnENx1QZ80O*0IntL(;tXcOkQ5hih2l>0ICW8|^1mh5?J0MxVtC`~yTC zm94NTQP9YX;WmXslU6 z=9ir-sWN@8kjD2OuAewu^{k%_n&01gMG0h2nLie%XzqC?v3{V8_FBA(RJb z@r`!M^$+!ilm4~pZwwNw2 zc!viAvle`jf5xwc7)Dy`%kC9FZ{|SeFd$cE2K+fyy;J=S=pPv|##KrDmv`^D@FrwU z*a-3aVMxS`0$4wrPNDEMJGt<2O^Mu#ZYZPB)%{v7?%ITEo-LXKNzm_XR_mmk^Z`7o zeYXXT7A3WY3yL~|y5R%F@CPkM>tU<@Oi#a?e`M6+zLmS<`QF`fZ!)9WYwDhvJ|M%& zPrm;{s3}OSs}9HPoP}Bf`X&(h%=4Jw*lsFqO9g0&E3JozhqAid!h@N3#4uzC#$+hQ zFmyV=2qPJi;Et2?M_uvMJyU?_gdJpRL|WXJJUjWunW;5E!&hW<`JKa&5_*{-8-9M)fAlvdKH5bJ%Ey?qedvh@Oc>8ds={cZc~{r1#s3y$4= z(*`}+6J4KgeT~pr<&KFC@7t$Vx_ej>Rh~H=`zEGhvVU>#&{-`O2rB1!;Llt7pHIS; zi*hjMk-w99z;$)qt{XPgX&2%!CTSmD2{UV6xBg&wGrycl^)BbAl=Vqfx%1oaLa~em zQa|mnWBc18oErO(42OUeku#0U$X{##oX-|CdUEKvR8UVZv$y6g{DKyw;@!P}f3;H3 zWKjB|KX}GvEQ2C>LjcA15FqIz@_4qg>!SR7*?_7%4>%j8Q$z&Pxd2)GB=i4vSm|El z#TLW@7Chy`Am_C^glLlB$ea#puOd}3$i(N_b}a~)31eJlLf^qd`frhWtK0Lq>a9|c zpL6DoO+~kCu*My!?q%W=%B%Ex{GVvh8SEO^w+RajXH#yl@QoiEHZwHdQG%2Fp!_;1 zkx`3t1@mlm&~sc-R=bW^TYr#q(%D~L7(CX=rfUTTkC}adycXI=Rub^v5DMm4CM3Sg zDjB$!PA4WtrebR)CM6icjv*R7aWwyjeI@0(Bs|R}^p)_Eum~?-YZ?DVzL)$JuZc7U zgs~`rrF&@V;uFexq5pP8LNSOIWL=dJD@;t6{aPnt{gB)HO+T4({{Z445#r51{8cue z0%>qPlkQdkN5gb)8v4fF`sD==sRHm7qqBavC8ArbMX2=0qCED9xbb~fxcj9}h-NI% zD5CpifG=qExa%}TVGtj}w?974FJtFCT1NVS%%IjeJ5UwxToKvWe6LMs<-n`U!(_X= za)eUFq6-H;$}HmrSutUJxY&JEUl*96NzgFOxT90_{6$f{f*)@^ojB#d9LMy=OYq^& zgiPWwXwVgI;Jd@7#hP(UNi`x&x+}$qLBlAyx;(_uqKyQ0RqDE#HZ@mlB+P|Ql$n+21hmrx5(Nm^VSN5Cyu{0z|Y{qZMoL0_*v7Fk;)dl$*faATn>={9Q_W@WPt zn{Xvqf80R!P+2U~=9Y~_X>GUQenfQXVizTrbPcebL`-+%7z&U2TlDoWJwiZb$=JIzQX(buRwE8-E;ow@MGLj9bOC3D= z@<-3C#+Xm5qJy6Lgr6#14JR@-lEMwLA<8bp_w1-P8mF20sbY8Q3bVcNC=7qZ)MF)L z4R1nz8{awTPbbB;Ack165-U5~88xHkdv*pgtJo6?e%oGh_89C0wz!J493vq&REhd( zb!f0K$p?Q1qLg@EjS}CMc83(OHioJvhO!Lj-8Av!+#pwk@Mbn>PRCAE%BpUmT5_i>$Phe97P9!0V*U@>(J1!MYk5F!F#$ zQ|-@$#%D%~{0|OS^Y3-WqJIKeV+3<8aE;5)a zMKj&Qs(Qvmq|V0OPt_Z*>FXSxj|WZfs-iR;U(Ctd1{}b3y}ZmU=zWw{#FTTQkN}>q zg8&T13v$SS8}x<##w>m*EnuEmdoypWLNKBM)JSZn6^f>5?_zCK?mTbMuf5ShoOA~| z;pAHXHK${471fnpDy5^q z0S0@jZ5#jrRRh;P5q;bGtx=G!B(7SzF4VIqrhCfcD8B`6ePbB@WvoBHDEj{ z9?UW8qQo;o|HtEc{g2-gQ6%U>1AsLZd*PJE|GcVda`y`M#PpU)+%^O`_qUfPlCM!# zKdj5hu?Uy=5cvDysKj42Xez8(Mmo~5E;xOCYK&ca9Bo=^#F1ORjxJ}bN+S_>#o6a@6>&8fzcCoyRB;yj; z@Pa8F%==*^+K;ku?(763g4SeK9$3ZXi6h3g8_UF>e@Zl<)FtnxS3`oC2(sZamI!z2 z7b8jz28z$Fh{G+G7}B=;x<(F3<-ergz)p{XONMG>y^eJ9MG|FPmU?9Ev1R1pSCqea zMyyx5wInnGiraW-`NV#p2C5Z;$l!2qpr3}X^kLP~kV zNn!9-8NWq=M@G9A#R7VCxa_;rX9a}v-0p9kajCYDKb9U`0@_14&vMP`u{-LHGtAJ~ zCCDPkur!7!HRb?8gmQ_rh65R#;ePv9@o9PsR)@;uFFeQ!##@DYo3Ijoq%UOgUmntC zW@ysSLbnfMI@sYU4c$PX&c@d+D;Vo(6Y`E)gnnA1@yKDvG1CBN!G z_o*jSS7U8BcqG=F-RM^>4FN^3DuI8X0 znYw9=hhTcSE2M_|Q_lxY>|S*ba_b8RH9$ZhFR-$j8TTTYrPCU+=V$n*$#%sh_Xs#=d z@8!9ZBuOpx-Y5kd=FTXSQ@39)M1aR0QO!n4ma&!i)f zn4r|1$XWVn*f^VEM+Nq`%v7G8o$+p&7@rUgSe$@1M;oQt0L^8QhkYqrvVozV&P7JG z1t&iammI!Ug=ZbBk6lHNv8^B@(VoW5tD4_`DVU1jo@1ZST+WIia4cy$U{rD#3R0A! zDtYGIegE3*k@nfb&3}MetGJjkzI~K{YTOkQgbQbq72wXFYJWkB!_F6lG_t9B9{fGV zeCOO1)`!V}4c;_fY0emZ7o;qEh4_)L5%Cp}3t4!0kO`igwe$Z>T;M9fDoY4P3^mmC z(?KM7WGTZ7H92l3yAM~(Z0fKTjF%wg6bRG;Pgc!Y-=|&vS*F4ItVgMzFsJJuKJr^k zEoHo!A*001#u^~y+@|i8ISe~+8<$4`2wnB5J*DY#Z9s#_%eirwISFtfOh~eBl2&}C zpcp}2;7;D1r*Bj7gx!G56btwIx6O3a1?B>VcTwbuO;{;ln}JN`X3LSm;-luuUTy?+ z1@k@wxh4pyXPvZ}rJ?>G*S~`ps%Y^pM<(FOep7Eio5)l8USX!Q>} zS)kLf_KEwy{g`i!A|Ziw`N0 z&l9l8oAGN_`=Qa%;!w^)1~#8zL1kk3WLWNnu#&`tS~}#2Z>xO;F5UdpWylop5Jhi@ z@OaqE+li-1^DH4G15C@;3bf61Wxu#B6D@rD@KmsWmY}ZDOJ8f^cZl zavnR`5M1Jxc_Nq^k|2Vnwb}O6xPpN>Gyz{EGk@3D>p&tytbiQCm8yovFqB*7kNu%G ztVto>x8Rh_FY8Q!o10%7c;zwl=>GeNQs~?C8lBvUgiriL?8U7BHds((^BsLVa<*s!| zO#bWqJebwK;&!V4OsBqox14-+KAT-)P{Khr()wS2b5m#gcP_W& zo=$pi*C{#}aC7Hqiv}Ygpk&GWvV0%;+JKjU9K_>%G~rTMQzmmHz^k&i5-ud`5y!UL zDynD2YX@WX&-X5?wpn#|L0W9}x}%Z*dj!;kuGyyv86-;cw~&QYO=(g^JC!${^%KIV zPW9ZT>!g9LY?90N{R50O9oR_9=718LTv)n*;YOo6m>ixoF zLx^d9rXe+dZbKQ`scqHPBscFm+sp63E)SbHq}(?*i_b-x0gR(TJ7^YiwjZ;BZOEj> z?2<^gaR)y-KK48Oo3Hi9$m3RXKPc-TDmlrN?=M=%43&Ua97t;~euL=;q<-gNi5VHX z)z@j@9FRy6<`$SkyAn)+oLjv()??J5scv9GMl)D#^71YD)T_+%ll18>Yg8zHlC%wq z6)F(8_WFG)$x1JBoEXVcXhA`L_+p;%n0Ngvb@yazOVE-BZrP8a!`H`nG##hF2I#{T zblfGB?@azDSm+<0shNo5G8{?UqZWjlf6Te@(;6O}DLm*X{Q{t#loD+|i#B64C^vkSEecMwoWH^O(Wc~TQft!K2Ewwe+;3YuQx^7Qv; zkB#k9l(_kN?rXIGdcQudPn$H`r!R`rujKUiea2 z(TCp|e;%a15{w9!IStG3N?Ho-lm3+4p*#Czkmfv$3vwKhB*hkK)E$9(zd!>wtT~}n zjK^OZ47V{4_j{U321e|l(pUbpT+FCEUi%q3L!a2L_A*ge#GmDCg`VT#$@cSPSpHd= z!}(Mffl@=8BAU;TI!L0Vo6e1jotKgUf* z;TwyU+vLCfsv=mF9Ba@J8X2sE%AvQ06y6jVY5$iPLFHbSI_^wU7|hoxxoOa!$;cK< zk74c*w!FQ@VrFDl9MY(F;xk_L=OvrV@=*!f*Jq-Sn3HMjzlQ&D&w!fmMCnGCN<;#Z zzhd!|?K@k4xA~VJsUM``_l|qmJ_~Iaj%(91T3(4LJ}H?dTkn2khQmKIzngFILESy*>NSNm-Fn_gJnd|DogVa4 z6wvVz9U+Ng@@;rq;kPxc$%ou~kNE7#YlIcF*7FV0z@NSE+TQPq3kSR$BmI_MlrZ9p z{Gc!yGI~0U;!dYnV*VZt8+kyg0ru?vp#X&Ca=7%oxxQhxllc(&Fn^3pC#|x0!AwY9 zY}3J9T`t|3)P^lG2QYuYoA4>G1UT%gO^Sm#-*(@yC?;t?g3&;)qzRRTn-iJe&8SYj zAoM&)*=*hOh`t`EX?PXQ2N#HS-&n;{@-mN{E35qOa@i(?2J8KzwLGlBF>lO^WmDzZ zvgwM<#!HBd=T)c@8aCiNcgGbuS{D!)cwD8~pKm8NiF;+cf(5XyQO^w75?^ZDTShFX{vX%61_xCWL zNSJ#puX*=P(oc{$D~q62)q7@ht81#K;BXIuz$+B|sIcd+o2&WZ*Mlj4a$ZP88Zs9Q-by ze`K9P#nx}j2OtU=J08{6%vRr1uiYQPy<}K}&Q|G(QOir5qmIL8_Dx(Jz9$}g+oVBU zBBPwFpw=3(AQH|-5d`pMEHQIzDXV14U*$I1kt};I{hM$zjve}da?Fpf%gn13$fZol zern~Ma}|`&XIZ284bBFisa}nyJ(&?oyc6f}lQF_ngTo(UcwpQfJxi3GhV)CxFa&!3 z>^u+uqF&x5tF!YDwEes3bYff?C2!o1Ujbv>r6L$>zpngoQ8sK8u5qyxG$}R<12SZH%0Pd00}>hjX?m={ctDFALRjd?Kz8FD+A8F&A z;Rwei*{m-fYI#^GC(-+}$we%fE^=&AG=rjuQzpf*oTVJftUiSQ>59%~k;t!aD=z-eFNJ%WSx?i}P$Nb%ZvaxBNTKC!g=&yB3uq0Iw>$Ujxd+ z?u`~S##)lj;r($~Hh-vy^I^88O$8t~B}Df7ZkKLG#kcivZG|4De!~8fnXN=zk-6$k z(6i&X7q2&)*ZaM@Xe<5hquB1+Ek*gpt11Zezbt^X73p#*5}$6!$cVyM+DtmcFSiRG z=gHL&{h2S%c&VLd{tQf8=|p!{#?Qsr{2mtG!D_v}dkePf z>NVezfohrTBfO%fyW+ES}%LNqZH-~t{RO0(`eEct?6zka7$1tGrhmsWPSVA=bMrb{i`@qc6D+wIrv4Q zT7%*vzgXhZMjZ@#B4Qc&?#^_{>T~G%VpmGy%cB29V7URWlsaRV{AF#s_yd0rvGa_L z>zC=FA<^RrbddC7BMo$mL#XW`$aKJV4P5(?%sf9;IYUYRMMZc#lMS4n^_>l|Xj315 z|0N5FSOlB(E@Cj0lla%DzdiciVewiC&1B=uHeOdhb(w_CcqjSw@ajqrW(y6-AAY&xB769SsT<(g z=n2|{8p%B5b)Q*Oi9D?d?%lwxsA9lyfCb!~4ny+^z!dXs%ah6PZX@J=@>^Zrl?0|! z#>-Ly1}7_KTu}3}Z9I4lzg|g=!9BsM%0MQ98BN#G{wwiSj=+HFl`rxs5U)&Q-gwEQ z^-LPvM4cJJ*@I){n)u-((VVrO-q!Y~<9EyAJ`)!GbpoGcLO%W2P_+Q|)(RF@%BMm% zqF^FwqN>5E00Vw`puYH~?t*7(+ytR4+pilOzUcG{#9MLpjuWncsbE1jj-=abx_~~M zH0#Lys4hndIl-~-ZJGT`v+PTZj_W!YM~sa~n({uOSLz>7Bd+V=|zeM)|U zf2qksJoO>=6Q=+;qchZ$4G*;X28=3csr`G+RR=uo8Ht3iJLlrRWn;U$cY&XUs@fdK zPm>|!MZcvrOz|WYw1I&c%1$!J13ZD&fT~ZeXBHLvL)BV$R}Z6lG=IOq187QC;#0qL;uHCDZUg2S4(~zRK_a0L&Vl~PVfTBREB8C^an}~_ zzi#}=9_s4Tr3zyWeDHb47x6+J@eCMgNRMZf>`Zh&!Z?twoeiy|Q3o`sI&?k_a;JS) z91%@KmSuB=n5o^$2|-9)fE#dn zf!W;WUsaui7Owea;+N5PjQIFj#7%FUL>9!DU0?MGn{XbD>Vb z<=a~ql_<>ff_&qON@qZbcer`oQk4IUoW&b4;!-qtJ+$}u9snT$kh;?prai9xls<3E zP2+<+)2fo8jTUh}P)-h8BgEq;qwDw35X1$9gXo#X&G1`zda^(n{f9DtS`|1h$#xV4baFgMRGdhE?x{MhfI=T zWoqO;xmPvb!TGo9{tP!2K%e~N7u{Dp&jd0v#`Dfv7Yj$d5wB3dqnumXohW{7p@==Z zk7vb~apw2TJo}FB?~&b*G!cYJYq4(VLf)Hj_3kVOn!>oOB6?4#+d;n5=-{QjJ_H0+ z)1D^oRxLJGy@pDp^9gp2_4FqSQOSnu((McGOLMK&Mpnmj5kWW&4i?M99S`R}u&Jns zd$BR$D_sCR5!X)jj_%7lFp(Sfv1+jW(~wV!rg<4*>UYkO)BsEi$#tHcf>+)bRQl)* zMcsqq$*PJ{Sevg&cMdz-R7Dk56kybg`9`QWDC_f_BfjbT)zQy_a6^UX!t|nho?mz4 zC($RmI-F}o5=bn3zAh_4Rb9DUP=4I&KGNF=>W0!Ls02L#qUVCRY4Icku5sPm#Z}Md zfYE_+0lg7p(BPN2_!I63svkteSInInEEYzO98KQ}GHM<)$SGp_5Bc<@_qG9#Z8omLKXGZ*6NeViWrnts6rlJoj@yUk zxQU9u54+}rR?2?Zpv%S!8$1kgC7(#b^(U<@i?Sc6uO3%>6uEHIZ)2$RPOVA8R!{^#bBv>62C^ zsd^PB4{QcqGxoC}V#kWkj1_)QUU^yvXELf0vT#~sh$}CTe{BqxxpXqr4yj7bSeAS^ z8=TCZ$Q;BNaF^tIpyWfxr0et7M=o!}FuMa^0wI5}|wRTD;>&NikE zkv_lY_AKqKV-#M;bhr!JsIKCI*)Vf^FHNs7(;MNqv%2j#*@D~sI!As@Rel-O1WM`M zFcOE`>wh-(Za2^FzI;Qe&d3MVoG+_g{540X0AHHNKCgJbB9pAWbPlSrMkL)_A>>Sx z@TR8u+2!_SZYF5*Xiw=g0=34I}h&zDJ2YWC+G(d@K~?SGt73vgnSL zh}RpVKbBG^E}boHXt#lj-8UbYubv1tRWW$8Jo@WT1!}RC)?KNMROM~0_A^p z(;I7XE<5+?xt?sU;x}KiDj1BE@Pi8<6>*Epjl;3Mrn0y>9@Z_z1q#Vza7X_|-ManI zm;3YAX}p!gca2wlX?G{?SNxgYa9d>+;{cLE8D69ywwBTFQughi5N$Y*E@BY-wxS{% zxPA3P{McRm5zjm*(x!Lc9zp=M_HHB$nk6@xb3!U9-(4yCrVK8EH7tjpT$2;WbD^mSSqH<<1IdDK zLU8JsTC?B1V9*Uw*nd5|0a4BIMIq>30B~$d(luE=*z*`XdUl(={OxC$nMj-Z=PBr4 zG-3fAQYUbw&mo%II&oiF5p~+V5eL$|4IV9{xL@bwkwrWe+ zbQXtk8g=HuX0#KD9kC7b5VL+Fn42CS^tin>O|C)fI6Y>liH4VQ`iw62M#m@Z?XP7T zh=iU3;xz_eOCxTo)+}3eaA#z6ijWC>*@K;^43-TC2Hc*fFAuqSUt1g)cbGj<$Ol~3 z>i_&bqWZS$E#5!FBrzW7l0ZVSFTsw}Q&s>>lwOUz87|9GisYWhxRy@zYs5_^TpK#8 zhqc}S)4~Bn_$*}igGPnVt|Y3?&6S=|xO-I#rbxn&@2jPno++U~9C~SS4t^%Lwu_9| zgnKv(aR=H3bc_8izRIfwEO_8a@P|3FGwcf@*uZ)`9Z=f4BKIF&#X)yUL#D*Fi8b4d z&wfsw@Y~{ntd9Zl(l^M?m$2>Ct#vW$4dX(_9t}4K%(XvqFU!x{ylaQjd(PCPH`n(b z8*Y6?0f)hCy|bNNg~M8#u9d$qs{n0Pqem66xJa5jQms|K)eZ;J9*hgFyUL6a7yRaS zK8Np@7pVX@sXY?YlCs2rs|MJ|yn{v_FuHZfTZ zB)JKgB3kE4l<)~e>j-C{!9m^eYLli8bz{XdhMN5NH{AvP)l7QR{<_Xa@{0~k31!Wy zD8z3AWSxOX&JJJQ>+qep_=03qg#%s|WT-s&YZv+Dfeb~;)Ee|S7;6(XT9SA2bdt|J z;d$T^32z^X5klj-x*5cgNmaUzfD;b|7!31Ur%4-hPig0h#^j1Ol}kIJ^xX~F`-l#>!IDy-016P9Sq@)T?AOs zLGIKUTXZs2#NL`PC(XUPLDZIY1N%&oU{)X3r?L}I8WU4%&oFY595(>e8aFOKEtI!8eb>)Ttb1wQY<6qjsV6Z76JSpGR zA)k|9Rjj=q6u2uBVS;X9KZ}FrT=vC5(pNa(47deK?QFYhWsgrh$6C{iKq*1mj^mYA~}02}fvTIk5(Sl+l|h+S12A%(QEptA zeuq{4%Uz@Phk7$Jowzl?Gl`cvBQ8}8nAIeEv{N7|J5n*u+BRBbKNoozP<`Mivz^w61 z?IwRIHfrv*<3Q==%`U6LO4Im;wsYRiE$>p@AMJ?fN7CPD_x;82VGu#~lzB!#$M+OQF$$b%O zKI*Ol=or-)C+-%gxGmj9DU?$yLO)?TP)2S? zaUwBme;pX;NyuluID^ju$pV}mZBlH*!~Uk>`Ze*(wU?U6V@lz+JYn;3PRz6`PS6i) z78G+UDJu2d`iiNqxMGX#P5#T5 zbPryr&|r!(3JMB5%J`2Ap)nybkaE$*6!dZm)KS3*>T{Hvh7)u3$M0Q{?}S%qoPv$) zjbE&`uYCMBa@rq2S6xS!mY2>w1jPcl51O9y7_dBsIp#4>MW5ECHK9@WbR54jFri&r z+QsYk6DCOdI+y~JnQV?wz)CQiBgD1Ghl1pc!g*?U6!#0OGb9k@t;b6wYF{kQe85wX zaXnFdk~q{Tm=0xhM`%l!AlOsGk*z>rxV3xGqt9gSvl(42i*XOr>fF({x9*6QG3LJ# zuw?4NJ50eWNFJo+TZLWU=s(MlsiE`LJBC1k*D9UgMb*)n+G+EmaL4e8&nye=m>gATT3g&A+$YkcrzcJ!$w8Y+oWf^*f20ki;M3G@BADK?ZN7WCFkQL7s zG@99cPk5+?`tYh5Lv5$Qtnol6##t;3wGZK&56z|y3kg>JfkaA{?8~oE!n&kAYsbAP z+@Do(0)$Yud#T6@6g+2heiM)9EHaen zLP>?28ZAIe+MNXA-bWn5d|X1$KSTm4YpEIamOXD8dA=K#noty8&Qz~vHAG!b*e^Vl z!SZ}s6t^!b9LU;NB%|b$JOa==F4`5Jr(Az6%J`aoxKSN$MKe;|2)Rn!3L_ou#Z``0 zg^+5l=QxwivM&*LZWS`f&fk=nIdQq5SDJ|IS_;y13(9xxL7Cswj=5D%fWN<>zE2GH zH0Hh6{FL0z(f->faL#wgl|N-xC^j9zXZ-}HM!&Z6F_HW!iNNhBp`3h!K5Y>SmG)zO zn#IGvwj(#3-ljhA-7OkYIwdN)SWUkk9kKge#^!W&xV#lERH@1ew4B?~wH_nmddoqE zHbQiM%<`yVJ>e`#FP2Xo?l=M#fB+Pz9KMev^#vWAlyz6`TP9&@>K1QQKb!<3O)ZY{1JRfZp8#ClF6PpzNeS!H$Tgw zs@s+rvdrq!_m%!f@hP-UF>?R%;2)7vIKI=Rb{U!9#2=wt646qk4%e>VzX|oxh|zx^ zE1RdB#l_yK$~onTex}`lr%E%@hA3-;x`MDdX5-fiX(@kpMkp*&f>#my$VJ9x43WVK z13CjiTA7k~taCkBFUU!rlFv!p{rE)DtofI!FpY>I30}-Y4Dzo=g{!y~_m{7_KRo;W z!~&$8xd9{Hviq}?=SFyeAkAfww5EoZB`)fps)~XLsK<=fI)-uMgZCVvski(?eNJ3L zI8>){jj0m`RNRwH0_)*|?AwwUys(1L$K#({_l0o-S{15^k&5EQQ8Rs{43lg)bQgjEW-)8@=Q5pDi3A#8fr7hjsHz^30RT{H+t z4wV}$0fs?624{H(I|&2PKi z6)U8BQ7-0^E;4KiivWglxa$m96PIxQDwoA&Yqsew7}^N=9+aW3ylQcOB1j~a3D4=l z0bM*0Ho0@5O_~Ic&XHIi$p~{(peFv%oW2kyE0JRqCctTl<1hdXN#?HV_dC>tZ6WKQlm zgY+pZF7siHatAI*Q`=TJM{|l&jJHoTH8(m-0K&$5ry&g3en`DSL>|>cGW_MFeq(kB z71m@{K0{n4$7(FBb-1^F$O&a{tL^T6YdF8R!h%a^6#c!P>wBlU40m@V z%ShLE1hptCNF>7k8gN|?)i~qYKj#bl^N@a4K*3noGX(9^k+WNtRc$p$a9EDxJf)gH zqI8CE-Z2aJ1Wcz1^?c$VhnxH&N?>1YyJos7TmB2otDsZ~{((X%IU8u6mGXgI z6B)r)p^VbJkC3X_tpdKuOjKBIS!$C+ih&}Fn#v^temDdvsvm!$S^Ov5>3Mz`XCaB6 zNS1`O=u2hqL0+LhYwhWMl_tFayhbaZ6fKi#GL5+=0~YYpFghwd6g?9Cv>~|{Zd|S5 zK#&Z-LeTHhv;}l7puZ!nOG9!OdrBWs@|Uux1**ovqh)TGze=~*tSon%*n>F3@37YED z6rdsG?2+sojPhUd@v=|doq|5@Oh-ydnR9Eg6QX(xe@K&?izbQlD%go`g;UNmS?2TTN6%9|#*(>+N^ z?BhAHGb4^a_AxI|OcYV1vKM<0xh$DwGN@J;wwRRRHr{L)j(uawO*nMpIU0B?%jDE4 z;e)KB449Jw?LI^|bBtdx{6+P2$(_p4FjAE+fo!6J2CJGM#Xy-HC%|(W-a!mzF(!Sj zaZuJoe^tzuzg;K7K<-ap%R{1*fx8*R^zAQUa~mFM?*~B*-(*1- zo)T~vkuikUV}$=|qprs*8#Gu+R^YRRPuAZ*qOlbRumsxnTe^s!DTn1ctBU4CXNAE; zguxnW3(@T61TKGm|8fs4g+$eeISTL;nUhT(f2zz9+63> zXqd;`Uf~=eV8ix;mh=#pMI3zD*jqEI$Pj>?OapwKm?Z}MyzpXSP*N+DzfjcN*&4N~ zP)`i%9f8%eF4*@JPz^zj6;RDK^6(wlt$NgXoDwJe3WXo?@Du4br+ z!?#{-7t;=H9h)1NPuVd_*^C_zqe}8uAGSXU2{8(eVu>k;qr7^DAYr0FY}K!#k%nUJ z>W7oPn6{E92BRpae*miok$!cIO;OPrn_@g?-v=(-zQ;PRN|sYv7&uV%=kmg10$$5^ zPx0T`i!&%YhOpH$LufO|n>(~+O>>rZIXcwp3bXy#+|d}gkx>n7u-L9-bJjByeO&Nj z7p>(LW$y~jaK$=>*lU4{YK~EQwTqyB!Y%!>9erEe>SaaG^k=oF=$mAySTuhxwm{X! zL6#YKV}^f=r(&W$pd45{tc&rT=6As2>HeQmjd9=;@81_>_*QI!WA~+BwW~K(L0O83GCw6mptek|l2vo6 zma1ZS6&Y3?A%>7n3xi&xgdED`y9UO&-y$YTJXBnPe4qdK(8p4R*Kp&g`9*yY`(90>EHUo#n2R)hVm!rug%L9;)*KVd zNNto@Q+8#x5d-gISY<5U>;FawhIL`~fR>y9cx+RWeN)blgnY=yRXGhk4oYnEMGD z^TT(f?&tN-K!KMrmoL*ogXV&_nPr%rH%;4!ww+X~u~y6rM-lmLb}Jc1XQ>{QPZFuL zHKG(UKy!b?71tA=5SqlHP~cKC<34{W>;Lb`W33x)cKd0gor_ZRDnD_y81BAk^w|Sd zXII*|5qxg9i;U6vmQ+moM>6{aSWZ641zloX`ac&5s8U*P@cXvypbR#JZqJVUFccX-KiPD_{QqtYh z@Qt?NNji6`Bs; zLMNrf>PI>>Vsfv>_jzNs**yzwi%UZej{)aZh{+T8p-I>SrEN#t?{R15?E>=C^QY_fW3gygxck7QCS0ZbfM%q3XjaidW5w zic;i(^5BKk0!JE?IA};$`>?DzrFdMJub4J{7>S*JW#FW=RRDjRy-}bdz-)}SEdHj$ z-XPsQeVKXK2wn)=|3V_GmuO$~%7c0CDbfN&8iO<;s@aDHPfiP?sqU?4Cv2$xhlfoS zHtV~pS#k$0jdyKNfzCjF_(`jVKlx+}ZfG!KIhbFxpA})JkvQo;uH-xD;Eyu5aCGtn z>4l>t^}b~jE%zrd#i~5*YpC)=D|LO#_EsGDbc&yw!{l`dqk5FFl!1QywolU$JAdHh z%Y&JvSa`)UkORVm2iqf9N*mDR{O3>Fg@o4urf}4(v(cM@vb!6M-e3da1nYI1?}@1C zNhlIJ;GXpI7FjqKB)@1Du3e(VKBCILfAr*b189$y@;4-6U<0ONc!r4yMZAoD=BOtX z`~dO{LRLg0`VOyLR*i`JodY@i)TY99Z32?0GcL3s?BNVv*7H)X=&wV3NL@}1LC{jLdRTV1bj=~bz+sz6Dx`PBJ6J!1!^wJL685n?!O1(Pds^OS%UAIBbjgm0i5-zyO* zh`%X#6N>Q1CDQOpKR_iRH|3@88^W7))2xvnPm29n@hG6D1x;UG{wXf$TAQhBm-&-E z>n+6JnKDAN@kkYTQ0CBY$F=^d;2Yzwp^;D1kBPG3HT+S&%OX|=872%QiJ6K$SjDm^ zUPDXZ2uKCMrz4IS@@b8G8DsLgU5DfkBo7yrfh4@7g+zb|{8mz2Ad?6ScG-9CFO^Mr$wGn|O`USISL z(qT^`Q-6J1;5=Tq85Yatj3k@T&vOJ`P$|QjN%0CF%(pF75>^iEP9pG{z$8T$c&`-^ z!A*~p0yv6>(mLC?}syiAv4dO#&-P#Ng>C)Jv!-WHq30X8AT9_MUEJ;oY*9)&H5ScJTWj9C7sOP zs)6e%mi|aGZT1~8h|^e9lEgya{(ffg0^F~^uk#Hoqk4uLCq4@<^#j zrUXl)5&+Ry9?cPtrF4}_xg0u!oMp182ZVr?4nxrHy_{VdnDoG*@rvAIOa1!9@vCCL z>~CC#RsT@^Hxvwxo7Y?nFVbf31){kl+N9*b=g9Q^2(-%wktTvWuu=sM4Ec==oJ6W1 z)v-qYELLACgC1H$|LbCZAFvT>Mhq_}2h2eGanymNY=BXn*BNj1O(;q=Q=^V7Mef0#77YSw? z9b_ok*d0GklcAFP$o8CmRgYyo|S)pEMwl? z>`i-CeivQU++X-;vA-gifST>@(HBwRBRt|mkpFrXc(H)pnW}-=35x>Jp`erNGTTR3 zSZPvpdF5c|#TTFs;y5fc(g?O6un>q1%LXZI(-^kA3uug{I89wkcPe{rvPxyUY8UpGydy|K}cm-_YL%(SD(cW^=?K zT49@g*VBA^JikaogJPH-1S*TKbLV0gq>D^}zL&Zz`a)H^nFTU2RFO1@#VHShg(Q^AqLq96U@gAe1#*L1u zG&=8Ae$a0$e?E{pX;Z1rVFfO{EdrjS3aJVg>hwDr$x{B|6*H)?V@>B7q_LFB4Mv%bxu85by6Uj2eNT)o*rFJowf2JV{(E-}cyPyx2{>M7@G6 zArI`fzuhQRMaS_(kLhma(8=b;RJF+=cjLo|58_y??CqmtYSmncSZdj)ve6U)-@^!$ zfA>TkUi#(Jg^JUG4V*Kyq4qy#OazSyLhtr)?{l{#{BWE15Op>%U+*CQv-M@8`(dWO zW(Tu-q=1@gfw30LU2;5wuHjQ!g=bxvj!;G{Z&wTN8 z-|DLgSgZ>B$@;b7R{rZkMdvD!pRG65X2Ji8cr^kwy@Gp?^1-|FWN-|Lw6uZD5|Rj&#%We@@ijCV&Mq z%?}2-QKBHf(HV7MBar!zrG#Av2YWpR#Gc3iLkJco9h4iX{$X&By}>Wx{V*HfQ>cKE zJ?4&)|2I?quW_jaWUGT;ZhehMMUjAA|8{1J;~&QVs2K#t&bBTg!=>383mTXeuo&=> zkKjVP9!^Juot8V`7+!tO7C7mBZpf?TxRIbB5b~cp5;jJEBfQV(&@^GcHcsnFnM4)? zM%UeGbh>Zg?Blh@%}nR3OrxojPE%w|viL9pR=M^51fQ3llR+)OSZ=R2v($=JDEne* z!sbQo7h6iA2zl(&7da3Gt^m6#VVw* z_s!MWoY(y-QYN>WE>F>R1`&h?2lM~7FJKVM+K9FY!)PRW)ux{W&L`DZB$D~vD>?0d zLFqeZi5*%fCyJDloHvKBTvwl`qN$qd)R-p#J*rjj+VJ)HZc)^1olR1ZIa0c2m7#%F zl_BW?C**S|9@EZD+w~5U$6@P5i!Q6b4j30_df1aR0gvjI>zy+Att1dbWH5QHC+X1< zF-iHTuTIt{HWAphDs(l~3KbYYkW0#y3`5!29?J_R;dPRTrIFuwOZJ{4>gh4$AB-j& z-WQnh;9+cV)=QO6!Va9=v9_xX;_cC#K*M;h7uYdoVENsd@w3sg(eV|)dleI3sMVM& zwpMc4Ov`|kIUfnJ97yM_L3{R6gwJhn>zEPbfb`gy|6qAw%G?p*pQclypp-;}jEmK3 zy?T`#DI<P3%D(u|YfFRC*r ztILPLT|ww^4()eZ!&$-HE zMOoB(Uu``VyMi&3d~W{8eni1rI|mjxD^J;lfX2HT)++d3{F@W= zN4Ycz@PA|h-kLPSRfx3^SldYMdnp=7mVa7jH?Kv+ZCjj!t+oNlP)dj9yjGkjROF)P zII`RrOo4BJgMMt%mnoZXaxF9V7AYjC@;;VCZ-}-I@qm+o>~u$wSNR2ZMxAX> z*r-fb=<)cp1aMAoG39fEk;=_*bzSEX9{*JhTg#u50oH*YazEzc)sIA`M>W8w zM20Z#{(47_g*Hco07dxRWe8EAf=B)Oc2%lGqa@Z4VdES>T{H~Ly3ESAk6dfwtuK{r zPHQi;9No^gR9m?vhT)vV5R}b3W`P0Kar$~B;FFfd-Gx$gW?Q0Gt^MfBjqS6t*;X{ ziC}6rn5-&vdnE3Y4rc)o50QEgSNz%#Lhlm$=mWyw_F`*ukorpqu1FquHjhUZ5W8s& z#kl(rdRmFX&d=Qbf`h?qf`owPZY$3oDj$NhhI9}@GI_X$N6{!i=87S_LIB8^BS;fg ziQDbv+rAWzqR3YW`TTBs$&pH@`01||037cyyK?td!YS8tWbe_U^FIn44V95k>#T$jwYd}gO05o z@)aDG(>PgapohSWYa)G?$R4{9> zbKTjRl=J71c#<<;#1HK96tg+iT?EM&Hs44P^U z(-`uj*j7$61-)GQ^XPULn(LJi5t+L#yFZhOFuC}JKXD9FN(yNZGtFQ_7{-bSI*9d8 z4#^bfQQVV#xW98PwbcN9|1e{aIYfkV{JFbW+O5@h81m=n(2yu;z1m3gd>H!$@2rO& z{bx*?e2NH8SIShz1CjcB+t@$uEfqK~dN3d6d^k<-(E^ds$72-IcpRp!Vj-{2ccxI2 ztRIiY(GDr%Kn0rOiL8L1%RRm!@bwOm@dSm)H?G@@$f|BW!J1lR1f9FHT_A)ugzIh~!ZS~R9R6H1C$|@D&7Lp!@0X;Rh z^Vi1S7xadQiWBXna9EgL94u739~v6y4HI)LqdC@s80mC*{L3gr#My31 zOpZadD*gm8~SRF`Y@8d{i$|>I0cXF5+;D4snN$gjZCP4aCCo$bV9`gHhjTy7yZM=T~lI zk<&-djF~qi^C>pCXGo?aCJmTgx$)LpZxj!gp){leVfj6+OohY{E5dL#rgb;vH`MKz%G@t1HK@^# zOQO2Vm(ijpBu-OWyL@$@4?`{>OvgfMjs_{RI+AltM5GXAsakIJ)xm{De)lN}0wv3^ znK2T$vDY%q5_2r6NYFH}CMJ$UxpBoMF~Kw09%N=Dfy>lE-wvm5i+XeRGM_ZJ@aNG=7ilou=e40U?Ly_O$~-?v2}JkL;Zju;oNZ?V zZ30FH88-Sy7rs~h@eHnWIVJAvDCxW-HdLYIV+O3PXR$Ef(bP?da$_Rs|KgDG-L^`vxQA!lQQ=JB!rP&6^J`I=+-qU z7plGFz7z>~FVT_6N4ape%cos5IVd3N!5E|@s{Zd5T&5nK1?I4j?o7P4l_<)&f}6HT zWvA#bQ_YjYuw1vc`)Sf4HK~A*e_G3}?BzhU0+P}KV)lHW6#padyj2EU9gc;(wuU8AAn=bCJ_ zgidNpqR{kCO|S2cdSX^;r7(<1%g$sOoHSG>i>O}M4t?+2%{Z+ z%NLpx-&sC`FUgZZKmU54TBZ4UVDY);GuP7=uF2ox?#lTX6aQX;)#60%lc_@yv|1f+J#65tb zq}=_XmJ;N1KQ1p{xDCE`s7r5tC&gma$ec4%`F)Fog!vjpzu8+n(w?7&#u_jD-k2od z|C)$pX+*Umg%?)pMM;B%)iQM_T=~|GJJ%LOc#o056uo~{IJFlug>qY@sn4l`CXpC(v{AgSML>5EVp`U zhb~rY!GEj*g%YFV;i6z8$Sdy;hxePEu1;wB(m2{{ZN_vM6H1@bajoPAN(1zEc~-tP z)}$!1x8tqRz{)BzjX}^^EUn@Wh2HBUW{cyff%VJ#IJHK5kG^=pz;eCoB!PQdyxD5c zV3MD2UlG+iP|Tj1qh}&I)BNW&ekVk%)PIsNK)@>7rjNLmF|!!=YxcA|rOr906G$X< zu=Y2z^rmI9C=cbk2MK9>uFphktC#irQ&u-k4}Q`-74{Xl(td&cfZ6PBbP&9dG|0=* zf0{uik@XL`Q$T44s7Tgh;VrY=9(2i=^j@eWiMFqP238_keKmovkoaXnT5edMmqxQx zL$tbd*{HjG40ZimNf=7kd!8rxn*yOR3FabqP5->akV0I>q7Le z-)#t#$a$GiX<6c$-4Ax0pVBJPr#PF&ckdIu6GXxUBco`*gFuOA>n$8v**mmnuDb5U z9Pxc|+#TlV{8(n`L0MWYuvD9Z%W9}!Z9c?!{QC#$(L&)3YfE&A=zc38+g_4dCKE86!Hz3RW2p7pY+ zfgZyf2MR$jdO!@ZAx>8rv6Aw;M}LsTkx7~RL2Y$H?W+zs+MDcCXFu3wDEHXlnc`J- zfe{jw@4tBaj`%LDPySdfx56K|!OGZHCgTIgEiflFm)AQ=H$fT9<` zVl@vz;9RbHX(sc^m`T}*zBw+N{r0zK(XhRV)q_{8)tai`jKj#4dU+v_F)1*RC=`sg zBPbAg2uUKRhe3(`N{vU(U4L9pu0CVZ7zKF4wV*U%Aeu+%(pxB{$0aTh$zL;X)1%;J zzu442`fn`&Fqs^f%V0%%UxV!BVtK>H+jvoKyVTTmpC*O*w^I^fQYlrC?K6ai;nF9} zSA~)Yy?Z5eFs;p)So(X$gO?C5BotVzs_GE!rpw-fu6>X}5`Z7(zCUlm)!u* zY_}fi`#Z={T%#}>G-((fE=umym*U-ZQn7ve=uA*xwSa%(eMdm`79X^DdU|6LY8b7Mo5u!&XVF5p^Ku>!O9ZtFYaJ}LvmXCuOIVanA`Je^rC%xp z_}yPg1N5AAq0M@;hF2fmd$k+A(PRIs0x^3})u|IZBbE+W7pk&k#mK6q!osz_Bx!(+ zTjE5)I8SWRWs3h|EUjQyy+~P7rhs}uhG8&&yUoax)y%(0mBm)CV7+&D*XISVyMk5> z)7Z^I%MF^R0hxOMa%dmMXB)p7-sWZI88o}8Rh!CY4fk}Gh$t`~^*)cfBu!svc2|xh z6JnI_t_tldaAao0T7`kv4f*$kpRH@E)hhLQ9~AJ1bbw+>5zeCtXahJ5(cY?p|J@vd z9x1SGs>YX(kG970V6T{UMD_eLYhebCVO%jRd-Ww31Z;+1HX)^OBu5FY0Lx_eI58Ex z{nhAP0Mg^O)X-J6fM?ni+#Q7YUp!9Db@-j-!>fW3fY2AlPSZ}Et-g9>;ftH5J}U)2 zaNZV?^f(V}GhbZ(P<8Bd>~zWF@f9mol@2_pd6kDqIFN0ZzV79o$Eo z3~#|-)Z4!$C3^$hKCmPcHPH4O^Ri;g&n951U1LsRo zz-c#EOBS^bc&$R1DOpMK8omT zgR~5j2p2)*h^7#Etm5%9gkP6XE5wWJw7=r-H<8$oaJa*0CZLT&9*&<=Bu~-0K=5mcxgJY+P0f<`^1CzQx=7< zsnS%>!Q5wUtzm=b6`o(CHLTS6hJ`&Y!*6fzsAN;AN;r4V%9+z^6&R-&=n&;TRCVEO zKbd;+m#1&v!~!OGbrBjiUjsUUjB$4|D8HeNq8*_HNx`AwGe-jo;s;#yQ4^fSbE)uX z5Vf1|PWJ1U5&;IfvjIOkBni_KqssO%<}0(WyJ=eW?aZNhYqU1`$~3uPU%RVeAM z_vLleGM7s{_luEVY`T_E(kl|Qm+UA@CIlfC7LgfoI(hgqx!<1@=p;$SvZxbv3fSN~ zL{Fo9mNX+P6Mkf|c%ZaLD(xuci0`Fs^eb(aFda^d@(rr0E~u}^9nq{}LQceR335Ba z2_3acH5dS<4ybVRleIWy>q~Q#o9EYP+X+x10M$koa{ z8;Ni<_y%=!0dTmZ5qa}tY8^8m><#}ga}2Vqd24h_uSu*C`x_6{W*ZIKHDb5uz`&qJ z<1TZ($(r`)FV1A|+jbbc#UNE{Wg+7@vv?wM%wr6n9VQ<}!BM(W@T{8ax9(&x@A**4 zJ-miCj3pTKnB!-OdXY;UM1A)!#^tN6POv59bqJ9-fzqo#d$X<*1h!yX!d!UImz=dgEI3^ z@5e-k0hN|C5`#`b@|xCm;%9Z~Ve1C<&8r@o6uHJ`pVP>4b!>y@Jh-JyV4C(u7lJc#Ou9_t4>C6 zmCTJC@>4~OazW*o*%;%hH5>iR1!Z{dd&q_ZnLK4{OsVq)OT)zVem9gl*AsG94`WP` z*X9t5sKmh{uF|n1wgS5`7}jEZcY}pxRcc*03&*ck6SRG)TxGtP*v9^^WacF% ze>OUAayGD=SG;vw`G%-dYZ((MFyHK61-42z_WH+FxMj5>rHUo#4@77thJaE-V4LuU zU|SacNx@6^xma*t85afHjS=|tnN^B87%8x_lLXiQ+WT4Vkir?UZ*UbM`-cafpdFkl7KuR| zs)>=N$!+UZXZ<&bjlU(tA}6WJ%2@5Qh7j-9hx{PhcjT-4a}C}LzLH~n{khRoK_ZYw zW7%>y)c}Rsz$Kp`QZwcaV(8`*?KJc%JQf2H#%Y$-afi)_t|rLHT+7V(F@A^6Y>&M4 ztH&ajEukInGx&4HIi&|D_z;3TaNfFI{88VR8=IP+r>pVs%`W7I-1ntTk zR(0)*Ws;fOs=Fpi^khdEhP?61sMEDKXfMc~T$3^3q7r6 z`tl$I+=R}a(Z{zv2o{cRzrR0$@j6OHAfOjT(u-Hr0AhInBvf6JL4=OnprMFV5I=?x zps`q;=BbY%pI?BqTIk`{*@rLm;kg}-IidwB0ZV*QmhT)JVs<7k219z8Rz)mGojmH^ zf+%!){i+ISO;n@fBgnH_n2Y^k`aAfu9hf`(+Y@S*ZYZNXrBP1M$h6Z`mJZ%m88d?@ z`V;CnTf`m;r4n0abqA!2ZAS`P1{`FRiM}@W3}~J5l+Epc(afOr^+$cKbIaUB~7ZKqeghOp~lq^I`UFU*w9F25{gumVpS;M0`1|4ZrnU!bJOO@Pq2bl62 z(qqyjy1$=^Fo|^65>E7n^7>g8BSAiKDt3r)8a&JU(*=mERRnL;qS z$2${(Q`L!)-@{5lcB;wykO2#A8|`Hg9F6c6KxCTu35RaSMaG|F5=UIRpG?qG;L$La zs8*AURd6(G+hs1N<){dwW+`=)&~@>uD-#ri{H&IXMX&2tOyOIVjghRw;utqjMM!I; z;ic!)Y&IXZ?LHfEx&Dp{R0HEfsIhbpRr#GD?_aR5#8-l}vrr+(!fpulqUHG%Z{78K2`&xkCv8StjT}I#b-F&G82l0f~|VRjZ5; z3}+-^q!zebYUOjIo!(5v8)}PyJw}v@Y$YzcBQ?7|quX}-aDOYENr6Sc&yd~MIq5av z%C)+!Yrjc2n9aYNV1pl)x=1O8%aZ-{g9Ir($As26TWLmvA$F@UrC!FqM7KRR{MuA3 z9>(PbV#;9u_|^S6YS(XpIYY>YdjryK%~b+kdMajUlmzY z%yU!;mCRUj#RP-@$e$2?U2B zcm{F?An}|f>Cz3LP&gX_P){7-iJVtU{6}CpNaOc(3je=ykfHyRgWy3q7KpI9K5UV> zIR7fzu{7ZG(m7!a`KnnkP};{*aU=>2B!bbV7fG<#?!7uA^?gY?iLOnSNn5#}zQ+5MScj22S5sQez{PK@Ba0v;>AlOhNs;;s&<~qzg00sm|JpE@u)=v=ocWoq?o1DV3>P zxljC&n1K4pUd*4qRx}W)ud4mN)W`u8J$V6etU%`?4;-a;2}=)#3(RA>AOq~fWp&G~ z7S|DR7~<3I%`F>HFxeDIV4X&)DS1rraG7TR)c^fih!8tgS3VPwv`Rgi2pU7sG?7K{ z1=3;`**Pmwx`G=(Q zS7e*%O_`ng{uCaAw^%Gk7cy3|v_*S~6Y(sX&)PcO^}4oBzuC#amqb8bGIU)H+vGv; zQoS0hC{2c^=^nT zy)m_mXdS5`csO-)B+Gp_lI!VE+z9BSm4IR2Tg5%jfd=3k$ii#b7ibgd3q<_@4zz|h z!$cXGpK>2U1URxEAL`h|!+BCkq;JFFb>c$>3>f56HRpgGdB*f8&RktPzi! zn}lUn=X9WJfVP-gN~rps5an0Yy-RGo%62GTCjE;iI5g3+j3-Pbaw!rS91{L^56)5c zd`IbwN>Lkd{sQ&5y+Z$#8Rm?L?kpkH78K(=A&~% zH{dQwKi-d!@NYsahJE!W^?cyQ(V`jU^>^@{CcH>(7Fwd zC{ZiMo>!APyN35Bz|%}dFT&EUkhcS*klXL?{2X|*o8xnX^w;SI1Sd5o@~@}u4AoH* z7_zILZy7Xy+9^*xSuuyJ>>#|A+UL*zo$7H!#A#GX^w?!@LQWb-YU=TDZfW|%FgVf! zdb%{#of1hd%+lyAEE6#j^E^147c!WN@=0S5AWMBfau1ZD4^%CR&Jdecwsh3Tf2OpJ zfF4ZVrAYA^5qkF5lrJ?FBYaKnzv}?W*;l{hOKdMbS8MegtmtSJI#7$XJ=}ZEwm*oP zP(Ir!S>duNQZ7fX8&$ASa<(1Il{5p_!atVDgTIFrwd9P>ks>g-QN%s+Uq!9^GFrCD z*qV`RDrIh=F-OqI!~!_ZVp{iU@o?PEpEEU&ENU{Qu82o&`j$y#3 zecvMGKK$rXWS=?zHs#HsjGu&Eb!x~O8VX%8`T`c&GEKLSF;d<`-(nB>w zG{2>srs#vo3{CX{A1qb#Is@WAjQ&jLfT4AJtTPwI&?TE72qObWqVdS|4% z9TUl%*G8_Z?WX6)~$2+CA}$4qc=|CcmDG{w+=3yvbtP< zNKIFKD_BO@0o_Z`wb#}|b^l9^`C??wgt5p&Z~S~(Y%PmF8gur;|ruBcXE zh}frnCfb_P`%H~twIrk=`Mr5Ef73D=*IqOcF&>uXbfm7WI?|9@Q+yJ9sj8?8h!#yK z*Z1DAXt88umQMK<*AdBr{-E7)MNBrjGK;{PY&g z#|?{4-i#NxA?S|gVIL%tW7{c1N~OI+3ZCpDf?Q;ysACk62Lk2mLFdT5}HcyciZQ*9Ek z=sUJ(0qMy$NU`D*UA9Ikfy&OIh3^9B!xAkrJ){7GQne|LY-}M98l{0WN4h_Wl@%cG z4ce>SKLTO2KhRi2^%M}$tBAmazEM4ht8jo1jR(}(RzboGJuRTMCAUwhw_k__v1k5R z_i(Y5MmmzDlU}l;{L|k}Zr$%;^GV^Upa9@_97d;b5tuoRx`p3&e*nK>%gJ@u{v7nMgIf784VrHfVB*oB>K>DzpChPyHHiPtubc5;H!6tsj^4lwiLAJfR_q zS$!9vd%r$kyb8qBy9|2zV#q26Fl)>wsmI|R+7v%KX4`kjE~ACH;!Mf*YcRV>J-Fv%mQs($OAs?}rGJT`6SnX+C^4?R)C6Q&L3L z`qdcO!t3I|RYF#)SVffE{?{|B2uFC(er0-koeu{01oRT8A*;br0(>D1e6~(m;raSt zNgRv2$<|Nu0-5990(-RE-zG2_zl+Z%g5ZW79>)D~^TOKNaUT1-LF&GAnlxWQ1)try zY)>7J&90GxMGnli^?TQ2Z^9!T2STXR`*`}(<+NVeCCsEHUmbcJ3)Z(@KU$_%G)tC> zOD27qV&+Z0+Ul|{dEe(KWVGMrnA|VciHp|VsFmm6ILvo@-$V?i@~O5FKg8iOt1;_p zJB-ZC%T_d@uRS?ms^Mbl{h$*@trw{uLpHI)GktO1(`wosyYyTzk_U9lc;r2edcuh~ z%{)h8R+<#ORubFjvH&}rj)$)|F=F@Im3Fb^TdRpHMFzbbr+jK7s6~+N;$@Nd$$7L) zhQ($em>y({khZ?W{m%Bz;Y2PQE!o7u^QD0L8^85_yiPw5zVRiLB}`wxe(eO#Ox%y6 ziwos1ec%uS1EwXXRFWrEU*qv04dlTG=m9rt>Zd%KMvC!x&9i6#J^44Op9W)wI zE2f_1w2CybQneg4vNUbHu>UoWFr_&4mL*nPdEq696Z(%efNH@-n5iBT^SgkA{*<2~udF%!XsU3gWxxSbz znUI6mbbii`1Wka-s%`(VcR(K!@+>M#7j~CwBV;pVHy?g#PcOiMw3qbXS^zBb{#eb$ z`(^bz&vv0h8<{_H&BHtGEZ24J&9w=8py~1ALg?*ocjuj7872cEwE@xv0|RydhX~uP zI9iqgp-?_|g`Q&jxkB8=`2G3Aj+lfLu`_eWERO5$fcgk$qQQKcBeT0jPG=?1ZAFk17gjjnxn&h_HO zLD^~$X5x~c9wVyngZxuErM)`vmPA8&JQZXNFTe7ERGs0HREQ|{6x%lh;qiH6!2V$p z?7w%Bk`USUL)NUQ%n=XX$vQrY!JtFEFz%pVP)M>W?09k2)uUbkwZRsugNI6IB|5&FxnINl=bgP&i-0sOH7MYpNhaXeZU zMSgao#lJTKxWaU< z_|~uBA5`Yi{XsXuU*u&!P#>MY0v$DRg^FoAHV8e*yF;m^jeZZl9%p0H&*Sd^!WIY0 zAKpk9tiIS&rd_r3Si+z96_B{PD2hxb@w@KCn`8h(9xI7FB8{#wYWKZ%=&=6j)xqdo z@mp{bzpL&;JDkZt5^GW>L?X)U;ZR~e}6Wie9}jk_4p5FPzUgL+eZ5|D>GFAEZmV>B-ldg~)6z_}#_I0NIT|rOzAcYjywW*3Up0&~Y z)@qAQw~}lTy_x!@IMY7$tzs7c8n>!;8uIqjCjBRk%y(_XOxxoY?<{+Yl0zC z`i|^uR{5ezioXO^yI&{~{^{bB)|f-ix=2b@o=)=da_hH5Y*buQtJBu_F4#_T^5HInZw2vfT$v3R6Xu}XZ{{{-(iD5SiIh&yIo9$zN4cAGQ2 zIeU8ANV2ih&~`#px8-CBnPl~-(ACC;T0d>rq4zRoslWCk(^-A>t9gsVcA6>c>6nks zg}YsB+6C5Jrn`mc)h9DQcIF%Nt9NE-V)-3>J!)opVxPN{u`QP?-B}c9*Z(=`w79FQ z^ecZwyDII&G&2=klj@oo>jqA=ty6hAkX5G5I7GMQnpAd{+%kRsVz!90`r4q-nS_+HKG;S*w^FUX2qi&F(}P@x3EAQlghgu zJXY;AOLa~o$=w&t7Uy}{#L_BsCAYXgr*(Zs&bZBCsun}fQ(!SOlYf|v9m}VvTYo|2 ze)@f5n&bE2O$dkb@*WWR5@~fDjiP=wMq5V2+ln*-R20)k5!JnrP2{u(je-w0L zZWk}+MII4&Y!>=hzssYQ$E>W{T*Fu~=GJ^Glw@@yMaCjg4#u1suyF!h~S2 zuXqcxL76&h%r+U&&4Ki_fp0`2!UNvI>(5@yn;-XK);!Cs>~kyL;Rxbg$Q``TM4l1R znvP@&iYdaq1m5U;l+9nN*=>KEVp3Nf!&5oyewhOc&H0%M;$!59h0Yfeoub4LDiLDs zAWw%H^PyNE1=;ed!%N>^Y}JFTP>qw!uyL}z#LIR#_|-`C>)B&OM}(gtGGEepo%tev z>{eDHi%epW@@-i~kqMT%etoZ1CQR;g7Gpv-7>4`i*)8|-U}*SIz_{GlA>irYtrXOh!h-_XA+2| z*NiHh;sLdoxR3jp#M|#RGHvc-Iqi?Dkt|-nzTmW-ud@rgg&xiXQwT+qKVz4AM+Z{ zakTi;dksYhTfVoOXJZL2y@o`ohMyRDAaA!~R#mq!-64aFW}Uem={DC*QQ!OE{tMw< zy64Cj>Xeu61zp5~`h^Os-iVDsxo@5P8aG*NpgeMXVv~{Y_$BtFt2ZZ>&Ir!x-YLz|H%n6&cD!2i z!{NdlokWp$f&YrOxU~GSn%TKGuf`*CH7o2dCj+;<)sE*zM5B_Xpwjj1%AF^}OBQe} z)Nj0`_)Q^OP3?5EVaK3$pjx6<|Ngd(p)GuGeq!}mn*b!cRiyJ<0SIO6E z%RG(vo>S|s#kM~f?Kh9tuLUDO%b6e)0oN(PI;b}Z)?P)0I(+nzK zna$*EdWDi03r%$aXhd8^9!soda}!bl@V4`dKdub2IGYay!NlJc;H}fxf?Cdq!=qL> zm^&e!+mj`6k#EF^OhI~fUUIo0F#_1Pjt_A5N9&*mYUblDoF7^Tg21<*mkPh_0M5s$ zD*>2)zb$*qeT-19JRA$t+L~`jWm==SA2%~P2}Pb#^`@hXxB&iQWqBQF4mkD0(V*g? zv{d1cGzCt)ZX{bVPP|Gyi7vvfkx>G^WE-31jsPMg$-?dwF5MBr$372t7b|>`JV zSCElQy-wGV$D*fvm9L-OZzxz-;aT?JLJ`is-Bacc-zu6I`ddik^YX0wvovHnzR**T zRCGqW-A{HkCLswcd;}VGTs!3U5NiDqPwVztKpuv<@ct4W?=SsQHi5W4Xw>7J=W)QD z7+P73EGALEvf>yCZ*WOyjQ-F-ab0Q` zX0cu*$+UiYu{6D0p6>bCYyYDZyPhadelitbuHHOBS3u1b6OY_e%_2BeQy;I3Xb+}o zd5MzLjezmV|Bt;t4W}~v{)b^xh7cu$45^g4%o(Bx86rxNslqm89x~G)88XwZEtzM^ zM&==vd7FudGS4#4?sbvA-{1eZ@B4Z2ym_AE_`T6V*L9uKI@h_*wLa^!M5ea2P<5|4 z101||X-LEb^9SdBvX>Vrmt-fB0>*^J?k&}rEb}_EGxrd)6&iMOtf$sYiQjHNjEmx1 z({`Hqo_>XmA}xs?_pY_o%a8BMt$X8<)Mx>##FKrW?sNxWHy)WoIIPz`Kf_ercUfc6*Ts5+bHlQlc(>GI*6MStGPo`MPy$vg>;W1{hHL*M6!CrAbCsdkpiNSKLF z6QCB4Z#g^if2OCMovq4d_uqEBbQI%4h3lIktE86EFskZ-eC6t-+mHdSst|#X!^b=I zhSb-ONv?XZip4kenA`;3E4;&TzOP>rU%luhH=L*xS&z?29rNV4>UY3g$F~y)0Vf@v zt2eLTm9<=y`Qb=fEb$Y98>hdvG60;5CZr)zpxEMwVbIza;ir6s;0-z6A!?Ek<@MW1mR*nVlECt??jZ@2vY1j)H1uadC zk$zLhIp0>f^GX=JfUU4?ws44R5WMj`GW@=9G?lBXbCxNwPA_?lKerBu&AO@YtM-~t zd9+!}(J@Sn<(tazcYZIc=qu@$@bJD5UJN&$Ngpus^Ou*6V2qE^Wso_}UJ$bIQqd3f zHm~L8(Z`A->M%yNf|Nq6&GxWySMT5S~> zp5K=`s}$XS^Q5%uN$!N?#;%;x#aTZeCJFh;i6;w+y}e-mI`JkaL2}v??epxvJh$!# zZb^Fh6YD8X$eAOz9V`my*G9z!L zmJ45rF$ot|G`QWATO6+2HRwMV{G?_cA6##6<6n`9&@!rGD98n1#xHY$U&0+%t`afH zL~_dSLR#AtDc=c}_$+2x2#n3)p^(J$%SUV~Ukl=W#7!K*;9H_QG~_;(E#f4^m57h> zd7n^|{PT?ECx8yh1kB>SEmCt#cjE^ zI;3tgxfsX6H2LY5tEfp3hA={?hTz6;XKpe38!x_{n5|@Bp1;Xdm`xUEL(o#&@(+@!ew?p?m>z)nHMW_;lbsu>hsIBafEGuk~qYl08hc%hllGT2_XYe@grC*+vlytb1DAw@oi5YWf?dDc85AaQ06rN!>5-=tkV-nVnd{|*;B ztwp4*5ZMQdAS|WDovCl!ir9DJJDa3)LBZYG8SUfuWpDMGn40<#V$iS3QxCno<>HM! zyEs}{O3$pmYU;c?SSu3x@@1LeLvVvp;oNTy@x=Gu!y{B|C7`gUpBZxJWm7toW~Hbm z=<9VaYcwr`gN@DnYFx=V3NwQfG&gVi;jYXGfz)+~5=!Yh_U6ggvo%o*mp+KUgeP+Z zp~xgW-0G}2A)6QGA|=p!in;AD6Zr3Gj!fUJLJJmtBseWc#JC-z5kPwGdqJ|N&l~w| zRqJF8Hg-CW9>Lsq*KMEgj>MOLIM+Ij=S!+e^U}n;Si4zvGKrS!4F_#kgcge0SDuO^ zz6pRDIcS%V_@Fno-!7m4T_D-To2QT3Q>%dJd$;4i=BcaYeH4H=o3@8A_(o;zMM3Tt zA3u9Vw|V~y^`Z15b*KS~aO+L4>|z7Hu#1{w;*tAG5Z(#F_Cz@6*84ik>s7w|HCPM=e}LK`PyU;h{_ zZSjV$V?cHt666Vbvtqq;=(ls?{HLnN4yBt`q}jqiPkp9LODU^Zwq*xu}%X%|atI;8D?`(|`9a7NF7N_+} zoqe==tDbiD;#Z}}(BKNj%u`+sT{FvNE?)+7mOY3Er^EawZ5;WPa*F9n-jcMIe>4U! zKM4*`Sw)9#rTuVbrYc@$euT|Q4H0||>im)PR7vq`1p4R~o^o|;3tu8nHIb)Tr*ay5 z3>L`0*Vf$ZgOqyzBVQYi>KXACzdEks{%4c|mnlPNe_xg37`f-&(?KmGNA=_$5@0Sl zcRD1!q<#Ik?Z{N_XX4|FpG27a_{3zz`NWNM0P$wP9@BAxeftCdg>d4)SiK~OEV#7a z6k@3v6t_qD$rGvsa)}YvRyi62NK{hpO)4!rSn6#KS!u761-aej4qwE};Y_A_Z2a7= z%Mq_Uj^p*RxV=@n_Yl}ik6F9PSMVl2a}cyOKF!CwarX9z;vka@9R(Yma95`9)Mv&S z<1Pkf8{&D(e(5nkm973?bG02pMCzZC+jO>N6B&1Kpfg26BTf2=P*%H3>%_E%U$o!r z6N0e*A{6wmCnP>4d=YpB7S=20Zs{14QR39?NrW2>&wmSEZHZ}Ov^7~d;XPGief*ZD zV=i83rb!zoez^(}(@%v&dX2F1SdnhqwedJY2T{cnGEC-M?-ixr5O!F1+;SYYOx(Wa zKm6eX9# zWdLE}Xl2jW0Q$3l*nTx5XO6jkE|(4`I!Hl%_^9HA<3AP5Ib=vR@~D@M@eU1=u^hv{ zY=|4AG;Hcv<$@r-uk+fc_SZfS*ZLV|uK~P}>{Wf_E*6))+#x2r6h9$HKIlJ1GkDl{ z!|>iGt4?Xc##2dx=>U>@yg1tA%7Z7WThVf&`vIhtuohd{_TM}Gt0e;e+FH=^c<T zo{+B{Pij6rh$u6#AWWFb5L8-O8wAt0pq&K zPyKSd@2GE2oqBt?+j*AAWxMxwPrA2$Cu9ByYox|YEBo`c6-F~6FXm=XUX#jyHYrPZ zc-_q+Ib*cBp}Q_!QoAf!JB7?+Hg z61UUkn94G?G6Ht*O|9xOqNuIiv()*eB1Ll6@_g1OZF_G*kkWi})t zNgQ^4I7d-><(F$XBk=`ahs$z+oVRCYW zuoSfnff>qF+&fj+B2g8t``2niADA`D{bb|9F<^cTXm?}~P&~sYj`O8al3{bLaUgxh zpL&SI0CU@4?i>pwwo)S&BCtY7WcpBj>&%6LMYjVBZ-bd)8@ z91vQnoN6KPDlujE@w@iWw99*nHs@_Jepkgu#BH8)2UwT4j4D4qQQZqUwH6YCY=z^a z9+gOrx0ujZ6F%6C3Uaum`!USeXySA1S+yO7k(i&c>tC)B;zoB0ZiJonQcw8u8Xr~d z`M5C7M-B7m&wC=&WxChZKQw$~2v7ybTc~aTs^p|twUdjcS_AOmymL5GVoX6^!r(J8*3+4STUa_&p))SM=Uu51GW;O&*2b~aOzX+& z6EMj2H+OqWXdcJZ=LZi(H`#rr5sPxObhF-MSi#P^WeqG$Xif|3uCD6DH2>Hwn{zvg zFW#G^4NL-ux-=>s{eby(n-jRB9)=a!XbyrpwMKRIH zAmv~Zf# zVjGR?Yi+^_+CPg*PkPrG?MjuqG)wzZ@9}r)9Y>Y2Rfs)lu1wdD58?H8w~UVb!tQ>4 zD%(G;kD~J>Nhy}QQ+_c+J@(N+-e8j`K6@m+i*P>Qt$n6|0#swrt(&AcxBJM zTG2u!PaHL~NnY)Ao&qqGO-}d4DV_^@PcpICt3IJSDyXXy*w0S|V-6)52vd5I2)lP` zhy8LJH1kWxGC5v1T8o>@DG!J{+x)A6pc>`++<>NB8og5l8RvN&t6BpyEz`9HrdE+k|bIG16rjbv`15ZXZqf!4pjlCmqEpD)^*($a0StSep7kWzLYXHAOJ2$**xx4IhHHDOXRDzdiX9AbfDwb!`U zt-I9S^QV?>f9n{V%H)pmNl<2yWhEe`D*OzDxT@kbiqW1ECZoT0Ls=Wva86lP@w)@N zJKgKQXKy->zI}DpU2_Ag7kgVe_zPc#xuS=G9$hMaB=Ms-?Q0BD_hdCG)f`G*cFaZX zsBC88XM3LiZF$97EAXPE82QX%%G{G3iMT8JYddbgn!9Z@nU@Bbw*7p%Vi|(-zP-oK z;B)D6o0MSP_?*6@|5RGIO?LH6hdS5s#-y>h%_XzSwEM;7&BaT1q-SZTjYmmp{ozboyxE-fiBSlXG+WlKhO7AHd z8xw_CTx!^0W81fzPAUi(=a2121hO(J3INbcV(n;cjz&L$-JrOuL;`edVhvkNS8bwP zG6HcO=w};$a?%3y(yDuQCc_2dK4L>yG z8Jv1y2fqAAG-#EX3@67y70b7)UC42gBpxH~*GHfC~P*bJYEpsAnGvZ>npoiM_37PR~vf^f)G>xPO)l}o18@7?z z!IycOX00M@pa;*w7c0+0Gx0KlzkuVNrdWpZB}{^Ho>;Ts6%fm3d`7y2Rp{D<9IvqY zr1V;goN_DA>QricB5IjIA)ifg$B5WZIZ--yZYdX1RB3;%b1^&)-Msl8Z9`UtCewTG zD@RU^Ije&iL$kpQU8N_*;kurCl&m&%&`U?GFG@#4zm%!nj`Y`mZn;Qmf&d4;BT;}^qr;E7o^lL> z?w~Zk9s!WL_(Ydi*I|-59G?#IVEgZ`_nWV~t~@aFnk#EtNRC2CH32|IVL;Nz!!~xx z8;`*DxR}dABJk(jj^a1qL*kb&t#3qRyfxsvz3c9IGWJ-17D#~oQ2G}*fzEbGSRus7 zOlL)N6+r^xO>Ua^!GM0tqm4x$eADjD+onH=?SM_>@i+}X9-TCf^_)z)ZC-}Sjmow!VsdmouzWVKmCzWUzP*gdsvxD{7^~0l~TkDJW#0l!< zk#P_oGBcz^2#bNA9;$KwWen#Bw^gf4q6RlzKJF4droI5{)i}PAXwbK$ zmQov7G^Y$yF1-KGFF)Lzvzs!oj_&II^qd4g0kl-0mDcTUaGLx(RzkCmM$kRA@F@@owA-xqtbbIocXqdqrrvaZhvFInkM6&`bg003fG;T^Dw6&kBMA+G7W}t%Yw}WnZpAwN#M+mEMBQ~ejY6CI z4GOaBdz0)5pI>x)!2l*IvS1VB)BPoPM67#xa->DIxuPAJzHxnm0C7S*zf_sYR2hZnRRo48dIHc3%M_~-A+zh1qV?ZDKUh+fxU6C=C;_!<9>jM*FJf%h<5)w z#N?1=eaC6LT$Afgp^U48`xB`qidotuV;5i2>?t0jwdBmluA`G&LkQs3zJ%2aZJ-ImdN?c-i2;fKs1Y)XRN z!+850m-&rYWT>X_|1|Ib@WyUZT(O8f7+p9+br(uQ17W5`#V5Z~S$*o^`_tg_A+Eze!`?BYo-2Jq5! z04V={mvR&j!nSc8h+YKUM*%m_2Ml(hkSEZfD%yVrft3M6m^?6_yMA*7&YcL+pO0nV zh&UA8-)$y8ViCY-T!7GaitQ|Mwk}#~Q%(YqX|83DKpmt{CeJop zKbQ;e85nzkbI8WtIYIqFk&XK)7E%oA06;o|lbBP{Oa#$GIGxC1x`Cv7wG=3RO76Fi zu(}W8#&ybE^MP(v^i(?IYcs>ujj5cJ%~D8n)yXbhQ0ggl!A3xi#T!v;zF7B#mu}^_ ztL-iP4hY{o!tS}bIv35M`lc4B2n29vWYqZjq8(t*MF*#V zqeybTk`P_Ye{OG&AWK_-$jQ$ulYixe-6!)14tCR71J8A1Uv@@}8EGCc<+S6b1}Z8j z7=;w+cxE9b&6Tb8y+Qm7w#hzs#@7G>JL1b^W#h&4$R=4t++rr{ z)6Xy<&o%f4TsvbJ`&VwwM^DQaVi!an^`5x|G)aC(ss3&We4S!|Z(N1(pI`k&M{1=SA1 zk$avpJ20(`;OyGf5%L-{z7m9y>sG3Il^$Q3kM?;aTRG#Tn2iiV@!Z8EE;Qe!Z2~xs z2$0Tel0zy}uH$hpjNqL%@M3<#A=D3jUj4<6c7Snic){v9&c`saFZ_tKgTfhIJtBB{ zUt%ea)lLqopp?sYcQ#RhcV*A$jixH-2;uY{Z}np>Wn}TFSdH$Qmu)One??b{Wb>2w4A=6b%V5N=x>A4#3Gs5a0UzZhh`p zkCh$GT-+|a}Y~J3_3AC9p6PX#1kF-g_K>6hAsSuplCS*JfSCO!) zT;d19fQ2#X-JY0!$X_bJ)=#SNZJQ_g+A`KZKeCPIgZthlzO?g^7_}j+O8Eh{6muM` zW!A)G|vyZv%QTq_X@0eMJd1Khpn)cc=mt3l+_4qX^NHB}`g@eG>( z_o;laE+%s(Q2d87%7s_I^gHG2s|p|%BT3QvVSwH*#+GZ4po(oPa9Zsx%E8zsvG(BO z(|tvnTA6o4O>9Km*KAg~qY-w0a{f|v*Oh5bP>=y!V(la7-L)F&zOiwjzhvP-Qn|+U zAN_hk{A8|l@zv%vwZ94(On^DW1em)pBnwjrzp~+15TT0mG*Q_THTXiAhnd9*RiZs7 zGjlrd!#!2r11LrVD4bMCP|iPHQc5>xOU(E>JLYE~OTZS@HnC%b(3ug6^jq`3VL%n5 zdJ)|6k!#Vu`Yt1pv&XssE!F@inplBvQ`sfB?f6kZ|5CVO&ma1G$Whla!YP(%JCglZ zuZH-_Xpx{c@~_A6Ms~$1Q!kT8nt7aLYcnw;O?LB`ctNz3fXApm!6O8!#~=}A23vCH zjn&QoXlv2$ZTz<;eK+V0+-;R?lJm#o41RX>pwg(yqKR!b&fd@qOH)|WP~9%`H3DvQ zJ3?$rW8OThu`j5N8^G+wgYfg&r*7E#z+ctcP?J0&dJF4_!;o@&>HO{V2uP&6;;<|H z|2&mGnH0FI_!_R&lueknMF-9ipptwT)~aosrIQ~P$$clePeC>p;e+X~^@0Spcx6H1 zjPx(a=0li_ji>P8MSU?`CuVriCI#W1x^ z#69HEE&L$CtrG1@$FJYlPpDd5l6NF)s^Eu}#L(ErNOQ?-u`-mJI1gN=MG_p_3nt@F zi7w*XUHo_6`oMX6NHtz=Uwdt_Bj>GT5NNB2Uo#H;`Jnq6fI_7Yt-TRh2kNru{_V+! zP@rVC?c1@lN{KGbPf4i!afZrMI%P7em)d(wfSWqd)vp*`G5a+6?QV@-AglDxeofU? zvv#!U$$#hO02mcpKkt8gvA=C=2pCVohV05>ceJ6~5e#lnaVhmRET#}lkVS!F`6hQ# z{Glo;c*_z{9_{ ztJC-~=*0WMd^8;`D$$2I&Yryv+#rcwyF0TcOORQj)Fa)HAQ0p=4S8EU;th{LA0O>b z$NqL8*7G4a#9l74MbKaU2ohapkKNSsulMRE0Bj&&m?evWRK6=z^MoX>6(2x zs`^en##U!R5oIwEO_b{tU%agFjR5xisxcf`v8Y=c?vzrH&Nrh?DS6^xCn&;gO*{Up z3(Nlc;M0&ciMQV$YkJ$w>|siE(d6x+X=B5}f756TgNzamwEJs*+kVxkh$-R3aTO~nHuL=45I8>oc zgNHQ0sFx$V$Z*3SmDit>CE>!8OTQ9lf(|rfycI7|AgEdK8dgTo(O5$ zXArsm<%|IpaAmTf(cj*D(1QW59MO~aK_(f@bIa?beEJF3*BD^7}Gp6utpiS zGYI@0_jlwfpWS@w1_Q5kml3?U7;>9TJ-$THMQi*u*y;V`Qf2G8* z1lpNlPW0THOm6sQrsX}$YfvL~w5Id#Gv3}LF!BFR#nr&uU@L8kwd&mLtmMbbbagu-1;pUDlAJadn-6ciWTRgi_cjP%opTk zwBE=?HO%gqcK_0A^YC;==RMHS?fo?W>Ur*V@J!oAbyrG%j{HZ)-1zFlW|tF++;eBc z3njUe-$K0XOU>7RTLJNA>ZdNM?CGpZa_FbXpN*GoY;lvTR{M?u*pLUYr+wcGz5;R^mg--rT^NXNpdHB$ zdAW-|&phU!A}W?~J3eZus5CgjBPnxhrN(+;+<+xAG)TTUPUk#Pz_919Ds9tm-Hn}s zBVAP1suLwRL$|V?DS?HOP=WhBL@v2I;dTRGSFgwEJ~x_qm)|yH&~MZlbxmNh_ohaU zgOq4(hIVPPI~1(Y&bNrn%j)VX>sFR7J=5&9S!!zIbU&2bckf8-J`GD zj_v2)44w_1D^aqcGf8L*TZb4(@n{u;mw}wJWbAnSdrM5I2rB>#on9tdJoc-Wrg!5-LIoW ziMY4z%$!AZHF6RodRYpG@fm&?W#010N_Q(KHiP=r<3ArR1cfJQCz;sIY<9mpiqTu1 zy}i&mn$!1Qg%Ep{|9IHQDql3F$6l%I`f}gYIiVOfRW1=**`h)gB9!gOu*tzDlwn8Q zV{wgw`v#Qa*$~<)?e!-kBi*=);5k+|Rq~^6P6|!6WEv&58rhJ=ADgY%Fm6_KDWp<+)QWd_@cW-2Jfz z`YU$pqd4onq%dAS(^K>qH=mBj3BH@yA*tY!)yY+3X|g7hcC>Adh{x?C%+hCqQbe4Z zr*7v&mnh0`E!AKl0Co3N*zlLmnAz$|yu^}&#{gJ>@ zO~KGIEf@~%9d~P2*tnlyTf$^1qa3jAQsNzxgSc;PXJ3K|zFdRv*}s_~kzu)AzHGNT zU843QQ~J&3bNUEmbBo>c7%+F<+Gr(`a_C9nvz@_yP;)mBeu7d!)T9Cdmul)T?n#5C zP4fM2?;PUVy2S8>?@7URr3PblgVw|-E-|+$-%p7*pUcmxi1aHz8oW(tJQrG~vAsO` z$HSHk_j{_Y|6nnEh``i1i-%U&cff>nR%wruN2Ntyqieja?m4$m(l~IzU`ZBFJUqQA zQ)IzDC3a+#%F1c=N_*0qv|yIl)sUk>@|cf}3znIT?Ta-(O$M~jw&01MD0+pU~*9A*>22RrH7|c!=Jxo7UO#*Gp*7#WMaN-gV6jONJ*tQoc&We@>P3M;)$57vh=4<5+YGN%&PbAMSh2aN)pSF+_r= zP5lW@`$~a_=7gBd`+K&)1jUMbVmSOL@?cxAz^yg0EZD-Twjs^1=VQJjzr)0BscTR$*fQ%=!*46q9>3 zsq_c8SkeNxxqR4c9^b#YC?XY5t`@nR_}73;AT)51!0yuDzZmI*;kl^w-@cSG80@Ol zdxF1RI1OERttF_w|0_6t_aPIfpCTiMUyneKBsSzC_j`nB=Q?PjlCAs4{bdvHAHvQNss)Oa-6*ix#>=r$*mu z3HPIFa2c7h`hF-h3D-rMezr5F>kE5%8hyS@+_TZ_1 zL=Fb|8Lo zs4X42`4{GJ`PgHmPz`Leczi~Zek< z1ywWv!2^i@TpA}8Wb0q^}KEmxe&Qfh9?c!ptiQ>yrP$QTL228%Sm_F;h!`>3t5BqqU)Gl*RKLu1-87HlUsy$^QVm}B zoaA`8A7`zU@qzHOGDOn*-}7vp*GTl6?#*wg^yeh|kF@w0!Q}nibCkj#YwdhIw>Rnv zZHM~!%_pHiJsGeG0*}UUxb}`E%fEX+pak-urPg{|<^xrvGYE`=vzb^V9NB&02T!{L zW)H%+_s$>yr_e9fB=;S5K5S$ZOd^c{dycl69O(JMqu7ti{I@j^u)Tk`gT{OHxT{wE z04BiZdiwu!KT?xMAQ{e8Pj92e**Vka0Y4$AB4*vD!I3v zw2TpVL?}9{ybUAXk@M_~U7lF!7Jk=#1^*{mAlfai!y-&lHuyigs*(w&$-2)4T+Ao9 z)S5=q+8!`41GXSzwyX4aEyllnDquEDt6{< znCl`XnDh?@3hqYm^SbyoCmfZjl*FgRL0RysMu9~gnnM5C>>{Nwt%%U8IR00LWYggu zVKebr3o8rhRg7W@{}ciRxG@Dka7KV;{7GiSA=chw3LN!egd|65VF8VxB)3MnyGtuS zF%+MFn+pY6T8(3XQz!JEby+8KW9rQWqn=#jwh#AZ8KgZ+K{AwqgOLTPFS!IMV&&T_ z_tI`A_y)2_>6mwC&oJYuiCi5V4?SZRGUk>xv{gU4^_w)R5=U@v*#8xKzr5g&3wL8s zd4ES|7!M2nQZJ4Jz)LDJPZ6o8KNst6Cr=5kaXJ zh!d}>W$5ImkC!<#3sLq!sP>n>plJ)+1mGv82PzB>A3rZ+K)_SEV^a0#^5)w^pH>4; z{@r$s$iiXyZ#@SYATzZoK(yLej598Ekl$-P0IEw>Q{uR^GS34(*{&Ha;hYUMuR69p zAbzQl-G279cmP5mwJ@F#T%UM79AOupZ4L%jX~0-SAiKBHhp=DzEfj8cnC=b%G zM4FAa*Mn(my1t)+2LYK2z|7g$SgsCWtjA|U(0k4dsYZ?)m~pN3oy|FA2sj%9@60R@ zwxtycnj&>dhoKBvTbY|P5HGz#01GLHP*CeS5)=^-hICldur}mZ-DxHAbDkA9(7z!>y4;asHzWw?I zwf%M4+QfsL;o#U`Q-R*1D2>Gchxt}Us0z(jl6mh__GXgG_IVj7Yu~SJDn6uAQJmJm z?-&f8hVQO45g9!LJm+IB+hhKZfg^S9`hKc|PwO$;dxK9MNs-FYe5$JxY0>7BFE{YL zL73|5Be^xRk-F!b`KtZ&su^<&OeYrAV%ks&SV@G6LxC zRr=C2pY5hn+p)BRZ7w4Zin5dyewD!x!4^Md(VZO$g{gs|t1}Upcx;OWKau$(%ahb@ z7976a^rYWQelOSol4$}miteFL-@EjZzDJ(=FmYej#0zV<8rymh+dxpGCM(a=_s$H0 z^B{5fmK(dijUah5QvbqdSg;BjXQ?{(HUN>L_jVIO$Mh{(V=wIB%Dd2mZ}lMGT75CO z|1EapTV@X8)kgob#UglWbj7!R|EV*`Q|3;L$hZExU;lZk@|X{twaVGLMZ*|-+{eTR z^+GqH11i%>56mBUVZVqi9zU=!*$`-eX-6S3J`n-84H!4;0Fz3BKr{dn%s^iSL~!4^ z0er6PfcS7T)I0Hb8YZ5uBreO@&`a>1ghKz{iKm8i2rQce`XalxOdk;dO*nwEH56XD zH(NZ#_XJy5N4zNyZ%*%EEB?#qmZ`J^MUUN-bp!+HH7K$Blz*;YnzkMdmzTdpu_799 z>{Y}5*$jcj|2T2H^$1u!*WxG`1mvQ4wAmr*9|bT()cwIAwVouq%5}V9urC@{?lvM!)1LW+8CfScCgW)II>JJNqGEs5=kNasg3uhk@~0%+adw0`@9d+ zD+KOfRWw4UU0{ip++J#5XpyC=Lo7M45Tc>NVT06a-)S(0qMCkxTLTP`JYV4;f#?ahwO=p#4OQll6VU%M>RTYgELO;f>EeTDZovRhD?EkqwK!RI z)DQ$~NqkumwqRiyYWUz__z&DqF~r+yBlfhy5$$sKBBO?=CXdOXU}?T;UrPH`!FFk7 zi(o;@UWA8ub&J(&>n0pRyii**5?GI0V|<*x(%z{BSV>CCBz!4> z&Omd~mTRo6{lU;jqM9db(DL&$vUp|DRs0q6MU7-sGpn2uCx5r2r2nr~w|wfS~EXBEwSbn~sTmjB|xHo>2R+fylatPXb6`^wY7lD~;{8J`JT z_1*Pc`$ghR=tbGS)bJWOj3eP3@7}&E0|#3FQPT@y`ZgNyXTN|2C!Fk zfXYfzi@n?iwHN_N2Rqbf?uw)m7eHsG0b;ro)2990?2su;H1 z80f943E|V%@O)HA0UE3(?l3;+`A5J3Ibi$O3|4vDhXBPH;XL5cy`7%yiLXg`ber0n zZ>;0A(2ym3f1#3)3Zdifvw%uLv36?PTq&~w$*iO;!BnNoUH5o88|Az##ji!LO_n4I zGElKdzBcjxwqqjVD72HGTcN$BT6Fs0NTjv~1@$<a2Z zWu@^#M6LB#a~e->9K|b(^+XGnxtWY!#ypJn_DphIp6s-0Wh|Xs(4|zB_2J%p+tb`R$0fIS$;@H!(lbKtQS<2W!3h0>GRjwk$@g#1hm-*ha+GI=^|ZwLe2`MfNC!vs&ZwF*q=YMfvfQz)BzpaUuFQCI9nc zuR;E-JRL25fjp)Ldgg(G#{OHA#6n{pQV9?CwiG!LP}!aemRh-6Pl4??3;OVoYGeE{ zP(A^MMZ5jMKEX+WoE6lJ|sdGy9C*b)gP;#$8&fVHjCFf%CLiU zMe$D+KeZYU*~`xUgUYalASfJd`Gz1c+^q=!(Gv;Gw#gEi_pFMpncy&-6ESz7sB;nE zL9e0MN5Go1mJA&8B484+v>A9h3)JKm$AY(;igz>V-d_s$IyR%eZ#&XK(bgS@$T7B zhG73ZMm~Y`xW&Mn_!6(R+i(4|!pi3&LtRx5fAtic{m)e|1j!K9u5W09(W()|$z%e3 zfBdPtlL;nGZ`wePA_k!tu9HTBJmqR0oF5Hf=CevIM$J5qb+vK4FxT)o;A@(A2KixK^;m;Of)yY1$!=8SOlep%z!=5{MHq<^h;8wXt z;!YX+?y7(7CRNRVSI3pmW&>(;=UAhgGlolJCv3R4_Z4l?hR<*hs(&#?eDxbXyXcbI zjtA8_O@AYk-f(VAi&bdt@T=CKj8*5lsvf@#IiX57BTu`>$yWwG+p_C;p#$XJIqmq5 zfTF&;R7iJlH&!f#Q}lO}3p)YnT`uawIK&Omt>>&)oIo103Ye<~NWp$>_j9gdcb3=+ zTmg~5DBP!Us2&$61BZ8?w-8`YXQw}xdu+a=d;j&J7o>C_j-+F_HFN9Q9dQ@S5;FE( zMos~&f}Kslqyc-*fYT)AsN1z8v&SaJ8WZcs3sGT;%N71`FiM6`>5ctl{I+akqPu-T z&uAxgIKwfyE6h1ewnX|9U90vZR;boJb@c^K#LX!~Q?q$^>S`5w$32W@nAcTsT_r|X zt3Yc(I+j_(i^U@zou`eMQdSG zP=&rOJaPr~4UH8qlot%`N{AU8^9U84RrC__a_dKxx74@!<(Z3ZF z%w2iuzHDxHA*DtU<@;h5T^Le`QPBl7 zmp)XZgC=+e+((FyQhwf$*wwMzx-wiU9V;a^HPe@(`v|zY!guwj)z}A0e-Y{3NbNL! zCFnM7YWMg~Qk;E-_>J?<@1Wq?M6mW#>}TUJFa7U5u9iBVhHam-^0UxrOJ2G(9k61v zDr#@?)sgAzj!7${-GQrwEkqGCznu9Z4-xeT9k$~bRgBop?BaUu=EzAg0PdcFTC1GO z!kW!u_rL%Vviz?8&VAuJgXCkI&%DGGC)hxZtE|~S-KYeVZe44_4Obc`q?kjx!CeL1 zk1gMbfyk5b@@|Q=n2EA?Js3$s<>-^_T=FSK53vTsBaV`XE=X}hJc|-jC;wAWW5*NdV5;NQt(w9IZl4})WXaU z9LGA2wu|Hx~0@;^lCxkef#0^YG=#%&ZixcbA8z zpP$CHwH&S6e3L|vF`L&)3BnS!^#rH}tJ{ba$JNr{NGKg-cja-Pj<1Nt4b-c}1g*tv z8t2{8L}lbVM;T8Z>zGS9JFw`2Vo39}WCI%_5c|e8603%05htG;d(NICu6(^+LP(5T zuYdinkD$aXSIv#77%9c!@ZFiB z;LvCTCn$IrH*0e~yLh*e0mb-=^6u#{sT9SVV+Q1U z8gClb?^gCa5!7={^LgtzD6`5>-Xzmr)&^S2*J$_MFZbI^UZu8|ui)WP-@Pt(`*dtt}>3J)J>?0E!# zsqFC%fR|ciUmgYvW)Q17Cgu9&`84z-&%&;MYO#q86i&F3)%HE8TD&8OJ6G3ZCdDvQssUf+jsmVDcl zRWJw4yMqSdqC4F^4jV@aw=Y~-(`ahKy&xWyoZ-3XD6*C_;hr$0I_9E)$??=5X|!_d zCi|l?=S;_fnb?-y%zflckk>+T!htNq^S-9E9d#2Jxc?o48bqq)eABW&N_K@YNL6$h09@_p5O7(Zy~*fwF=ew*dI}) zy0-Dyyo+J7vV|nlm-DrEW`o!tz6~%v;n@F>{gas6b7P5f>kW}#o9Wj(w7;bpw7J*z zr0%k#M527PS$DRf@FvslRG7?9hdilKd~*QG{~84MG>WHkjm+YLuN+Qm&JR8I!O(g=#8t8|`x_h8c>a$h2L@9^=wRlAQG& znd&*uKF|IGwjVqm&oke7-|xHL^{(~#tk1jF8031%Api58Bs?Ihy`;eUn@fFoq zvRxtE(<=M2-F(Zg6N4c)L0*N4YRf)YNc9bzA|*z8UeGa+qO2rlpNy3AK0O$6Lvd=) z@q2Wvez14KBH;C1S%UUT%-d(e(5XWWa4K~S?G^=l;3$aJ`tCndNe2;Ef$?S^0L}8F z=J5U}p_C+FXgtPlNJa5y&Rw1Eb=*w>kGnckY?y0MTWSnOXn$NLGIFy0gVrKNcVQ1& zVo&5;T_(F3HBon+wsY^n6z zPOyq{N<@~0BqO4ekxZ+t z7T*BLY3;Y7;xRRb+;L&4O#AVRSVi znxf`HXZMPcV+RyCZ_7p%h#6aaeQ@RPmnZ;|Bdg({n&wp)$AieCzoe3fpRR0#&Pd+8 zhW$-H0S0`*c8>rnD$o0(7k_9RPzREeQUl|Khh)w8iXKnp)JtcFom7Mp%@0e*!ToANBzJMtmxiKR+AS2 zr)~Xz5>K`%fOhkf87tSvn~8}~+5G zEn5pnpv@tI@~BnnC15Xnl91?wNJ@F~09B}mytRhPl80%6LP#l)<5-3}&}vG6CMyLX?8GHa z!Ts|LKqYU*jQ@s{C|r*PD#afzi=XI@?rA{U94Cf^(dVXuo_2yHbu-d^VpK1h=dB4E zDE;8Jy-_1?)T>WUra`9D0IQ}A+$E9lxp$pU%UBZzF|iy~RF%{$3Hw^#MnKa!fd!1) zz*fo=H-Cx+?#F2dw`0skRCni@&*N6NUGHQm!&R{$Zj|P{zMo$?nysQ3*o|kP9>k0v zLUg#m!~m9R1As^OanCXwYJP@AI06u<=F?7h-_z>Ar>0-lsd(OPS`7UXADo_zLhlag zsj1$VgK4dHoA#ImiozdA^0U0u=7a!(9;tFo+=$Wg&{QqMOOZ;=9WIAE`lJQt1R6Of z%0IY3NVOZziAi3!QGc({r*?+;a=GWfo?qVGf1|?NNk)pLKlFmaN!}{;7$IU2y$@i3 zEGcH8J%yL(NA111i5wn4<-B0FI5oGheBch{C!;m9PRwdRp1b0LlZs~V%uSXOP6SZ< znYnLQpc<20z>978=|jg1=~78rjdSDP>VKZv-|n01um5#$D>fo%_6gGc4e_-KA{mguH&p?rkmG;mFW?HS*J&F5;(?ggYL8HxPVSjRR3Sc9^O;f))?0|FSTe*jj z$P%dTP-qde0jhpfO!pI}HbKT@2=|6C(PtfvJqoSP``{nfNMSXdD2#rB_NjX8Fk8DA zRD$F5GZ-^saIgeAP-mmOHf5mb|T*J zdp!1+ckzkAS`^9Ra$%GR$+RMMivRsVIiK8W8N+lX9#}5SEZ{bO}ma#_S-}HHssH z4oOaONF{(i|E1EU*wdA1FsK|os+YUcjcP95kcIMUQhVWW3=ZftEq9%1g-><+rH#a2 z_cSqU>hgo(Sii7gT2OS+1oe@6`7$p2dIYwf@D%z3)7$m`jp}73ZSrC1Kw9K&mUfzy;vlH zh9ey4O>fIda(+0-jkKsf{!Cip81i6AEjIg6uMjCuL1 z2&AXN8~H{-rLW*qaL#>iRo^g5{$GE8;oqpI`_J>PRa>-{;O&*yszRaceA$31|%W5*7BMFm;S z9Xl|`ckI|Tg1rZ{T<0pjvt!4p^Dc7wE_NOk*0zWpOnfq%pO|>Lt&q+xOnkCTyu5G+ z2TpTqxTO=^&Y9C5;R2e#b+{$Meyf8D@{F~uEu4w>m;fg?xWu9YH?y{PK{{J8@kxX4 ziuNuDTktPv2EWy`z%PC9mxr5^M_{vulN%GC6c4X3CpRm&AdfJ!L4whsF5rg;G##^c zLRcU@K;uTQ%_+DcoSdzZ_M5YWZsru)yx?pGw?%9=gPWn*nj)Rd5l&k@!5n#+_~e+l zrN9;Fhfii}ec+2d+-~bub+AN^t^2LbH}6$a7jTi6Q8$p*(Du-EQ`9q%mIKTAdNkCS zb{_Iha0e?Dr1{3`%{{he#LLIOIVp3`%@!dMkFPPp!FS1 zp;kR>a~Hs6NKAwUwx$QQD;wp&-ha-2N%q5C-@{oD9zur=Jq7g(>n6AzV867zsWbEs%Q07Hgt-b*4pIht2UlRV zxuwhiNdQ@p_Rhe}{&--BB52YG6x0ElAz(V())rxlgirts?|^W!2DIKVsb5`F-@YL2 zZe{I)&~kuph=x0`h2TasF|@GuK%lG}B(wm6Y-NzP$W1fNCkW@}wy=QOk@hZV?G_t< zK)bVx6Ve8Oc7;!fn_ET(T=}V>0>p4ZpzJWz>$lnYL($s0U66YVY_^Oz5C4{NaNjnG zJR)0dR_Nvs8Wke*-$xwzHYbF}_kW}n+OjUcYlXHL`;WImTl7G)LK?1cdl&078#dxQ znfMxaepk7_R%ULv2#?UeqRhOqLW00=_=}Vo%xyy_x0IQW=LgCx^dn_Pxi)_vWfs_6 z2^v>Xw*G6l*nk_zwti*t1sz=>KWoG2ayW0eo{*Q!Bj5n+9x$w^>);a_NCB7M#N2_2 zR|a%dM#q-T@i)7ortvM7eFHhbKtmHi0}8mUwI$@`n*nzOLL6zxj9UYLRBG#jowd0+ zP7G+}`kYIQ13~}byM?sBc=b;YW4MyAYV}!mjeFDN;LiqcB@#dK5zajtk zoALPswqRzbHt*Woyd|{(?2gT01+yUCgY$Q|Pbl<39`5`2hDI z25xbhA0^x9h8V?tbVH2lhmwf@{*F<2b0KJ~MZxKR3e_C}-M)c!2;NbTHFZL{0H_wA zCBFjq-|cfz_x~xrh{lBm`G+w_Cv8EvQb4{(>hK_GsV8-%W< zLO*bqP@4Pq(Ng{`WI^N4U*IoonuZ_vw7(mdLphbJdtwwte%sJWLr4lj zuu!hT&JLy0zySWh7^5}mZH9rMOo)q^M|2O zMTN%%xe@ebv3IpIu|VP^cF?XAR1)0fJtNG1*pWW;tICA z-{|`v+2j7rF;bZK>tLhc4-5g%k1!I|?{8!VzK>s^4F4YnL@AYj6%RKA>KGLI;pOK2 z;+}oC$^Ays{`a?I0x0Kc3t5C9#e!Cejx4Ca%HJGUwxT9~?RM-7u53U8RBHj@0~*L| zLj-j3&%lT+)Bn2|0r3%H>&=CLw>IX`-e93!!NiKc6OA$f^*3krt4;9Ay zdt(Xz|B+NE3L<_NOZc{m|Nikjwsg2L(^)21Z~D#{Nc-wUy!cftdYa$Wl?^{TCtYH>!C4aXTKqZ(?a*D%sId z3guCt3(?;%l7^bmU%TVkv^oDy0ty8pzmGQmS9UHaiu}L~pjr9tx(*qnovF1w0=2vO z-5k_5PHm52h8lz090^`50fqWjU_fqdqytD#al50kR^UY-TTpff6_A5wxIL8r(nOfM zg0ga`Wpn@Zm*NvQw=k#@2a(tBYQed=K^z0R0@+Tq8u2gH<3E+zKd}(-dwdc`nHn_H z@Y|Af z?~hRG?R5X7nq(fP! z%{11(Q8Kyt;2%6v;M+%{EGO!bn=Zq@@yP#nl_uZ!`70E7{$WR74IsA@fMq##2%F*d zHUQ1P#5FfZM-}6Lqi+BEi3@&#uQOp^gcMDcPyx*!sS--C{`&VTcFOr*A7F!{wr{r|U5{+=)Eq5uxHuSC;iKE59mIiNKDZ=}jU zs+ap?J`?I>m_Ms@=`W1yqw(**zjF{qZ8X0C#``z4rH)W0*^Se9$@T$N`na(yF?~#H@fYog$=Vjo|h@Zc7@av2D#>V`S zt+*2coCgFq-8iTa%B(wV9zz07^AnI}5|9Ng&{2b%X9jJ&4)t|X-X0066hJ;6?Uc8z zroSQ;&x5L4-g1tXeBj_xegSaqBzVgTdfs=161?2sl>q%p z*73&)ZxPfHTqtiAwf{n67$3Lrm&07pdB*px+5PV`4G?UDX4@w|9!G!^bU_KtH>~?3 z8u}wfZ;Jw?vwsce1UY%XPMq+dGz*P+sLjb&jN5wE2nEsqeZKutv~2UFPC#Gis5G>L zHx<7>coTHksGf2LN0oxlvZy>Q_>6*?U&kXSaLZRP`y01%qfSu+_iUZ`16^8{Djk*?rSXmHTXhKb^KvxaWotP0#N8{9rpirWU^ zWWRCBH5_^o7in*X-~|6_TUkR--$W{2ZgYeM+|_n_LcE}+30(eaF`#?_(hGOsC4hfS zq0gH$&$^soFYJ!6xd_ld8K( zwWGDwj}m54WocDbWmQ$l5@yBk8Hpvd&XOgx-fbP7NVwYcxye1KDk-@q8Sa++DP!uf zn%hvXbF*ht=UR|u-(q#lx|T|Z!dc9p{dk`{%7`nPp^rsy?x%hBVvxGM!Dc*)@#CMj zGMFsm-c~%qq{MVPfBbW~XyD+_uF>tJg&l*%v4gdOX`!PT>;8Ef~mV~x;a+V@tl zTQ(WT@(pu68JSBOe3d~l_^yr0>2n5_liFIv=VqCT&u{(3JZE1?9(t$|dzO|sg!sKq zDA9yKD5ZGHLv_FDzEaj$g=K!$!AR}KR~dt`oUTru9H&VoQ`5X;t`|99peQ`_g(3cD znH*-5{Ic2}XEXKObl>F`m3r|r$tU^s7%En^r;EcCYHNN1CX;*Z`|dJV?W581^NmZx z@{5~GDpU+(#9^||#EfO}6;n**u6cNY$|&|@)EnI!Dso(ei{@2C~qqcGfLSDlCni@%3oL9dA06Mu`ULXCmc`>fdb z`uX5ur#Vi88tdS3Nhi(T#_7r|Zhz*<`Um<9mmhn9<~d>)0vNVZ373a|)S$pDwghFf>*@h`U$z zfnsnV5ylbZm=VN0e00ofZt{Gw`I?(StzF@uW{a>vpRZPN?IoTU*Bh6jkE#{&8X2JF%d35yS}mUzcj<~3`qcCub%4$f#g z_W-B@_CAaof>joC1WQZP$W|%Es$3?#pvPv_UCe=LdXB|>+lGBw4I`g$EMtt^_Nql- z$0;Mn*F#>T&3LW8)8>kVk_GXf1{p$WnQz+&&9V{G`Cn#m(9}sZzKf`;PCgVEv7DK# z)|^q-l${iOo?%-7L+J0}-XeOCtmerB2468<#h9_z_6y8s*AmVc)U#~66$8xwS$0hF z!cKGTc$)6J6l0Or*XGZL&JK?FX@(HN|!tIl1 zW8(Z2F8Px%%^8hc)4o!TxuL)(8P2j5_i-4p-!hX;DqJp0P>6wf-bAX2W){uGQ%|As zV+X{KoyE~RCGd4a@TYeZn4NK->`79(xvWH-ux?mx+W0o;erMbcoB%5S-Fd#z7h5u) zT(p1wV1ly1?KpgxjrFAb_Vjitf$8mXf$y?41k*c(H#+1p>cey5-J{S44M&73W}E!DeJ3z+)!?X-FvpX#6C-nw5tbf%t#+i_?W zk)stF<9fdePuQ6_L>MFF1C6*{L1~Vbh6H<}tw}?JFzMmhgw&}A zYF>lQcoc~Au+s1k$KVt#50}KyB#j=DG{~(IW&+H-UoN5~G(s=JYZ<#rYmQgb=)5tC z+^OEM`B-jyTgvtvjew}|I!?@v+_lm6UBv43BQe8+yEM0*8jWw%A3Mjlckq>6CnFiw%Q?eutEeFIva;t?2NRaZ8yhQHhyT4qh^QP{?ig6+m4<01wfYN-V_Jf zW|C8Z9dA3y<|_%};v3@>?~93?vusUe1%|?e@Zw$bvOy2#^QAY9c%5rQYH0J2EvXwMtzAHx7!$4IKw8Sq-|^MkRD@@k36ru(!vCpJ1kf4)yn*8fHPs|iM zqbh+q__0g?+@Z% zOC>lZ!6I}0SzJ`4&F*z`5gG8nT4sl}DWg{RH~gJ7U;a8kWj}v|LDB;CxZ1v>Lm$XhNa!}jP zT`9s7DjmOw#`7eI=aXFy=FLDmY~M}=t?WCfwBV#-uJD?inElwtdl}r83K)!pgqQ?t zZqjbFpLEEhCMbVtq@}#_hS}#;S-qCmcA@4&^g(b{Ia#tYI7>NA?8J~9oPtxo6UY_H zfPBBI`Gxb0UPBSFYdsPEk_JuDz!A8N?Oi)5ub6Qpx%6|j=ACCiwN4ZjYoW2q4q{dJ z*n^f%XjP}C0=nI&6w}#mB5bGJ{KYkzqQEG0IH_V@Cv}m%oa~1d_a!Z9bOhq*x+$S) zmI|aVrjVW{j_WhhFp~rf%q;H*@JvK^;A_^VOYLQQ2^}=cMk=K<4uZQK~!>(E5fLtg8 z=zR%d^}@1)iy)v5RflN*P+FDdWnC=FmYX`L)@*;z!&Ja*NcM{l^$<@HCuTlNC3ktGnEFCIueghA8xr&@=8VJrwq+|YIb&ylTZcM=@dUa zna2K|v*ZIYdBx@-QL(h`E&-rCJJnA0wD$lc*t?YJ=vvs6P}h_v+)4Rc z{Eca@QkV#1V)TraMe+AC&gz|&v$D%mp&yZ2&3SWKMRRPbZfKjAnyOt5um zE;p9$Wa_<)!y9V+0@IB~7$#)@Cc333#nUyN)a~S>#-=YaqHwPqfJi>FaG$&-WaRv7 zNs5z@FrO~L*!?Y6x9pSu*+UM~9~?z|NAr)qpCwM{WWC-N{zVzFRv96+vYp}k^@u4I%OwM)U=`;=@6GV}J!v;yL9Aaqcrs1!J8nTnd}XyOew;}&cdFzyt)zO3 zvAmq_BFPdWd1C%pxLmQ0x7+jD3$ng_EXBS>4UHKmjHcR0mm>mYYwTEx%~EU@F~@X( z{!p{4!6~T6o*Bd*W;d9+jz2r!4NI0FYt+*w6Iu5Pq3y2b(A8Ud<{emq z@T5c83gz+5lgJ9y!HG;*k5>2_G1}$Kgi!{~;_`+ZI~daY#d6SOsW9Y9ks>MYHu${VwA z%dU>IM_$k~c#r2|r+y6c1-Uc1R#im@mMcG(H?T2Hq!f}H6lBh|cSuFM(TTqnd#jfE&D2`gNmqHYJ+7pbu2_(^eQGKT0!*9oL+b@aR?NpA;P|N0rtg+ z`5$Ej3Q~Yq6(uT{ne(A?XEIpugU7Iv(XL5(cvzpdQa=051tFN{XlH@8ffL(-h6VRj zpT*Z)1s@5$UjtNf$oKTXtD<^c*qC%RsNu?*7gNtA%4_haZn6D0%irfH8KjAt<=UR8 zl5VX|SM5E+Tq3KBbzC>^<l>F%7m{4Q{<+EzYV_ zQ+zmG^y#Nc!=50`Oj{-?-0{>Alg6R>JVwQ0(pVNp;tKaEPKNPwW>n6JjHiI`4je-d zAx>We#5{5=J6!c*bCL2&jY4)0%&V3x*`Rxd(yf5=VS^DY$&aE_q{mG@{~^p{$Ra_r z>c-8yyO_XwvH@B@vH}gl@p&?90ZU-oIOA-kWb-Z2o{?Y5s)-<(s@bq6GKJ z3w%MB*|qbtTqZ=dud7hoHB<(1bTyvG+Gp-jJIU+a>eBd8m7G;;+D9?r(9O~hFI0Tr z+)-`kG=Z^~uOAlmeZzPaYcZiRGUe%j-)Dj42BRYGK;vm!1n_mg4M~bYR{jc}XeyWc z=j#gIIPBS-#>j+|8Ly9h~RWuY(HmCPZMKt7T$dKQf_D1X)Qw6}E zKfl*bu%C5Ym0FaIV^k-hQ~9-G>maMP%b1^LnYom8TSYLn`Yb16v{Wg_u_|AJfib--IYVpbZ1fRrXHhtoV`YW%)(9rW-CD z_zyC!-4!&nW#@01b6uPkI(a&ev76JPVMRjAvePGXUF8Cd{D$n)g$M|9o9(gH;N`9Z zXDDCXec;S(LwR%8fg|<=MmBoA5uD;5bkFy z`&{RSrq;qlr*&41>Y9DdVNE%Prt~B9Jy;Rkvc`{x=HEY9-5tR<-o>BKsu+7HNzmBq z>Aa2J-Vwb#Uo8f?_T(bbPjmK5@Cl_69)|!i-;WAM%B<+7ES{^vhii@E7?^jKgQ#^y zWQ5Qc7ZH500?79C?%RD32GHs@ikf{)EHbJ+-RZo`uzVm|L(KHlzU~SqtNBZ1JZx3H zciePzT+H;bOU|*}j;5jOpGi+7*$>J=4z$$SI!?O8n7C~#<{ttH z6H#-jno9-*4b9Iorb4jc!y(Oy_%`NGKE-I>GPJAk8jd=gCMjR{X8m}>nMw!t za4z7$6nese(4GBZy((<>6KxEZJN5<4YGpc&BQZAFG>eV6w|Ws1V|VW!hA;NL%r3Tk ztmb6(`D7>=bJUUANfJ+nwM#Wo;RC?soQ6Xrsurd`fJP$To$hQ8rQ7EvIEyb|Q;5KX zB|MgpEb$VOJ;J=VkJA0K+~7ymmkc6)0Hr#&%i+q*wDMv2Ihj76ss(|Hk`|7y8j*4?(L$vbgy?tSaN8TEkL`^Eq zCMu{sa&wU?yWO2pdkz5R<1X4WXAJ6^XgeGt01wvXK%%C$J^U)u7lhoILH|}YuwxIN ztR{kFByUg&&Sb+DMQ9+TFxdXc@};MXiTUuy>9m7cot?(8@UF0kYCEs?eGlXk-Fz2L zR3GHH-4JCB>l1X5I4FLhZB2IaB)(^luRx!uUi#gjb-HJ_Rh!KqTs5H%zrDJP9c7b_zAfRZ<+2zp(FW9Ao%f`Jbky?>t z?$(O38aN++XiB6se5g>w?Yc2 z@*lp5je=b|CQ0IU_oXCPAt6WGpjV8@yj&B@t?RYvSF&0jRKeZ{hX6NkFvAGeoy{^L z)FGIK`#`qX@A^7%i21Nw($jp`{l>^3~cfm83lE2f8qkSHf{Q zAyqecY-2~34PrXyZ8dz;7CU5V2Mv#t*?DM(-b{8)cPw)9$9~XZ`mns`f|Znwa+026 zr5%jfkGL_que~q5^LF07?Ax(f`gRdu^xT zZP<|}dU;T7!P4yfsWal`D%WW=-#O^el9736Vz&5IS#&HP%X-VgR5R@@i)&AfAknPs z(QxjP5McZiKNadC%E)^yvmrR6+q|t}4W_4^;^4-F<7dn|tK%6H`nhwmdnq#3fh`yR z9PaGQs?_5X7C9AHB_nJ)DFlnQ&tvK~OTFFRD@gARO{E0EsZmj6DW5Gv;4YiGjBbvFHjw4sCNs=p0-zYUE)9GM{ zwBsD0?bh_QWItGWOWIa=ZO3%KJ?pr)hodk*K*w_6fL8>DQ*4KQci@~oD@tY>o}=AR zyh|rukJ{+fC2}|2*pji(6Jnz$I&`#i@^DV-)!m5XFx(qsghk$&SxuMpk1tD2{*$)Y->`B}R%bM;S(lBiy%7~#2-K*1FJ6mNZPOnJB6KZ*RUQ3So#T52@ ziaqe2y(J4-#?K+2i9|qu2NE)s`R+4Dq&*%?PYJCNcFqSvUTNm!z0AGAuSg~mEU10W z>EOwk!*vA1x%3GBz*xl71ee_GR)nLy(MyCNoLTAe`<(2o6ykH0QltIivcpmx6|uxN zWNwCMuc#>$YqLEmG&yvs$Vl|4xtv<}OTv~ttO6Cb3UJFI;*_EX9rtM`B-?~wge}qO z-bUA0&1#}>;b1qxhgX%t=0xZNo4;K}?5?1AvPC8RYy(WO&2iF1q1sq zoiAXC7QH*k0D;e_QtqIZby_=i)r){0Z8Gs-+-pmh!nCz4v!pthbYo6f>YTAn&0@pX zO?wcet)lHHcjQp!i|8kziK=p@{kDR#eXcr0CUZs|Vd9skcfls3yK=Qm>Bk9K^^EQY zNjTV3X&xq0+`}5|+)elXqdtA?p4^gN@?6|og)IIba*AZkQ=+g-aYeIMGgCMZXzByc z=}AFIk@G_)e)#KSa_#hi6mzfeEuZ&`)gpDmZYA%_#LCN4ctES2!k+u&NLbO!;r_Ly z5+$alocipht_Pjx&aZ2ZG>w)$VpAe&^YzMOx$`1`21b$Q_&h6L*+Q4q!*611i$aGnW1yAbjq% z`l}t*u_1M5F1H?C$>D2TX6M1|4=m|%n4~EX$bKkoPOoGeapWpzSZ-LD^St_gX*|)a zZ~}GW4~vnv0p)Ns1jsa8qvfMjg)dCh#A*-iF~APZx+GA?$qx@!Du|A_Gpwbfy>qyq zJ`6wSPSl;Z&&0EmR9Ol#30{kP9U6oUUBDn2a_$+;E#wKVq0~VbxQe%W-lw~C#DvQX z9+Pe@=b)lFFLB2#YA8e~i;Us!elyCPhm`BR@@aFy&KN>0BM-EXw&wwU-@|c2-QC83>KHOF8XhT24lmfwFd58bUbu$xkW`;TzsGWbbgys zT%IXG$PSt1k>Fem&Hz$#iUWiDaZUEB&%KmmcFC}GXSco%YjRk~;Nw)-XP+~4VYGll zC*ZPupmr%$Q!G_BBXv{JOBITp{`n6poG;R9YSW849;6OaCe=Bi?n54dWG81ic!dFj z=t?l(*pNWEq7J_-&X76=32S+XlI%?M9adYLh;>K%gITOYQFVk0X0G*RF=6>G^;oP; zEWGE$&q~t?BMDnVmc8pb=e04fDP3xhBs1+th!6v)F>9oDSCja85)Wac+7ueFRpkE$tq4sZtiBv}R6k?Pv#_}j2OfO7Fxt>cC_%lXsgH=FI>__P5kYJ@($yj-=WI#Y z{{B=c8X%k3KNI-2mpv>I>kk`dJo7*oQ&vbjPp<5Qal@{{66t&)_v@cO%E5;vZ63vd z2w8wCs1Onj*bY`H+~dHQ)DCH*xq1w$1oJ}jo`^to0GmHY zg^3-q0qhf;H;y}c1KI%#3td#Vokt3z2x z2YW)`yPa@vO2Nd4fqb_DwqdlKWxoRCyY&n2byLB(vavaQLr3a%zhW{~00$Gc6PWcA zI!&S56srq8%S>kaz?x+|5i@n8e-Ab2U$u}bf`fSg>Tjgcgcc60i`nS!C_J8mNxj*B z|L^qA2ID&_fd1(l$-OF86rHj8AsD`$nYs@}9>7utC?O<&XS~Y>+HKpgtQ9ceiq;zO z5`fg&JsilW;-eeYBq*r<^oB>jG88l9B3f7WSJKzXT#D-v`rpNC^d;cK93*Y3B1S4`B|28 zU{4)EH3dXwS{#gDQ+|jVt^X%RC{oaf=)Zgz61F?#m_XPHB`wu3{6YC2XS8md^Y(7s z-vOdwKX#I_7nD3YgMZD$O%^_!^t4_I_@lxgH2v5Pt;U3r5DG)!lMr!D?}YM2yTYeq zgO9=&V=2yFU*-fFKq%s)dT{XAR8JMfyA-RIOvPsxc`~Y|mgmQs?nKvz*H#waXZWo< zR$RE_IPfH|ux502UVo*}@3XOLsxbZI3$#K*DH1+sJm;PZz3*}Fa}`!g5?w68`@FVj za^ByblGmJj0huN0t&4o&}7M2FS;F$k&Ecz23|OAkR=;$~gdCu)Ba-zB+Xi|3=W zcLb5Whj-(VyDXhCsVss&2;#Nt;x|(uW7CdiQceswQcW)t)Iee4P4q71Ndo)RbGz}R ztKBBL1Qb(+XPkX{DxBr%gvIUbx=Ldl2kI-lSJWie7?)<>JC7`DWveQdSUfj>5QI-q zs20l@>b>+nN%NMeT+;GfQaI;{J9LBzRu|3USu`SQ&toyB*#{HQ#4h%FHJe<2uuF z=sS?Blb7W((%S4gm{({2^16!V9S3-lo6plk4#euG8BXJ|F$(%HRlFk+u4C;tVqCgjdqaWC_*yqOuo`$CfQ&6n-n8{2P_tJ&Q zM;!(sv7}cyT2y}S?dcLeY?e)_L|GZ{yan_$Rbtk$r4m@R4h`4}A#tiYODvUd8kUXA ztbH1|uzU7#SE^ixaixo*@5+P_Ll@rcvrYr|p?G}-0flQ6cCMqZh$2jco(%wLjol9d zt|kvw?x+f7MM-Pchmt%V8JMSZux`!5akdjPP+mKgKQi$60uvAv`ucNodu~Y}wN0V~2y8B|&p|Giy6 zL0DKj*Cfj8F@1{F==Qj?%@Jg;nE!aIhPi^-#H*Y*`lFzu{fVhbO1J%>i043)?XzFZ=NMPr z{y5pw9kM?5rlQ10k`7h`bnl!`K2-)G+e8wGM~A^ucK-OES))vd3-kOG<-X`@b0 z#+t6HXgay01x_efiBBbXoKCv#^dzn*Dc#wQy6WCDV;nLrSETlj@W0xFf1o4W@U3`B z(O6g6m~(5(k$N1GLki*K9OGPGMjZyPmoFh}V$RHuipaXokLp}yId4y>eD};eED|$sYC!r}wi~9P*I|m}ri;a8K?=j`OThbBv!53Ewx1SGU*ZD&nSVYP40z zq>HW#hDLS=*A)R$hGa><8so0Ed;_hTmC352+gIpI=vQCRX@`@sAG(-SvWL9NZ6fy5 z%s_Cb5EtD}Bmif9BniT=Ukc#igY&tc=M~|@em3<)5C9zl0F)+$A$9y7o1^+pL7(N1 z1feeD9W5a#v)Ua#)AzA3LeDP%q@ikYQE&{%dZ}RTO<1+A@0~fAKKGc`PmH2%D6~lCN>IyHd8#!A0g9Y_v&T^hx9z$Q6w>ED zqo%(R*wZZe+4h;9kV3&|Ti(++W;JPSiRuftU^hvh8-l=vh)uNqk=Uo50)#H}W3S6= zeLhXY8&gF}UP|28mJgt>w3&QU(TpSN!#5P{%KcPSwu}A%NsQ%W!8L;p1FRc?X##gC zh#UwFLiW5`QOqW0s3gk7TN~Ze81zaTyZ4e1@(8J$`&6$$4ec|H7!QJWQve-K>#Lq# zet?OwN(pR~CZfb{3If6VPgJ^Chp^ID?u%&8H(&sgR9YJxqNN}!X4zScSa?nJ+)CBJ zt@5n*dCY3a<+!^yz9$e5f^mv`-`Ai5qmP3U>o}-04cHM#Q@#fWlV_)UnCe?PUj=rx zfHV_&5bm2-RP0Gl^vMyKISd+QE4d*vcT?*#g=QxzUwD(Q|yWRoTA}fo~ zu)_Hx0d7G(Ia=A&xT0gi(Gn{o*%=3#cS_3YozB6z65;zIzpm-TTMV1aixLHaR`scb zppWVz4o8cIXnatM*ve z<%%$7%!H`+m#n!KJQf_SI}FZ;@c0muUOaX z&YV^mFs}8z8`GXcWqv3i3!)h!x>BJ`P>7UNN`?qm7O}oGoN?~PeIB#=fRK;}t|8TP zvK`j+;lp&xq{UrtqAkK*jC%OBN{XJwFa(cxmpA+3H#xp)-N!YG-7+)qm}*!wg-Rb5 zDdxAnI@=#iJA2;ghG;4>NC=WSWZifl73Ahqk*?M5QwuP@I{NzLgN=IXAV)dXDn1lW zFy6yqY#1muMLA_gJ&@h(S|#aj&!We2A!7aX_3GTP>jL?@s=Yp3RT1gD?6MR+U4jW2 z?siFy&g9CKdyyg&y`5qX58fnDF~Kt}9~ZwZJAjY`sc=JuTfA3jHQqIOe=O@Rw~ZI- z^;Xu*IzI7n;d&Wc<0Cx3#!Ocuot8V6J9kIfd#rwnu{Eqx!X;;gR~uh{yTWVM!loGW zBvEQWwaRlq|>ek6iPp|$T+Obzwov0%d!u7?XVU+x^hP& zkD%FjitXb_Tzvycg7wQAh!u6@6_90NDP-{9!FY!%WNyThXZ0@Pr(NpEl^>E2oA^5OKjdzG+S4~c!xTa&wmaLbsdmyjE; z(c-NKwCA%XVeLB@pKnld{5@xV13)f~CLA-=sc0u@GN5 z8h%5j#QRkpft<#(K(2he^WxlAFX$LvD6;iLXCf#G+ z^ER|7&*WMuDXT#fp@Vzd@DmQn{Dk-6Lr+#{-zTYxV>QSmDDGIoXh|IDS&T5joP1NM zSig^!<>e4p(ktr>vO=dl@JH`FXXhXC%r>ui6+2AmhEg@(<+G-@tGs=hk&nDEu^|T5 zO-sUbI)0cjshmO@uSzezI^d1$iv6Qwit!zXUfcJq8nIE7lU>78?ilfWW>~yX0Y9Xe zxLRQ{E7Q?t-Ekr;>E?$CYaH_vk<6Y!GC>@YorkQ&ztU)+S9~M}2Jeij>roFC-y3s@@T4k?Nwv#oMRbo?M&4KPijR3j=O- zFRanG4l?{f%*LNbyS3nYHA*V1D+aDB=8F@(^d_a&Fhb=S5=Q<2n#?DJ#z6}iVNtA~ zN90?N%1vIJaz2*l*~1n@m@4iX`^u(ONh#5tI$6lF%baQMono@m?W_9Z8T9L1>VIj4v_?Q4-y9%@PiRRWa7mWgMEK<0g~V#j4UBZepY?ACG-qm>4u z&f`9;C#p+d<@de%^k}A2iy6`Sf!=B22@`|iUb7TXT4Dvqa{Oo=%DcdmnW3Xh(SWR7 zH>GeB_3JwPk~uPgJ&8+k;RLP1)2N=^F^YyAH~U;Wu0>u8G`{lgT9(YqGn%>H2!s6k zRbYg2bjz>AvluB7mX&)xIPGe#0eizqLG6ZiXArD>9^685?mqDHs zbhAr&@%Hn<7wdPA1@xcQh%;9&Y-+vP$4x0(nsJ{fqD8R4nAOXDozZxOAc4k6=xOGZ z#r{?nOZ&16!_qjvwpuJg70;Q%x!BKfFijmf-{)x}I%oD#G6;Y6NMBxgat$wX3^zxD zKI&CWgX+@TVBvbDY$fwG=Y&5G8^|mld{ZuuOUij@%DnM( zk(^EI^^(CRo4ET;3wfUXr9~x^g@X7t2L^)U&lkyLbVydtf371R37VpGepzS#=z`=^ z8>Nns_8MzrMNq(0x|Ug~2*L01Rq&fr>BknA7U6bL2jy$%1w7|rR!{hu?s~LNBi_L z^6NM)o?WE(RSqfDu^}@e%{{wX64l*Z*HkGO(s9IjD8lhzYMB(a)Q6Gl%^@YYq9IJF zM%kAHdPvR@2a>!oR<=pdH%`!#d~8ThuHzANC{p0|_4lJC%VLV5kKUA$%AOhRDx>!! ziNIh9x%^TkLv%Q&=;hkw8#pzR(^wJD&vmCv(p;2G7G^hrLW?!GVbm8GpvaET6&w%c z4k_$}b>XDF)N9wOlf;wg$Acm=*rqHREwD2tB<}rM@UoM905xwR!)igYr$k5&+7bT>kR5u&kHWd*T+-bKh^F39I`GR@v+wsKSilW ziHQ-aX{(M{KFseI6(eB)8aY&RF~HCIJ`$l_YY$LFcV3?)`>r*6hA8xblik_qE&-!bxpMaQQWdKR3m+{psI2INNQVxw^RM_$I!lOdXIHm-9LF$ zc+YT^=`BAg!qG}N@x;_*SJqtnO>NEjrRzij3v&fD_4_SyLsYSu@(p^=1>!5>jZiDj zw2c*tint9@1UHA0iwC`2<6aq43?kgKt3k_K&EITp!5Gn(o;Dj!e7WPXm%F`lrO9)u z#$-^iIG+USjdU4&sOvZ%W6>&sI#BkV%d)C0{p}?(%1l_I5=rk00(;7z3hpOzQ`3HX z?-oPwHMJzcR-bzgte4F>U@m=HUKA_)>h_zZ&Gu-noOFMvf+zas<-57l_4=pV%M}sM zSBvo8W3l(a{EspN-=m~=*L~eQU5;I3xLlVpa_tQ}`d5V%czd2<5VR^};0G@{o}|f4 ze#V*(ia5y?D0>L6Wg{rf331iwTe2dTUO_1HU zRlrv2-Sl43)Ml*v0eckp+{d{UU=evHPp>fjateEj02BEe8Jp)j zMkue4&RkK3t)&v8Y5^p75uFafe)ZfSAdNHgofQqYiR>7hD`x_iP|%dKnA&svlT;h@jw6iQHsV$D>yzQvA8MbJIo>aUEvgVM!)ADT{B_>} z8S%x(l-BBH`E)m@S1Tm5%H4B)ew4a=6-J$<#k$+2X#^fM?XFR1r_< zJuXw18GBx{k?|)vo-`99v%^u$r84J)VJ6viAyQX@_SD!_EocrLKb7BE{Jg5oa&mvr zCz=;GuR7kWuRL(jXz-4T5ynfHX*hw7>u(C7_}pA>9p12}%h{ zw+Kjgr}TSx)-RstUGE?7TCOF!hMD`m&)Mhfv-jt^W_9)P6s*Q`CHY**1dzevGnNI` z`sXdi&H!fAMBUJ3AfoR>x8=G#C+RyDK{7HwSdCI?JL&PXWLk|!-=1y% zV%lqSyL{n9?)#j8*mC}5^@qtWv%GJ!_4w*j{TbquMJXk;a7K@)`6iz&%(d0j4?LKN z_$qmQ<86WMQuTK{qGcraf1QDHXS5Mhx-e?9>1b{6^G`7eqK&zt)fiC55};(1Wqhw( zVb+K_?3-9*f}eK(HRksx>;of|cNJjXGW72dio4@?tk!zZfCRCwkxd5oE?+mG~Mm1S<5s1(4#x(q;c~T?BR^Y z%W-h@gA+nZP*?JpOD(CEw1Z0WO`Z>h>~K2hIY>t65W@c3?o3~VxDdi+Zo=n?e!K{3 zni1VUw|rT|9->u&%JK}bk-?ch`>E7O`WTZjE=xU{QX%YETQw_boza|v+a$bE;ofm1n*z~vrdCPkY5^hf1;vU#i=R7^gG!t{vf%Iaq( zouYw+%gj&nZ-#b36;`2$!QSCWmW*@2<|z}9AcM?~SoOo;;hhjAK#gaA_b8>RL%N+f z_yyf|7%U_(WZYFIeA>u)7NIKlowkqW8e#q<&_6}2yW3gHx zvP%BKu7-?#<&)a|+3e+>yrhVD!MP-i0Gi^~dKy6i27;v1vlC~}y8@!7b4L0LZ?BZujOSZ;-0B!@5Jv$`ZUapZ*rAGZYr&F}C04JC=Mh%V$v z7T{RyOUl!_;XVqCt8EvTk%wpTTiv2Q4#8rdQ;=$3qgc1Sj~S3=)50zr1E{DpK-~9P zeo@Sr{H&$ePA{bMY&bC)|KWv2f*B#xcA`a`_atY`%9mz8{Z2b6`LxZL0!Zq{pC3=t zF&Z3aH?<<^Pf3{5q=Y>pV`zRCV{*e~tCi^wUKXQMLvW+Y-d_j$_9BTf@-1|I5^{>$ z4;C;k^P<%*%`Z9#muerzaK7Qa%AWrGI^Js!I_v$T9-Ml=EUt?RlZV=0qS?2Z>k{XP zjgYoiAYxCI>T-;y5OWEqTOU$gvW|QJ#B4eEhTAxCtQudrq&{wn4t3dXBhx zlIQ+%MCVC9iQlG>R{nPifjszf4u$VPgX|O{YeIj`SWXRoVUWe`$Z$W>+(?@GTnZPH z4~^`(%+us&Ki~2C70djS^OiILwK; zQAGlI`~@lpTtf4pHs5`nundiXew5Bgcq!#Ep>TV19lw@FtAn3(T0Bb2sb9hjvJ5;Q zTjGVh!4LAAYpubTaa$e+Y5s(iJZ6cY0!y0|UX)k*+x#T7Ta1S2HaeP3I`5GQZ3B#- z@U2llf#WyO2=nzn6D^6`+O|svtx;3q!V&UCG8;MMyGG+k`iXn&!x!5!X0bSO3MS#H zjg7S_tF`FmYs8yUNp|r?f?h~C9PQyV&=d4}Ymi-2DV(4sK+AGoC01?Gh$5X`tV4enQ1@(If!@@L;jp!$+MBoEZ>0>@?5{`XEz?1!yBKIkd3 z<;U>7r;U+|)9s?9sQhDh8yAgEG(S>hE8hFp*KT7hCrr$8S3OtGeR3lHo}#+cMMI*; zvspc#5aSxM;%)Zo_wO{hyS)Xo^cget01(VwP9KOhmQfWtjpd$Y2g$WsX)dN)#$RpfT`8=d0`&1SM zRl}Q<&s(qvPDslQguUOV>#0887*z*79eN7&p|cz@&Q6>V9MGH9UF?qEA}z(Xc<>}s za6}pQd@khD)n>OMPZ#+e6RQ1?;YxM^U(fsApFGvqVnONpQyV2ki5@65`8PBrKTS9_ zQ^mdfxaB_xKczn%>Hi?i;v)FCe>hQn!(@iz38z=RsxL!D;C5hONj5liK> z>v*FwYSL#j7tp^!tBPp2DijJW*&LiXuc0IEDRj%pn&UWb4*;(21Cjv7hVa< ztj3F1=~iT-Lr3m)7#_SFWof4G+cjs83QfexbiR5erU0fG_1&e>v$W__?8qk%S0;4A zN+vO)E*j`I|=S+K`KIwcSt zstk{i%aQv&86ON&;{HtwZ?tR7IW&$J8@!ePYLZXaNv4UW3j7GG8cCKWEe3lpLHqb)D^It7#W5Z`bwvhQUsb6;tS<>73bpCONV z3sgj%+vIrtbQekpt4lomphI5EcRNCE|94jRjKwgVz0R^b>d_;d;!t2Svs|q6do;{E zBUXG;79VaxkmUGIk9=&oSGR8%MlIug8E<$bp$#Hw8ei6@Q zL;Pa9XAr)L#d`x6oit`wTj?A!$s<2VFU6sopP2ad&khQn<8`a;}9+;TWcF+_F{b@ zckMLt!C>M4h_J$-cZ4D`c7PpQ^6o%z!97p;4`|2nrm(kTQ4hBhhFfp6h!UImFo_{x1xG)4AQr$>M$H*}x0`>(AGw~vlR@`2HgLSyY16~XS7nVj}0n}TSTQ~ z)i!xmu|{)2Ky?N*+CV)n{b@V(zpn&1pQL~*07~zB<6n(okBU-$=lAMNH6mY90=V!m z8n|_7BWA*+|9w3?4ZL1S&_inQUv5jd^@yIQ6&D_}NA`M9*d5la$H@7%23 z>8(2J*86q*SHX$hoK@g<4kV&_XznAyYCXd*_M z!5A|3|P#c z*`O<*MEXFJ>hoI;{U1Ax&6g4ujDPPynEgo+h+PlxKd6eC%fN`t@bj+`2JG=WmbBOvt3I5gXbS*2SU11p2(u$V8&H!e+@`E6Ki?4@ zOamZ*H}G`(WupHTXwHYPZD9G^zl~WCg{aV}XwIJQ{ow_`mfuDaC9lOBGIkxR$<_W3 zhJfu@-EWN|p#4$jI4ZXz0FZ>kgYS&$e_DfC((Ue8ni8!f9ARi1>h7xs*|f~GINsW%8%al z;anC_BIh|dlIN*=^W{(BA3%!qB`18epTlIg#T zcBgC<1xTmVXJ$GGE|Ms$?e@lWq(H-2VQi)MwX*N-jH5(PQdOyTZd4H9=U`YP_0ERW zV%`p@Nx06`&TO-x)ZclG%Om1`@+56vUeLb{nEveM7Dq2v5psP&NcoE#QRt?SHw$uR z7SOn5aC;9ZFSRiAd)sI7^ItADOBqi5wx=H6v3WaZnc{w$P)epIfB5#z-z6e;=1DdDsEhFLRxY9-JghWj3O9j`i14*f>P z>T-$h@JDIV6{3jF1`~vE>_0t0m>`6ZcRxe=f^cwKg3uvK5M>AT6s{yZat^9cKjv+S zwl`p2AQ3nF9YnU173MID+$t~b_FVvZ*8ltn2pP4R1`qmYIHgq`V(SG=3w^={6=ud1 zTt@Q;s~3Rmh;xvfO(eXyLovp(r0+M6qmk%Kqg_0|%Yy~C zhcOPPJ#ys$^k;sv`F;Rc;25DVitOp`eFHMNc_|RQXQ>B-;J^QX1$6DhR=al63q#y_ z7tZvq$f%+R3b7k19Lnv4lM88pBb#l&wl5V`yXSqUuPVq2tJApZV5*B+mo^JM+cj6s zlxT#i{RqICj#ZkPZ4EX}{6sAs5~P|X75v^tE{3^6In5zQ;bq)`KhOwiu9O3)tuIaJ zCp%tk6{lWNqRULfaLCSR;gc0`M^LFEZ!z@Th6K~h7)DFkQ zjngySq7E8B(km>1fLQ4l{v$XQJ@FN=o=e@*I>hB`>*>-**H+N z!K2pb^xg5&caJkAeo0ZxM1XXZ*ZR8Oo8U$b+h zUdAUOJdSpdm4`*S{HlqBC{0{K7opFGKv@YT*+VkRHvqV5YD+mWgG!6pjWpR#7`cui zTGxAb%}jkw(ao>q2<%jhB+Oz=$0(KBAstQ7dq7oPZr2WOdyddVVz(wLXA5x?z&_ID z|4JLp?yniv&PvpQcvV|>k(5?>J;K7#NvAGkVJfL!9mu#8_0Qk?4p6;H;Q@06`D6NJ zULENUCj_pzp7Nut2!}bhiPi%Nh0bi)be%!DN%tR^+-J~9@n)O%-=CKae8Yq^JKLR= zrx!a70EN~#=b4p=z`j4R^M2PCCw#f&9oD^PfZ&Ix0Km5MaCH{%s=n-$i1V^oL&Y12&`pr7gb+Q4Et2938)_TsWphuCjpJ=_V zG9}9fPkStkqfF+5q7$uoAM3JyM(h)f!{IeX*%{05`8-{ECc-28kUi7YwF0NPE0&Tr zoPc0B9MLPdMeV!Sa+Gjgt>RH117t~SYV}Uzan6Z&FUJh~X>xG(VEt4{_DwMKLqIn3 z!yBQ=6g*}Qp+raD(_~9b{3Gt=aFyq{{M}y|skTv$ZYR9N;3rHrQhv6GSj8)tc;lil z4$Bw|Dxh0E1dAlK<4i&cxlO3)U|NS)JBjbak53|d1jGRqkJ4?Uixg(7ENp+gc?hTM zU%K6G-oy8ZJG$RLoCrZCuAv?asoQVc1TlVW{E2;L)!(0#3waKSb-7c{%Y)e)IuXNk z@^8|J7xo};m00R-R{83N1I>pdD+#0Mk5WAi5Yb1Q$Fe7h7Ksl4?jZpG2wg~=vnTfz z3G53>m_Yjy>4#PHS3>q1VgI+ZHERiDyM^n)2 z05j+dq66#vc=SoU=;q=BD#{5ov+7D{S6>bn7s5Um>$x>!<2000+;7*#EWx24Y?0KV z%0GBclQk#1{QX4E27Z_h;-1M&ZRGH&P-E#n0j^{p+P-%z&Tj_Yl^Fg ztF;jn$@i1(fer+w@J`CBpDW3o5g^riAW6Z8Jx$>0xHLf6TgWJg(u=suSa+fjq33|k z{9Jgp_?TZ|4&;G$DeqDI5pi;2WzQhtVmri@3tYlSlOW2|#iQiV9Z)m^S>S+r)X20w zgq=slO!}0anK);E2r8L07n?)M7Lcur1kao+l6AYUi?(cyVNt4wS+K_}?spoAbG*UN zRDnXt8e+#jkx;^oNuvj|&;TC~(mYT3p^d1c{i=6;>|Ke0i9t=UaRW-VZ_bF9B_M;G zv*4zll%FT(wV3&JC~+nxvP1+5JfGv(WE7A&j9w=eDqphYqiF)kuKEcqZRlZD#P{rZ zb{BaVgAL#p%6cUUPN3>0H9D};dmwk0IIDrgIe&4j<2oHI&sn_C=qKRt@&FuO`hk6i z=2xyTI(?sdppu0MfWmPFu!x}226OkP5UmDeTT9uHQ#_%YXZ3s1r%tlZzN=O+) zjP{j(Ih31tp^$dKG~$J<9wpwB^-umscJl5y3&9FRX|_ox#uYf?&YgLpk>kIn%G(JY zv)R@^nt~I<&PH7m1YHN|v75=Yz*M2U{GpzvJof;p?~Q&Yqa>@gfmMuA9YbcMqIP8L(SS7^tin67w%-V9 zTzOFK9EDT+c?qHPQVJ7**QD#C)*sK(fen=lN5zolsZ(3X_4aUP1Ge=Dm+`@Z?w)xI zBj}zqb;i?9-aJpYEFAq-ZqqK}u}V)ffQNLb%OrYv=B>XBj+E0(>h)C}mn!T^b@VqP z&i^HJqT2YhyZGdrwGs}>|g&k z3$QV(2x_I=9jrmzzdTnyJvm)ph!_DK>N^S+81dwpLVl3{V9ISvW8av=H~uwOu6YxB z&=dCM>1<5?;Yo{a^6H8ADbIc!sCo5VGOixw{f3;y(N0({K2Twj0N_a-L3|o`11HVVFWJgCyope}T4?4a;df zvdt6;TMqU!W1nKJa$gkF`x#@*3O5xKGT+^81`Boc5S4$d|GCF7lt>ekbAQzi%o%GA z(@$l@s?k^*;nvtqaQ3{H`cuZ7os0g71R_?9_M;2w0<%yqsD2-TBN9a>FU_WdcqX

Y(_b>QE;0~wGnDe?fk0!&i3hr zoyKo`S9}m^WCX+ZquIO)2Z;W@;=$+ij;$iE2=ukUN!d{$EE`N8_l$FVL9%Q<<#t4Q z-9HLX;j8N}t3L4C#(+hllj>dW7f0L3Bl3Rs+zCB%=T0-DcEalqx}gU`r%jr+6NAci zEZv14C<0f#s9=6cc(HFhuRcxFEo`u{kP{b+bP$8Ho6FmukV+L*zvx$~?~IL_b@rL- zf_u+E{RN(Rmlp}Fkzz#47_~ps8XTyx?Vw@TEl#W3JTCLR9K+3hx{&V1eEUE#Z*5-t z8QMX?zx?CMEbIFcU&?=fQXk5CWRI9E_7CpOSFjnMKG`ulh@EJ2GZSc)3SMlugRWQy zjk!E-Z}*VMmzf!%E*5dCKBX!dO8qQPTEm}}YgbuHIF3Qiw`LeeiYH~3*fJ!=Km9n2 z^cp|lIU^9I1Aa;JQ?|XCx*Qw}QAb|}Z#@je1WO<6G@ra(w3WA0>d$zn@9Q$le66f0 zI0UlmN?3bg0wa6wo0freq2-5s6^CmVQOrQ9yg9Jo!#rPzsijJ`fpmbFU8D<0(kr(Q zaGf9fsEkGwfesq0W;z?U>8CoCm}p%y-p~ckcfL$-Poz)$i=B#}U=qu{BzZufJCz~nqr)1{0H3TdOAc35>q&QfZF{(_ zSQEO(bG~*XDe80eo=!4IW$u=(t!#XItt>0Syqesyq@<`U zc}0qp{^h+-@g_crLHDvUw9DM2JhqGKiVXEBDrQTN=0QuwM|PTT->Os6EV1jg)CJAe z)gQz<)DHb>@Xar~2+0l__A82^dc5vF17vM)2sO4;ix1!09?TerSC)>uva$RyNN)Cf z_Oq<`HGWv^+9QYIiENy!qisiOX_psthD(qC3x)MR`?6i57QZ(={`1a(tj6t(&|hFw ze5COVn#=3ub(%i3Dhhgys+sV&bbtLnR}YqqyE4DywMZ2DNvS^als!*(FDw~u>r}X9 zJL?YdHPn~>5*=6F=CgLnU92aV<-H+bgvZ$p@ttCv&o#Z}ou&<~S98H>mJ|<39@e${ zRoS(6%74#Ob+T1HPpewT^YH8s<|MsT!xR42*T%TNvARQu;y=`$Xrv3KhTd8ox*wDI zGK4R=GSBm3mSNB4S{I7W4kuMbq<=;GyZR~M#3`2z$%L>^32(seLaTY&Yn}ehj_3J{ zGpK-;>?ju*qzgMN>gW)HhU1Y>J8&$=2%=K(iSsel?zeB35W7U!1(#F{p)5KsHn5)9 zIu^B0e9s06`%yn3M}@*y=p=)mc`KwcyaR3a9`-(hYa0tT z%nQs15axz^M7z7Y24*qp*$uj8c=pnts(J;|#TyO)qE z6?FJEm#67d9Rs#m<@hncZw0HyaqM#9W`50iQTQXW@3<;PO`;=#=IiOdiav+lh z3~N;0yyq`k8_bY;X~%6|sbC@pSTgP>AtkzvnEmFBsUvynm3gD;C8qhz+$L=iia5&% zfx-l&cXxFw2ADgFsvS$A3^@W15=PR=KNw9RGP!>@9|#mA(5mKh>J=S& zZt`8J;b6k{AMNR_6|pV}D83>Swg@WuQ%7FO-g>cT?+P9Ndl=bf^y7=}ZT#Br$_M8O zAASrn&TnY4{M64gIiab3D#*~?{%k32#Yr<{d3jOOxMqdf^28<2?$hSH9mMyqTZ8PE zrV&2!NSvsG)$1)c|026{_#2q6o#)AZ4%p)$I^L6el1k2c$X}`~#xRUm40BJ=Zb%(E zT9@ct3kb&Mz+;r}XTlbGHlC0`FVzz$W#6S_(i}v5^lW(GEx&!Q8mSVYwzD34b4*(Z zmnj6Y9r(%B%-J);B!DC5H!wHs@rv9=zl^Lctfs?vY&#_9B5@%62Ed`O+CH0&u4Ya9 zmG>#Qza^g@t=+E2kUR^PoO!dZU)-nGx*1J@z}AjCFt}$f^HJ{9$-Y(tBk)J*YFHpl z@fE}LlTnI_9Df1U56cC(;!D@{$+q!-jnzO7)5wYKLUAtc*u`^-;xvr{2SHs*BlcI>+4jT z%~gba9hO)TzwOIk^D&b=O`=exoQbG6asJCC(N1`l9m~CJq#{ph(SUX9i7iSo(o$!i z%Czmyox}T0lQgZa80Y3Y>K{ z86>Sy0Zv+9=n8Lg3^G`|9_>o^Cb^@~>4cY@J4$>wDMsT|Z5)Ar{8K~zEZ8DmRy@^u z{tZGL=A-5HqH1eyQn^jr_8$^NofO&=QxqIQBaC9bXFHU!lESK;BXHkghr!M^*R8S7(v9{n>I}DwjPm z8g7cfyR!Uob;^-QufpEiKUzQ1g=k|Z>DsO+iXLJqWcpFso75ih{ZzxB0qnT+#o8)C z!c)2(>FCgev}Z(nfAop%!ko$Fm+zfr?(&ptR2-E`a8{fSMDETumggaSVQW zC~xP+n#?5vu&rIyreGta#DO9<-9p>@FVZ-2YU;i06I#cJaKyPib#J}K*g7+v*V>9= z_`7bie6{-h{=S&wQX}?pj4^e;bl&y*PEkpUu#au#a3(=l!mQIXxT13O&sDmvOne4P zYWTA%CtU|8LRH5bfklx~WAgY4JJ0}Z3~gm@lR6ycdZ;y|II@VxRb~V>PR0a={3u!l8N^3+CE%Grkmcx@_V?R#y|ZXNNqz4zhI6<= z*wru*3W{R;(h_z&1v&*Eor&`cEZ?2t=eaud3&)A~NBQzqW0ikXcAu2{1k*gAd*$T@}Kk@H%SLjnwHg7G)YBk*nO?+Ll^`Vq@5Lr!N@RiOJV!-o{~kuG-{vzs-%`qOq3L_XA#VKF(g~Ug2ha_ zSuW{+l}Pg&+tn(}-^LQQjh^E65qw9$L9l)-@HuNYi$31?mP5$i?4uu3BY*w%b{2P` zDPRLVt<|A^;paaFwj2fvZ>FPTgy+JlpU!d32?S5Ghur*Kh_L!YIwE$Eut}J}rCfi) zJzE#e#5!4SCMPcJ_EAdy`fPfh!=2)@rN(2e(A2>KPbW#*PRLfZMo#9oc|}bDpswT z+c~}l6C#Tw)5pi5?jgg8L@^{vCEkZ{I5d5H@{j!KMdJA?6S+M6ORznrYkujZ8){r{B%_i9X2Amp1g-9n0 z0ZTo^tzv_9e_I=fB#g+Mgz)48eT}4|^|}K0d1qAYyPM_23XM3OC>Hx<__kOKjpj<* zDDbP!tZ;Y|T|_W023ooYqsm%5SMaxdu0zI>;X`m2sHP&&a) zMI4}V{uYLbeBnb3LD-vLveeddp|9dK8*mWRJbSA%R_7CF@=*uhknLBU%DBdT(A9iX z!oIpOk@ARMr0drGTt-n(sM6IQZ0>uSc@M3j{4S?&4lswUcZYzcM8F__e6ArN)p>U# zGK2MF9gZ|N3&pHq{h9n!z0XefMp1V=;h>Tl3s#z$i^b>UsDy(;6Y`jvr<_`UolACs z(KoxvvBMiN*8m_vdi=9!B=`iQ21^HDgQ>=Du$S@td0FWM+LD>Z6@Uf(S}(6K~C5ijr*+m%hWx z+O4w4(|p#_a1CllzNR(DlJ=< z)NK^KRbx?l9qajaN6)qE!xfLlRO%wd%uRs`br1%LSDWrcPzI<%&Pk>=LvS$Af`cC| z)lKn9DDjY^%Y6;UT^~gZpHQ{EyBn>OyU_{`WEJ>C2S!UgPYte5tS#d!w3Qfr2CXP3 zPpgtA#Z+$QAM5^QK3-Lt>*S%a(8Iyq{wz8sDD9Iw%PmIcD?`5oE;%OjrRsoRavWxL zc_lt)ULRk9B?G>0GFCoNoG#z}IJ><|y_!!>vVQvHgZt5ah3iOtkU;9{En9dC4I-`s zw5vx}ijthj!cbrJibs>1)tr%Rn>%x1Y4saDagTAuxvXB;+pFcJaeJ@NM67L&^@LsN zaqu~ngSUa{IIpD;EDJqn&euya$>eAvEUrWqj5?*0vuj7`Mvayhi7LEKO%q1GV+S?X zuUu+;kMy@X(uZH54afcRTIJuy#pDW^vlw5MFB~H+q4@a9&+nO0ja5pUaXX<)cp%Z! zkHtP|pmx6hZLBFNt%MQc8(ON;Gaz{}w8op&Xv2kk;56%BY?AQz)~{86%8}oDzUSZ# zmp@*;w8mdEc(O-w7H{^Wg{E2M@iW^3)n^5#lb$!o^J)QN#F>r%&)-OyJmMkcUod_0 ze)PwR?-i$+SGwe6aDqtDyfG)5sF~ttL{=_8Fz^W9M}zHm;DBZzqG(0&kqQ*J8%{ex zbl7St^GygDgkCn=$W6_*gsFVGeftWRkoYNWPBq^UHyLQkJlN7_rh@N7G>~CM`QLI` zv3@(CXCx2q3<-DP{NJ)E@LBYG3rq zg;BitJ=|mb@_IB#`ldR$%zMId&QsZe!5Ed0vv|0(bB4#4Lb}~>Y z$Cw*Dj(3J;8u;vn2R>2R0#xYEbW@V^TIw|KQXEm9Z0>2sBWSYvo8pfnIpW185gzB? ziDBDmV1iJdiZYbDvxTb1twxr1r_FLjy zTcqe>=tZIvg-;l7aGjd7m-6vpP)wJIucte|akYJ@ifp2>9`Yl>rmP7zWw`GjEmkP- zIQo;HF7GaI4Z0#Yx3MT4g4JXhRm?Z`GStyUH$*krxZ+;#;RIy-EDcGb>W?VsP(gv^ zY1}|;`|T^;tYK2-FFl{o;OtG`Y}%bOno50kYW(2BiK^DEj_9-F$8V*+Nt~=#Myf?+8?|mvd1l$Ds8bFyS zUp;FWS`q$1U_=u35Y}Y<{UzzDZEC=i=M#=shx=vXefM$J2>$Z0=$XGd$oO{tcmD>% zPcj9;fCdU$9?7N17a9;56=52w`P-aFlxdVb5U{*F!l4s!f8o5Bok^JH*%?PY`iO$H z6oRgu_#z*#8CbPI#4NY4({}XAjFOsscWBnzIFAY3k0+T9Df_FA>9_i47eRmA!-k>! z@{{OUhGbyMR%{LfXJ^JlDxHvD{YC4v*S_dxq4oUWSTakSZ!`WDm8d5(0fWT*55kI_ zG8n9aF6~!CL29(Xa>x3#=t#}JGtbXdNgTG6!89r-~K8%9?~>Ajv79h zl7UL9$w50x)FVX(UlmJ>n{F#OKi4kQBvfkAF>4vW0nax{ zNd{@2m;H_sta1PK$nUN)>?3BG#uVb-YgP@ou&{8Nbf}b&>-1P&Rp9^#LN;kT&2GG^}ks&UeUBOo&WWUa|J=fxI zqcT4GCD1Q%RSp#*;k*4F(Z4G3XQF}_Vec%?u|Sd=H78!@f(@w}<*AY#$HVASuxx3nOtQ508K$HmTx1gA?nf-w{YJs#MEchxO z8Qa&x$GqF_yq$DT8`-P0-9_5DA{k_kZAD4;6;%;P=b?{ig(dJsqBEvNNO1|ocYVqn zINXIOzvD4yGpsO^VC?At=6l+$LFaO|A90)GX@w8Pfg7O;=#ap?v4k0b1=LDFcOi@*4On%P1A}ajM#elt%w} z`V88_t%}V~95|cXZ#wuf7CH7eFOO^AVR>^FC z%x)k|Y0yS?a|q(3RgGF=p9=%-5c)YM(+br1o(BVuF#Mh2N9T<=;vZ%Vz~n+DgEE3* zuj@O-|Cn1w9B9s9tgG#TbyE_hiyF{D202KVRi)f`JPB}V$O$=>#LhHbRMtFhK42Ckfw3O2lXsb!plHm8U*}H+z zNnSGYqJXsMZFF}2y2dCZ&09E1<3M~p1L1SkTr2d)kgQT;M>F_ADfEdUe-H3k-H zL7Zy?a;1ZL+nsSl6Sf2Z-S4xe!&>c=lfo*axPsYV#S=qfQ4)EzPVr0-g6Ka>7T!Zb z4B)q~n=g+yp{`Wqb`y@6>bmXi0cImp7QVq=4(y8&r@Z>^a5Rx9;XUG9w-BKNeD$l`JNg{*r`vBUmnGpUTQ3t7lqH8NoWviDK^Hfd| z<`uzbPff9MZij??0F;Xed8~lX^XE7r%dYrBzGhb5EPG5}=6-Aqm_wZ0!RSzS?T)&|?If!3A#O$$hlnFlhUL@2QRf z9P{5Y2rCS*vi|zY`U7aZz0&g&TkJ zqRbL!iV&nFpK9OLko?DsX`$frupVirKpZj4WT%3)g|MYdciX?^JLY4$`LBU%-YACK@{*SlZVgYX%aHeU;aE>WQ>YE#N zoGaq7m{=N3`Q&u=_vf)np-Bj^buP{>rl=4RQSgvsfZS}fs&HTs^z!Phb4 z&bD~OXBpPG-1GF@QKJ^H&j#iF(d$6JVr9!L9RGI`3Sh#az}4|s`~MjIx5o>`M1B

Ol|cGV>}220cyhAiErV2pa>#3Y-5+C)eFefHomK(oi}(DA z4(@YoN4?4=vHdaHIQoFyzG7xG*aNNzzLO+4yvfHwm5Fsgi=>JVl}Y z%y7$}_Ud9@?uLD7O0>zPI1aoC50fI&m?CF4P7CmLQnrXt(mEt#cf2Vpu5;ePQp<<` z1>SOsAl+TWX*Mir-pkXL-vH&ZW7Ev57WV!Xx6y%4~xosl=LMii%-`E z-xu=QRQWJjkq1t0x~~m(dcU~pUXBoP7UgX2jyCE(aQK$y3R9Z! z*_zVn<+kTA%6kq$_?<3$l$hlje9Rh57mUIx!fBT4Qu9>X-nQM^_|bJF*zE_ccUhBT zln#cQ>6QesQ20VZ3lr$G3tYu&-bW~SZS5)w(bH<%7qVKi7#}_I-dsnRNca^byJ0)7 zEH(;;7D(4Stf{5eKF!BvimsZEK#sl=zn#gOIN4zbUk$7`JdmmD8v66~pw{K4O)9f_+?Kvz%-DpJpDONa*Iufr}rrElO5DE)q%1WI|j zjwh~`PsLiN3DSb+r=WFTEvg~zt&v(N(mD)@Hf8BFC z4v?ot^FQs80Xm;-;9WPIDqzp%HrL|4HNhQlb0rF-93MtNP$BkSW8onH>a>Tyx@t0Zgq2%~x_RN@=4w*tjh2Q1IpR~rS0##;Punq?` zaGMt}0(9fvKkV%YOhlLy=^0L~`fNMsd?nG07)Q>|ir*%B9bJs&`2;!@>9{`}dXsAq znIqWahEC3QCB*VKNsT3eLeeC0;Im!C>-(UW!uJ8>+qK%gk^lJ^<}V@cUS7GtgnI}` z$Jj%z{+2+monwfZutY^gliJ#D&-{xe^ufjx1`d^6j5#H5Z3s%t+f=I@GfUsnahlaJ z+xA>}K}`@qEo%0=)nD>No8DGBy^XOCGDJP%u#Uag z-i&*`E!DnuvMn}$KvXjN@ot-e_|;{!?%$4@_CtJ&qJ;zg4`pv1mDSpIk8VLiy1S7s zk?t0xyAc8DZfTH|mhO^#Xej{!MY_8~y1S&${p@eQ`}6(%#u?+Bza9p#p0(DU*EO#> z=WXG4h!v6Moc(Ys37qm))n~W_(4F(igEGB_eHUC{7Hw|PNEJk+Za+?^aq1xL&b9It z$Yv%jcwKCT3CMJqL6+V`&0W|WpZCNC`I|z4V4Yja8cdi!UO()^(^`6gwE{U80~|q= zFYESH7@#biOv&W}Ah^@?m+7Z?^d#H&paer{pZ$q>(Rq1!At55Z*n86`1Foxt#tff2pP|wmEwvDs-Fh)m)XC`n zdn;;}`>SrhP$B&l)OUZ*btRboO3D)8k9p^p{j0(L3ebL4n`emluYRpEgV9aMl$_Q% zW9?g@26<+4Pd?ibG<>6ihHo-yr_6sW^pfl_9$vl&wDb~)+WY|=Kijl#a7=wMMZv1^ zR5U>v8ZKZ$!-Wp+p67pTlW#MhQtTt)#!;w z`W5OkraGVh*jrn%!HX_GD-+}vhdzsP2HyP(OgFq1p_hEnwF-I(!Q0+~KSn7AOaS=L zcx)q7=u`Y;l^P%cw?L8h2nS#LNeNuAeiSeOn)k_nUP==Tgv?1hgM_rN0J5oB1kE+{ zre8MLzZ?Xo|6slU-r&3YbBgvuf%ZgcY3R0(88Ws?{=9u?wLylK9`r>jxM}J$SypVA zf3E=jD+|qEkgTDRERh?7@g_Y8@Ob;wK=XuoL(ySB^mp)o{C|}$_+^>fUfx6pc=XHz zJR^M9OI=OIr#6{jQPNvB(#ig_iKwAXgjpfT-wtB@h!+WXkDz7o+N55Jfj1lgk2@r< z?3DV)6+xoP;0EjJ+E5^Hs!9%%LmIrvPn9eX(u=M`Bl2r63XVTvhDj&_RHuhuK+D?B zx@*$_ZKvrks+4c{!28bLp!MJU(bdPKpaX8B9X3WWq5Ef0ZtZUbVEFc%CGHs#7#+0W zre+`F`v16s;^_x)1G=JiBJc(^AqL?oeBo6Q1ktBB;0>sCW*o~l6js3Fu-@&L9Vp7-xS<~|7s&Z#e*TgO2?=waTCYCO zj?+C@XwrFlS#h8@VrZ@ipBf=?-?hHGg1gBFR>Y5FBA(FwE#&=Cz@mV~+e+3D>(#CZ zrN*<(B5u~h1R-I@sp2*Pn!zG6L%+aJgjAtDb-T8@a9k}4O{Z~6XizwY!RRf zQLX`*Wpj7#sL{jd?73~zH0=ah|&iF z?hRq6flqtYaxf+xit#OkPgc8&BtOOnc|?P9@O^W`c+MLG%bh}@yJ+Ms#qmNere`EY zY`O{{e5DrLTOXyK@cXUZy4iVVzP`3021qheLI}S@JnV880ep$8ccs=Y`B9=-a7_3y zFX-r>{Kn;U#;9f})A!4G*X=0Sq)XJ$k7V>D-oNp_IE9lco^Ptm_*)Qf@2v`Kh2oV)#qR8g5#|ry%-gRMm-W>~bSBl67^ol$(_~H`XH1P$D19I5F+cxp;HSuRyROk<6A=Brjr~8*L!Sm-m`& zzrwIZek}wwAMZfFM7cVy7UVX%jTWpeKDQx^8Z|{AJ5W8}9(}ggn{}o&*WgSH%{2Qo z>Z!LXT6hm|K3SC}v;MkDVAT4w(B!4^?e{Yn($IOH$3jc&S1MPnfq~$Nu;wvb#(0x8 zydPw_;GakG{C8X~Cme4!8kS3poat_a{KS%?khssVt#%UP>SWL7ak-K~ z_B^(OEzQtf*;cL3k>rgLupR*9^eTsdo z=Q2G5SwcQ#Ur}=j7~>R(E8jfGtxYbG@l{ zorCJw`e-$-M(u52eVLwA7_H6b#HSZqYP*q}Z2FOcHz=BWlrjJs*Ic1joN^{N?#<2M@p+vD9a>MEJdQ=|fF`!m&n83<~E7x;kF z-v~g1q*7>3vcWhzYVUD6YV&yFt%KmM1dZPx`M+*0-f!WdlFy{i@w(^2NQIA{P$O7A z$ouUIxZtp)2t|b_G5-|d38m{WL!aj6eA~e*n?;4|S1D+a*<*hGZm!L`dG_pX$*(@V z>UJq;$Hl!6jNkY_uC1T+})o| zrj0o*V3L*LYjiWpMYyiExJQew*qs>w>iynfD&)3HQl7*Dc?VdA@Y1A#qQr=;(UIy0 zSzgo6Nzq*Na`hw=Uo`~y-F{Q}{mqoDc0lWA4I=^WLd9Y=Q?diDIiEu><3ZBmbh2Ik z)KK%3cp+TQY+2p3^?Xn*oef zo!8xy%-U6UGF=Q-6_>8NOneiRBWDWF1oFQ>csQ^rq|}84F9JS>PSsm*(su-n#cr)- z$@KS?fUJfWZ;9Z3E9C%I0z=762v&*frHi$kfhT%S5MzsLl@^cF(QJ%tV&%IyI#ox? zeZWEsoQI6%>eX7?xpt)8w5}aR$yreo!A% z7I057x7b?QS(7IDbak?Z5Pce?p3XlMlcCP&lw`x#+u)~^=%kyj>*|19yGcAOg}kSCioWKZ2mj7Kv(9>PR4q-O3B`%cmR6D)x-H$xA0aPFpR z?s{hM2@W`6?j|wPsRStU$!xot$V+{8PcmKn;IhV&Dpf4CQuF%^c5`Ae>uAgvAQgndf- zs_ih7{`Hr4A#?U&9l9^=X$N(ivbF;FL_fTBu^LaYny8Qt4u@nx8!Q=W~rhrBFOBp}*5Uod2xkdMf?p8Zk+|-7E!U4UmPboJin-@$F zgAyM9i!QLW*yntvwTg=6!O@wxBxez5!%1W0uaAST-}hkFCmkRiA%9!zz{;xN2zl`a1g(+Z*Z-@W8Sn@a3c8{n{~NXeZcOd!xe}R3 ze%EE<$(S?0VS6b@M;Ub2?s=5L=M8ELiY@+GBoDR5KR zkkLC%e6B=Sda_fS%=RF4AL>&sQG}}2K4d8M?4dLdQ7G|6M@=l&tn;rItt?rq0YWHt zqBD!CV;b0jF=Pu{obo;F3Tac;V`CKi=Yz zl8Q$zsrKi5yZS>ReERL;%PW3B*NBVT`y*Hd!8Zoz z#n3;3Z_tYvG{J4#*6nt)n=Kh=pth^>@Ps5PsL3AC{7Wpdf=Kv)zhM*=*3sCtGf6M} ztVz+Be4Pxh8WUjX*Z6c_JG+qQ<>|U@cbvksqphOV%>%5U zjA|<58kbGOgwiAjBVM=8B^vG-0YGhQ!h}!IDDHjohz> zL?&blE-$xvg7!9n&R>Y29quLe#mz3KO;=2)zg!|e3s2OD`$yD>DhcYKO)I5X5OM*o z+|YzBl(ZT$+S4xA`*MP1B0mM>R3u6YGYUxB;!JphDk`2lh+}A8#i$cv&c(`+KuXs) z$fitZTMk(ZrI(a+qEg_pzRd9ETS{GN9-aZkGg+u(0wQx%=#TFY=ZBm2P4m zzsdmu)na6@fkg(O|9uqVEEmNor` zPLY%dGZnD?{5X!PVr;5D9&F2qh3vW(qL8BKafVN|g?GW=-U`F`VIbPG1m-cUM`9dN z4Hgyg)%Cf?%eQpM{N0d(#@;7Zc9gQi(vw1Z05C67vNABnR0!d>tkIa;qrz3Gb?-Ji zCFBUE?(Mok%h9f0bBN+qYd_@CFd6>Y$aOueH#BOm`!f{q$>7pxcash@U8gWK@U#jn zb`i4flY&TY?k{gi)&BLVr8jc4q2v7dvyusY2-0lyD8S zn!HUrAs#tgp&cB8?RE}FcC=Rru1KGBqY|d;qpWMb+vXF!4+2C)1L8cAD~U8kGLA^@ zN&|-Wp}a?z|1I)E+d5MG*K>ywxBz0(fN=; zBZG1c+64LjY>ChjlWIw3^X0mQ#DvhfY8~>4GfI!HOX>aZ6^Y{Z!>uyDuMDyl#!7yq zRNcmTSe#B_H0H^(b2x@!nYfdty*>(LCK2$QZP&+E2>Za_X= zA%d7xY!u?Ut)~C;R&oC#UK*W`RHsepm-Er3NCKw^k_1chaqG{8+sNTezO5&N9L8PY zQW&b)8oEayTi2Mg9H7`69j46lcy)dze4OlAXADgM1|D*>2nu?J*4KZ$K z5d**oNc_WaK&}yDk-epq{j?FTy|C4a0E)ssp~TPxYgoIie8qpkurL1JlKu!RUL{pU zq|Ek1F*6L9M`(>)@xN;1qF(`ugUrL>C;lc^*M#_*#l^P*gv?BAOLXx zoq;MbVAd5kY)ySN1~AxH>y`8x#O7ni_-1gADnam6%=Sv}Uj3P55rJQpnU&_9Oa6_^ z#mF0l!rT*Jj(+{VKq}$*F#PDlUFsOzIDDrvEd^hB)X2^GK8#FAIa0LyKU{#DD{rY- z^R>)T^~{d!JgHmCqx;dLTgr$pzmfs+^=t4`$sa4k_c8EJHRq&p*8$Kcbw&d$xhQgW zZP!ln-J3 zryOgBz-RuL2kFK$yc~d)jG9a;)TK{O;HJy-kDn80 zZ9O45mlR;13TQeG+3;8vye2>Y3)EE+YrnnvqtogEl$XhJ2stQ!9h7-+P*Q>`MPoGm zvj7#BRV9sx+a4RSd*!~_vM*(Y)4a0~gU1&tLlmE5TDkfRpWpXCaFBf|t{W~yvuL-D zq+E3dDin)jD9K=!lC5d@X2KzDHP!@+{3{I04LFiffxXcK4Z}Z;b1G>=8MqCCu=q$W*r%fkbef>e}L4tgHS~J>`*GR z5$Hbny?_#pSg2OSboiR-V7@WRj*(Eg;tMIC`+*dBg5>VXTnKz5fpo80ejcdm#TJ!f zUq@!i-@4lt%{M$0;tyw!maO@yHp;qzBDpu`yq{p;!LP&DowA8EKq+^-$_K5MR~WQP zg8Z4f;czK|!iehT{DgWimHX4>;esqUq$wYMP*?4maZ;mp*4)W{Uj21MNUgWnUu_~j zT>~Mg{~{@u`}rW1RQ-vksQxpTj3tXB5L#>nL3NH>reLwlb)WYzU<(wR7uCBQ4cRiP zmxrq^*FA*fw zxNOHiNquDSm4SWrd}_A<`L|F(pN6{iRo?p(Y=Wr|0EIc-T;{P{%vx&mlGB=~b6oA% z=qVv-d*~Wf2LgEZUSv!iZQ>wu_)Vc#2Rl_K{{d81~%5>_(mP#7MBdOXIvEBR(n`Ps>Q9gI0 zADz7!%HC0WI7P(@pyFD3|8Ue6;6v&HG`BAD*sO(u4DjBUuM} z#TLCUz25-&lV}7?;rOgt_fGP%uhyP(-trkf%6X{N_lb|S4wve(Ah~xv+LJgdRN8D5 z&tJJ!8$CqFYn|~^+k1AXlnt<#aT!-UT>MHB$ul3>juI!Rg zXorhYs~r*2LK7M9hkyPSw7$kx3CgV7PZ&`9W1`_fCmNhxxI6(Z1DUT1o$t~5^MNL%w)%ilPO zzvEhG(T#$hN@mC)%IdRb?lM26B>7mUqor1wWnUq_FNpsm6iy^4)u_ajdlh-LJNpk} zRb2(uNY=TOi2=FrukC!3Q1sI+XHA=R9ElzhL{R`%X;X3Ajt6Xn{vSZqj~j)PTRv+y zh+#SN1>wtZtmm?l;}dhG#kF$XT`VY}M^HTGI<(0vR&buD+@vA}|2gBB*FmCsAU#JX z@@h?k7Xnm^$?@|(R@bC(kn0-f(6iKneiRyq!?>8M6jo0GCPb4JP zPsKlKv1s5sdz@{e%89pB{S=_kDxECrmv^iHsl|}CgZ?hJm11UQns;J@?9(iM&lJ4j z%-2i}r0QNg$8sbkurl}_u5=% z-#uJNOZ9Md+;n*(a630q7$CI4q8)VQc2%$N?FdMXkrK>2Z#S3M7Y9=z6An3P(Wuu0 zMwtel7h2|9*u%CH34h`F6i&rc7`0RQ{k5V`CzlmRRA(c1vEMfm_K}0$Z^bwyLEFSA zRNbn||F14Wi`*F=&8sOFn~q_$y|c0nB}uT!eo=d=^haQ%4drLyePwt_14h_y6hNNA z5QD!C7%fpSsd&_^&oc;XKLkMwJi|neCm$YMu|_B&`?Myi<=R6$$uswZQz;@IlP4lx zi{YJ|CS%&VE%y|&i38nn%BA;Q#yR*j%3lft!`%Rx_dL$gpqi)BP~||D2BzRaOupkI zaZK>*hG-U@di9Do93#esSCJxqvl3JVr&f%4X+R%b09_qtL&EbA6u6c7|u z4AHfsW95@xnuh5EB-aBOrw~whEI~R>avh?smW+q`0kIiWUEuoRs4%z8mLM!Z(#C&t zx`E79rjo_)ahXfZXoc}91ImtZ!od8kJV~?9_m;qL9Pn$xm&(37l>#9ikkgaphZs~2 z9WZpWHTJxIX6#3!WGLv>w}pNSln^rflJocZX__Pp%@HB!+X{PEWS;@I1YwS2Enx=0 z*VXbJ4wke2Qe;65B^MN9Y9gG36_XW#B#NnwsIEJx&wkg1fQ4uL5L)ma##Y6felDLd zLoxmHOi=FevuaRg!BOmh(w|F?JaoyikCq=46@?aIjR1LOC*sh{bw{RCE?JGL+>FAg zNa~d(rW<;~`_VB#a9Lj;OAH>3veqgFkO(tBMtL&5qpH>9kEj4@tY;sdetM%o3L}T? zjpjIEJ>r0e15Tw}vEcp}paDp3w`CrI#e~Dfm=w`E&ktZpwJn-oqG(t+o>L_ZukqWC2~m&N=LKb510J@W(VL*I&dseVowq@nrM0W8Od4|CHr% zy=q9pJ_RM$NFcvilDOk-xw^k~pOBP-tydiloS|lES8lfaBFiPkzimw;{^q zP3LgKo=1wNeQWbc71Cp5>M1jfV#3V?qnv6V3KDU;dD)-lfnO|QiOy|1ise6bp zqpr_;scE(kt2;WslZiq2`P_5C$$Dd}sLkpwx=0TwUy#e9(iJHRJbp0yA!*kK5Iv{_ zc+&sSk(?Bm0Ovz(cyJVsIMbZP>Y_d~1AHAnKMY50T_5zg=L&XCLd7aYP*SbxX=$&DhKl7`8rePdB+c|)&zf>J$Z@>LUam;dz00L$QwRf_A2F!TSfZ4Pt z?-B}RWZv_D{%qDKLT>(tGZ_-b08IOvQ^hU*Rc_`}8oZz&;I7nrEz$j3tsF^8Ck&Tf zKSRXp{Fks)1et=ARsj!Ucw}nL(>eP5hAC#lpu4TgoU&QxQ}b9sKHtH?i;01cRRUh8 zyca4erlsl$aldBwM)i z@rcnS30YIP60MP-(mU%1u_iG@Djz;h*5NE+cEp+|S(ywTeG>se26HZiyh3;?UKoMf z1M}bN4M3WFh0qPhP$+X?@Wqj*Q|${9EY}K#ZD^S2y$4W%<{@E{@_7uY_EqF9ClIxh(kE5XSWG?I9`C6g7)LFvzOgtfJU7o!J>E&3J;a15l7Q1_-I z;GPB{J=cazSoRted=UnEZ8#|ojnIJ#Gt&5Wv}Smn1es?sXqjFk^E{QkuP05;)IK1c z4kT0wIK5?)evun1@|09nX1qO?(iP%Vr#Vtd9}FJy0KU^MO*03xITY(AMGC6BWY>0+GQ0AM7hhuu|LWE!!owg7sR? zv#S%u(iPak2ya1%Le!iQlh;&5W!>N3Bgn$#@fD;o(hzEP5FFNhkOcZg76Fb4BUM?iU@R{N)WXhoVc3@9=@6O|IRB*2Y1 zToYeu-V{-6Yx6>*@2$AvtmO3gM_&oP1B_v@30z6a^;vV8^SKgTdki%NQiy5`EsOeE z#3pC4yH@nOpP~q(trF5|5Kc=4B!Odz^JOp+8YvN$P#NGp6_LkU-KAa=fIH!(u^ z5j<31_@2@f%jA^aP`8|F^n9nitBAyjp%J1H`ij6J%il#6Q8GW)IA!&N%LMB|`q`V@ zUPL1AcfCpD;Lr)e5TEb2?C2<*?!ThzAfthJs3n8lILSiuJmay#LMY=QcM{m}CZC>R zzSqfQaNu*l36c$TUGaHp-lMYL<`_gu7T)r`xmz*Yds z)vGK{C{#Yl(7g(x;>~M(<$&q-zOCe9`wGr_HZM=HPTlx`!+TnJ+Oc!$o;l^jXQsiL z*!?tk!X|&rGfEmH_z2D1 z;qIgCxcsh3EU485;5F`8o(`e4CWj=Pko7?D=l;M%8I_}J<)W<_;R2b5jg50 zFNNJC_-kbBdogZdl|^NtTXa-Zsj%$y)l;d-e6PGQ|9GZZ$g;#*Xzmo{?mDFucmAHd zqu7?AWM+X(VxMw6OD!Cw#?4BgmlYd_(_n)=q0C?iZxl8U>q-VFcm@X{QpFa%CW}<+^FyL!eWTroL-_G)9(^JBMFK;SX~N5@m8DN#|^D6|ARCs zS1p>>nM+JuTsW*9Hdwonp-nSgY|)f${T0Q}dhZ%Ng38Sd8zqWYLKHe(sh=l!jjR&P zIN1oxkrGWTOh58jHnH^GHj%f*GV7ts?dh^#Nr;3HqfSNM?4V}*%eE?grgm25+I8JO zQW3-e(Wf>lGaK?ck#qRzlD%uDFUl?s+q5j^F!7tN_xIiTo+N&0$lX&?8i^_@vd}ND zaoL_i()G3yO<7Llvn1M=RHUIODS+7Y9AXW!vJ=V`a zy@^aH^6<_%-t(MT9KcI?BIt(&b^UmwK&@)gwaSG?S)k(lyzpppkvGpT7D1aPKuKbOk5k zxVF1d3u?ziLau0!WTZA)YzA-t0lR4A#Cp~7XOGx9jd+}Ub^B8IOCYQ-@+is-bsj{qlxO6AcPEdqILp!zl{x#980 ziWMgQ)J9>(^mS79H^z#BtL&wblX>FpffmvPS|jcoYAw2OG8Wp4S^X@smR7b3akZ-e;G^S*w(0ahNV2{@ z7YsKGj549zWEYTh^7~zK0wqg`e)9zspBian%Nj3ZSlj-Um6;%HVg^#c?@ml_cO`Xs zy%@oAmezE2k(<%>@M=x^_sAz$FTS`^qcd($tbhHXsy}TFzsYrtXi@#K)Zz+3`jN26 zbQc;)?OFcZHJ$MKo zqs0ylG^dkPRdnnYTYfWJto1wy)k?b7lk!>OaRf!~6%1)R)9CvFJ<7~k9=;o0qP%!5 z@jJFIBAS%t7O#GDcwnMqzg9{5ZW4%vCOw3t+BSNXV#tMaUgDNI1Q|FiI79<*ZFnEk zmiOHNP7`E(`hOU0CyydEicm#TEzo4epN$Xo2YOw>WIjIMy$v!4^c@jD`~|PeHF{p# zg*Go~AN2YEi=C&uOz1Fl?u@rP(;4ry#e!i`0#!}HPxg-jw2j^)T;2O{Po4F31z8J& zDTI^Le)ApC!J|gst~9_KCf%2#t)}hUMT7xR(;0=IJx!P@5oGXgEZO}JN%Hr;XESM4 zd#avf8?W^OZtO~A+#g$PW-=js0?KQ5u$EVKjkO!6>bvTU{$`{H;?r3q=DDHE)|l^Y zBchA+OJtoH2QP}~EW17pWvI?Kw-eo5-BZh^zd)e*n6A6%^`Vf&N=Z?wTXhBH^8Dr} zvaYKgFRr|yf>g&rPH3f_^8Gg-*hka#10kZuq9?e6+7Xa-3+WxKt^8ZI%JD4gvxJ0Z zsR|l%)jREvd{OQLt%d7vJXu+IcNDb99!v7z=~NFnH(qOW=ani&t9(~M=gRJ;-7=N4 z{0_pDJM^>{@*=<^E+&+||MNO*2ac+-IpDG&Tpd#NV9DfziK*uQ_;e1$n9}!UhK&iG z^6`8= z0Bg0X%zAH5<8`Jxr&BD?-#oQptync?O3Gax+XB@Gk`e%?^~EyO#1X!Mii_sHxzUs_ zdqpaei^wh+WcEtEjs~1fnzkQ4yi!%EYSJKVUT~H0wlV?kg`BP#E*9sDUI!l_kLtE2VXwhGM9H28356b*p2ij8a>GokB&kUzfw*EJ(UVh?}8rR~W= zYOT|}n1&yVQ88UlLRGrRc=qQC3iP64zM1Kg;Z>BgQq9r|tgjoz{};KAQwNG)9(mvV z#;hR&%0}cu4!Rf>_pPZ6Gk&kW-*1QI;_`bv+Axe7T<@(sb0r;kDg*B1_fNn{GsSA? z_VKrwm2W?ZT9b(wzmTKPu6SRcz&7WZ1 z7RT8_9Sn8iyVEkVY4=apnQ*r6uJ#UJLBTB54W<&Di)-fZ2r+5zQaDt{wOCSzclIQ@ zZ`3x+W{V3qS_37~r*X5*=Bj{c} zlQt8Ad5t}(eoFN6#jAmob>(Va>9P+hW!lvq_LPj&ULXkGCyl-ObB={W=NPfU*>eEE z_`};);#p`Z58K(4bGTfeVCp(~Qe2dNn~k*!FBZI8vR9IjCE`J?V||&1wxnP2bb;f; z_WXNCDi#rrAlj+c;&-}O(F<}obTk_>DhYU*yGr&V5B*pAKHdpQnI}fQZoj8w=iE&& z6MIiD3~x`O%X*Qj{v!F-!cR$cw3FU^z9nK!Z#c&YnRc0W%tbo zG$M6ZIt9Uzkkf+FysXI-R$)PzhOF$*eSCArm6VV2gVv>3@%1kO2fa@*q&4~&sTqha zj}6x;w>XUuJ~^x`Sf1|lBm_Ts3bc{FpG(%Hv8enhuwGEGaTn9GBhuUji)f_PCcJbd zidF>tJbysl6T3#nr)Ht4ayW@oZ&rPV>&O+ac8H;X`%5H?B;i~FVjMu#nIPz}VJa?9G8PQBSMA>9>o@*lDMm>EJv@JYgFm82W z>Foa^pw^!vP1uN2?u7z|txiLD(m?v_nxqOIhW!DZ21k~c6Dv&WWjM(EkqXvYBi{(s zELP?Ym^&~W5nqP|SX%?KAA@S4S?X82D-}hZE@2z>(O}@{Mgv8L|8E@Bhyi;J7`6e2 z$M!r_*gTZc)E4#UpK14zihXP~^;+~^D@m%gdY^m{E>_p`sUOT7BNj~| zrH|*xn1UC|7Kuvm?{4}Yn;r|qB@afV$pmH2M6CI-YX0eDpk=DKWM^G$b(N5+*?3l~ zaFd{o=zPRF#LP#T$?ou``FyuXwbuG+bBl~K=~N}1bgvk^@nb#x_3V!pgd%>n456qt zJ%xD}icqA^--PA-ZmS%@N0aM(a@PWQ|F7Fg`tENY>A|QO(pus8?0ZMgiwA~5z9j(W zMc^}G7AVFhMla|p>|j5(*BygFCc1`RYs?-(^q%@^V#^@ZMd0{KHO=cA-J`DD$4*xa zw=<>pC$6!o7a2`@G}3yEpBl+L>{;ccUQTf4cbLha zri;zgsg1hE`ph+Dghzi?P2F?ewUIgda=K=j(h8Ed+1`<9%r{2U6^^!$BC6p&tqNf8 zT5Zy8Wb{NOncD{*GG7eGH(_6#XMkauDl;v4!=#EH z6>PA{tW#nT&mU4NbPtJ4Clr03ps6AwV*vH9%%?Y|;Ht9wym`KUB4K4?$bgvd#5h?w zowLI1uUv$#;{1bax)?~j0jEh(c!^9l=c(d-=<@((_C&0R;_{IxTX05`K1DeIL%utQ zkVp`%PyIC43DeM0@z+&DsAMJOOJiw*uzxikB}odssdn$y%xXB<2Au2k_d(q9&t9xL z4*!P>(2Q_(q=HVv#~#}bMT|2P%by8=%1j28knJ~e!$2IJeT1>x3s{COWV35E|8Q`! zj5J(DZWtn~+JZPZ%EFgi#9R53?}7Yb!&PkM*^jS2XZBXWN|MrwRp%7vjzdOi0QZx3 z;a5|228gFo15nA~L1Y9vub%8cfQ^K$Z`fa)H2YQ4t@nc0Iv7@i8LdOoBIoUB;v|;c zHwmWuPsThQFE3!lvFaGv4H3mAzDO{>91?_=WP?|orrdN$+SO#}{X#bCMS!stdnh9v zGeFtS`f{hg8L%z>lX|}0fX4Juzt@!T-jQmVp?=|r_1GdWhPl|1M!<3lv)I$|rIDWL zRCYZ~!y%9_!MdxI`Q0m!j@o`WGKiaz7am~I>U`y1Ix$y4iPZc=qh6)!d85U`171jz zn?c_|(KkybN6%8wsLdsz2E`I5y(BxKh{BO+ZLS4PZCCkrdq^X1n+@Jsz0vd)L>{ z4BKFSGeCND^rWTQfsZD@r*E&2=^Y~({Va7%@zU-foQ9bk{^}5~Sk6YKy#iRgGS1PH zMg3zC_{pV?5vM@R7rR(=rLu073|JJ5=1HrmnD3xs>S=>T2vkl7tUEv)!f#1sfQbdS zRshMql%EWhKy3hM;cifBjhzXU-4K-Fw}HiQaO-AU6q)Kcz)QGyNP{+;Fyc)ESr1cu>@c?$QT zY(6Px^~GlO-RZFQadiE_Vs7*!v4xShok>1chI#4hX3YCQY0*CVi2%^k)tqoGXNJnt zd+(>*d8hK|E-+0LtTYN=v1Dj7r>$N#KyZtnH8;CIzd`iA_&SV(1Xu&Xa<^1Tz(xv_Rn%JDidC3|6^nT{cz0hDj~Uhx zhn7t2wpAKzaGLt$Q(K`vWUw@g6^g@%?@}7znYT5>&FR*X1udsfSE39^JjQR`>QlBK z-Z~@}0f`xaJpLYp6tWmqTt?G6lX*-zRy~wa=f5+ff+|W`X$nUV@bNWl8+S=iudZsw znfGZ)ZTeO0aKF^$;{}J*RF!5opwq-;2NaS!7WBtF^*?K#QFPU<-s=y4mP;9WV~>)J*k zG?HR`x=^s7;?li|3h|wm#X}K(v&gqvEO+z($bY-n&+R?ZgfYsm6oxB~5aX3j0$}u*L z-P-jEt6ySfVqKhSXAPw~-8M!IT3ntp%_bbz#jjvv4WU3gGp@=u*L%q-0JJj=7&Q1I zs07*oTywXPT0_ErV!b49FLo{>i~+b@UW}6*^f^d;gh0qB#^Og0>U`+XD`=(_@0a-* z>wW`bKyf9O;J8Lq9j~mF_|$~z#hh{_^H;DHBr9+=94mW;^hjnrDOU+@B}=~)&Nnb2 zv#{|`n2i0pB51`>Et4*VvkZmi(R>x+?K-&j;P6PcS+*Yn2i)BZu9aeC&czhKLM;{xn^-R=)=baHesgTKMPBj~l(kPi#MLvW za$QT}%5~3)H={$CKYQOm+z;Q0g&vV#5?*X57T<#+lLIKlzjZ8)0;{>|*_>Mj?SpgD zen>WTCox0SSNryTsqq$pI)p@dIe>#jV;=SI`2lZ_H*aQXYI!ic0Y--Q?Td2#V!rS3;D z@y`7JN~?QEDovUiS*14m3d1+h(|zN{ia1jN7EZQ@|&joW8bG|ZCLJyT* zrM?ePhg8bXCMCkA_zVRyy`A4if5DuMrjczg=N249X9^D=H!|2^sLc6xsv0;q*V+$x zh1Yxtli}R)uYzsQ62g}u;3Hn}qv_b=r=a5@W$=%29G?6=@prj}6hVYUQlfcA zq7d9=6*Ik>m^|6{@i!@%FFq!%|`m$@woFff`yfv&`$Q zz<$WCqF9f|+ufiqVci{)+?{UxDrV5YP{samE8oNNa@1};Wjf+=S-btpgyQ~^EfGs# z>noirOO0~DSIvA>d^!i`FbRky-1E29@&S)8PqVSyY?ICNb0@D#?P_MP@2APgKg+-N z)B14uDSY3VI_66Ry=LV533F#YB2`;9p6>S*>!f2AC-NwqWdzM_gzoXd-ke-)dVCO| zg!_BT2g5N}gz^|yCZf5rsijaZ`%^F(JaCFvN{1w-@unP&f%2<2m$tszRh8&dyjHw3Ckm87C|r<$nptK^ROv z!sPkWQy_&wKVrY*gvESUoNtfpbyf*k=i9l?ZsT}wtuvAM1Ygw@lV~LHOpzzJ_csuq-3cRgOHvkxb*qmaN+M9~!hn-}qVhcl24;W#?ca~_&(lyKN$LjtM%kxg z311-C!@+oqzgPc!j`p2pXlCxmEPmf0EILJbntsi&de@%sf4>A`RXoSP7ytcL7?yBP zXQXub8`fKCQa#lS}B1fXNeuQ4#ewfSI+>Df~|xK(8Oo_5Z7RTUn#4 zwiY}UbGgMnsgi%DhgTqx(K6)5W4SoZ(~(8_5iP*eznZ;u>TcxqRKktVjF(B>n(^nX zXO4<6`&dvHMyV$%qBwuQo+wULEcC0M=c8bkBOz8rXnL4T{;I~D$))^p(Vr~SAOWEq z#k(EI@xu)g?;c1&y04wa+EaED8f?6ZiqZxXCoLbz`GY@+#(s&2U_uD{r%?g^J}KB zj{WG6hzh%Jz87(K8a$f! zy*M{&enD1uun=hXrcE~Bq2Kan1$7Ww&85MdGI{1zspv{xLgiZ{ey`o9QDm#~q)k_O zwY&S*#aFHa>pF%liUofO^m`BY(*OBV&tb1CoKYY?daYBl zkps59v9qF>5}$@Q=j)>qbFHSZrbBU2@LtiMZDyUvK&Wd`eI(EC=0I*boEkyIS(Der zQnvH`@SxTsQzRCqEH5^9_|e0eMEF(jfcQdnJ>G znf=#qTtL8jb5gYM?r8s&lw1O>LF|*{iGgRIORU-G81<@=4v>}xey!Z!wjmzYDAr$Q z424~+Gbx9m9nCBF6LRfwsbiSX7kwlXs!HZAH|QKLfh>so7%Vb?yfe2Jt}5^+k|@g6 zwB<}f0OjH1TBq}ACU*c(taMJF?+YjkmA(La6ku)eO$p%U8+^Rz7o7Ye2Sx<_&JH3( z!oG}qf&eo2e4I917aWLxc(}!Gd&3>BZ zB{IFJ9* zpClTxDti>PjpyVV4I#m22ziifsGH**N!`=3yw40-iU*#VUyv&-`|E4PGWG5jTJ@BC zLnAYt|N6Rt?ERZmE|ZpJmXA-Zz^bmj3>K}$5T~j?gUgyA>~?p9e4@fp;dHqv`3lvu z9-Z=k~@&x9d>UfO$bWe!9=DeTdLAE zGH&jJ9d(a6gs0j0FVw%zX`_j{FepqR6YrL1(~y|7Vq8Nj?v9o?w%FA5Sq&^mZd8ne!HF>biV0efAF3Sv8^?_c*IwH0Fuo zLLm)D>U+n4u0SY|QMqGdt~BQv%ZlW`&SQe5aH%6Nve7GTOFWsg0vPzfbH}0;=G{fQ z?n#ELL^Juw!Lr|@5e3JjJA?Jua>W|{%w%WM5L0mtI3fsX!6&lb=D4W9w}uchTm1j< z_10lgeQo#n3@MFDHyD(JbP6~U(p^%DbV>{$HHd^F0)imj-5o;=qKF7XhcrqvfJm48 z_I#e_eIK9qdtJZ3ye`k-oH=LjefE8?d#!a30N=K!UO_Db8u$11*@5pv-fPmDOQ z1#0^X`n@6C)U8qX3*DTUps=bjXzQ($aVLx+^D*2!H8>{4b2Ix=MRID_laOrd8&e?C z`+m8IZ6Y}{W<(KRVzB_PO4ujpvufg^H!jVhdPB{6+2O(^JMTh4_Fk>kfs)j7cLBb1 z@rc`fDOzDs@2_|=Ax*?>k0*-!obGx0Qa~^bu4do%@CN(ic*R41ByLIV)*f_~yJ>}+ zXqYxEp14>brxyI*aVYXQT9nExAKy)SVYBV!zN0x+eJi$sMM{I0DnZx_4RsnQ_8jkO zmrIv=_Hw}Z*P`(L#nCI62JQ~5*n@hl@2mA=1Pf}z93LqoF-#K+qPY)lt2Ve1^ck0)~G!TX`l! zu!%Cwwc?3*L*YcAArcf0l3HEw0jBQ<1d(?g(D(5nJVN@?LH2T?%;$?4+-ioPd`*H( zWI{HOG?QoHW|348WT8Lb@G`#>SnsX(1j-?k#@y6Rmqm(Gh4Ncdsc81z*=8mnsizvn z$%4J{+ENDSk9AMvBz;{Rj^Ul0&rXfMWdsRPHy+4MlauQ88fteqeMdYdnh60-HE%a{ zYkyM{cMo3wP5u3AguK*aPbWtBu%vfmHZ`LU52v2kDX=2#DR{`Yu_~4ZQ4qr~%Rww$ zV0cK~z{J8qsqUL|)HqJWtwz&Hdy|_>c)kv5^NgC2`Legu4~4sj^_w-l{z5!e6;6Xr zS4gEgb~4VtInf%*c=fJBopiqTFP#zn)TFx?j-w8A{8iFE8+j$!4QUY}xuwnC{uTK? zdd>9N)4=kh$1X)2Io_9S$-3NtSLMmPs7OX-rD!gdCimd?68AdjnyECP)v`}-Kl7`z z4t~}tcodjWySf|gsE%`mz$)os z+w~8uA1D2Fn~Wb4z~fi>f7I!sOwv5_BzAPkHDIeuRmVWWu_Wdp6YA1xcS1sw&HC;p zk+-eJ1r*(hH6>JY#f&zSK3gQ;61EaV!k}x<#-X3f9gUVABd3q_wucBwX#*3>m%Tvi z7aUA`!Rc@3|HM9&_&=IJ^z-AG;CAdfNtGHKV|k(jag~#^O~2sM=e@$wY!h<=CU+Hl ze1gGt=1$Tlu>`NZwDSsZ`ecJoZ{kfV^k-lAkC-eM3?mGEi8pbi5tMnw{J-X@JV>&1 z4x|p&^HsMe=pfOgA_c=nC$HJ;$?R&B`3lgKjD`Nl%yhJLyJ=G26F! zVEQvf!-8W*74zmc3XL}ADcJPN4bAhFpa1^tPE>oRX#$vqQyoPwWdm$j8O>tbCwf9)4g9h zUF64QMR>%)6qVAo{q?7^$Z_x>@n)kZlZwH6S0A_eynE@Uweh5`%ATv@UVRS;;3N~y zw&KfQb0hR`yb=(sR}bsOToHx`{Vu7qA)8#ECxo=Zm!E{UUdekcrmL-o3zs{OeW^h6 zVC`WRrz&m#?`W6ZPIH)bYE<|COiu2}E9BETe8P!u>z1}!-;>hqunraPQc zJ3Zg}knVlCXWO3U^81&_Am~v>%yS>8y9+bHFF)9Z*i6M0G6=BYwIWYJ z$r9PInpH+VW2qqBw64j>%l0O!in4E9_WXOSA3$?Kgd5*=l@=Sb)55(<>U`yM(R&MC15FXt&r+YS&+oWqZWYlD z>7qs`oN{h_>TC|LZnHbBMpA6oy9S=#-G^4;HsQ)}>$IeK3q$k-D5f%&9ZO8T$d#DwC|}{Ro5{h^2+GR^i?XApM_^SQB%~$9mC5! zyP%hwOo3a+nHV06!=6w}i`R>Pd${KDl`7bgfs1U>YSSkl znowslkHr&+IbAM1wA(C-d_eFp!ycun#@&64YRI6#fH6fML{Yb~4 z*2z#M$ym-HW-GSjto}iB@VS9?6byLuCpq4P_=9f@+vXP%-Zp3HVlsXkeE!Wkr5kPIGE z8Xgw$Tt+kgkPw*wxTqvYU{YoN&b3>*>oKt691q%t9mg-R!k~;TB0si*`vE@yQ^w%U zW-ifs;Dox)T>&X}H_N^X<|ZoIh9A4jlchEi<74&@aE-z8FC5+%UYx%RZI z&iLmsE~AYQj3sPgG@JW6{!#nRdNABLd*VI4?Ev;h_IxJpD&Kiqf!#t*3f5uYgH{G_ z26ti7(V|anEEe(Wv#H`Gi;>J^ERt_GCz)qzLm7>;Dy-EbAjl7fW*MA+Z`U%qV7r!? z>ho=-g2H!wJr5FT46Gu&WWC}Z92w>Hbt)RwJ>afb^nTc!AXOx{4IMOep2;3&9vOU! z|GVOkM+TJY^#mg^)i>KUI?`G3K;I`#nQFj0tX^xF8*v^L?*Na%8;R(oH+Hv3f6bUmsC8_UMpK zCL(pb+;)Ni2SLvO6>~EvUXXZXIwiW%klnz>_C5$QIi|xeg0cnU`?TD{mH`jm+usmu zSJmlBPRlKyY&d;a!CQ-7$vQg&#TMD zs?S)~r;*Gh@F03w?mBRE;OQ|t zo)W|Zdb{)MQ&pWn8CGqmL(F%BRgE`JswmO@7!RZMi3;Im3$sws>?^_H>d935YBu`~ zJ?Mc^zn4DDX%;{|WrZj%B@qesy%bkX5uu-8f1`eWvduhSFkL)0^DM_YXeEQy1zq&{ zs8IEwFNIGz(h!nq-!t?u->k#Xj!7!z`bsP3)#xaZ(Jv*uz^F%(`{8ijRdbf&0q9_d z&LyFCA>(B+)x2^Hi}tl;i?Pc)x)e1i4FIUH<&d)fJ-P22fXRJ-_N1xa6=X(E9XUde z)NrIdS3*u5D*yOiU7>;8q-sl|#D7wX!%$W0?e(@?PrYjInpKijqXiBB-&Y#N0!H3s zHZRwVz$N#cm>y;Psz;bRn{@QSs058d^uEMs|M>K(`001zl{uaXc|vDE@J*Q6e}^9UM)6 zb^oi)-}7-19m_bndeV&8|38U54~5DnCRNT?2C1J4x{_4EZe z%_;{RRi^@;n^^Gge?hVAh48}=0ou>V(QEEsUmlC<1gU}M0JYV?oZK^i^qUOwl& zIlu)W%)z6Z*Qvm1*TGS{9ALcp_kiTc8v_LBYjGqAxZ?W5FHir63lK$ue3~ptTvhw# zt1r+olNrLTv*`}X&1qs0Hn!-mq-R5a7Z&Zx;- z-}-bA={8;oI^e8@O)zV=H?jVyf6p2T@T!p#NN*y?A7F<^!?ADApO2Rs{|6HLM$dpf z=-;=9R07YYnA;>&_ZiGeySv=h7xFlmCl@qd@c_z+dH{|la{)Wfzg7@}i_#E`qX*BA z7uYL)3mzyp_YDU)%hD1Yv&5RH{yl3xnScv@{$8%8eVrD}{RZiFT%?^Ai(ZP? z`r6sfbzqZr0jxX+PNv;D>W^nGFODov4^!RNQtj!adEzh5cC_|S<_Gp+?`|qZoUCRB zbLr$f(D8dS!>ENF11)jG#9M#;KzViS?7yv_X!pPX{1_br!R;B=t5vB$;JigTr2=B{ z0@mQ(>otO=PM_36I3F*5%?rOx@f(3@>Zpe)1{tu)9;B;3xr^Sws- zP=^L7AcYS5e4F-b$Xk|4wdOtAEC4xPLqD|Ub#fdW?Ohr=PvXy-4{C$Kl>AZyV)FT^ ztn9dN*)aXRey(A0_iGCUUpjA*R7HiR(tPDs#={i6Vb(;x5Z42Mfx{0B{(iKF4ZAuN z&iR&!ZGQc_!}sv5xHR)6SESeC9kPv)=-d4!Sg~g=aLgO5@$&@Qd^!8WCzmQ@#DaQA z1#8Edk5alvj4kV+)6@R;meU{m>4JkgJAX=z)W$c#;WykzrVNOPC&}P-`5wgjQ(o0% zx01FcUq_?R!>b>k5Z-x*``xFZ;H~VbpnV4xELKax>Kg2S+m|Mx3+Ji0Sf!rld_6MW zxDP%Sy1L!qWAVSA6gDVW82nJz=S*fbV7fOs#HH16qqc#$q}Rr=4ueEryyKW#pVlxw z+H^&g0JUHEAZ`^EEN(0sD^`PQk(M}LNk-5-)CnyK=%=nSlUL3f}X zm)kPr<6~P_`_`OWs(Qb=y@{=4v%gTV4kpKc;R?>?SXzgf0(4|+-PrU#l4&I-o9ER-a^sjxhK`R zyT7-4LtlPY&ZY@~|&zz{7~b})WW8ZaQRI#Bg(pdgBQ!2e>y zX8%v6-1=d(lh<+*iGI7Pw+Dkr!n<2qkADZuwFRo(8);o%>kI^5A*MC!j>@2jP_-Qu zXs0`lkt|9YZZ$1`-b#w8-JTO$(^v@O?#3(f$yC4u~ie(nh$8YWAhqgKpX^Cgyx2ueri(N!i?AQm10Z z&mO2hMsqX>!`)kU7moZ#b6tdccdhyHb5gjI%uRT7Tc4^hOCvhKx8~j1&o`U^UlQg z0>U$;d)iY|3US|aTYkM`6>)Ii2t}P2CFV4iII9)4qG@M5p_hLWr0tLs|HXIkY367B z8ub(HpSU#%+WfFGcK3c&0$1xow2hU|$q&hJIYj)RN6C)vcyEKbFi&hnvR zlj?PW7!;c*Z#noAv|o|faL=goe3@Vik;asl$F1}i^1QwOgJRQyIsbDP)=`DNh6pVw@3umo1p>m-M z?G;_c-WNQ*$;epMq{<$D(aEoG#df4Se?CwK@d>$sjrxC0r%3E{>QVZ#YXkD_pN#h1 z`x^N0wo~$)pay2TKPZJlNsNk3&vOasm!~(FG9@XPi~n$G(Y(7xQusm7htah98K(_8 zCHl|Vj$Dm{Zq#WrnH^yU4aB^=J5+7E^)xp)Wj?7+N1`U9M9ME*cLy{RSCrP@Y76ZR zJ2+pzPJjo|I7qEq@YdJgmLyw(++XfZPtxW6LJP6T&bhVredNp1@^;RQqG&?uH+H6K2s)9rW)RE=k`X4u+_f8cOxYF`eD6N;)jc`)?h1WQF zp{2y@17FCu?6R5<1|7ns+*_e24tM4SPTd%dYvqy>&vMvPI*?DdM#?xD}>n$IOx#M zoxM5IS9=v!E8>~xe4f52?pgDkmbG*$Nw5DnTxdrZ3SU{jWd!2=|1Gn#5wC&3L-Q@{ zKjGStzLlMvm7qB?K&SbeRNTPt`lToHp{bfOf#;D<@NqAJ&Xvyw z#WDx*vsix=>5^Ce4oYTCiD@ek$C2tsTvkakeyX{fMf&Bp#p57FFh@J5_c*2bQ%ZG@ zx)HP2>*qQz9|;WWyO|;T7OG*jEsw*?Od8q!ie!h#AaXl#u4Ww+>*el1Xk>arT^O2o zZm7pEBS8}?R+bGi_7j#eT!(&!s`6{_ z?@aMrol<8vnvXFy6vt<5^MU1c29TBoj`XblJxY-{N+4|HjJWZiEXkg{s_VL4kw_1+ z(RHxW$5prr7FfT?uUYqO;BG}Q!Ribx?R!?>3BrRnnY}Vo1r6k~nue+#c6dpQqF+pF zHWJJVtfJ|dcYu3Vc8VY!F5LZPwk^#<6pa>(Y#x6(!|+T#l&9W{19=+T`%1#Yio&Kh z4XCuY&3 z!A+Bl&8@^ef6&1ZII0|TJg76>F^vaXaCQ^iX-Dz{8g~F(u#6MPrkZmBY_5{Hn(bAwWCvo*}S(g17E0L zK3fi_fH>>5DaBrn2c|hrHRzC4n2)QDbx7q_FlNio;}|gV8D#{p$Kz89iOrRK4!R9R zf;gN=@gT0`Rf_9n5rYf#8*ObWi7T&&&txux8Y=lnp?P=V)U?Kxw=e*w2?&Kef8x_d zu=L)VNNzdUW_)+;QN9v$?j}N7pb&Ro2O)MTDNVNiy?qd`r z_~njJy%o57&JFv7-e^{F8pVgZFpMvl2D*=>8yp-cQ9LWCCr%;XnxAH?3MHw`bWyIi z^Z$-7?( z-~BhygbM|m5}{q&-S5?ALJY+hTJAHj=#tj=7*z>+>m2~8PMuud^N%`sToIC84ijx! zmTar z90G}ANrzejY2hr1#oDisof&e0LN}SOr@T#o#4Z^A7nE{!+;#KKdb7Su%xq|Gwc z(%z(mUGbU{wMkzzr)8;vxu|$;DV9r$oZEv^8_*D5kHUR%KdZL-2%@Pyd%`l;k=k8y zq-v(v80}Hf??Q==_FWZVWgq<=;t(u;rMkV9BHpB;cio4paF z-5vYuKO+cT4Hi;K0sY; zZ{&Wm#kbX9_=ZA{4@rt?>8;R2QB5v%n0a5bP~)3|=Ou1ru8~9GL*-R({;Q5IgRP^> z%+2`Li2y*&UigOb>wlm>9rJad0n=Zd{iI*x99Ber>xh#QAA<2XKfeCHRDbNLkDY$r zRV!NRgC&dqiY1$HAndU+Ynx zimgKDrIhS7EKJu9)@I*?`Zg9}o-78>IoT{A#QXPM{)0T%I7e||zrwqhZqzp~l`4rs z6>nSrV7n~lugav(i`m9GysgPO^X*PC?f8X?=+N)6aDcRheZS(om+r|nC z$G3`Hb8WBA+U|QwMB*k2a9rAutY26L|Mq?uf2(lr!mP-lv>}c~ycKk=Xi>?OyHo@8 zs{#<%H0^CIT(yXMp4iS3U0znKpjWCAP5nMKQQ(6bR$Z*HotrD9Ue+$2b98-kXR@_o zNEDA*Lb2cJec^J5G-%3zseWD_ew!1;k9J8`H~z0tV7rIKdhhL2l<~Dtt3;xy1&k|QOnYE09B()S@C*=SuS#40U|Ni*sOVsL;vBliONf*Xi;6WFb9pY9F#U_j-r6_jm< zVuNJYheFSYgT@N4&W+sXesyrFzE=Q{9-Wjrhpz>t%%atf*SF8T2q9*V+Mcr50KSje z`x`EwaahM!8j+3b8K;GsT?M1a4&Fi=h7m2FR#3~=1`HvG9yvHbWNY7FWyiSonKr$iEAit$ zOn>+#!S%c6N)ADPneWd20Rq4rQWVssCRYWC2jm1z&A@PPn9i3 z0wg|94?eG2Gw)30)^4M1VsY&UM8$wW-yORLmRAQq79+{9WESuH0q4I$Ld=sg-g?>Z zXx=;WOf$d*NL>9JVaJv!Zi9~*8ZU0?nhq)w=PDFw65lQK0mZt7 zV3KaY)3r46ade%{IGy%B$C+)J96kYUKBHBqNoNApLsIEq))|;3!J2DqHR&|?roD3v z#nK&yX4XkJpG%SDQ4o`>8(P`d3_Rh{&yxs!!6dhW?QkhoWChKr$7ZAQBW=ohjTUeB z8%rv~?gAJEeD`N|#V}GI<+=XNocm3EheMYgeL%C*seWhk1}y-x zwENvn!8=C3>6YXVQjNKzo$V8*AuR#BOp+7_`fTg0DS&~fabVty>iZA`gtVme6vf4e zryzP4DjJPeqeTyA-J7-FIt*RfcGenaZV8u{{o=Y+6gZKA`CaQW@dhg(Te#}JIl6m8 zB=|r{)7!kcC#L1lBgd?H@7E0ENorAofdu7Z39i{pt!!jAi`Cq2*BQ0^MU3{U^F~u# z0;eT3lc~9VMDn(QRBpBV%e_a8eBee=cC#F z0#boL=v+vRi_K!D?Y}(_4&B4y8j1EUO$5}d?dOw6g?gNC2X#BXQw1E8ZcVi$uMA|U zZd-G5XB~vEzMvWabHvw7M!+)U{*tuyHBmk8&+afKueTn{=x>y~_FW7?6j?U-_{_@9 z2b>g@UsYGbZ*Q$;?#$_wK7S9>)r8KNPE>w01s1*i3(wJ+f0EPP-|}}+P4>a`y2)2p zVHDOoh1}emi%{v-2tY&kTI?IZ5^^^g=476Hqb&pbT+MYMivQZ@0H(~1&Duk_DE;t1 z;=)yMY5~GR&=Qja{KRWLiChJwzd-c?y$ZfsHrW&}WY^!Fp7P8^u5po+8R851M2UES zcpATQCokMUHQXbrlPK}Nf}Aw|F9_!LfcXQ!W4DUPFi2Oe~!{CSb^ z81gg_aXO)R|qx=;J!eCPO0V*jE*z6vDh8QA#%d9*e9X@F`oA z#y&|`BKkx+Y-H{*d1x5~c4X&rzH?Q*ljZ;zDj5{3Oc2IHLbj{g;(Cc?33okd;Mb(! zE?>^b2vgcwAaGD-XQQwqoNC>{Lyl8Ux~*p4SU>XjiAJmWTtMFZjS?yC(cZaV@m4k= zwDLACAKt=U#58ao8T~Hv6>$}+a)ep8X85tryeOF>;m`h<6)Nz|w%#uWu>Qhc)99g< zmN`U!-&@HT@}vF{Ed5*MNhqiz7Preh`mYUFo@x+V5o99LVf#(|k`S{%Q(Nz#>Ga}A50LJT z+1*ds((h_a?Jq4aUR&wBm{l4~e>7q6+Ugx8s-XFkq05{`df6XVB?s{d{q381F*uWa z=B(=9Jcoq>T2H6&Cg@K2kR|$UBl&>y-ng8=&vifOirpLCsI6!BRUW1=+-zQ zd+0NEFLb)!(>*xc4c*rA5(nNURrgexjtQ_6%Z=C6K))>oliJkJ$8yPgkG#qPCafLPb8-yInE++~%InF3Ui9bseRWPEIk&j_(aBGj*t7y&hAY9g6jF z_s{gHNfl^c0xKGM=JBzXKqJjS{A15uy6XkSR}O!_GOZk!!DTz{uK#A6o(o7k)xtWd z;&UP38*`UZ}MlDBfamJOuFudoh}hmjw;EcEzex%YeEu_(N8#y0g=nTFX8)3Q;$ec zW*ljp5Bzq}x{uUzM3HOqAUN|zIc$seCnqW`YlL}(e$wN@O)vPV7L`jlabh`wvvWpk zT$FS7qOMG?!(WXd%zaPS#~oq)LZwsG^bgn#=%8Pi5M;W_*Gs1yAkOaF>|1p8-3xGM z<7PZER`K_slaTtX!N&SR+|MH0ws2B22$iT+;k4)&mDSO&&Yh(}e22U?13sBY85CKX zK%`mZ*e^9&?qdn!FuiAWB_p=IuF_p0xe& z62LFpPftPfBB_FfTf-i-72nl`^@+vIT&Daa|BWpTIzRAfW2h2dWdaWe0yL3R>@Wcu z8(RU?1cGd=oToDz&^O4eda&-%dv^`>O}Pz~R1$P^*bvG|)|RP*{~z=xbWffSHF*tG zg}-U`$ozMr^Z=TVt?U!z$S+Yx7KJnG{?RSaEZ4n1yY>ULm~-rbz;BM}laViw-Y;)7 zB&JF~Igu!I<=3ts661}n^HHuUiKkHkMhzSSK1Oi)YyyUjYG?+jw&<*dOmJ^(Zf!|S z#QoE51oJT85oWRkGD>xNHo|F8H~$tiXY0=bBPXkez9Ub0S$@{!-r<^c+k_`?{*blL zp&7naZvVQZI%4W^_evoJ1_I_=ZgELW!Tc>lT=?R860cUfw&x3B()tO=EaY?S`0G0L zmjtv>nz$OwkGrDkf~x<+1xS30nb#QU_7}nu{do^+w|dC=n+R4lcMDunC-eqG^wJCg zwTa!F%Atxy)sycQ92FZ^c0}>;-Qbn?Z?T-~Urg^fwH}ITIMtStnLGQG*6ECzj12P} z9LTHQIh+Q>xs7r(MbN!cbWBo2X-Nn$N8mdYCYpR)>A_p|`t^Uu8Fk23_#w8MTy7IT{P#2ojOqfDlLbYkB^3kjkQA+USCQ$VR8KM_`vG z>%;O*?qSnD=*;x;V(x`#W8cm9Bi-|)p;yPwAF+5RfkNE}Zfb^R~O@@XHPezF&! z!J}&0^D+vIt6B4>7RUO%1&mrUk7(gLF+1)|dUMvleL6*F22K$%+$9}n%OD$ zKyW(}gdYXr(9r5xF&?m!g_)o<+ae%a-7AlOK`j>nq|^Rl?YGMR86OmuS+eB(&xaEX zPh1?&YmwnHp6IJ@C1>S14J4|Cp7!QJjCuZCL6Jw_lq#lbpCHbF+*u=ljCP%M=;)61 zK@ZUCL^FG9e|CU-S~sP9?kw&1U99@h_RgCVkdg%%ni@1}mD`*1!0a{!?Il@ zuv2;@i}yT4p+jw-c?UlOIyq8cEl%)sZG*TeUd;Sd5XN>ux4;LeU#;y}m17mDz~IZD zXP;iJ1MWv^T|ggU@u2U$w5JfeDYG~0AtyL&KH^R1WuPOO> z`0!wOC}%TSTRtKpR!+vE1tgIBnV!1^NdCB?hHR z@wYW{o_v0t+=ffWAd)qU^ciXO+bii4_mA<^5KX)3S?@_Z@pjluT7v$d7})C2*`dwOo-zOw^zm+I8KhBqZfUsG{kBc5FT z8Xw_0-_E?zUSso3DDA-~TPwm?QzYDl!1J^MY1^No^LyzlHyaz9>eT0g)EhYPGdRIW z&55V|IjP)o#Wy9;<|RA%x{_Z1^XQ)O$B6H0Lx|ZN1RucLTx)k?iW%?VV<3{qqjfca z;4Kzk9&R#ke4pc)3Spjc1=~k1PFU11@UH&)Ia?Z^S*CwmT79OAkR2qS*;LBww?#Ti1yDf0oE`nYCpcu3ofZFis!Dg|`bi>Qb`i2HEhewM+&@@=Oq??-?L7;_&#nR&9RS!9)0Kohz zG$=LH0VqcYE-#0f5V(6JHN{@ zGZ_~nNY3OB3B9G3xTHS?mZgw3Y*wa;(tm$VVRO9PI_b$!j-ta%(=$>|4I=Xh^C%iV z2}OD*lREpW->0U+8E_z#P7@Uv)VBogpuBTMJk0HAxY2e#p-BJEH`eXeiIU<^K%|QW zfHx@4#YLRn(Wc8hk7Sk*Gj}e=rr0M>=99fJ3-@|6glR?XZ55**84E{-cWL6nu>`rS zOONN5e*{=6lp1;3`Aq1jrRg(Sp_oUcj}0|7BSp#-UFb}jW&_!dluSNp?hFjXRXG@H(M3WY}SI} zf91hdVbXvA(n4jd*mg{3bLa7VX11yCp2+eE#29vZz;On36#>G>N^1UMP&tM53%9OX zRn(5n%{9TpfQD1NOI%{g3-@fkCCGq5s5kXUOC0hXccSd6gyd%@yfNKaT2J`}voWXP z7rhCU4?yL9x$Tc)=YoJ;&)e7Ud0?HYNvi|@gJ~MAqWV2$%%y@8Zx@TQOC7Fh6jQUE z|BP83t@d|g7_5!=m7s+(%}!$rdCjsIcI*|9%-mltqY%{u7=nfkSIb>(wp zw`F7(Kf~wu+5b02@T891M3HZ9jTBP!XUbV#=$s^uJRyT3k9OvT%Sgz{SG=a`9AKBv z?A68PMe{my;OXh<%r@T=qccOOw94<%Lg>tb5Qwo-BTiOUR!zdsd*c=6PVV*+E2>HS z8YHy*O9P@-gvNw|gyLBQX9b$+sudL#X%2I(;eG1+8)vT(%t#f6`1xbE}2l=x2~+^dm&8Y=m}b0pIsHJLinSPYJ6| z+|_$VE_p!MPl+0RX|h-BUM)arPGUX4t=oFjyVf;l<$b1<*N!5P0p6n5)H>`V*@@yEefr_-mO_3LD&C=8HKvd1HJy88f+Ht;~6FKEPEt-2xH5hXd zGCNIwWx%p6PR|^g{BJaN5cBnCkzy+OhLqE9gH;kx!wEKmzgWBjU164M%>?lfYtmKF zf+26QRhIqBE_IS8bGg+1rpQ&DYxW=0bR;&KOR(Dr# zT!h*`O9dd?8*&M)IiNb_M|XZcVHsQxI3oWiMh;Os<=0J5aK~-5Rdk0L z9jG6|JRZFVS=%a|OSk1&rbpau_4~9$n+>Ez`j0aHCRAe?P7ku=10Zns)@ZuB95)T9 zQRIU}zeq#mKK&BL5&abq2tz*o7~eBA6jvGi=Ug&hD>I{$9x?^O9eUDx0mrd5b^~Zv zp{8C+(|&0n5XvIu&my{hdoY$AcFaOH>GHiMtdl7PX75mq@OQuZWSfjpOoV>(-~8M6M}+l||^$dPxq)HniyMXU<_*;7IwUK@oJAH6@^)?j#n zq}+Q=arJ6d7sKMa=|*1Tkc3=Pw8!}eHE1Of5B@GER!Odrfs9m5ym;5$HtI%WQos4& z$e_GIy+lE=yU%2Dc6qX%StF%3_BAh1{b6YOGL(QYnFLBD9V(`adaYRJyJd*4#(;I# zklZ>XTwZIwoHk!>C!X!e%XA@Jep&JrK~XL0UXCN}$6f#8JH$8abz+uXuQ`@wXQn4w zq^3nk@Yo&Vcx3hFHYog^X^H08EFCLC6+$D%_HQxFQv(a$o7p3ncQU~%tCT*R07+(*%lj9ST*gl~a+^S& zNClshQ~t-v2q#&FGQxf~!FUa7mBbGt$lX+CzHhxZJZ9u$QINL!3z43EMnc7-N>-Xc zy6pPW^KRISuZCMvx-kv?_Oowcn0tyP0&|^3I7rVzO(BS)Rl9FTxbJhubV92D_3;0{ z`PmgtW_^rLd3bR&FRX%uZ!v_j^JzVHA8Pw^K@}kr<~{$$W!S0I^)7y z{QM6wiz3<*>yWiNd0`@nQBp}v9lE5YxVZ&IZq3$4CAz7bulkw0nU2$YW7HNZEyG&% z8$?I!d-tdKTbcoZJFlz3VdVA_!)O*rAmoU=4>^dmP6QgbhD5l#6N9$hkl5`BrLVal#<^*jsaN7bI19lTt+E3sn4_PEEm_IVT- z8NwqywrvY8dT0dX6@;Kwx#d|dN!~+vgXe;X;w(hoW0$UrpKsRpaOp0e&H^QxmrSgF zeUcAva$SrXu~({FpMsegCWE{K_7HU2?2s7#4C#K?TPW!ZRDHi`Bgugi;FKjd1sw>P zBA--yeDBW&gj9NWZ+nZF$7(<~cXF@HPo*QHZu67C;}ll=y_c||gw<4B4C(srtk0_^ z+hkYOn^uOP@(UZw8qiR`1XKN6{WYmp?IGOmbsWYUC&D^e6F-G>yo!%0lsij zicbBqvE2bbr))MhH(b3~Sbi3KTARp&AnD#eV)<`b3mfsvN_jIvxceFvkA5WN1v#_S zrzlawF7!+-V&#g8f})+tXzmEkdWDKG=z@$N$~jsVdV^f!W*y`XZk;e5W^k59oAk)k z#AKHbOdSX%zqq`Dyj0U^>^8|niX=HQP0~|01P9-KXRZxQ9@RTp`TSMW9l>&AB?i1p z3~Ldl0Vx@a>jj|5S!=7)5p*H(p8Q~kob{f%f`Y>Noms?4C76h2I|BC}fHZNC$RXl@ zL7c*b-?6I`zuxciVNHp796-wR1Mo>o^*;^NDUQX%Hn@kyN_n*LBN&F27e6t|I@oeW zFzZ^y{UD-?kWa5p;FhY#BmS!noc(d)-#85reh<~ZE zi+nbs_DDU$rZqdllliM~Ase&B)G^OaKj{#_5hf5ThC8Ihq**_EABPjLBCNw#UQsFQ zWw?=73GXxIt&tm{n}Ns2vs2;r!m5Zl;fo@bU0rs6#J+VJbd=o7wIgW-TsZEmawFT~_ z=>1e`Is8ebFCs4S+`Y2oy;$l9C1}RjwJq*NHd~(N3bD9uE3jbt%v2RK$J`0({^Y-P zx9^}T>F08(79Qp-MAOzg7>7D_cN+K@j=f@({ggb;qjDIEmQ<>JElbRTb0tuF;fkwV z9^xbP!>vbf0+kQQC>X^u@n+wzLAoo#OI2z~_i|s@wR;^MYT$px&>KQOV9QX?i*M7E zQz4}G_MT}HnP)jxEcae-*{k8N>XK~E8kQ!M{=ORrtpeS6uXiaPRmXGZda$=}`v)(BLh>J0O2 zInE7T7o_<45G4ia?wFxdT6$;%=>eom zU|(ZA&-1>2_V;ri`}qEO=4t`dW^d{6V`ouzebnvAhgkKDCoN3a>_IC{RzaQ)tL(z& z2u6nc;jcn+BHu(L`cYUKnB=wc)aUuUiO9ftz;ykg)5BKI2V)Y-g4}2_lWrT-=4S`~ z^gJIv(I_IpXNUdiB`V+;x8-QZ#v(h&b)4NzfVZ`2f~=jIXj&j~MVOz`s*1OG5z;w1 zXT2qwa6?19xpuJ{G*(DuRj6kojyA!S=kh#>0(sYj6oW?wZEnk_K@eXyZ(nnyMG8#| z;~9#jI^REp4HYOdNCyn5V^2Rd!85fig+e;BNDcayvFY!U;`6=J<#u7sd%^oQN$N@3 zXwoC~!)gKHg*W`ZBI;aH!#66()ztg%8#wmY!yQfqXIblv9fX-##}Ma&BCXg$yP`NX z`l5ANKaNJTT=Q!uurOwP9Jv|KY(|x?=8n@I(P?julbi0h@G1d1@KMQ)Z+)%vSJKx& zNh4iehg$1@L*{QW9B~mdwa#oKr0B1s#3mbkeO2g3G5Ch&mnF4_zPwpZA5KtQjZ`nx zQw zy}ylZD|r*wM>~G&x*vs~0Sv;wCeI{^(j+}RtD2QD{;UcMVTHB8?gIS``m zPK#h%$9_gs!ggKt65r%@hSM|O=p4^P@I9mq5Z?!`!SXMA8!xEX5r7kSKmQjCBAz82;$uRl415zd6ZMRb2T+BK-YEO_i16HN zr*o_?Q9#J`Fgx|-Gac-pl)EO`qou8)mYk*#G`WS39TTuH)e=O^pSF49LGar^1XY{Eqk(S(k&2xF%Rrb9FwMUg5 z;j6W~wd}-Xs46mU+!FQwWdEl7g@Av!|CGH}iTutdKhVyLVzmJPQy~0nI>1NQXXg02 zUC{~by8q?@OF z64Kx>AveE&LrS2Ctdj%@4xM8*@w`KAPg_SCS3pbDg}4ktttCa!$NN+wqw&ySKYBFP{#2)7 z#-qWsEce3^T$;&ZD()Y1(x}!<@L_OM_5eg(f2x?byDEU)*ZMZJD-dqC=tXKG%T{`2 z2oiLkE>e#Dhv}P1aM5#OA<_IFDz8R8)Q3jc{MBhKRc+))>5Wm>c-|xezDRT_S+xd? zK&!s8zzXnPhf$B?8+lJEZ-H5XlAT1G+49}g!K;YR0+}kwZ!V4(g-Cnz?s zwUO0O7HeV=bfhgWe9*(RAGY&ol!p#!rug|*U&cQ5yXQMlRuQVwuaT?A-Qc7B!~B}YM$>^SE#GGp(^Yi&zT3JxY`4b)wC0k7;bsAO_m<}fl5?#Je7eEiS$gwj0EV?J^T@)nbX6`$-NC!S#F5`^lag+Pt5Se9z^p?G4n= zoc+7G&XchSGR?Zwzlw8m`%e56asi+BQ%GTEtyH%u)@-~xU?VZ0G6IOkg~pY!1GxQ- zl&5EEpPoE@z4Fug^=k_S*v8YoJsfjX^}813C>m>wR1%C%A!PgPY_w<4?|pB6VtOk3RvFD@>Y zXBi5|<8SLm!?SpF%UBc!N=r*k-T9zMoiTQH_F(vQks+MFw+0G@R=DpxvqqFKZ6Zo< z4h^(V?Ck6?VPgl$MUqU=?qCBvYjj;NoQEhP7JZ0}quC%@OeNYYdi8>;e88kl^c3`HRcDPUJ1Y<#%0h?`QRv z<_{(c)PxCufQBU*Os~z)CSyin-0>Ix35q?RKyx~RFK`23gbKyR^%>f}-ts+$)z%y{ zwXUz<7Z5C15TVffpz!oq4)sA)N4e28Dz0V~(VBI%%tKqEXTj+1=x8pB>o3+bn;7O3 zkG?Q{7jZf1;B+U}PACK`vS=8VF2nL!8idvo7XxPd7RUSQV&-%jVJ+E5shfWPn z^X@3u&qg_=8{lI^U(WE{tGq7^u7;-!X*6cyV-aksj_SEi>;#?6Bc~fY!cBRXhpN=w zuF|aBw_V@ne+An8wKyhtyL^x=r+YK{mA+D47882i$;A>lC9ShG!`eqk9^*R#-|5iu z;@Ax%?!-6oaZ&e%CkqRV=yuN95*YNHy^~7rL{*b}pvl*177DJ69cV;t0JtzoSZu=L zZt6|HHZTBvPtU<_mz+bgV|0!R4Cug5>6DaFbWlK^CI?wnzA_6M@w(^StbkS0aR~bG zo%DWlztwmU59>PR3Oj^`2_*JIuC)mX!W|3Sa*b~4VAl1jB)kz)nAQuM+ujDG2W=aMh(a+%o^&pUj$Iki@JD&;c+a~;W2 zrcH?r3c?P*A)Ycl1BMlsnN1Tff+s3WTd^7rbFU=N5W2##Nm9D86j z^ugQL#TazQxFHdD^Ykh?Mcse+!Tb1a#@@bq{n}&QVFo?I8eQ9<<1Mvr4*+S0f$lNa zw-w_e@2mkXKvaG{*G#=v{6P13@DTUeSN0X7*xOdocOrvq0GgAB8Xg=x{P-vxD=u_6 z>gnwM#%3a5Q!{F0-PYxJzI+=W0w$Y&mUpJ){Gb)>Km5HmEC)=9!dEC**La4wf*tps zG=H?@O35&tf?fg-n*Kzn=4!R^c1bId!{SFLE00<@JCz>6`p~eyi~7!kCs%$9l|f!$ zxzybF-P?YZ_s#X>cnGK6ZvwQ-!O&GFTA>s$p0`JW)3=%V0g4a(IibeV#M$EO<7+yl za)m;{wh1HY{#&1OoDL>`o@2~x(qT*E0B|-tbvHki6wi&pgb;%fi;+l0_)j@uRD*@q z*WOg+eELoK@YArw@uKVXMPOJ5elKeJDgCRkQP{i|p1m&`vRfl3dx ztp~YzctS4wZFe>leoB#+-MI6?SKM`4F2w?b*2r4$?I^E{ zZUUI&iniVzXp~H&3P?CKRH&8yN0iN&#!ej5v<09J{Pz_#PD|HDiwyNMtLw8nBX8Vy z0nCpZ-MFz@nUhwf76S}eN)Ay3H^ipDZw{JJPJ5pnZa!zzKiZm6ZV-p@R$!J~`E(_A zc=|N}g$YFjH!ILs?1nOMFRufruU~n&_oEfyos47;#JdcAh~V7B{_4lTAIqbhcPA37 z!b+z$z-bKeuJE&S&cM5f@hnqUEC{Gj5?n?8G*`)L6!&=+L_hsZZt68HVG*Mr9jH41 zZjV(2Et*6l#m3rNj4Y5}8nyi`M1nez$i&M*EfawtzhI?-4aZe7y4)i`i@H|a`FPl70RC1frR&>c)H zP%k)eh-5DrZbmRM`5>FG`HaKx6#u7ClXiO^A2TMDq>t=50DES#C?MS*b(CxKhBA~@ zL5a(>A8p#sPezY!63#WfQDQD3ScArQhkEv6m48rz#xBU*)e3aISc|Q(v)}PM3Xy z`NyN~wn*e@)@97ci{$$Q^>?DW&p#xN?-eV%494y)_f~8jU9!pBjM`Y9r_y{Z;|+bK zxcyK8O964hf7?MzW8m&*A4HX_v&2_Bm$E0D2X}d4>^dPe^U#?WD_6}7*{Jxk48JURCQ&o9Xo#_8@p3nZ5am<4z3_+(8Rz6?-Cln=Eu^NoPl~&;{qd9_p~kNrjh8!Cdndxi0pHRcGJ=GcGZPZZobN9Hk&;D_7*LAED&dGXq3NC!cDY z@%}B?Mp5wChhfi#pSz0sb?%Pu4<8s1rTVI~!P8DywX#Nh)7f2PzCN|Pdn2<#m{^q0 z?Jzr{OW%LeDD(@>S<8KYn4Z2mW^-%uq;AN~2>c9~m?d|jp-FZM_&R~RzJ5F;_e+&5 z#zCETK(Z>wXw64l5t+5ztz?$OAxh0F%64rrR}7@5q|hg_U>2mRFyfo_I3k%n3fyw2$TyRDRdJeq z>B(Arw4}Ajl|iMcE3S*Nlm)AjneKTJ9}>($rLFX;+px0R%=uLh;s!Ixj1U~2JI!A9 z)#KT{I7#Q18QIJ&+q#3l{;Ek{RbI4#W20F#o6R#YyHz( zrrG2Uyo{R0+nE+AP8Q$G$~S7mv^Bh*Dm(4YOA!(}h1gda0dQ?}lJ!bJKS*J9*vSERA1MyjvQTalWP0qEU?vF~SuK2vb-kh?j-?pot zOIg_`!ht7|e)suDCD8m05ka~m`s4GhxWH)#VBkPZ5 zZ9;WoG4W*(Ye9A)GBVPf!cUo>l4|sgm+VoA%S~F%tZNVYJaavoxs>UUP8U7-9uu(% zn84FQp->x7Z6f%7(lGM4Og2-fqR~JyH@L63wpnI$72Ac+__;EP0B7$d;_Hf`!i=xg z`*BR>Xa*eVF9iWbXh}?4ly_a#L7~d7D1FYOESNJXR0{jyx^;doL-J|E z!mDBA+MqoGxi*aP?y36bpoGSU3`ER11Ait<9*=$`_Eu?&Oc>1bnIh#&DPn5DHeQ@IUW--+p;GocY z7pM?OKT>Ql@BDsFrs=@_lO60VhQd^vISOOS{)?PG{50FIrkfdfF*YlX1dj zZ)a$8aM{1Hhm74T*>2ub*w$K@_mSUz$a$*KqEbm8$H4P0KS$P3E_(K5E`u3+iwc#Z ztwH11-E*H6d*}**wqBh2MZO}3f8pgEtkGt>srLMfAtAG=o7ueh@wD@v>sFFt5|M9u zs`5v{+1cc~bD3`00+1goPo9=Jn=6&~VED>@Ke@cLBrj?_68k_c*UYQ+MR>;3wFc&L z&WLOYjp_f7V{yXs*TEC%VwA~~$K)^^&2Hnj{MzJW_>LmfK)sL8Fz?f(Tk{|-^=3>X z@)-zscup0Wm8tGM+-W0&UY=uT6=#mdV-*?8rZ zG~c01T5FU46?N+)4mFj&qa)yAbU-3;LC<*#23iK-8n9=?sd(!LKD zI{BU$5ctx>*Q`mM=QQWsPN2p`JF{LPh2FT~m12mdA=l_vwRMZQbv^^LS%I)dCp8x3 zyOhD6F>hlYetm&G@RngaB?eP%(sDhEd`!|m?3SnKv2h>g%yHhS9;{Actuoq|R8>Kq z+aM#y?pnq*D1OE@PS?G(L$|B~`KLCp;73oNto8NCE}jVGG&OvWob;d)aO9{ZrG`EE z^nAWUCp9Hsjzvaf?OCW1(d-IO{1f$S=0^>$4Z<7kF{U&g_fu* zpM68swWUvB)OA?7AXi^+Aa`IR(I_f9uK?z1 zhe4{rk*ufcdG;BPsvnfHfn$WGbyf{md7AAJu z_Hs|bXQLoh@kc`}&d8F|#={Ge<4Ar|sJ-ul&SYmp%ye2k*~aq*5m~Fv__27sB<{0A z+KpNJuP^m;g^0$LyU;zdvi1y@Ka{B;PUIAP4CAYj(>sZk4fV-nfuto0JDM?Vm)~C> z8PH31weJHD(|k(DFT7~rMz$Xixw7R((ut1EO>2Dp@`7zF=dld@F7X6y|CG-w@ z(;)g~$n41VI4*7I0&i^Aq-Q9PdS2f8HU~`gUeU|*(<+;B9<~kgZ+w%ChlTVg=qO8EQpe3 zTcX;b%UNbaBVW|4Zu)l;>7|Vhi*;mji+)9aoy*E;X-W~ z*VQ?aPDLKQGa+>$9frG|GkmZ&ihpx``Ve{2!(XIt(+;8>Z*F*Rhy1zEXqh@*1Gu?d2sCc;f4%H zK^s&J$dnK$pmN)xeGjNl zd04vWgT+UTW*A0@Q9E~#J_Kw$;zg<%*O3-oEM}$(7*kiKmW0K7ym-MNiWOcV5ad$* zT;ayMxbb^Fl&0|<%nYckI{p0WY_!QE>GbDZqpUsFE|lq`i(E^!}Nx)5&`bExJg3;kFg#{X=sUlty5;QLkn&Z zR;tJy&yOQ%SPaQ!gnQzTQk_AQ!AKW@UeK6m*p=xep|ZjK@Bv z-#i?x?%7Oh=G6=0AD8j6?07Bl1#>p>N7iV8zWCDPZ&Akht*`f1`l0EUv6BZaU-f@N z)28ex?A#`Gu`hz+j*|~A%@4;@8Vj8-O{@0nsanJg0}K)GhM4}n9|;H}F^FA*xkfeA zV?RRfYQ~~u!Y03wkB_cJS22vPV)kW^Jq>TGr=r+wtlCBcE&C%?<|XNku{DqZs6M1~2R@JR@)Qm@8-9+wNbP36K&TGsU z;E1GN#Gc+>%VB?HIQC9U-S82vSm_g5 zQO^o^jpV_ui)j$g#uLoc4zZ(Y=b8Hpg|s&3?KFcIy%(@OH<9wC^TRf&LDkOiLp^FD z3}QJWjQlaXifRl#WR}<*FEUIT5lGKLFZm}W2Ez3n3uBNNda)a1lA@rgnUKxg_DWrp z)jjrEUTn|2##jL|Ffv_n+X*)lvVJ%UdXnIBHN6SaGlI}ov6i>Cw&6Tw~x2mQf z&ea2X90{96@GXVf5y?T9&^-0w!UBNnx^a{$EX7u6xTH`)Js(FaI1LEMSAlLa*r|tA z<#xkae~92{*B+Jr>N5g)J{d%=mC|T@jD)&bT0%fFGmVd#i8;Qvbh58XyjIS2 zsw9u@mLUY7jHn}C<~#}7v{(WZXiHmeo<&|v*aPRelhg&>@xMqTpv;>z|cJ4 zSnOZ?j>0>*%$`txWom*6t zEaFx~9#am?J8zgO|M17x!78r-ap;9rSMn|uSHoL8bR6lFrZ0|U1$}W;Q-Rs%#R1Ya zk7W>nVJzo=aS@adaAJh8Bgr4VALK4p(A9+5nJZHx zSTV*e2X?mL+tA!>K$D6%{0gb-1#-cxdC6<=O!=LEUJxQ%^@a^-nNk`)(2vXu|1Jd{qLV{2ji<|&T&q{0 z;Cb>%LWwei+_R^R;O6>$tU+=!CTvuIj$Y!-5bYaFp8DO(HkE zt(SVUek*!6GO`e3;Ql!W1E2?zdb|K*?H$(=jRn4EW{G9&7&knNzpg zscRHpZiF*YUjY!8L*uSMyDp*_79JX-j)-CDcw`gt=o3F=2Eaq9NIWkWT(7RC+d}I?blnw(YAhPMKe0l0eDcR<1Vll39_C89W7<)0H$mth` zSoqaK#e}75tf4`*N9S-UgbX=vvedt}kcA69z_4uV_C_d$e@C@jc{%N9={kk#+fnlC zRH%)-!MvrKXeV=RxqSAhDngJWz_5+v{=et&lekAJfnZ%MD$CL#{UEq`pLS=`Yb>p= zvNQ?nIcgcXn5JdBM!kXNfr*Ov8`VxP6l9V}TX+_NssG)B1S2&rBQn}iTUWgRw_@vF zQv{FtWWV(jt}%5}ZF;kw9u6Ws4!_}@e|~($l!|Wz|G%22g!+Yw z!~)h7oQH?g8;7lV2hhzIRBgi*AMgN>8iu=1&EE~1bFELFDls2nXGEsIN$~$CR6)ax z4gmuH5gJq3M(f|Rg@kCTigoovw;7738aeTb7_ldYz@TeLRZOqI zAWp^BZVd&OUK@|-%7}EL@E$Ntq;H$bi|gzNtToE8Ar1Gk?#m+D$;Ds)SD$G7#|rhQ zzIVmg0bZh3x-_oMbWLeE4Ik&=`(|&Zs3(}vpAAz9b9*Im9GXLh(vMz)OC`p|u&=)i zq4)gK%wOR(CFhcwDMkX-ZE0`)&;DwYCDz$R(l?yWLQKvFnM+(wtl`xI4*7a%58put zU-b(Te__SE_-PXvo+vm9Y^427uSz#6#da7I@>%_rw?zLD69-I<9?17E@1T2q0_t#{ zZ_r3wwtMKK?_n#0->TfU#o$G2fdI&aD75o@S z-NFN@dKK~?Ys7L5{l*Jf7sGVq^*tqi3|R29*@f-R(;nWkd$O@|RZFD|K!O!8DSgjA zhtpi9xYnNQ*?3IUKBeKSFeQejIWBPgbsdaRHn)H?W1MryU*O9A}q6^#4KJSCClzNW8l)D5c?8uAgES6q@4LmjJG zm2m#iu{J7nT=-;VkReHx0g!fZoDWR2|d> zl;QX5_D7IOAlhFNky>Hlz@;Y(zb9-wAE<|Xo;&%<(>SjrQvRs$yB=M>KN_Z!v_y4N zgZTOVv)zqtH_k)+QOKd}7s zlfm@Bbs!OJ3tzx4rz^iF?gd1ePvI?Q!vd4Fs^5$jY27`c>^LN4hdA2QEP|}Vn=C9vp?GA92;*(fFEkdcLNZ5z!66cR#T36KNeHe|4(uDtZVEOo^C&gHQboSg#5|I_+$)i?$;LVaL?qmm+La zD&2a{1v|q~MkI4cUaP#jYhvSVH8@-ibIC@exZO8_xvBKbp6R~g z_$1xABG|D~zD9$ALEruSm`8{gan}d_3JkF`Mss?RN%N~JKtJ+2S&A=|38TE5s(()f zgeh;~G_`z8NaoJ>Db6Y;N2$J7A;C#D+e&A>ls?rhoLDbEiKk89)%tc^2KAo!>?DRJ z3RIAU?Z=V$C88|OX0tQS#*1oOtDn~*?%E>m2*mHFmxQ1&It6zt2E!*h&wKQK@QTKs+FTf~KHE9aiAdI@ zY<5oGnPK75LW;|0``!>cySV&LPk>=X0^&&?(zvE#5#Q=K8{ew94gSepKHBNxQ~~?y z8Q%*l_~miW`GqW_r2Y1+FPUpdFD-_n>ik!!Jsp4Vpz|XxU$LWn+mAPq8BVhLCr+{x zJDssrWZmy^<@Ti(Uvdj2Gq&(#V|x4vVxy!mw(E~l_#o4WgP~$$@tA47O6xedgeedN z=t{ENOp@`S{!MGimZ?ZKoND2+L-k7Q`uC$vPWBH<14WNz*YKx?A>2v$j zQbeL=SFe#f4BAD9*w-Jt-<+-;Zn!wL$p^n%6_QD1s+}fz6?`_}M%fH6uep+6td5GD z7KnDrB*QHJqzqQT;yK>?6HeV_`C&2NI2O;T+1-Z%=(TdyGGixf=Ne}EMYbR6*Epp0 zrQTqVHq2J(4V(eJ!Q7w;d5v_h1u|7-r&Z^^6T2dIar|5a1Pg^=#tm@C)pTU>HR{F# zVrKi`vZ(}$+QZdA5 z-X9y88E;eufTgLA7AaN(5BJ{Y?__1IQ0(ygoVoy^OEvV~Y3C8uG zK*-Itxx+yaxHOhE& zC;YlX#z|o625%V!-M;}AM|28QAoa6dfAgt-HQS;wtY(j=3e?9|Os zW!6P1uM@AFd32(gADE#$dsHF$d*S?Ym5@*W|cN7iop{Ak98HeaO*ILEh~ z0b1r{VEGL=PCW5pvn`U}`PuKi>!J3}+$Z>F?+4OiE>5wV zt{IfCudqhQBYci#+_Xxc$sD}|GeZmt1xH&~ z6c5w*cMtZNdFqw-1wv0(TsYoX&#YnT%ZrT`t0&h2WKURLPVqLdT5XfXIs+iS-k>-6 zm4Oh;VS&FjL&%K%P62ThB?MaY4)q@AW`VM>D~TiM15i>&gj3(|0Fdgt7Kydeo3aru z#x}EcwaOpvG=pTmA$FTn6sN5RqFpj*vD01~3AEPZsJsChU^w=1=Z|k^H5e?FRhKboU#M4E=D64~vCtl7NDnW#CK4jn1qUjl^YPvbgSot; ztX1LBeDzcuc<&KMwYYyt&n6qA&4w)EML3n}A5S^N>{qgA@F&E4rt}H)`Wym3r3WCF zLNID(3j-UMKeL-NS(9(|^k{qAJXi77{(ypVp?Y@2jMsMk)_Vj%B{K?`1zWblcpFLp zKTsdovKctcJ00|@f!Q0gLJPq+%UF=0AX1g8vHyDgz3ynC;>J+O^~D#*|3}A9BfAbj z%6zu$9%^qcgPIa}X{GDN7>Duj5=nM4G~ui~eJ{m(S3dFHi*8i0e;e2U`COaq23MN` zdsGRCZ_E1{gq;Lx#F6hta*ed4(!qK$6L;kKmcbA z$FP0{@`K=V{^QQgBqap$ij3+fwF>o?(ve^yNc=J_?7>2$_=UruV_FrM)>E5k=IU$% z=5$4`XK959%MgXmfawa7GA+)!jD96O7`GYt0_cPO#Gp~mK-`n=XJG#g%AZL{q!$5e zS_PRAruvA2pW@Y&{*SgM!FU^!WnF{I9>g14@p^#v44PW)OxIFX2wA`j)-VeMi8EJ) zJLPW?j^YRM%!UGYP=Shn0Miq|p3twd{m8+M5SS^*r*(*1R}91V*PT->O_w2ZPL98w z+G%Z|%YN_5pp6=kS=M&`GMwL*{4Th;w*r3O{^C+N$r5^lu7K6y!gmS8fyz_%;YU^s zw?>!TWcP(j9ulFfW#8QhEdL51D$x4%H^eV6NZo)8Iv;Gfq`k?Q zsjwO@0EN4!De_y_BVrNLr32seSX+1uSk|zA{RgM}U>A^CpE1JU8JL^{hS7APLG4hU zRv}GIveTni#q46)ktJaRJ|W|}OvnXSiw{)4Ll#o-Kbbcvoqq+#{sxMl?B^P`(gQ~} zKRk~pRphB{!y=@!47?VYU4#JO^yV8{NQwBxaqBLq;1Tz-ZeihHH#w~bqs(ycu1F`a zwD7!xaveI$pi+wu=)YGG11l_SGjaXCYy*i;s;Oc;S1Z6Y859|%dO_6d5CI0LZ2}~> z6etk_6eCF~Nx?|2QXps(e|)d$T8s*5B6y3XN4%5evIv<(!Np(6%s=4JtzYj$P8gJh z><}f*Gr{Xqpag*&Qsk{Xn)96-b>af>W5jqwNzdJuj13S6yXeylUIlS*Xng;#m&X_n z-t1CzxcSFl?;r|+IYb848wD1}9{7^7N}KTnH8Y^qc7R6XN^0L3K@XkvSg9o+ckUPJ ze02Sxd&&9nV&oVYql13@=lT7oAsl6?**s_drtHxQ+t!RA%@y^kGi5G85#d;KD~J@FwH z0O{<+?YahDs+Q%>?cV_mqa6m0h{|7YfsW&j3daIt!x&lmSe8fL-`0HwCK8l4US3QU z)gS7?m)`3e7>e+PQ~=7~bi?riUcP=cB(&u0aLOLVRoeOpcyKu|Mx8- z3|$6gv@#Vav~Lu-VF*wE6b4R8t}KUPmfu(MoqkekEb=z2+Vv zmS*cMmVT?07`WIUr5OO^LMa(=GVZ41U}Q(EBAZLt26$K2<>q}(R;3DxnlTkICc{VM z=xk|Kpu0U$z&-`URs*{wKtPkez8H^;WS^6jxT+RY&i3NzM$HST){3FK)hi42M>UXaRJdXo{_y+)T(6*`fXD9lobAN6&*@jJ-mwCUetXVrgWfdWR;1hW zcQix^?yK}u|A#-u=r-w>)u9l;EgLq(0&J>Fun#Zzgiw)~r4Pyn}SN!Kx zD>SF5rV8SUCqoPI)Mk6EeVffhQcm7zitWCQ<=wnE6VR`;3klZ+hG(|6C4y6}`PAUS za7xnX5T*T3!Gc7mtq_4J)hWu!Zo9Cd&7<5bMZ3ei@y+n|`W9iRmWb4U)92qlvR1U~_656}-BowE?2fdip%eG>rhn7I>Y?Cjo!+g|U2X z-i?X!Y%c|^bd+~&wg>kRMK)DjX`rCUCj{(bhRQL$yUe0OQ7v%S>0!k*a2 zRTkT+V-?-!hh4i*(VJ_83levc6kMC1Ukpqf7oTNKZI_tOVF%3E98RVOwf?`|k{_l4 zKuZln^8$1oXayZU#aO@ywC)vwQGO~#xofv{GGUyoTG5KzicYy5hMpXyAi8oD=*W9) z)Zl%B1cMbT)K!Gq>+^9$>Zd?f$kyu(@O4-)vzl_c%ElaE<(`3#(K{|XSlqiP`AKQ; z{#ban=_OKXW|&yN%je~I;R9*5!iAq~+|pespRKYGZ|o5>-m40$+Hl*UWBbaoH&Ql~ z>f1^7zJx2GI9BxdBA4Uqz{9l6%A&mhpAy>upC7H|&~2;Fn{mak$^OPFy|8t|E0Wt9 zXN%kP{EOLl4`L7oH#JdwmN#RQVCdtu;}Ua~KuJ`*f=Q z@}NT%IVvMMt+zopjDT*Q^xg8_`@lW3pF3mV<{-h4Vmk#buKvhnTpH=?-OPf7Zcada zx6gY*mw2D|RQFetIBz>h-c0~iQRT_ESARKxE(PGQ3PgHfzcp5y>b3VXS5LDXQ3UHDSF3ky zKbGv(k4Rp)8nFYBg9e)03OIqUD*%B={P8M%>C`naxSb=p%b ze`Zqj>i1?YE{n3Hp3UZmc}@nQGYs0|z%7T&f9KtDTBrIG3b=4{{c?2l<38Mg_nRrj*wh>~yo145?3qs^?3JfhG}WBnvVw~B9b;=7a* zuftHyLtTltK4s4|fGb~_6CJxnJfqES?>Kd~J$InYnh8SNy z6lDz;)WgOzrQ#lXd*!!o{H?wIIikE=Qy3hCMtUITuxoyGIfIrE&=VfU+AfLU%by%f z&}tBWQf}v$zmf$#-5G}o+aJ}b3pseS#b6rLZE@b?A>05% z18?sS0K-Qi0F3&mIHsjT@wZ<_$7xIOYfFZ7j&A4m$%fJS>)^mUc|LAXj(8}v#s(%k z{&X#RKZOV+(rs4@2Nh~@+l$*#l)^(YV;rs{0yD%@w-`IhCXMx4wmziCVXWmuVl!Xv zA{^WEbzA-X^lt=vZX7o2s@ysSqkn83+BRNR_RXHcQWgE}G|%p`9k9@AwyHQef^a}5 zru%53*w+?cnqby4((tsFdqS`;%Uk?GRKHiTCYO)Od}>gwh3tdNt_Gq4)p)u61_#(V zeEUWsT}_*$z^E3Ft0&`THB>^Y|1m;l>$$|hpRgkbghVnF0IGmM%JcMtvF^gt64@xNdkn?jDb|hTHC~_$zY5rVjmF_V%>a^C7S5i_GoghtBL7 zuR>4W^A^tSXYsIVwWt^mA5|gsY;KA*wrbfLnxI0IgKj?8wMJM}dQ)29mFjxHP1qj8 zpOzN+nintBIg9tDj-Zj-SzwsK^NZ=$vW_9=-L)>coD1a?nM8aK|^C*)OT(?Ap53Zcg0>GP!=mjxFrxIQtA z(%`ZwBip@c@*{TV$v3;LYVlc*fa|V`MEn{rmxI zzawW!R5kv4)m^LKku<8dDEce5)}n7{ydvI8NPGRcDk_gsTo-(XoKNk$3Cc^?Zzny=dLnX)UCs^$h`=HoX#iaB*< zDG^h3xcuLm2b%vXz_?{1UGKdOC{cW4Xh!hsT8VAI-gLIn7XN)t5MY8~djz57YakQ0 zmHGe*Qb6BWSis7X5Zai3DHMY@!GjDNX4i2|*{H}U2U{ja0Q$$^yvrulCX{4!Ih-K5Kc2Svxa?f`lMHS`_d?+Yye*Zn`s zrwd8oaQy=8{aHaJVdVs-!oQ9Z6$Mttt+i94`9~_GBf*271u>>90fzUG%r?JsY#r2% zd17Dxmr>Ft1P+&E|0GG7$dt;0w#DW5QJBEWq)t;G9{jdAATbJN01xi#u(itl7m6&s z1uo)%@Xo))tzZ;*BRKa-Fp=T^KsHuORo=V<+=r5|0`y;df2IsB;ve7>wQJ%Pa5%?? zM~?iWV5fFC!@r71{s92LQT$@}eCXE={4$H4;K9M?$@n*!kkO!e_KSG|Y$LdcF_EU@ zf1m3IhgdQe)iXE zF^9(=(BC~U2RnX?X2&d=PJD3QwV<*0{yy8maTMhPzR|X78rB{nn#H1 z=Kv$l+jH9OW7G%bS$bY2rL(w(Qw&-PiUNSn+@`cr<+P*#{~17H3)B^EgMuq&r{1j8 zOo*IdL)Y;qhd_{)f4d#{3IINDVC&gd^F*Vw1pr;k4pxUh-Q_T%tZTn+S~;QCt;vE0 zc$PH~hvF_drvJ&~__LG2>J@-t+65})H4u6kfqe{DIn906LvZ#c$2ErEB8AoqD38cP z2myZl9MCx`5{x|i07y#}LU*{a=Z`~O0c;y#H5F$kD;cFqNnCuM`<58SAas!R?)8>~ z34Ikaz=r4uX&9JNeSBdCDh&0YwuBlKIr=_89ocupfG%s)9;KPp<+1un+_cnQZey1l*Ks$dUk%3jfDP`Kuj7f>`owXt(WIJ5Z*nsU8&S ze?E}-voj7`OGO{pJ(dDD{KNI)^5vT1;MT~Bej&Yp8UF!dWt9BDTE`7(uUWTgyTbwQ zA=F148uVSh7eB->)nJUG9Z)mAme53Ddt5h3pncHA^S8|rGuO<-XbS^pY|2}oe>3tb z4J!e2BEts^>LIum8y>PU4eZC2)>4bkV3qD`xd}~UO~9))_r8A(sHs2FE=yIcjcL%( z;L!rb2m_Z%K!4nVdb78mJ8jIhw=1@v{rYrgb6A$97F;a@&wV>FR;W=5N;!NUyJi}v zMnKb*_Ec^CC$QWxpl}#U7n%9aa3A!css=^b=?)e&J0?vJWu*eTyFvdOxXh#kOvFO~ zI!}^m8$^$ULy=?ci3S)J!I!VHE2W?Y_?D2$8YF{l#WVSG(xub+^3NGNR7&37p|J;; z{ToGHO`b8JM>9i8A=v?VUQZ|aw&4Wacrdp)%KOb=0opoMYh!^C*oS2i_p>}H(gE&( zhl0W{p8Da4rz~_;4`v#n>4r*x~CIo#4X1tN^u!6QowC{&bHn zh!WY+SuVZNimvLHnCuAfmMN?d!Uq>3nM(|2{4aa?b8~v7ivXfmTs*AYG6hZf4kngC zd~*2$Fr5eGNkZQ7+DxD@%$dQ!oF}!`(1@bzm3Lt^afv-vO% zp_5F~R=}g0l&2Yw)-W3aGh=gBeQGJ3f7TLIIxE{RPaiF90EY(GU3n+lmBBQ7dxf7d zX3?)GAN!vBBH3CcQ}s}w4}1<6V^oF28BDT#XJ0N594W2^AQ^zBE?jR8qdw2cfo1=p zJ?Hm3$Ue1C&VYJok|+nBSF@lybrG_rY8Q%X7;Zvr6|)DkAmhh^3#37zZgrmgT`S{G zjFQoQ4S?$PV22bGq+bT?iYg2Yy!lM#!p6>Ms^D&BH^z~n*gelD`j>iVn$#4g^wItG zJIo6v(MeyuoU|_3*9+>A^bB?y9hxoU`OB@q?uW4$)&?8tY&2^1j0icDb5i9Jq-O{I zu|qEu=KQW&V`7Fxj#7(T3w*Ri;5%Bk3J=A3XJK{>Z7>Z`Q-Ng+<_APcRRyFBzS3C* z3v&G(Q>gOZH=U2SJGHQsv-DQrK=Ww(>&HiX2T3bpH&0{U2fTT( zT>#0x5G!p0&Yj=F%iD1b3PY3LKfsVJBrb`sgS)Yu=VeVFX##l`Z-{WM*H&Xs z2tLPv+vb!xxPQtiS~kH1M;@z3N%yca~SW?=_ieTONG1|sZ&lrGY znDV0w#F$t+v<3N8Bj#(&j}tJ13?A!3+=v~Jz8HSXu%orFXy|$}+`TM}DVAs?7Y8YY zLGZa`w=E+8Z%fYmaMEJt%-4@$3YN^ZF^3O#yh|eeCRZgSt}{+xq%9p3w%DBz63WAY zpz!II=NI=Oepii z^+FqzmFU#AL)%F=r`jGujaVJ2ZXsEv;~cdQ%qZ((My-+ap;}{}w@8^P=FWuP6!|c6 zGRp(47w={ycAiwzVC10si&ANkVkSgdy&4ceE!H_GXeTr%7t3TU&X>1eu3H~7*hr!n zHU_KM|8bbp@`t3HU$vF37N`XpR*b2|#vm9O)?09+6QiOKkc$DmiVBUXtj_5i z`Ir*#-LD z`(+MDtqv znNn#1xre;5%YkVC57R1HH5#U5n1L~j*BD7_p7zRzDB_v>7^gCPBCS2_BvDN~d_>Ql zIl!!gl7FgbUub-=oiZ3QQQR88N?=qD-7)f5B0DNLls8o#TIF6ynaF9?+@Ar&fV7I3 z9L=VZZp;L~bE<&jv8EkpH49M6ZBI6Nvye6pqqT@-LZYhWVx4^iJw+9;s%oy)y; zW-0L4fu<{;i7)radH`5^QNK`tZNO4VyB_|`*EIQ}IQ(=H@XK`FmXj=m?myHe`A)jT zzz3{u6$|67i~RC-##$<3)HG(q=Q<4_5qh%EFBC(W875?h3L|US)6}MF4%MPxA!Dt5 zza(7*&IcO4V5u=f9)8Mb9P3btekX)O{6;U*tlCVcqCzoJ@%i5H(VM^-jDFiJC31Fe z+sjJCW`b2aTV+K*BYRW@{w3GTW-B=(OJO{yEx&hsOj+G!>JeV^)0+x@T85%m_y3^K zt(PB4?P(~A4MuJGC!a3f(%FCNzr8Wki>?UwQ&Q z!S`#-m4%>n0cq{$P=OJJwQ_~McW)~+uoD&|tvi^B2l@NBq`l-RnAs3EznF`VF31>S zQ%`NzLD-XIUYlj3BlS0+n?((EJAN0?Q;SwbW`-V*7+D#}^GGW)v-Hb;@-SKHDqhY$ z(o|H+enNDYOMpQO0=zzuwW8sAR&z)XA^1l~eyDAXcg*>?$7eaz zT6O+VF)Sr{g3*diu*v7_@Pa*D&cWw$1TUAmcn!^>r@GSBFMUEs2sBuo6}9d9ojC?IkZDZ)qiB zI=>NbZlkvV^P^p2aD}NX1sDW2dtRs8QrEC+l8WffLA6 zUIN6veGd&^?)1_}HVAnjlZ0s2^$oUKkBkyLi#c6?1|mu{>H!BY_suCjDbL%i5tz=C zW>sQ@v%203PlMMEk5wG*c)O%Pmo(ujEey&GadErF)y#sALh`whr|~)=Jn|gl38PI6 zT@1%H#!WiXSlWfF$a!LSD^A>q3XO2#&GNx`SluNnbIFnGA#4ve%)rTOuLo?OM!8SD zi|(XBLY(|2!`5%EL+_1eX^vWmVIg?;k+}dP1bQc&e@yYH6g*g3#h$tg@uN$pkRJqx z9UuGe;D!BAq=cbi&VckIO2sMQBwJXEq+-e2PRaTd` zCVspde$R^2qhF2OASu)^ee$N?m@<0!;}aL8)HKG+N-7lk9%c$-@rluizO5b*Yu&y^ zj3a%gNIpv5;H1fFT~b0tnOMpu?rDrBA~v=&y!+M@6tk1caH!}P%?n`VmB-?G1PD0p zSUulv?7NP`gr4-McBmv+t=1uU$De7$#tRgikF~ESui09bzv(KT-GjtZNJ2u8QbPD` z%u}l5%yPshay365ym@of*J6c8GwyXu6NuZXQgJbWv)R*$jkLW1GIvz%D-3 zlFAv$zp#KF=UGZ8DVRN4=hjVz;vWcpT?zhA&GK)1e~3KoT7aQjo%rM4azIrYuhGzq z3Xld{`h{X0a}j&7x?ocp;+g#7QqEc(BKIQqTkPy3=GdmqndWbJEMFvf*Fx{Id4AwV zlAq>|4U8%CqDdi4Dio5=YEwBQsuu*=$QWeaecth;(LH2z5hP!?F%luHQ=PdtrG6$2IpdjagHVGjE6Z?Z=WctooK z{;mYs!9*thophiuk=kmS_pb+yU}{Ehs&_z|*7yM7jL>{c82^!!HmVqRRZD*+^nrub zPLbgX$9nIR2OXyW0tjFb{|0h5a)THCYg)q~PUah(md$SDzz}vpI{e)TbzRYY!azcO&-gh9Eq1oAi^1iZPX9Z|b za}v+}lyzKR5Utz$6dm1Dc%)xfQfoEZ?Rkj4|9QdjN(o$XWnkC>=;Phv1`s#LS}LWR z{|v^&>To-2EA*?Mhf{f>EA35yvtJRQg2VO0-gPPhB&M2}K|vs797r>5?>G)ilMAkK zn)OE69zoW~vzgJeK1YT?(bA6w9RC^;RbkE%#W0<#sZmUHWAyT8U~o93Kf>?w#Dbhm z=udb^BKdQ)EVUKDnes&dPDUp}vwcUNB?BCk+1AkeH>2 z8Q?c#T-Tah)=U9?JOTr^I5IeP(4jSLiZE6_ss&`T&{t`u{rSiGq3j8_3)gcr^mzvR(q}uLXuE zh7t8l0P1K|5_+n(Ki&7#$_of0Hv!^P4|-?{^neLS>Ro(nC#$-Ew*TpN%U%Q-o8A}( z>9PTGRE#;u)>(PmY0;UuK~)9F?cBiFO#?u1+gD6g5w$k?^Hb0?*d~5-BaB1eE0!}) z$yo)4k#o33DrJbyBY_Pb@JtX|> z--e0@y|L>rzZtq?_)Ciud*Jv)0maYaAtP{dbld47Z{8W31D0GfXPy?|7_xsH_Ze=| zfqmq&dG)}d#RQ;ZE@zq5$DmiI1Idk62Tg&Rhie8aZ4?j}n`2*ubDDo9t^bz%W~7U3 z34o7&P1Tu0y#T&TT|4msxu=0~D0)8whjUaIka|;0JP*OpeD^-Tob%i$RVjI-+yYYF zJP~mD=;IOq$vs@#8q&2k$>+w%jgptMhTLURMXn!kyruE}Tz)c3fbzf%mE1N=WZucUCgs`FbN?ESqj|5FM5@01n$4hhybVcel(rtAAXu($9p zE&v^!%b2>XF5uG|*l}|pxU!i&hFiVd8K1)(Fd`%J+5KJrk()`OB@q&-?9y&bPC209z%Y^yJo0DcYlA%rd&`kyUw)h79$_SaI8X+ zpe4Jfti&&{=hXrMT!RJR6j@`mOF&|&TWQ*rs&hN{20px0ej|E+M2G>WOZT;6)hEyA~5L^GMOt)mRbDtD=;Lx&=NHQ{dj9<-soC3OeKQ1%1vk(*m~C zt(LINfIO7&!I*{{K#j}<7%IQ6AS^VwbeMlCT#bv$9m`88cq>pBJEWOt6kR4j z0MBRIiQ5c5j(dqUY3070sNzk9fV;Q)AN$*Q=c?{aLKelp9fIDSsWY$ z3j==sKiw%X`54g?EXPM^m@rHS_6;UQ`Z1U07n$@EfLlh(Fc{l9^bm=PgN;{DWdJa(;o`|2vD%7cmCT9;d5|2Iwd(k5 z4b$<+!Jy9g_6AAylu%R*_FxI1)af}E-Oqnmm{;2ZI$4EBLBNKi*>z+2H1<{h>Uq%} z=>QfA^%3QSd^x>PKPCxbR|2@7+nXD;qRmaxPn_d$V=G1*C)(u3ZYY)w2q>~EpyWy0 zfd@{6zje==twM#uy!pO|Hjfc3c`Rsf~GE1@UAXtUwfVGRJY!7%v)>W(*Sa*aQo>bE*pp;2g(XLPJKV z8({`C$`937q;s>bU^f_g#<+yqeT7R^SESEaY*Cq|8VAaUMhJe|BCk^9 zAl+GLHB>`$`mUJSnwx&`v@IYl%gpCi$w&%=8N+AT0M}_tP*omUG}|?hf-Px=vuYO* zPyQ$A`EPO0n>bj}WQE>uU$Qj;(veT|i{UQN6RpEEnUn#{Rfx?-o+)LgEDvXy+{<5)&VJN?^@r+n6nZa z+M9F%tny9h?>;0rY4^wb0XFE(aHM&pk~+Sx+Z$XBl{!emeNM|D#C@`ZY7!@J~8 z|EyJ)!dJeDS{inY(jBWf-eg_c#vcnK6zDaj(xF|Nk&43>q3As)*(wzY_S%Y=ui)%9 zP*~5%NI=*!kn^gn=Ow>H5)}n8dKAPbRcP=;FB4H(+A`GlsolqqbuT6gk zy^KQ{Q~BG@9iTr|78K*;ck8LZgUBe>f&1F1W73wPhsna@pkxFJxMot_o1y*)wNZbfS#rM;f(75pMnBkN0 zS4K6bu{yaEEM0uD@p`C}p*+!NO|;T%rLj7OshPMkG?ObD-?c@-3MlM>?dx7J(Tc@ig0Gu#ak zPh@oB6pCt8FMemcmcPnj+^+Ik8A}X;2-QNwD5;&Mvm>GcG^8>R!VEev514)IVNtBY zcntnsUJ4taM=e`U$_j*ulFH}kIX30cx@wg2jJF|KcU`}5k+>;jK9P#` z10cFlVW87C>XC?60jlhDlH^c{m`)9|oCL+mr6M_2!}QF3CBwXE-MuWiVLhZ6u~?B{ zeynK-oFLD6zxi!-83eh!7Tl|!uvw%_xBX^ibdZK+W%s!MDASPwVe4{Y{50p zdMzWmA||STb&uVuyj>%95dGMp@LR#@D=?vMPI2UFKH5cOJrO=gav09)@;!syFRs4( z*-xtXomE3y`Gsb<%ta6G|B-O|H;}j)eaLZd46x50yOWh^koaB{<_Mehj){#)RUcJO zMk=o>Dnn$DLJA8p-lL_F#KD(dNHrwU=|^L$Sfm%RFaTy`J&jyHduuEr5hSC{nTVkc zC)^pHLV(0W8(h=_lF3z!JmpvljCuV?N|v)~<}lKqMtSNSm5}jVBBZo+=vNY)FM_ej z<{uMOm^g^IYWum^|65A4b!QlN)?fbCQ{t=Dmu5Zwa{X z_#ZR3PAMAqv4rU6YAJRod^mA_0Td`r_*Ca@H@Lrf@0T4u*|wW&^J#bfv9iH|zr%li zim!Zr55xp=yep)Eg}7WHTz=19{Ay~g*h_e{>eXhXzYbPJ1L1ux*yvudGoxc6k4~v9 zWi9o1tHu`@!!o4q=t&&*QG!*$Y?gf|fzh)Uw~9M+p+8}j<1p@n$(;i-?)lWa*E5KV z$J}OOQzlnL1e~}^YfqZueJ_ajgr<`*{&E`Zhq$wh#?KJLiP23cXtj^rg%41jh3p9b z1J3)+|CpZ0y@Op-N-V5=Juugv!Up&Wv0E#2e9BkLR+tb3Foz%@4q3V!Rf3E&Iuc&I|MVp<|<6 zkjq`%Qgtph3mKc+ONj6vj#Zxok1yVdZX-X0;>HH!Yd3>rF75qo704a!xyel&lLSMj z7Mwq;$A8vLXnk>EC7`;Lll5lArpPeAA2Q#%Hjjn(P+D5Lc`aT!_m;(2EH#DqbsL_Q zd*LF7;JABVX&p+6VT6PsV;4hRHDaywqJ4dhLUS~WUS#q5OpxjpwLj<>CJkZySbRnM zes%h#Rj8QOSRtar>i>v{h_z4*54Wycl+v8qfM6yI=}lo0zDkT0!qn_o^5EdR}T>ksI~3lM)M!QEcdxcEgSsg|K4{m$u{)tW`{^wf(30(s4tZp&Y zrLIKsuO7aMAuHa&G(#THvJ#|H`Vjy8cmvEkZR*nl0VTUnPOPv3D1d7~-UwUR)gk%k zEBy_`!EYbcB}oHdt`WmztVi=~{BsD9N#M^(?g_P^Ga6ri00pHIfautl;x-#pF0m5- z`F=guU7s;7LX$?rtXG;e84?ppJ3ZiD6-M0YCCM z%`s0}gQ&4J0|wVL5RHeFEhPT43yI{^z&K#f2;^bNp}>zRvK&pHC_s~vk9uV*uE~%@qK;u#{I~R73jR{zR`1mqHe<#(6 zAuxTmR9D3Wen<+sM2%@jDF-|mDjo0pcW^lB7(6AUJ*Wf>YVSiZntUI3^mH&h#@Ep* ze_vfuvEYaDjis!>W)BzB22urK5@zsZx#-+~6eeMRljT=o+NJ!W0bz&3OHDvqNI zmH@rz-RpmjeHMnuX*d54EWJ(w@XYVF(q*Y~!np8@_r-^x&@~OHgpG5e=;`_);4NVR ztQ@K@FV6GbMScgBDN_Opp57PV0mows*t*z*rBVyPbm1cB+pZOV$IT1M&CVM8Vi@(o1ushH z05{Lr^fkSh2MsHgi-x)Z;JC+=1#SZwrnm=52^u6?wrQU z;@U&=@ynmGP0fm3So)Cb5AYv!9e(Pt7rzkQaHvO>qS)N(QU1Xpru1|pGrt}meXMS~ zUCmuxT^-ZzY&f^nvjyK4-1Zqnr#BvXq(8&pWCZtq|KPU?+<5Bj>Sx@NzM+8$@gcBL zI4SSC1uvVkZ{ECJ7*x~$91gb1O@O-9?=18%qU(S&f|2L<7oP|wDUAY8Kj&3oh?=&M zB8L?jzNdX!fQNc3oI8X4xk^3C6#oH_rJ5+gqi4@fKo0J$xqAJM@KaD@S6*uaJjz4l zdAGhgIdCSNZ zJiP)AZu8uX*SBYGP7iJdcyYo792bsFU3FVswqM)iXwo8M@?natDTaku+PIwT(M-XB zW)bBjcI)SU1T(4MC~Kw|PS2DMq*8CYjzd^eT-Nki7F>g&? zi=SrcvhU=iguDVuBU+RE8&E8$S`~K98vA77P#DfokTQPh1x4lZRa|EO2@e}z&Zk`wBa#3rW;>4d^Qte2oS%Rd)<#VLm6Zda+!jI( zWOJ0;eJ{{p9n^zV7K1yyv!sODgCfsb&tq_nB>cy#>*o!iNrue;=idZV`mnG83{@hi zRMkh|=M_n(1DeL<e$~{1ybMB4&d>aU2I)h84Vw~i8m|$cJc-!IG5kjA3Klxr=99@d+g1W-7=er!f zJ)_%0_a32NmbZYxd3$@oXXtDR_>muOU!T|j6l!z^B*Ooiy4~;Io3-CCSM&H9Qu!YL z^lbn>6uX6o-dEr#uK?~RUfmCDE5s4q2EVm~s@!R#i5&)vz7({o?1*TJ8Q|W&F=6wB zf0L>xxne6LSWFa6#Ulx&#hRq2HAxVeSeM6^q8-Au98I$zL70B(1Jexo=FW^aim~3E-$$S( z0fD(iO)?+=xBZt16{IJ~n}XRT=f&W{B1#jqeDpLoZ(!|Zxl%sMa}+V((8vEPRa z=Cr}!aQzgwwyW}m;K*T@WiL}clU@GhK=C>%SSo!fCq`u#!5Md#%&iZ_YS8et;S!YF zc-DRC)Ps|AeD6mG4AAU#QSLon4sL|+4RBw$l99Oq1BJbjNRNjikR++aQLFs4<!1xv%R5N)0c?!_nDSW^B0d zQ~X+-!rb1L?pyFnfe&XSy5Lvt51U;Qw5FQ#WA9ZEc&cb zkOS!)4FLms7Bt2J*f=%gEs74*ITATVLs@j&y)(S%iK0nyDY1J1MSuOyulJBI0jwCO z91Xp0bzmq*fOU86UMGZ#w)e-!YY(K}L#P7fZdcvD7MLgweWHnieC!s{y)M-=++k2D z6AYC~uYCTa*He6z?w#d36A&*RJ$plLgDVnA(;he z@1HKuq=u^aASES^iS+LvpQKg;3uKnlEL$*xI14wS+N};{YSl66X(j5CvUT$Le_6G_ zTHRZKQ7BlBE(<9@b$7T6$Guasq=McwXkREx_xP%pOC^XQEA-A)60w`qDUmvfXh9Hz zL=$XSS@P}e9h%FSkZQ9I!opVp!J(x@CbBi8g=swNI}rKcR9bT$%qYcJy&3!QG!m#k zrt(!V2hnA^ls7~WkK~~a;Azj#W~_b&lAUqcBFwc}e;{`Gh=81=L(%CogaEFf?M7iDCni`Kdba?XP*#ui(amGr4l9 zBgUfJCbB#~o5^PJ5ux%LuD3~`g{9De%QJ4#4VvXi9szIm@#w8eA# zW6_R?g2pJUFjmRlw98NCz zRAfH|*W@*sc?|LQM!_xLN{L}t$%=KbdzFmG;0BAOL%!z0haTp1zKyw)vPFvy)|)g> zFtq&~%X}H2Xb9BSox&Lc@{74LWHGgS-H$Xxm=wkpsCe_-s-|tyh<4 zZPbP(0@=P4&+vImk$CX+?{v0A*Mi0LfXMZe0ES{aR48mhKFzTpfuQBI6h}dndyXyF zZo%&{K9igg&YFzk-x(o-l;wyatHL^=1jt6%=c+7-8=V5DTm`D$xgdp{Ors+6ra{48 zwljhD9CKC|@k~1)9vh(`f3&pC3fK66Zne(E<~|HKYI` zS6!DB?8*Y8+MwHgdB7$#LFIg3q`==2KKm$`C-J69&y{Q&$rF4FitQV-BC1If_8idtC$Aw(l=eogXJ5OF$e6rH8CYy?R^Y1=V;{LElBmIVe!n zWVNj}9^iTvFK!EeO_ z==zC7mhj+NR>7V0WCWZtRBX3)ED)%`DNWKAS#*F*CzZOU_i?`H-Pg<%J3N+OX*0GG ziYoBen#Szt17qTqP&?;+r=0}e@xcUzi4~!Fe7n5Rj9?YBt*EDCy!a#UETG^z zs}92F!W>u(6l}pYKaDVK5^_E|!#zN7{sqyKdcVUD$q<)hkIL&0ntiQ>jI)$*u0WoNCWI_7=#GH;GsM$){>gmJPh$ik09<+ zJqUf`pGq#&yJ2$dbv?KL;sWTV$Gt&Vo(t19MeyiA4@?884z`H$z~HD>4-Q>2iBZM>qK89DVUu?Vuhv1@>8G&!Dw+-+MQ#5g#Vp=ag_Rtj8hk> zI=h--8FO_(lRmne(FDC&YTsCsD(D0GblaaWqwiU8@rkoT#AQkE5y*e~?)VZ1`9@PN z3-Ns~eLn)uJ?HQCnModB$7ey2d6|6r?tA;RiL`mwsyc6xDKp>uF zYtkvw@c2B5(YllFGcg`KfMz7BKollmnrNw-#VYB-{2IwK{mVDZr~73`1n=jw^k5M^ zP1KIC+z%p##CN*pZQ#^4UtOz>W5Ar+pGDsoabQ8CguqN6(_K>lNedK{v_vag`io#@ z`X}bfv7+q2sV8*MFdh->+Q3iu1epBC#$LSoZNvgl2ac>V=&Ax;x?kpZI(fpZJjp~D z;z{{MgXuLTvLBDk1?Od{pJ$M8S&hw0-iy+c*Bk~MrY-t`I*^53`9MpEtSZA_pS_FL zV73B#i>?$49UCrVz)c_-ZqG4=`U;f1mRW>CZ)LtF${gPQnko6n;1l6+^sNUfOy)+A zL>@wEG3(#?5H4#OHFLj27O7v~&XkzpJdW#Oji_Tx-S$C2pThGlM?Ruw>< zJn5uL`3VIxws5l;?nL>`V0c(%$qA^0XW67|bIax&ay{O;7Z9t1s1Rf( z*X^gI4Um+)OaIJjxVP`+5Sc2P}VVBZW&YL!^&cbztw5mdD&4YEaSy6uItLA~sVdnCNjYzb1m{U$S8 zfzp^>rmQT}ji6x;#$U);rC2())LjLty01?Y3h2hLj)0KOQ~mnJi>IKZMxwju1nXq@>N8qkR>y5m_>)mci_wmyhKV9+q;d)8FFu6tWds!0(8l@M^S{ z3Ux%?8cqssF{tLi`z?&@t)Au@Apib7u<5NW{(j#Y7#NIA$9Psb-hvV)luQ+Leu;uK4kd&pzPir%Ou{0*>|L z14ftG=fR21^>!wu>zefvd<`lTP} zja6OJsMIm~>K>lHHV})Wtl^u9!`6}J$hS8JsqmvzQ61?|%!Sj42L$SOmAlxOfAd7$ zBB3=;9Mhkzp0qw`aqqo*{U8R8H3eUcX!X47Z-Dv!8?Fb;VCy5t-t`y-cQ518%O)9r{Po2xUkP&zc9)st!8)TR#;mWWZAF9w#3?T;sARP>)p4HBS-2PPcU zr|+^FnQ}C&Ha==MpdkBJtYHXuospTSc+N|{Q~$Nj%Gg5Cq-UdjroENG`Pmd4N}w+w zF7Prx{&3iM4~{dxd68pjfPL78|R)+YHhBgtix@lRPEo%wsH5Z zH~JzP?dQMm+T<)b(OHkCDW0zDV}oFFgg$D>D`-DmuABu1dWCev)~W=m=7B}*7H3fa z2QU^XlfU<@3qAU@DY#~%J$-9wnnh)7v&mG%YbSEj4IL$4ml5Nk4KI{2n;!EB+hj`c z^xUW)-E6JgLnl)!`%tVi7%Lqh!)!t)(oU!bv3 z>?)nDYi6_2+e$Tz?zZL+YKzRvF9ds<_)3?~+H6rgUO#&7ABK}SRvY?mow2jkTbwm% zm3QCj$7}wQCgttj6MtyTc$0=&aI`$4?w}~46nUS0%6|DZ!_wj2V4~pR+n5-h^n^3s z(pF6(-w&V93CZ&0H3y|%M~yw>We-k|gg0iJ>g?8`=q6FVZa=DzGT7h!W>cSL zY>y7k-V^B=bn7V1+t_PN)1Rc-`Vj0TJ5ZxvXKjzuZ1eVA+D0IB;hl1bG?CFp)|JX} zejSx*RB+qZmw7V=EcPB3sY|EQog#;OtniEMzPsxu(er1G56&!nrY5x`uGjuTG;T35 zGVT4n!LI_+Cd4j@zs}P6t;o=m_!rTFEt5>sRTUg2uW6&UJJ|Uc4)+fpoVkoi&Ka@1 zXGhO=YB{ZCXF?$4WNtol$xp`;n-|XRyX|~6&mJJM+}!w9L1ld0-+q2rnsR0-gs`NU zZCi5M;~&W#Bux8#j!>_(&GqFE)vJr~G$wDaAwDpQrBf4?vIbPGUVe#ID~)H-P{mlv z@F_O}15G18#`(`1E7;ogcOdRrXWP~6rAO@#cl~$xXN|)iZtNBI)-nvJF7Y z@7u!8*AXGtYI0V$SG!#2>t1g}TGi0r&UtJCLA6?|$i1I&_31 zb9-hwO-ID3+qZLiG|~_!#@iNb+Y2R@N+ACL=UF~)R6BN!d+%K1aIe&R7I5A_x$$I1 z1G5F=E;d)tFU8{W>XEzu-bwxFoAXF$IUpJ!h6B z+y*7iPHXnof(6(o8_lR%+3HWyRoOTDl?+SsPLz`#(K z;`Mg-v~eZqKC#=49l{Y>X)*Az`?!mHvzhzPv-i_=I?DE!4O9)LuwiHgugimdsM-i- zH>O&z3&hh~Rk6AFB3(Q4gD{g;dB3-eV3U=6Q)_xWF@?{svYfNteoxQsts<}8GW7fL zteSRngy)Mp_TNUP8YF3gZM{<7Fc`Q@8&20$#Jd*jZfjp28h&pvch;}AQr=jD&bv9i zzVmb9D-}X~;b~~AG1c~+(D!^)yz7?P7HlJ7C@^bgIm zc14{Lj@${<&3zir`Zx0pTPZ}x`Nlp!_<`_;Ad4SY@h}`JllvU?@84i&aPw67MzyEv zV$#OuX4kZ5C&NPxA-V?n9b?yUust^iDgiX)vZC8mT!MaMb-AfG&O&8pXx5OmpPG4o z<$rRv_Gz-yoQt@8UrJ@rNLY1KqIrCLXy#e2=iz&JZ>W#5NBCnGqb7Ncjr|`ZBUD|e z8h55{-Er;^u@lgnH9}yMSBMwObZCk;1*b97^q3RzMuQ(=;SVp|y01dby>;z|dN%tu zeGh$G@7CHzH226QOi|eFOHOU@$HX{%zK>J8Bkvtfo<+`=7Yx~S0V5peHY4KX)J5!HWaTK5(+LB z2^%SROy3&rEY+ALbYF;`-B{lm5x^7m?0_4uoCQl_n|csf_0kA83kx+#oTb&=OwA?- zVJ`izbuG*tKppI|KgLVJd$;gnJM!R1vAnN$0jykFO9xg4)`{#>P@{9}gvg?R8^7H| ziEe*LGXe$QH!iQl9M+Mc+Y?oFu3k>IlQc6-pByLNDu1WhDmr<5Z*~=Wb;qc+t=wI) z^YIdofD20<-FJIbR^yc~k0-kU*yzj%Y`4~zMkHDdE+2Pn@o5{oN5OY|Uy-F87lv~~zi_0KUoi02xw|f@7}iIp zZLNRaS|V(a_zbyl3xK~`kE(x&N;s=iO(a%KX2a2T{52lt$Etr6&rASu3;vjGIHR{d zZusj&^T8UzPH?rT9{y6XV^GNt-9K{YH&=Qr4+$_|R3A!w!hDh_2E*ajj=UoNnx|hZ zjtPEtW5k$&hx?s`EaI|9&sHAbh*J*dW>Qt?`D`42*bNej)&8{QMot%m3bSi(n5s)v zq5*K&*oTeTs(4q&0|yQozHyq+K-!j`+}3Uqx0KpKJJl@|aOFHX(HbcL&u5Vr>nKD1 zcHjg$s{_Ok30KZNH``PWte2>Dq6-;mzXWO6R!_pvRr2~ie}o>^*aZLozw~AB%Y_G#;mF+6Pcc*d289mg;U=cNcDtWuCc>c&G5V?{G!CshjZ6QoVOggU6+h zt`-ZLH&nuPFHiA}Uj#XLC#B00R*ljdWu<19pvtO-WepWi$dUbfoGd^Q56Tdr^PBQy zv1xy#y{c8FH9%d?>X!i$8{U=n^N0LXOi-UAKDP6lo~?}Cta~B|9XWHstycNNj_v;Z z%78GB^sQ2xX%Tx3n&}Ai)Yrl3f{@bVAG>~Wsyx|axhKBsSp2h-Z4rZQTRBaUGa;@# zw@fweo#ANjCu^qdP6^H=qa1B`YtDRAtyX-6a?w@VGn6liwz?vgroHNFeT?;!=VMAA znk2A|tjl8N1%i=CB_y28^(W!v6g)?Ce!#GCj^!X6xA{NGN756wg~%g&o7$q%A4iO6 zJkfKTGF1%`^Ot^lH%VwTe-8(fVG#o6t8nsd0%=e=YsRjsU6V1R zTP?UDseJD;^nB(T)xf6YW)(-?RO#H8Uw%Hm(_~J0&--;V&L72@hNLd9_uO{(OY*fC z`0ivfGwobhk^R)!*2^U*-uJqar+m%k)-Bn&x+l(8xou^4l^gurwfmRvYxlo(+r^1N zTQTDI+q~Q4Vf-r8747t(X+J&Y^^u6@|oPP zFakcl0Yf7gayLMyOVN5^W3F)kGEk1Ng0M|NeXk(2$WFV+4o-XN^hD)2_5D_*qn}4i zXSZ@@>9&rz3#+o9v%7HF6}9quMqFi6K-}vaGz>W05o~pI=;Q1K69?7(Z#_2Ir*8XJ z74?likoG@-LJKknW#tg5a5 z8a7Y~fxStk*?@on($WZ?} zJn!{eE*h{Qdx z1b)RRuUows&bEa~OAbynSyjk>Y=8e%b_1M+HbfZHMTvB!c4+Qfj1aG%;O;zT{BUu) zW5}8fv$E!Y;eRF;zQ3hCLsEZwwxz5hrL}tgZN?)?wr_=6$8elXUbHM-==i*|+IwP> zAZF=8$_o0nyu2Y;nO91_aq-b(O?U>~ZRs}lVc>l!zy78~wd1nO1?xCwMS;IUMLmnf zc8_I(*1YDBg-?(ZdUk!aiFIKv)T+fjHpqhR+olF-w&N%A8J;MTqf=UplF7pSup@Na z6yx~x_SCRPQuXIhH{*<%%3Jz1MrghZU1SUG1VJa?7O9X)w{6>2%_>geNRX`fYt>O& zqSTo>`+lb8XDrf|^&HYFhX5ppd2PTtmZoX>6;yY0k6P$l--Xk{w%$c2uE>Xcm&jz* ztUl>#P49M>EX^nE zTIW+OcoZ#hnC{W?tZUh6oLwKPTJqw{Y#t&an-g zFqYT+{^8STwVi0t@M2$|Llj#0{kkh-kB39EIceP1Nluj8Vdw|bkv$VLuH-Uwt9ZTO z`W7Kr+A(WGam_|G!yd^6oi}GUJ+n!)MohvivNAm;bCccqJc`#{n<(eZ+RwPJA9Ii0 zv>NUXi}_U8t6MwD0>GOvGuMfba}~yW(gB-4v7dt;!kPHm*)fnrB%r_PH=Lu3J8@9= zMri!e_=Sz9spQQ_G7vk__h!8+&&adRoQF(+JNvHWD4YD2FDApcuus#HzLY#CQ~QT| zB^9yocxLL7lgem ztm?yTnBz%#-wd{AXGD4&M-XZ#)Sj^7ThH`M2I#CU468CuJ2;y*xUczW$f+A!sUKdN zs`J7nRL5jkhBz_N;U#)_B>iN^> zAb$qvtd9)g-y6k-gaWp7QQ*sosrALc2XPvAfO4!>3_Ji|BiKnI+&ojsI_@!t+JVb< zEbLYajN|V+PCAKACHQD(tQ?i}kWKIx67J|xzu;3tYsVFzpx}u?%B{!Wv4qxY`ofKp znn-+fTp9Zu&X#FzV(5?LR9Toa<^v|paOhl6YdpNnPA7VjU}8$Vnex{X25T?OL5#7x zW?hGGyGvdlpRSR>qFU$2bWq~=Qmi=xYf>de*>jqu_j;ErAEnbS&wse3*`rJ zeZ@$yXgM-Te|A~%0Uhw;8vI|#Xvaa0Yo08t<-EL>J;Sn_A%(p<%tWjHT5qLy@*T10 z5S!eup(#}zp?BH)Jl!*6t8i!AJ56buts0lA3aii~_Y@w5l+N4ud9VbsVxm{l35nqj zMA1v7{yLLa;mDoF^)@k?HF6eqT>YA-LE}x+RE*WfbG_u;WtQ3ztGkGeIV^s}MxMe* zzxayt*ov}2YH*T7`$%{~7M6d4M<1C3AVe`3%A-s^BOj6Jx7){Aj}M;kc}dRO;36nY zDNfiNmJp{PQlof{CkPC?oW>z3H0<4k9S762zZKH>N!S6HgYA64=>X4r0^X58j)4bY z&k6Jqspgp)EM9N46--sDZxQaZZb!Ee_BhxNEwO06DyYYNJTAuayu!cWhUESGN`_f^ z@{X5NZOJdmd?-g~d$vyBdTO<)A?hmFk`rg)0%;HSS z<1OTA3#AF%N2m9|{+xq`l)p~X@p8Feo0P9iGMb|K*KHoPCjXaNemVmB{wN$Jzsrfz zvLT20>65WfvCI^d2!fKgk-IrkxM&-hhX$^F;^#%%h^32lWwZP!$+cf%EL8_p!f}G` zENY{)xBWN9hH-kqezBTCDgC{1GLG zw-Y4t6~YjL~&9;I6;%)2;5x%N3fujSBu8BgMaB|w_Liw|*CO=;67bj|G*)$>2g$Sif+kW)TXY#6q3nf6R*Zbw`=4J&hEw& z$L0tAf;R6YI7XOn$=tR(TwOw3;T5?)wjWODzHe6e(wINMA)jBrI;sFI$=}~8Uo5gBcvh-U)b@EiuUe>DExun~ zUxEqnQB_K)LqBdwzg-rkPiqvT?MY~Rcx@GY8p)<6>jGR^@h8Tv`@qzYbhReSNpO^s zfmUkIz7AzwcApVv#&CUAp;4j`7t3lPOh2}8ORGGwa9%#-ye*kYc3vc?E{G&RJCzKO@|61ayM6m8F?E z#H-8BoGwU!I0xvB@Hzw7QJmiI^gbv$F+zmay1vFCw=J^zWMa>k`zz&1X?m1>(xi)0 zY(H-Wmz-X0y{ETlvQZpUvYc*4@_y066{6shM3e zE&LEKJ~NGf5#l-O5G5YsG(2FFw{6jBgf^tBUZOK^w+G9L6^I$It@C8#+!u0RIE!#q zh^=hNXIgIJHu~3=kwc308wqjOU)xyirFFZeUq0u!e4s3Ik18eKSzNE(WUUwi(nxrYSKd}J+Uu@R~_IVu&`$UJdhrkvb*L&GM1bK4%yuE zF1paj!n}ypG`@foVh9)*eeh!j%#rNXNEV1@l?Mljue>F`yQ%~ZzU(SwxzNIZ63GGc z#V9aqiw-OT*q~4u{n-6raO)c^sZmq?g-GyC@t^4i`#Te*mt%r+ zHX8yWIG1%49)b2j!7O{Vt03a_nz#t~JLbKzchpati{CLSh(X&XnEXND(KJW*5TJen zfLo|%5SFxqz2R|G>A?LC25`T2zxoWgAN=Y7#ubONwg$n^ilAY{Yu%=k;O{Z_z%(*%@F0SCPfcd~x~p2`)pr!Li7naayYhq<>YMF)D@ zWum}?nD(GkG{NVgAaq;$sdm)oCz`s;$f`VIq>nXIPVu3pZ;xTan|N76z4~z#r$JSe%A_sbI0Q3aA zoa!O;;qcF&Mkru^NPMCBngun{UaemF-$fy5g?D75{_$=0h89a!fIF-2dlK3YtR+Klh;6h)BS?t49tL`~S~;`CnIh zZ9o%;#d;r_IHS0~^h*vV0li_$vLqC8+IUnO^v6o}M$szKQ73H~y$#F6!wr_e5& zc3B?-X*NwjB)WCX;RT2k9tR;iR>GHOc4H0CopYc6_yXmI2DYr_T*Dff*jp*5S}Y)2 zvXkDf8# zmq^Bl99SEEz25{1kON>kGxo8dSd$As+8529t7TEFdmTQw+66{h8)%}_%4oqfzW9!&>2N~pFA$irCL=JfcIJS+YsoLehBf_ zF@Kh6Xng{UGW}!M1B(K^PfeB;-?`&+2Dc>HySfJ4&Op1m@8d`YnFPtc!`I zpzXbG7}Fqx-55v{UNcBPpXvgllODBZE9cj~AYt5;y*f1|spkFWEl8N5WD7iUcKhB) z@n*aR{!#Lfx+C@~!Sw0n>CEK@=tCZl9!O8(00bY$9;?FIVnkcO!YBDHTxOI(JPQYU zkJ5sn^~n3r{t32FaJ-KEY!D{+`WBT_T3F${_vVE4CrglkdXFHC#AgahJrO3bi-H7u zLSLgv`t@55?;|#(jG=s|o4}*_SJk4_0R8jO%1Mx*`EK76IQ zc?jfFwg&mD_uw2^18`~52^2S?D4cEJFU<*LfPQjGdj;mKuKmWB)V1@3!Sbknz4-T` zn9VduSPt2m^MlxXOVFhK2tS}O%Kx;yKcZ)is#Vo2ECtc~%V$R~!{`|ZFx9HehIVTb zpS;QNynvFff;^2Ir$XRm=tAHE1A8UvH?7(TRiJdz66RjwOvdDY@+HJr3y4oZn58gC zM7ogqVW~IOdT$Jv{R;CG`U6MwKA?f(hRl@W=vw^o|Kz;PrHhPp0@3v3*L)7ZD?R~a zLz~O~drLsmHg?#4j(Rt@->?LNt9O4(V{=ESxTQKXimriK_51e@FxWoXM*{=2F~}Ny zM5X*@Z{y+gM5xe-$xJ&K-wl=4yVtjIlw+hz4CQofI~jmk372OVG6}9Y8Pw5(%RTy8 z51*s$7|>S?&wB6aMD6$-3$Nv6-rnH^wpS1_ct+6^&(WTns^`2IW3>KIZ4_Adg2+L4 z!woep%b0Mxm!Hq|kHCyL?%*6A0;X15?Fv&hTGow2o{$LT*S6pt|7F>>yB=Ai96W`% z8HUdCVCsb)@|NTFNzCKlF0`EX22SESIj&p1WOVjTAmRIk>(d`>{>W8dl;H5M&tBJ~ zaA!)?bu+;t4{g^;U}LbAs2Bb37DYfH;zVPZrz5$CWqSnU%~tuCNIiEz)dK~$*ex!cnb9|H6Baww57 zPJvTZclh%TACMS*V&;0^XMekK&&yN2R1#A9Szz#ld#m%8Jir2~7d5V@t*!*g4k#kRvuN3Q)OGA*k{!lPFL=dp*_!<6 zpKg_ghSPOhhucc=q}=lId5Jz=N)(h`&;;4}+`l2rz3SfhB z+B_E|Vp;CXmL(O&^Ra{RU@{`v3qRmwLFyGNu-=VP*j^@D)pvG@0^?#ayb6|^_*5Hp z_vX9F7|nv5y=b?YM<<=;Lmc|085t-D9)^b%Dj7ajD#^NJ$Hx?=zdJ|h+}$YVVhVF; zkevN|8l`$@5XyUx**MGe<>ynZyQp9D>m9t)dW9_@DR6HuNBKdD>();GP)8F@6rrF7 z%%R)d>65Z-gtMIZC>!gqmyZTCSW%aXy)5HA_Um_eHbe~?I;=gc!ewOV1;6)FOe=JP zwUb@+qzPJh&cJa-{x6ykT9^Qt_w*fp{-BTOZJ3b+C-!?pLNN(|Zj7$_ZPWkOVuPfeffc#H1&Q)0jR~m^kj};-;!9kBl5wGq;)r@Mp z11k*1&989Uc}2bHyYe*B_R()b9e({szbMEpD1F->9>c+_km1+WO@~C4}Fo!K%-=7J{%>2*IpGtliVc&`Hx zDwz)q?N9!QM_V)AEUuBLX}@sTq@a?uvlvf9R+ zygVZ<&U#EJF1d%m?ezLFJ0@xnI_vvGU?E)0&E-LaZ93fEUv-hI@CKNMZhhvj%ts-i ztk8`+yd4!=wez7uYb*x~Kb(HUd~{S7{7l%9IL1ivO}fR~+l$?&Mj#6y-4UIQU1o?LiUJc?sRQUmTwqWn_Y`rc49N;yMX zdO$*B_aX9b^M^zCkGZUXtT4d#R}5#7qc_YDW0Ybu7W5G&fo1!5%iSACJzhND`}UyZ zNY#Al$^E79DFJJZu7D+pW|o+k0&{45UNVCtThZT+Y*JBI*l*7I4L-H?@pGFMvyplc zjLon7DBv?cP#rdA(NU7DiejjV`%o$eb9R{Y9mDedh-mIJt0gH3TLRa^eP9HR9Q8d> zLtQZ>eFP9`e>b6y?eZ>`MQrq_2i&2flm-ttuu8;aKiCLIqDYkQN+06juA&~h0`a6f z<f4So*-@{_AvKSh&y=p{E+n+FT% zrXr;tBjI+$F>lhTmwcB+6ya=lfv%#$zfW)uOK| zCmcG%947rS6)@PFN$v(WcKe{gqdzv-2 z^E81eHm`L4C8qKgYAIcUoZ3ekmpIsG)D@-y(AOZ^WP6H zVnY%af`W|Z&uThwPpOxX(O5$gptsU4&CJODRf-KO@p5@vM(V7e~0};&%83p9F*xFp7?gqJ5@GoZ*I;b%4GOk?}Go$Zwdphdd@dN{oz$k-R z4$91%?}@!7dGt4p3!?x`7kOa;M0L6ia9_XH%zK*Qvd9$p4GFRS@K~upHYNKk(MUcV zo-JS4{GiHGfYd_-k#sM8GuQ+&P$rLBv9F;L>@am&bw1o$xT9n@*pRtR^G&po8&4_C zfwF2c?*ZzJ_M9(iQksZ<{i<~9h_j$lFlIL-KamzYqM?s4p1khm{cK)0ORgoT>hp;g zV_$mNBaj0=m_w96zx&G_WEvtg;$sJ$Sw2|*W8p9S?<|JjT9ZSx!3~k>&Ee7VYuC#i zGILuA^gGUWsSu0N<}mlgHxe2Q1P++ONdAaY0i3_oF9Eh@X@?{#`FaEx`m+A({LGw_ly9*M(P&YnVpm-{S!u7shH7|Z8bpe51dvY24R zdm!(~h-Amir4fwW4~nKadBn{SD@H3?i67x5#ff2fpQ{T5o`(BLWs#LLbJFA9y6Q7- z^h;un?G~pGLwRssCiqnErRKa62Pt4|{mWDwZ|}&clJ(hjlKl=zn@9Hl2_JFr~`d1Q1`F^~sRgi0aJVJ97Hs+5}ksg&DppRf>9S@MWjlqTue)!8+vdu8)W|U-Y zG@DE=*6b7UQz7G?B%e6CqB%iwY1Fv0l-M8f3tXv9`h!#`OovF`IU7&Q zggE}e6FM`|`nMl%hf{RCK?i|W*8lt22k7p@tvYYb?vnMfQ+y0BlJKEfO++plqThVG2Nwt+ z>l5lD)TrQXv*>;fbI9^u`Uq#1h-T`Gmnzog$)KGVKk+HAykv2@4H~EXwQ(sjRfi-*9+hk~wW%^eke-%iB0}hDgS6=9VD~NEfX19`j zS6zGh-}B=CT;&Onyv6NF6GN~L97D3(%OxZo6zlT$pH~JH7xz}><1Glr69o|Ab4>M` zeI{JSigYh3`VAIN|j?llO3**!Aq0D%5SQ2;{*Ps}1`;}Q?>QE9Ml zKIk+Z5rYoofB%U%fcuT&YFq%g%VYt%%U95otnr`MfKkB!9r6CdJzRiC($c z&cB~au^Y58(Ai<%0{lxnK#k)}GwM_ST^8!IE%ZIW1dT6^J*Y9iqCe)}MbTU{1y3wX zn4}9~z>(kv8QbywAOF5ATnF4gwpcU`3d2qW-KF1~D)DEd`1_SNo`RvH$&vO3cytFL zxWkzpPUiRT=OW?&ZG4rYBM4#UIZ&@8ye|qAMT2?!L4BBi)lgl2h&YZ)Yr#BE;s)<+d+H8t}&)KWlRAJ6<{AM0HKKh zH0}a+!TOLN5cpgb#Fs3j{S(mY0fFRU&OiQi{omi^v=y+6qDOD3yw&NGtD*w1ElmsP za`xM*HUB$HaTNC%z?839n~pN%5;!{(c&q{PRh<_uf4;%* zSoYi1K7TamZ54F&-(H))zbN2+jJndiH9k8~q>rv1+fZ}oy~fcNT5++?6ZzMX1Z29v zLKmj1N&b7p!=Y&By|L#9F+j-0kT(L}K~-?@)B!ns{x?>m<0`n8LgX@67BGip>b(fX zkI>||U)qOwyubf$h=bSLaezTTj{jio1%^xvC^5gMA?1&f?7#h)mIl49R`r1SZ`TzJ z63K^Ff}!{kyML_&L`0zCNx3*5ovqOVV>r`u(AC*|zg+Oqs6Mmy1oBe78Mo2W83oR03OkxBt<+;6}(U}Yu)R0|01EV1Hof&<;a4jYz$Rk zM&Acdp`7e2Sc4gT|BH`CTrhRw~?7^fLxouU4}u{ z2a``{yFt_i2~a1AylnQts&2-90J(?Q;vroVC*_`&om=%#N`t9|2!|$vs`M%!*yi2? z{k!ECx%iuB_+A#COdo)Y$WV#i%;5!Ccg7L`Bhsz?>uHPwaCRC8R6tGlA)s$wCG**r z16pbV!1MZ-=i8oLO`ZVap13@pxf~0=%kixVN_mS0Ze4}Oy(uq}w?X!Sv3v4KCIBTh zZT8r%oyP+x@#zL=N6*@k@+Rcu0aPp;XyDMX87_t=y>DcyclaMJ?w6KP%=7*xg<`!U z003(N`Xx}U6%h^jB2E}a$rS>AL;4M5G}Y$113>?8C$4O(KQHQD0*s>uEa8Bz-C*Zm zFRX}l9)q%^0y?kh5n%AZO0Kl^s8`_kvr`}x-ULo^`y;Iv{3p%8{p{~d+#h-nWaTIa zsa1PZT-KN`D>eW~DKo0~u~#jA9Zccdjo%u%C%_!$(QE{SaoQb-{cU-;b{EX@IiSN( z!^u>{GCURg@*j?;{i}aJpyO=rzI{4sS+^K_0FtCW`Ynj;Xd}1Y_~dZB7&mcwaVUIw z*~`BzfL@4{J@cOieKpPJuwepnnP%&_tY+)}6x{8gGje}zkaF}Ba`>5mK=w^5K$v+X zVWI)4!B5`u(S|yTQK1^r+<-rr`%^k-`0DBPkv9Bl#;FFWT1Fv9L13b0Ca#ylGeXE# z_mxD;iB=^TH@&h~==5ks5vg5%08X|pS5HCe-*J$OH*RA$$xRrOpD)av#X~~J=F1H% zU!5a^X5Qr_15jY$SI~os$lwKU@hRpsd>fac^1u=EQnq>J=1;aAou00y21l>j%r z@t3>Yfgr|IjJodXB_+U5@sw(SJ=7?$KU?1>Cp)*O9(#R}3QbM+nF5)Hd`vVrM2tUI z*MaEdPG=%HXOQHxUkiL#ciBGR{9pER*@%ti-gJ1!dwr6pl_S=l&WDFJ&5IUut};ukZ2*4U>=uQMh|ItqDB^fU0C9q%Sp#{F zlRZvWJ-P?m>B#(1iQ#7afC%)oZrg*1P$(rWNPSu3$%4S1s4}n7{)D912U%-m`N!=? zbe8G5p!9`$VI2$mSvPCUhVGkw0#8#1B_tWvb+t_7waYOcthUsG*kc8)1?SpJ?g%wv-6d0kOq@8%Ler-Lt&{2@Uz_ZYpVP!Y2FrJiPbA7tkBaEOTYKdVp>diVW^xJ~5yu zG#Gwj`c*;bI4hNK>&v);cx$+(9LBx%Bq1o%Q*Q`cBQWO^I*Hhm8KzgtdFt`IyHuYF z$h%&HdW-aMfLGtab_r6Opivrv#-Z#!Ar!p*zwhZpFVl<19(DkH{H6W+@P7XHthjOq z1797ODl!cuV4Au(k3iYI6luMCThDcqi#ond8-ytU=eew@$X7jwFE05vE2DJmzH>EQ zs*;_7OwVh2?c+3okC9C|h>f4`EX3OSUR_dBUOzMriO{528@v(FPC<}O`eDa|ZjDo2 z&NWdxf~RX~dudWI+XZ}$bufZ``k0fPEC_7l5?CfGiAqWJFG#Qqe~Vr1dxMOD9s6ml zj8_#X9M4_1FhRkDKG1dL-Sxea=8IqH7B2DjhQ5E6SPzZG99PpcAiP*d!znt+%6;+I zP|1x4<6R>Wb7Gv-rEsxXZ%#xg{xPTWiqz~v3s;7pN4Q7_b8&jgi%Q>d8+@UowDV=* z%bgj~YqZNwg72?5zk4gz4COGAK3xP^phJ#?o}>z-g=z<#$UvlFQHr#V+pgA7amtXkIj>jOrF3_GZYlQV8CB zyIl69lL#(_sLv7AdBG3CvKeg9?H6Ma3zK#bQ~n+g3tPo-8pOJ|QPzeh;MD*(CHgH= zJ-1n3I=9F-q3mhGS=`&6Sx@F^GvKtHZoewLxq4k=Rx~rLRDeAUTd@u1?JA;{Q??f} z4}aUPr2D(HRgQW}wu?;hwNc7(d;;9*8THMdfuD67oXS9wnyFB}H6}FZ4i#sg`*jko ziTA%G^vhKoqj|Hhu0la>8Q^zPWp#x z5D;_RaR*^}^g}@`1BT`E>t~CZ6tmbKtqDqCp2mk}qn1ivY(c3(q7s+7Ugi8}m%lxB z7hxHx4<2`IW1!jxM~3(|mG>e$115#M=`#0csY9BEgOgX^?IC<{9TlP-(! z6=9zWTwgL;4WvC1_@uZ%-RT9J8*BMZ+Y4((8qj`&S*3d}If77$!(eCJ5`X*%RG~DY z9hDH2qvBt6g?yNkfaqxe2E9uN32+yUk>(XPE|vk;kt8B(<}TF7AYkXpa=G_k-#Q^9d6TEu>Y*EO6ch+k1r%w6V>92vGEB*(6Of0l zNIgHXx34af3q3Qgy}$wfod?2>N36Ino~be!e0x4HeNTOX7<`oMLLViNRBN@TF-03A zFa_)@$82980lO5S{T@RxW_~mnOmAzHFAvZfoc1y_3?#+{5-l&{i1VN@7K=qwb zN6%XQcnu0sK~A`|ejQJ9*jPIrXOx5^CTjq_($NlLNoI2W{k)}C2$g*IkU4!YS4_H$ z9~!R6K6o9`Fc~6@i-hdGf3#(acZbvx`RZM=!yNoO1kb^KITvhF{{z%9=C6Su3oiS^ z2yRApltaBsyBjz{Bf)EOLy6Rkn_bQ4IVy=>lsE#*c!6X-WcamgE5leVaP>OlU5)0& z3Sr{!7n}_+2}(I3C${b;++~LwPtRlWXZ#Sweu%(1CYO&|dW=I)c`bI?heOHnX~sR* z96%S1+hE>^D3=-mSqYL9zms9VdUPWavalwjxG;dYk3M4mGh5{Y2D+-N?KoF|pO5(VZJHKWX@kMe&{%?{AErX>|X6k41 zPcm`MieI{`TfbDeV4t|xqeFB!4&(}zHHQ`kkT=D4aD3tccAZ63PPJ*``0($#PqFBB zzVBvZxLTYQLx~R`^)&u)SG#=O@5VoSF3k-*fhtcQ`jWn07ki3YnZ0gbNvn(^Kxm=> z*-zkq@;WPQ8h8uq+Q8VvFn}%O&KHWisI!kC_G+vsLJm3StICm~5iTD<7NL*whV2j+ z<3^Zuh$Gq|i?r`Bqx5Oja!meyQ8=oPObyuk%5o<7Rf;N~ej{`!J7;Gxgb{dydfo@` z^z{=vWNF+p5R>@{f%WD^U(NUA3=y{1I>j0V4K?_hSp>)l@MuXa3u(7HJu_cnb0c5* z@mag2t}tWhm>{nryy2Ee%NtjTP6bdx8RGNF8RV`j7`g8vKL!;d9r|94XgII4|HoJ_ zf)G@uNCi^H32~7{h_~J&2p}4Q>BpgD%Trwe(Pb}5-RDi_B)qGe?ttJrt-TqGjNMH! zq|&&B^g#>}yveyqhj{Sq-62K<@CAHB?Zdhobcv8Mzz>t!(=Kbqd(%UG4fV`$la4Cj z8N&&8_0F@7FR<^Vr`t~FeyOqoJa*TEMkUDjcax33KYy^-C|6u-lrdugKFL}&?15&* zYYv(R6u@mKM%aB~?R>xAC&%g124w%)NT3pzQtrs`d=Gsm?^tiD4kf;qki8 zZn_Z&&2!Wr@UMhm9Zc6Au@{>UBpkGDS#JEOVI0@WT+V9rD4rzJS8uQOaF}F2r1@wi zwau+I<$mM=jBU6LH?+07_MIokLpFNjaV&$3V;hAwuM(}|RP3bFWU<=*yxM2h2dsb5 zcYQaQABLTwu+b1gH)4~CM_PV6U4mXgb=22fl#k}{o*54B(^gVuTNG)(&Aj_)Xeh&A zNT6I91GXR2{e{j9@u0KDOt=C_-Lz|;t4!?B5G#g}fdJG;ki@93vCN?Lb8NI29&f8; z4#O+4aW=b7T1?vV)o2y@om<$C@w1TrC)PIjePOGZXq$J^7J@=%gYc=ZNqwUAOK?ew zS#dbr#4V&v)4-C}P)zKC*0oMp-~~sY2@=5zSE0E5?z8m04b+Ml(=FzX6LCB^u3s=- zpU2|4leFY0JfbbZ-!(j{@I}fasVby)|v7L&<;bnh^A+6Y0nwV!Bkq$XX~tIERSXbL7*rd zJOcgD<4nA*>x#|hbQO2KjxWF)#YD~w*e?q$4={7vj`v3oC)QY4kyqv!agaQ$ zUXt(r?vQR**1SbP>g+g5hHG&-!Z_;hdHw*mcf7QW`tYj!Jpv zRk_+e(?76&Z^W76SoX%OAG1xKwmbv+m(|!cdsCM5q{fHj)YvGidBbmtM8YSfgA)kx z&^#I+Xo^+x6ys+w4z6P}mtk_!U!nd`hfn=J0E5m5<|UEY^PIDY4Q_!!sO)b-)h`u6 zmGukpQEFEFCTy(5|1CY$WsaWEk6VDNp9FBltnZ`*hy+Sv%91B0@QMNhGr2-*8d^G4L2M>#s*yUp#fbs=R2O}t(!^<1xyNyb!W@x3G|Wn1k_p zRAi6{eHA}E&T#U&ItEi;S%>6OjI&_~&9i_Uq^^=(q%o$yywBkC3)v}dpXcm7J9m)2 zlo838GX29ZRHXRZ-fJvWp`4~O8&bRYSLjkoy(za{k#ykg^Sa$egzan{_26~<8V~uWL=G;sDPSCs_5^}A+6C5_A$SAEv>;~* zb4a}u?8Pc^9v%&-*c=}z!vg~2gWk)<#W4lo3solnt??(~SM|1HyPBEC05_RHThI3N zAf9$rli>Nf1^HgfZP#J(iIDiJ4qdxGm&W9WV#CP?vxcl2CcU-m!=}0eYW8zsgTqV3 z)00vB)#Z9gdE{(HO@zMt774QLI@?%|+aU>~O6J^^y_2`zG9i0#UKv|7 zTIyG{_cImDOW%W=oo)W_Y1A_OWijMwt1P=7)$HvuBoPGpq|gJ&X<@_LbLN;BKGa+vo$$!u3RzgOaRiAnkSipERAx3j28F`z6L#DD>5YHD z_#;QTg(c}VdF!kD7AOmBe?C~ekI3R@yrtkbv^?R7oUJ#wW?}Um+K7;!q+w(1gg`S2 z6BWR3X5w7BaV(2Lx>^TCtoh3*u7_NImdD?oUoxmg=YkdB`x(bTl*}&edc-SCvf(?7 z^?nj&Y}Y=Zsx*XV$x@=CUku+gr&5xnH!*Rei;NXjAt-Ms;zwErPi|5^vHe|?O(WhGV&XY&l<>iuM=r>dm0gp(XJN%cz zDWzgf%6c4wc){n64NXo8yVJoV&AnBn)koepnZMSV`8L#a5?4IiPYjv)9#2;AN1a3g zQ0+his(m5=H@(3m+Rfp{MDwz^Sk@D&RedaZDQC_~tlQqm(=#&MNYWfQoVX`z1Nm{v}@=7!oA z)dU$%%*X6-_XIRuPdX$2x%D(~^9DIjS^V#~qwPMSnOvPoOf>$XnfY%LR^|fB!`hC`V}@wLwMCJs+>+Hh)GbWgV5Rc2_2%O!7oo_?uXPT20m02SA=;Pu5`!Y zDxIz-T{Ft$eydc21egp(p8$~BU)iy;9<`siHZDdM+590-iT@!oUA#oL?WXiMJKARk zi}=r;Fc6uNC||9Mbup$*j%=!BQJUARr7IF8Qg^m{KJEU;0`o0OY*?}q@0Anf>UiOU zrO1w2RkDaQ98#8t{5w%>yTTb9vjp)M!Oa7$r7*{uIL{di<>?^T6nxVg} zq0N;k<`8oio8>W2^=BRD$D$(hQg}@g%Cy#Q#pufsE1xUE+BLH=OTr5BXR%Rg_s6ox8Q5LsE6xK9Q&57Vuj!=AV^7yH1c)Cj@l|g_OEFEUU3(S5+6CkD4Ub zy$lFzHDB#FUM2N%nl@H(b1`r52>0t~kI^QYs=lOnz2ndSZa&94XM&4`_)e#!CX4NE zp;d@VJxkW_B^zF~ymjRhMUi?VXT}fLLRRkJUGI<(qel>Zi#;J}>L8wuZ{p&ju0lWg* zgos1!CLSc~JiR;5pjP+L2b}*|?`xtxc#H$%|GYN508xsW&sB$dkPSMA02iP%k7+D) zOp-SG*YErb3^lXisU@>I%OzzA-YW?1W-Ck5A&2S$@{tzGU6Jp#mO7f7kXg?}Q{vX5 z-jlb8`x~%V+U<=b^-}SnS+jg>QkxBqQ=R41wFz(A@!}VZFXZCXvrHMf+%>Jwu`N?cn&?#UWwrIk63tvfp8$hG-xU9#CKX*I1hcf`P^!(rzS^A=l3R5iy!0w!vzc*AR{@=I3ANB{{HnuJ@qdhxQno(b$Urtefqsv z-cgIz(fL{0DJ8XhU@xcR0p66>NVTg(cPSm3dvD)Z{p~Hu>;b&({?C4Ogj=6%FAYk5 z^laLTa(eI`7Ic^cF{`NT2nZoDd=F0Z_n6kGKn>2)G=z!c4Q{0!X=;wmRvR0>x*y>& z>bWjlo>JF~djkFO6e9&;pXJwF0zd59%)NTF30(n~U2U4znK9LuJaCyt-FvxgLpVj+cz zFqHh$5(S*gOHv0*^n{^2-9ePzDGGlg6KMQIj1IVw^^N6P=8)5>QI$!{=BLS$Wd4w* zdf|s_o-8hnWe>F8L_SD;ihdiFQ_R_pJv>7xB);b+>=?=K`FJ7h9kNSB)cd5Iv1Fh_ z;+f9N#G3rg%O8%JL4nls@`<195|?xfLq3#$9?8reY-3>!skx=Y>FIw{HuIMldOYUD z={Stnp)u&%#_Zbfdhbkk&Z&HA`Esvfp{-(;Zyw`kUz_Nq&?j7z)N}>M*DD-ssvLT4 zxm+h|QT*%#R&UPf$wml2b$|Rd9$8xOn7#2YRvQdDmLe;Vw07uSiy0eS+Ey_&%ocfv zf1}YXq#FNbiXlkZRUsV}>tTXz`T5<|k7a7uI8S@QXCBQpHjL@yfpI^su^(jB=34sP z$bOxzwHd$bxfwhnew*8PLF0$}pEiA}$atOAiyjW`0q7_*h0wvZ@uUVm%k6N0aQzZf zJK7{h>8R`~H(~PYj%ONiesT1YNG*gNEV1g&DO6F{zTj|tqGpM98|BnmX=<~loLt?{ zt?fX^vP7mje0VVR)WG7q&J2a_{$NRk#m6XX$)d3zR`$0n&X+J~^{Cg@j&zr{?9aw= z-H*dsrp$%j8VTklp87=j?^VS{MY*DPtX9~s&4rjxwYRH&NcGY2jaa<_nid18Ql;Oy z)PY6y0xT$9kg+ELq13$%AWmZi3fDg|E(}sHJ0`hJanKj*T;&%mTvcI&FDP`0EpPG1 zx9h8&PsgvQ7QZq{jFfJUWTz|0E~+}q$4Lg$?h82{-ouTJngzy+!LCnZ1PPv^hr;I(OQK3)8bQ8%WEsir>jJI1rA2)^7FimH%4qC+<2_Ux`k}H-0o@t zQ_Ja;M-8Q}MuQamE&IeNL6I~qI;}sGXL?I_~n@}&hZ|H)vWfCsSXFHSyuk`>{bcish`Q3cb8RY=C>Rp3v*?KLy`ijf=lUa zLrF4@fp13P>E(VGK)FVz>z~`5zC9C|Sgy*xC1<(3<*9#okHw`#VhpBtYA0XUyp<@8 zW1)9w@jx7}{LU)ks-7KUTT5M-=RRiUJhr`uiVh(NAB*g_2jdnG#icMu)`x*{n}+9g z=WuURM&5)^y(Zk}{$x!)j;3q`!pels_OY*o)5@#>gR*}vE*>32;i&TeVk||MF zlZA(9{h-%P+>@3p;`SWs2BCQkCLp@UhjQN8YI(SLX%aCpoy<**nAWwoKNv@!yk$g_ zxgggYx7wbaqk-Q}%CUWQoGmb^T|Oj6PPsQJ@bd!)Yqf%LY#?whE#^XkZT$$0QYAb1 zlnsm(G2*Vu7d>va#LOS*U;nyu!N6Y^HS)wOWG^^#I(g-j#G6Gs?Rv?E1h({4I-PpV z#P7s(o|K`K*KefEOjm^LyPLmnaMA8BksNwuiYJ>@U#=6z-fd>s zIvzIH?Yh_f!jR~L$Iy4#3<(*0@zVU+-(|9&(tU|Cs3-k>sdFEdzAmor@v?q6q42Rc zdCflHBCwa12x9RmT~^Y$ZGL`Mwa#A$T?6&bI}xKp?z7N&$0@WG_0FDk#UxdB1Z|JW z85Lv59vP|cNPBj@E{9uv0h{{w(M+12o9LN+OumfYP4Nw%S!|VT9@NjY-}=P+rJghY zMe!Z}QSs4;ax&K`N!OYxd0exj#?G)U$A^}+6UG87W=nnYZl00*YpNqsnGC*h(dvk0 zp$qfkNH;*l6|VN zIhmiLb0U7+|KQ9>AmwO%`eU`=KtT-U0*oJ%+cwJ8bcSqu&%63PK zRe9S2gHN@H9^fL(GCMC}-qxxsm7FjewZzH@C!G3aP$PXgn*GUUZn6tqU2UJNdfGJ< zKh|McY>L?dFy5aysxxyJfz%W7Hm3ZH#REU;h3YF!_8f0j)}W{SF8Sh?2sMuscdN+B zeM@-EUQDpqxmkzMU1>3g1q`-gQS?s!yc|Q5J|QnS51SXR^P99~z71EV3kVJOQI=v* ziuixn`|^LNyZ8SRrO~1yl`Pd=S+cYsWJytBY-8+eCE2r$>v=7w`K9ytb9$PV zxTi-dM~KRuH-}YAJdfJWG__RBCVF=2+QPu)v5&Uk;8v{oy(Moi&3?JI z;Z0hwXqWb_3~sK zOMJ~!Mfb<`wmFQk>|2I3c6YRSp|1l};wEY;q1g?Vp3S}W&y!?snIiWyLxG9SO`vY&9I;+}`1li5WYv4UQ5xuTtK zJ+yey>CK9}Uz8VgX4T{yDinvU5-7*Y`fHU858s@oWxJ$T>!SBhy%lxts<1CxsE82b zUy?4GfLCcwO`gkMS|5f}(zf?>R0rfgGxw(W88|w zchT}mk+PhlhIH^k;xe8`a;^0st^7-~7P_>x$hUD~*Tz(fL%i2iXSEh`kZPga)yYwW z@b%xQCZJRL_e|Ti%k{LaC<;#BDAyR$~h$(MYnzfwrWE7Oo%}dYj z!qZLRE?9`Zl*ypn#U5ELnEjQI2Z@IHV|x%&xmAN62qYEN&lNwjM~}8%PoPTSgUxwo z>$cS-{1#Cv4Kmlz^08;jBe;-y;b(U* zY~DxgLz-JGJ{DtP7vvNr%lyY+y0>-Fx#=aGlVifC^*-}-`y<%^r z;3aomrCbeeh3|P&TQAg<8+Y@*nxtHvEqH~g#oEQ=6st1qyu)6Q&_pucxW3po;NZHP zthe53cft{|NLA)bv7D3;m6z^lcdu@wk3uoJsFWB7j&g^Bg$RpQoe5Gl3Kc4Bsl1kk6 zqK4VLoyx!77SKEIJDyZVH{_vY+Ciq8Zt-|+H)jt{o6O!Q+UiDsgJ zAtU@-mi6(N68m%+cqHE>qo7QL_P`Tszf`f1*&hG>TL;IOzMTP;<9p%lC0mO}OUWxf zY>y#X0h;SXXn{WkC!#O8Jil(~RB0$yT*Gl3ZMhM?L3V!EH$1a?l=AHH_Q(E$0OMl! zgm@Nt|0hb(^Rf7B*AM0+-hQ)e-tXX)$z5yX3fNv{7blc)zo%#77+k~1wEa{bl?LCc zsFLaqiM`a>iCQV&p3#Q4J=A1WYgLsee2=(p)rAf$uR|iwv+X=j-Z)Y!|0@#5>`nS%s=)VVxjVS8h_x=i2+T7&={QKM%7yIbr#WWwe$BLxlAesB6c`?3VjyQ;+_LSpk%`snl+_$Tw%?*%=+X^8#|B8HHuj%bv|MA(6u^lEs zDL%Y&8rL)al0C>g4yG(a>yAfE?Nds51EfP6UKRj}onv7|{9@L1kU(3Y6kcrY3#{vyT zub}vaE)bvX4G1w;a=Famlr$LXy({*wp0}2;kNatg%cK{?cu8uxD$4dTlKL9A><#xh ztGA2LTdug>i|Z5}%08Z&tgg{}|0tybHHV_&H)oqm!9m~N*bucZ^R(4BPKOW{p~Po+ z{s_z=+!~Ua)~|MO{^e_xJJ}Jjpp{|#t0|{W`-%!XtjUHp@g6<^s>XX)^Py?CY5MqW z9{&FC+4AWZo3U%$h*|HQiR+Rr9Qsm>6-^gShTS!j747M5MlXlDRHCfXi7KvbdT?3}nF{lQhU6lVT>QMH(u|0Rz zdu;pPe=#55cI*+lj(kT7UHill8RpIXxJT9r$rB*8Oe%s|rE_X-0pj;zH$0j10HOgspGiX(E=^av_f> zUb3>~f_1R6N{@!tpyNsDL=WmBRGupGXo38SV;{{U3&W6J#LS9FY6?*7ps3u6u*w=V z4KJM3WR<&ZUcR&x@9VIT=F;P#0t-VgV??9oL=FerP0A$B&EDchU|;FO>XS(0Ef@aU zHzwE3S0};Nz4wPdxO9^rL7q+*K!-Y8*K^jozrc_?ScZ-Y=5Z8XCEwVA%-j2P`bIYC z7O}_E9vh8XsT&}k$Gh8kyPOz%C?V<7X-d{|c;7l{)*9m3?!OV<`}z8i_I4D4Ap9Oe zD-@4M=yY+BZObz=63eH`i~1KktEFg{AT(*ld?@Y;k#0X(C!B4ao|%KaO}q#>f=8G) z%F5HcUG|u=A>#W>J8^!d6Kt@16%mro^VnUU)ck%AbLf!yIDE~H7#`^B508%3wUVjF zVNAcK#b;ADLMw(tqOq4wj%N?Y24JzIYNCnf`SJwBet9@0Gc&NDBH#fR#jY-_u`T0% z#8&Yz{g#F0+sBL(i&geY+e3PJD-p$+&!pRu&9A~?DqyAWUpKjWlL)PCih=Au<;i}>!y?%S`JD-bf-&sQDBie^8|}n#TX)48 zHxnRY`ar}&cl{Kx1fbYbu6s(9$hV=^r;7jX_xKXo@1Y6L&0ZbQ*9LrwD9WS;R;B?tzEHzV!Z57u7j4*jjRF#+`eI}|MEOV|4 zJDWASjle9)kGMQefmDh0>%&8y-|2sLqz6iJ>G}NLEBK>04K^`e+*py96X`X; zQz2^wR>_gjMe$ohTWuVZ9=QWP)>GsCT1AkHyHIr97;lC0xkugFf!b*J+2LOEt^Sf5 z4*NuC4m9Z5wuk{mi9IDbA4>_=Om{$v-v zKLme1jl9j!!A<2JsSwndCfO9WOuV)bk|N{2nHHVt7V%DbgZ;c)OnVVw4Q)2CJY`^k z5Qh&M_Guj9{*WomwYV*xJH9NRyRn_+yWh9fG*+7z&UF)ZwX_|zj0)J(OIv?mnM>2H zgDZ(Fe#vc+U|T!e;(bVY2Xf-*^UikZj_ea=vuoWa6OP05SIhc- z>BdZ!7Tnx3QCe4L357r@kb8AK35I8RM4>2+eRCrFwNXFrO+7d;R4xNQy0sEsk!gKD zkBmSe9uwW*o^j|1AZoB!cc!Ryh;n8FZDzQ+?c3HqW^_!Iz7sidSTHX<=g0eEcERXf z$cKOhBE{hqIvGDY>YWMXVdlf19y^$#6$e$7!XTo-SVKy@zgp5Xzy z5*|%vem!59`SrWjS5~`Ea-@D%5acum-0hIESUkFu)lINEwCALi>%*IS!Hslt|DMF_ zK)s$;d(tk=Cd0MqD1Oy9=Y_I0w_u)V)|qzyGdk?7t2(CayJgi}URpx#dRUc;G41^* z7QdpVGgkZ-fc|TPKlH#+iP?cP&!Id%#KL|vc!z*}s`ZaG^2Q(zh1T!RHr{+$bRZUNHc zJELMxKT*yB8{$^q5Do}90m$OrpL$c^^5MZ)mLv{9)DGke)>TK?qjokW6h~W<=ouX& zHg+0h`m_50I~U7N#m=Ka*CRVjM`DAH@fkUD08#v$3TT>Vj|+`mD3Cq)xSR{{=d7|- ztf;ibQ?N@+vK}5gyB_y)M-|tcnIIsF{r+Fowx}Rq@zrj#N6Ti%k+mYqR)$8mk~Y+m zIoan@ia7VVz34w(#r0{T+Rc<5T z4aicf6Y6LLq^CU@X8}Yh3!d2tOsqcln)b1vg7Ic{E(0*WR&Kseh9v|*iTDt!=zEJt z*U|*l_Pp&?0lPOZ1ZcX(GBz~2>Dac5Ujo>|W_vj)qhS#|K#H#bvZPO_{jK=muWAXl zYdnCc9W2N9&UIr3lQ;sTc@h*1)spidoD^=-%kj!XLaZ!)8Dgx#CsT_*OR!y(%j-MJ z+`U_XrjbLE2PJ@RE0jQ%2eL179*P^4dxSAb3O~2oCGwCa$a~EdV~_f`5Fc`NCyyc{h});VDl4YpPel7Nqd>p?qsbMRkq@>n61Vg zqHzC@T>^`_#{AHs=vd~F2OD$)&~P#+WbP3Qf%C4L2rDN$bAN=+9bifb=WPkVzso-l z;uGb^;8@g7^F5YgA^cAP`?=FBgljDGZog=76H2ajp4i08`f-Pbs#`rwe8A^AXK zo#OX%<^w-`4hR49kAM4n62QFD9Y{XyJwJEyKg#Og#e%sV0q;Ma{l0^_e4ww`X^+MK zU>Epp^N?U(i-2KU7$ELbIhf?j!HY7_|D=@dFL08S=gfdM=X{O;t9>YNeLd>W5ibGx zJeg;1R52%E2Q1@Y>;Mhpv<=?h+>6uC)SJ+5JAO!6@B<5YOgk8!saLNaE6(?A!R^vN zo1)BQ`L#Vi*;jh_vxAk_PzX;CXjEncUQy2jZ%PZ0LP2?0LsH;>C=U5ZO1~RqL z-AY?O4tn*0=c;W$)s$O28I-DZL@I4PUcskn#G2p?vikx9=bO$igQ~0EsV-bL=m|wC zu+??|P5E-@vN96upwx+38dh{f>uOD;>{y(pw(DT^POmTS?FzOa>D}2`JZ3ic-DTTI zX8BPOs1wnj-YlOGPEWWC8l&OCE6bzfHuFn;@?P{pm&&yUEhUDBC>*O_X@N`D+5%6J zo0PhZ_AT&gPJ>dJY;yt$RJ0xbGGwg;3VEbR8MmFBo?tjrdt zM#_oc(V%v~BSw^2qz779)-dj7=_G?bkMuY|fA?TbFtLve*6a*=8=*Ghf!`^_>?PMn z2AyWCrb*fE+u4JYn+xsjJh6-C;az>87o!8SP}m$anR{ok@bVVFYon0S8>hddL%1L& z^PTtkNO! z@SZiNt<5!N?ca*hWeAPSPiV(pN5IqqKx(aj(?J>+gxL*i8RUBtrsVuB8#C>v7uuED zyZpx&Rclxg*c{jr_4CU!Goblk%jDTw#5JUOTjpNc0h#ckYVnH~J9_XA<3Xj1aIbZF zjOM#b%Tu`AvCW}D?tiH|%A|__H&v=u#r_+J4KSU#R12s})Z0k0oPfiDk2ZaS&49vW z5wrNm6mK6R<=jW9Xh0vB4%9}Px4ae0_FJ3BOnPA}7f_bnE1)37^vC!5f{GOd-(?D) zSKkn(8U!+3oUz&1A$>*~X4^nDGT`M(FL`oYyJ@ef_$lPu-w_S!5Wq zF`|ZLoQstk^n#(fP=ZJu7LF^Mo$jeATtu29q=kirmqBJ&&q&j2DC`CSF*DV0BSzi+ z#CfQ%fhCtwlK*n|axTct#QR93b0a`_rhp3061`uFQ3$%Qwkda^K*tGYJ>Zx1l3JD-9EgLbS+LXlH%*}_KGvwMw^p4ano zHb6(jZe|r=!Gl&exN&?NQ{H;t*(TY^X$@oTQr63YdO)jDskl>l%%V(%jk0BCncli9 zDAgcTn{s1B4YKl>RX|C8^G%($`VQHXNnB5H@oA0vcdxtTJEtqX=I%qrpHJ{ChdW9POY&!mbEb}1hLf%xqO|fSAk{o z&m{#AuE<*eX#mEVAxjE7?eO8leG-$9m&-G~mUh+UXI}H7?1dGm_e;+-Av)z8L;$7!rP#5T6`qgTiU5-M1?yf6vCmO zIj-%hznZKgnD<>fB|cKu6Ri>hIvBKsh(N-)3O~54SJE~3?R$%y3P3;HlI^!qL3`Ow zKt6EIKZM*hAj6YW^S*!6knkMW`I?^Bf&Q4WLv;YQ6dS~Du1!P~I3ST!P{tdZ9Z?&b zjt=Od27&^_gb?hTBFFB#XEojy+IL|~tJUElCswWp?=F2*pBwPyj#c{L3aHHUxaSM1 zfXi&}w1+mBjuTDp#q(!xz$vLZw`)lCJeIVRY7r-Fiu>Hcef#MZD;C9Tp|!G7Y_-_; zuily2QA4aI^J23fdR(k3{X7|N6u;Hfxf5J5os2=_F$BI%@!~ zyHrjFoU*L+^R~Z@v8;thqz~ios19O|1faPQlN;FGdR8f%qJ*(fi}eu8n49(xgTl(B zS|Rw#eVzoi%V!NxnkQX99ddTx4@iSau7S-i%@v-|d%}Ip=l)Ty-MbM;_;tv-{*Moe z!WW3sQeE2Iel?CBhmKK%Kg{2#df~`y{}3J?uHpGjLCCRF?kOaXYea}uw>m8!2K$Op z)*p&D(PAOo67cV23r60Xb$K;3fUX*gV-aDCgX9;U`qXhtV20%!8x#_-8FquM0n)F3 zDnp8s`?@1f zxodX`r~V9R{p?28&A@Vk9+Np$TRw(~m}>3WG*D$ww&ICst3)&P26;o^lo7`SFO{X$uCH6xSu@ zE<$3tzGSG}A(T$Vde87@T9z+``>sRe*%0>eSd7T4D<}TL1RANKS_E#9> z-&N&-2%Y>)FAJcof=0+?Q|F~ZxSN`q!1=tiQpNrFQ)43mA{1D5*C5IjppA*91s`=i zUpJ@>MT=BjvM)QoM9ekPmx?!WW|0rH$Ee6u2@C37J$CFXc3`VeEdp)%V*)Us>)A&+ zP<%@*NY&9yiC0mtTGu6a6q>MI_dzhK#UGVB&xdLomWw->zs@ zspwn3!5|!t9A!)1x|}1aX`Li8r4PLd`lA>|q}$hoaPX8#Y7=c45oVX%_hPh>$V9BQ zr;-oa{+4x>U0IIEw*H;Wal`ha0Ty(}#d{6+r!VYMD2RVHB?FZr``!>gVF+D;6ARLR z=o(GVo=&7Hu@>HZ(38&>@?c%BF|7Td^|$y3;tz~%ED0Y`a`zw42SUEOXh=qt@&dJ5w zne(@-(JTj{T?;4{2>eh5=tOXQx2=h&R*V|tOaU$rR+5V4TWYs-Xg^5XqzZYOvmGiU zXR#2fr`N$EKW{#{gG^)*3X!q*-b?EwXE|PJg>%1-Kbo~v|4nAQ$4x&k6><@3tS@B> zHin84*}ZF*{C2#X?!#1!XV)g@A}Hl{BcQNr;3H=P?Xi{$X@Z*E(ece#D|n5a74b5T zQ@uT$d1uy-~EtdG3C@FThqo%@kzjsmH`^aXA863Q6g|VGl_Ift>3F zET+)Fm;zGxo7{d5B+~qWcsJs+yr3iyxU2?hZvTSitacP>AxGVi4T06~T*&%rbT4Iv z0^0TtI3^N#Cr*F^vb=U3L&C$JONg?)u}7|;y8M|$dMueXKz4N93L%;7xtu4=-f0pT zAmu3e;3V_l{B*ePp9kJkBIm$Q2mG^+;2RKXVY9amgr5{1{MxC=cJ&=NTl|IIND;d$ zd{82{R2nIl{v%x#3(>B!a)_q_J3VRmrM^&Fb8JWT5#0=Y@KZwJMOF>gRFOPsjf_k^g9dcsN(jAz_ zF5Dq_Mdp(`+_616X9(3=u&3PXlSYA;_XI*RlybTiK7e3>K6KJhV5-c1OKoq!lbCLm3GKh(R=7&Law)n`uCU`08Qsz>n_4ZNSF3gXql!+*lD$v6 z2$#N;jU=ITx3gTzxDj+(eBHqsJG-@Lqx6V8b~%2x*8fvi?MA$ZT}_`m{YWe)i$z{3 zf#?9go-TPB?6z{MWzUjE84Gf;b=yV}XH7iE*>mAK+!vy+@10hy+I1V^v&d0iyA%FB z6ZFtb#oWf$-XR;V4)1Z2Q-e7&9*NAAF&zQifd$cmOV~UO0g8$I1i2IhRxJ&F|JF#Tbgz$H$ZxzD0>p zxNc{PXvG9Tg1JeP4z`J&Z#g`t2xL|H%40c_(3skDXU~@MnK7U&`qt^qu~|ps40WQZ2jZs4eWmW^HZeKF#nsKl zz!!(q zBw>1eNX3c`6?BNrY3q#j!JZI1X15fM z^V}cTou@)Ymbu1+@^sG{XtBsATFO5ac{7h{%be3E!#`u}l9AIa>(kh#<^feW5?y;z zD&$4a)f&rjt(0UHV?G>4*(k;uyQ~kZP8Y5KQCKN0xhRX&*~}AAUa^M@2dnI8+~S#& zN!*Av`Q7OP2iYu7R4u~p?h@_U9B&#hyGe${jjb5ZN*QNppE%fkf9$(U(#tOQp(-ya zsQF4xJ`9c)>Da_5o7QBjh>e{MyKAN@_Wf5pKbUSP83fn)1DE+KDkaX5_xo|3ihEbB^nAh!tQIy5PsUR z?z&#J*gBx2f{d(nq8TGa>)brSmSt?%ix6~}_ zHfG8hSXT-JjgR0@hX1Sef&J9D@C)k~MLs-8E3|$pg8FD5Kb0ZuKlBhAJ)hpQUZZj~ zXWH@xcNF$Qb5`aF@l?J`bse|hPwdw3K7hu&#Wpu{_S8OD)@ID+jmX@ia9fN^bf^Gouv9KQ?T=x@ER34NB7%D&Hr!{W2 z_olB^EDqJy$dma!6W~MAeyelO*&XJ87@V2!e0i9XSu2^|_3+T%6E%r?U5TXAUmPTJ z)NjM7_F^|q{KgMC?E%Q96Yi!6KA9ZN%rXf`mQpp$>FEnPYO&L4F*#3De)AhAyV@Y= zz)!C*iROM+v-tYnKja9?0r%j&d*6f<_@2%nKq=T1Sv`L1h@XS*f-s}_zi2){ti`KB zWR-qT&x9BMhynax5bf`Ijxd+K|DR?5hW~%>4)bE}257B~#;2V>rTRFC5d8QR)A$!9 zT{{CVZl#l|Tk4p1qWP=jSL5cdg%Y@{<_wq!S2EV28#YFp=T%y^G+zdS<&S z0NE8@vS~Kb(|g4&YVakebnJvLXz@RfaI;{7`v0I;&T3^F6ihSj`pqU$y8bL zUfKWR8|Qxk;p|)!#Pq0E&@2O3{!YN%D)gNER&Vimh%NfRR=JZ^1z^2Ar*CWhbV`mQ z7vTtfGWXk;(sy&97PWGV04`%sMVW4O^2yiQXRpu8}yYf1W|76Sa!(j7;jtiCjy@N%T=a}@pxc!M2uZ&BKaI@ch zQvH>F!|C?QsY73nl>G<#2LJV6*#{-n^m`I{^3sm6BPt+8$coVY)qoX|QZsVl59ZxO z1Au!m;<$p^4x}K{i+=h&?=!eH!^ekdzo!-ju=3krHo=su7v{lkpLWrl0AbF5WxyK9BF7o&Se4%qCKN7& z89n+HHu}Az$O-1cDleus0;&H1W{cK2dDI(F`BLWG;=fQ6*da-9OhXQ#G(c$RSEQ!3 zONB|Wil!DI`1iERi_C=$#C6;Qp&mUj+ijr(;nx6_pLvh?favAFca}HDI1Xc>N%jUsa1&L>34*YH1{^SFY2~OIW{8sNdAbGE93H$6R#bi15zZL z2RtZ(tMly~?vC@P^1>&EKzI3nZyENHVxZgtINSqRIfW#VM(nh6p7HnnPb|4UbHd>B zqlyAXwl?q_mIb?G%B!6b2RCvCybzP9Sd&_by_jlI*|Xh8osA#blCBZAR;Lf*q%kH_ z$3{l{a{X6K;`33i9s8z9$tW=`BS#on6aT7Sy+^FoIe)lWz&2f5fm%uVp_rT67Dab%^b#tRgB9i+C~K~mqb`f^=}d9l}Nd+N}`7E6Pzf7n%(6} z)o)!)owW-CYvTooeQeJoL}Wfj75h65oDHyzl_~E_oSux)?mZ*Qe~}HbBuXl#HWAEY z2C-cf|IeDQiXV-gr$o4K92kFH_Eup;}ILk-lK8snmvN!fp zv!Acnt}TxusJ-NFiK=bAd;#3n_1_-4NdWU*O|WMd0mp0Z zB;nyx1Km}T+vV>JaZF1Wu{lARQ79pYSZn(&Kb0R_i)`;z(%wq1R}z>d#kWkZvFsvP zoL~MJTUP$&+@{8MiTUxsG`nWnGX-8V<%~gk^h$dFQy)5>=VTxt#u->Ph2D~Tiq0KRPM722fS$Ny&|19AKW&triVwsytdZu^v8KJ> z1Y*mcXk9li)g5(QQZh9C@xDHL?adf{X}F!&h0osWk+qdRNRY`mAuji?iwkr_2GckZ zYhDA^)Lqv1vUxafeQ2*A<3Vw0Ft+nb1#BjJcHkq2WtC)|6b)poQstn0Vy5L;5gTp^ zs8`sxy$fr>-T~AFr?#{1w3^X_Jrx2})Pon?STRarHQmWO_w3f&hqmQ@&o%=ajDl z$Cmr49Qh{59cI7jZZ)nwqo14xJ=VFN)JKM+?WWQ(1k8ZP-G$25oG^MQLx}`5K44G={w!UCuNRS&IW47xS#2b zD7@Ev^0mntrrLS+=&3!kKUZJLq{C5WNgZ5Mk@B_(soz5MHnlJQ<|Y4f2DN z_VE_D%ug$R9@<7prP1b3tHtlXN%+%leh5@ec!O?gJ1|Y(C|}uwnBK_tS}_W|%c!f( zNA*`VwPC$U*2qLs2n^r-TwAS#?;c~hVO_?t!zSF#uX2&_Xk_a3yz@WFJ{=DyL;etUU*ze@|i!hJJ(a7 z-LGf3LxCk6u3Wequ@#vGfN*~JccI>{z*w=qk6#OGo{BX7c?%V{C1O5dT#CiYTKOGT zDAq*d0(rDbaW!TbpLvtKZVs!;O0>vL@J;hyC#GX&tGp(CNt=`^aul)C02-RO-l~uC z?$G3+tPyQqyFiP!#W!*{s`$@?6dxWfF9!;A;%ZG#r{g{0z%--$Ghw9>XXV;yj!nW% zMVQ$W#r#Q#m+*Rv{Fn88p88Gc*JWnVKI$_%WdZFzyd{s4D+p`U3leGMY)aATme&3K6YI}c$iFZNlMdx>9K^11|IiyQHYu=#RZN9JgCxb zrNtL1q`8S_%iY@O?gdLFmzH8NZTJ3!1PQXt+hCre+^+uqDw=;WidLpD{jGd{?14;r zx2*J@^wwpacXZy8cApB6C}X^#Uep*mvV(;XE$MdnkEOq7f;}d;A_D-6idbrPBZlQ( z6ilPcTq?#Fi_V0dv~G-1YGYanN@p4OUE4p|cqe{9y>{#zPo*VA(VG@aWC#_LEQB#w zylwCJmjfl`L!;u4fN2uX@EaVjJ3uJw=vr1u=jkKQZm$OLe)M?0;ldUM`HOo|Vc>_i zgH9$galTf#s9;+QXM}xUv0b-eQ0!-pGt5uomNSvMtt6w16Byn}ZII7H6Wu@3}!;L_nnKn}$Nj@5!MgF@YYfM+lsjD5AH5*h} z2I6%S%7 zy)&2WIvdX+5+<;?!>^Cbyd^s4-!i!NJ?NxAV^~0`h;VdZROKY!M(q?&mP!+Zgby&B zmd2qN7Ox6_=K=`tD<53tFw-fJcFp#5_}V%aphRXX%=goQ{LgQkYyoOzbm3VC^IeT^aL5?!h=>nD%Gm8~|bRQAd zEW5O&?wh|5juCWO;7v*}J!Q7Zenms74g}r zs&TnG&l}*ARg|IUlNJ~!32yY5cRnr6E|8O6O9M7{0{`mvRVueloJJH~s&6Uzq4zqb zLDj!2PGEkk6Q$E$deB*la@5KAzG@g(evzw_6TP8ydBA2HZ)13&K!#Lh-5WrCVAyEs zjLAkmTaUtGnF=R} zyOfg&GO@PqvVDTJ4f?`#q@glyW8zXlIMc!o;}J`eTQY_}1GX(_z){xs!0bw0$cq!b zQ#Lr$zu%uV8;rF}trh>a?UNGt$6YzXEYql*cLV9*LbsX6W-|Sg$mg=$QGfC0y1pM7 zubmTRs#^Oc<)Kz9U&BKCZ+antCn(Hu#0*`k1vUP5GGw0;g|Q*xDXY@ zVLq8refa%&{^{I^Sk%Ir%uw#i+y0vEGLa+PI#@gV$z9n3dG40-A%paz#@t*7}n=p{E(MO^% z`r>P@f8%x&eI8DV(hviMy-weg?F#KphP7Z~@?A4;9uW6}M6}NC(4K1}*ygv6=HIcy zIr5`&JQwT~ieT7;yHX8ZkH+D(`AoajuOJ673)(bIHkUi|cmzY_r3q1w5@&b;UMz*-V zr4$|x4+g<*ONb_eY6{9AGpj*9$>2EwkarPNj2T;po|>w2;zz)o@NEfIH*#6z8O5Y- z+>)l4*Q(Tc%VCVF!+4K&&gQ28%69Nl9JS6r7an2YoLzBz`awzcBi-85gp7Vw@fZ54 z$$^pk2GF1)V<}gR&ifyGaVJ1}8-sxzbx{=DxmrYiy!R86)aD4<{1FK+WVA|5TE*M4 z+?9A@OLh!Kaxyhrz!+uc)VHV|q5w&`;xu3D!_zZ2T9lGZJI``xJfh@#AdXG53VO|?( ze?+$KE~|zj&3~K;>viWt7hDu=EcvQ)D&?04(mJa+;G}u!ag$Z?p{6`io#Q}_zx4d; zm*Sl;o1t2PqdiX-&Kaeh5j{kyrwv@|JN6rO2dwl4mTC*(lr4xiuyeCZgS-mO1(zH0 zAMIf~osK38jsruGpk4wh14;!S3|B^wvP>>d7bm05-D=O0GPTYZOTR-%CzAb>x`a&Y zW;*3uhOZS*e)z#prED0ZQCwtnf1NMXjx&{}_o1Tk4K;Z!qe3sY>nCf&1A7hx?lL z`BE|z7Dg{4?gVQ9f9-*Uw1&Gl)l}_3&mk|SJ59ONST=kl;LansXZ$)&IDISL)#QeD zN=B7dpP_%Ho4;O2B551Oj%k(c-1MBiqQ>+?@16b^@gQkR|1Kr zoYvZsK=qBSv~7)YqFcP*FL{LzF(m`bb`B}aVbP)6qgI@X`rUmAo~xB5>*1aGot^h0 z!bP?&Iq6eWCk8}3n3X8My`4E`#JKSt4(NfM?5w;LHlh^UL(gWiE6ln}KV;Of&~De> zK!%LJb_!+r!;D=JJ}W21U3BDIP6l$k!(nmBa(lH4+WMmC?YrP0!pNKZ8xK}wN-NK1 zixy88n}>R{xz}cu@3PFjNTOipCH;vN!dLe7nUj>%?qQ%B-vT8Ha&0NLpq%R2tKSi6 zX5jO=F>rX|_)MJlfgs~?O3L5L%woCP=GOY(bTbEC0fTPy9$E*w>=cMzaU4H;9>PQ+ zOX^Gc{}84_ObaT%a;3-8zt+yqEL!^6@Bo={5~pHmUh5Hq+N8_es6W3z)qck(+gckIbO+c0%`*JW`*bY7F4r?a}un za=bfolDs@4+hV$EAi00S!;4?D{8;#Ikn;CweXyF#EI_+mC#f!pe}troP>7b-5{EUC zbcgqueslOcaigcS$gg)knXc5x(^SVH?((>l3Bkj8VXXifzKk^IzrgW5&DSfy8_ZTwq8-3hc=<<3giP#IG1rbu&q)s)oLeJEj+)d#ZZ+`P^%M@KA0z@qavVt1HiZ52CwHl)q@aGN29a%`_x> zlM1OOaF@}o?U``rVqpH8T3HoO%&U^5bFvkd>S5A@jGdw8D>y~V?!{S5Z(uBg{0uGI zC!Q0S@9Ce4%kC{`V}s2)1~k%v3k>T@qL=;$a5i3hHEM1I64GQ!bzC3eYDm+cES4KB z11T%MRmCE=4Dv5qDAw`4e%THT*ECm6fn=)xIu%iW$B>e56N!&e$}QY@+c&QA5rBw~ zl5E0a^Pk<4DTFKKeyrqJHwk&g?WmW2hu)Je;9JJEPfl(nvlP1x4; z?$fT=-Y(pXT5{-XZ;lbNX0z%5E=Lr29y|)&ePEf~ zMW{T#f(PJbPoq+RqnB7^3tf&O&XXm4)dW>SDwCRc;9iBEW=Ux!yC66wL~qMi(bvRB z>@VU&IPP7t2gXrrs(t*{3R39+1%wLnits%qudz_tm)Ld>GSQPfYwS`W^QEZ&{8ACn z<~V>Pay)l-Mihv3mwhrJJI++*a*C%e&qxjQYz1%3EM}$?CKDlVoqe{?`PvfYm(8nY z`xcGa9zcefu2m1G^3G&Tb^pSULJ_uIgz1dVtUJZ}Qx8`-Gyr^=9HmhoU(Xqw^e9TW zD0zcrgV8=P141T*01?Ru-*atcq}g~A>-4ROO{%R|vCBrlG>hKxrEB@=9B`DDvpsfG zWA$uJ$UF1M*}F{S`oysP)m{g!$_7lp<*uZSDOs2xc0`AHn?J$>L6Ys zKf=hANwJt>1m+KO@I1BenM#-)KPi2V8RX)R-<=qj@US=m5w7s@`ytfw%$4!1W#C23 zln=KRFymA|+8cX+a7TI}GjedY+Wv6h&3p)CdpdKU19PtD)6R{oo+LSI!Z{ulK@--) zVjcE~9@{fxKn&hqOhx|^gHu2Zjt?tK05K3?4SIJNCp97RB4;pXYs+Sq5GDr4EBMk2 z5G$cxy&41M_wg-V2)F}w^~B;(@;=`a$0(V1r;((CM!@!@0UNZ`oxZwqz<0BtaVAP> z`w`L{0CMkw1mRCBe1>p{W{z|UGqwXf5{!EFvv#RX->~~-%Zsww1a*ZFliqDSHB!j! zDMtd=o6JjdaTgmY>4pLJ_2Eag6X)f(%72hj5PQ6wl=7HfyST&Scg(<<`Ss~f!X=Bl z2rvA!k~}_gG;C(pBh6tk6wh?CaXMbD`-JoqP4}pmegfRPEDVH#^!Y9?A6t24+veX; zCNtq|_@$y2WM=CMFh+EVZ>XP8hpC05Ah;lZAiC4?CU-Ra0{o(*32Pavg_bwuQZofaQ;T zoFuVOlP%cE(r)^Iu4)pdd>v`7rU*H7nS(ZEkm2xJ9*W=&3V?B=1t@7J)$rcbaG)6O_ua|-tU%%pH=MMx&r`J zi>X-n+V~#Ojiz(^ZR-j-U{>VnoH4I(1s4 zoL$wf#dm%ALjX7fb~8;$%E$Q0ho0#B8O=i?LmD@ddu*DodZwyf2G`RJ3ePl{3De6| z2^UJ6%HX&yQF$|;G0al^O%ooHFLS$!*4{{TRY8Ag^te@;+uW}T)KHKxP{}KUAm#s_ zII~+eRbox7;!WLtAwO&1LS&cmV?%^%bU!ZUq3glOE`Lu~CI1~4|6>EZQZ-3Z5Psq) zIRt7h7aA|TQd{`)y@8H5y(3q}0TVm*Lc6J`%4i!mc9fz$%d}wZDGzR&YQFDnjHJzn z7>}Ze@RX^vEtT2%)K%K>=01^@+^ez!OKyfHLdmtUS3oa67a#!qVl6K|TielZc@jQ5 z%W@AHBtUy-;LE~^K53`&&3xk&q4G_if@2RKp3!Whu#;@UG51d--UX*>%KV|~Z&Sl1 z-nEYNbaE%eFZWgOwyIWs9x|F+`l7F%35vpwkDk|Kq%2cY(?l~9$;W6JY*TX*!32+m zkn(M7va>6aPYf{EWlu+^cKSeolgTM`CjB>XE29?T#Qp zW^K*D>A*StP#3ZCM9R(8tduXQbKn#OVN$GSgo?FN zxjb*_;-1+TVsW!1^ypBZlDpG0zsMvqlqOWpt&~q`FI_b0v?L=7$W4K#u6Am)f%8W* zX(z&q{l7O-Z=R3l_FcAV!oqljMf6G?ZxM zWYz6eTJLmBT-E?{rkziq5t=W z+9JXApLvWbrEN;SU2|oaMJ+~KYk}aWp$x^B^l_t?wI!1%E=~t$&0Lex6gL9Xz#Z%L zcD9F*KPBz_>|NYFGs2RlvX^fA@2I0&GRh!xzRy{=2oIlg2}Zj402@6sBbU zd4){SO-$)bc4IKFSy{Q4=NpRYNRvEVykQ@|nb<8k>I*XPvx;9HtBrm<45>t)TRr0t=t#bO z_9{z?p&1?8ExJv#nG3Y9^b=WLg`M`IJd+(5o}$^;{Gq|q`>0fC` zSGd2DJSD(<19VV}D>@)x_|1+!AOHQ%)P!*6N00Z(T5eQbb$1=St3UCfKw*#-92AJB zL#sN!U12mF6vK^gRU-D8R{o(GuXRl61wCAIO~5LWbxRX#&-a>Sgc`6~Ssa*#gztLZ z)<%(nwv~q*C8#)$X}lZ-j?$KZ%+4ffz_h#^w|QkWzL^QiQXzAyzcJb#TYo3wBKAA7eU9|!xEjQU>Z zR@=TzJNEM=XEtQr4F%`H;SWm%=qZ@`ZL=wC1Z!Rdb59H9PHL4$3>2-Qm|~JzOF=-! z`P64VlXixopzixymFF+E+vN$WF1|9mm|ymjLH}dT=9$PjgIzd{4F+oZW>(xqG$5+r zxP!(QpZ%H2Y`m3UIALa|jtHVVrCQ@V)rMfngVKBU!I-hW4}EU;j3vVs~Xhz8ZOg`xPGH#6$oPMAHp5Ks1$ zT~go6KueukHM+fI*P?nB)iIyjVIesNYj_n^sx%C+5JA@em`=phBT{oh`Avgn5 z)S#zz&!#ryf}=oBbMBFOstX1!!oV@ST%8`17fN<{*oEof=iE@_I7X5r^X6P{EhD?*?+p-Rwir|DU7k8;B%Q^`8mS2IhrMH zz)E{ks%KGj#{0hUCff-9=F$XT#!kxCfIrt=gD$I3WQdivq}>Fwdo;85wn(7A)S%K7 z&lOM-;e6^<*?B!e#3V+uPo3l7t>ez=H7jr0*GLSqj<`~*#b%CD-OV5X!;Ebx(#GVHC+gOpVxFBR$7l^3tuB$SnnLC60*PjFK)qpgZ^Xd?3_K)=i8wor)^R{`UypTzVtT|mR$tq z+idbb#pnG{e10>i2Oww~B-u|5F@n2od<@z)_MtJE_TI$UJXYL}iKOMKlX)-b!I@Q=jZM6cFL>btuaZ5oQr?E^`2r+4 zdFtK;F$|(okE=)u)!^Z84LpOEiDGRYuC+wOuJhq4aSPENK3xRSDFrcPUHuE542p1y zjA*4fiCzd(1c6XIR6xOW*;+mNa5Nvn19fIh^XG@9X821Vo$G&P5=e8RFTU%oy$J#i zv+YthG)cmlP0$Y%dytLR!A{!wGj9OOTJ@F~Tl#Kf>r*~fM><8V+)P!>je}kQq;h%V z6L$^`u}V#p%?k%mvr2vvEg>oyX9@Gw9nr%$~vO)Yk~flvdzQSiJd9EO z9hWXjK|-raa}L6;ZQo^>f3?bb?_D?P0pH|fc!b*p)ohVDPfsB%1H=6_Xw3Ltr`%T}vq?>i-! z&;NW??&DO^9%6eGYL#s@xz7a&6?!#MPfFU{P;R|;%_c%2LJLhr?L0o1k289TmSj#M zy8~*6sxC{i$?JQY&x$MktiTjHvPd>ZyG@tMZ1U(78z*Q4<7_q8;#2u@HHs#^JNhXh zhKe^><}BG^FtBFvlgh63X@gK8=F#hXz~j<`xV%Va^xW$`Wa!lmz7w~Va-ZYlbn_(L z*vqn9so_rKXZ7N`u(>Tan?|>eIeKUA@{Uo9gKy>tyzeIku>q)@jJ|%R2W-GI)?#P( zdx#qlRG}TRR@Q!%po%9O(J5pn+={IW@brJ&7d^9kJfwbMBx;6Yv`0=dgpP zT0G}d3W4BVAn>|+?XUX3)PBx8Eso0-w{m;1VvuVG+E)f|Yp z$>Z_c=j($YO|Mddkl;qKx7hv(Ohx|BX_g$r@nD7Ws zAC?{MV` ztG5H%8wUbScJO^BxM8Oh{(rJ6(3A3cm4IeVcEGO88y66OI)=V6CXnr}{lYCs-+b{~ z(shfSsVjKv{|M2fqeO*wL-i4;(Sd2S$&mfd1t`oN%dW3e5;!MhR`0yBbe6GGw9Lbx&y7Dy&+I=+59*dNBJ`95(Q-N@J(%^R4h%@I zmy`|^WrTF>&h^ymp4zX>*{2kCvxL)I1?esPh61v{UwhL1SOIFB1BqSyiYvD`&?oF; zGVJDmk|ETSnf0)Q1;`LG1ZDwGgDrL$IqtxH{D0fvi?3!}R0-JsVS~?x(D?55%spcP zx12o1GFK0q(hFamhcW-BBxJ;()5*`1 zfpG}@`bqy#=J$2G?IHo08e7}(49g^6Av2cwkcUjfnR~Y2&`$on5rYqJ#Jf9_w0MLU z|FIE&h&Va?Z~Po=#ij)|2mAfA^Zx~*{Kq%}{&Pa?0Z)_(p~&>58aia^zoc&exB}2Y z&^Mg2*OyGT`JbHp<7+^Ev@aq7X37pVBOiJMosiyLa3UjT?T31mmm%*{K=gXZoIZQH z7#H$?^JmnpfnAMn?e9zwq~#@<)~p0kp1#wHC;$E~@eBt#*R4dF%!|8FYIkG*BL|1r z(Ac3<-0Y5%haF<(Jy#O=!xOi1uc!6e!~}h;-~1d~&mP%xulGv=k8znqwIU!p>y+C0 zwd@27u!H>ECFl^Ri);hM=?4!Wic$o19;EcbuA_5pw#-( z2V@Y8tm9j%t>(v2Gs5=<`@1s@?Y1^n0Mgr)rd`l?uUs;6V7(kSM<~`XePH&E^>f%FY7&RD`9mJ~+R3V#uDJyW|E>+wO4^qrTC zLdJyz2kueQ>q_TNKCgJphJPe3x^&3-+WE8n$Irj@-0{}ZrW^T?6R6ApN9^} z#wh~{O&7l-0`CQ99+HWIoL?a?Q%rP{1K0Pire7DpmgcZd)B8(6Q%2Rrilay;+vQ9Kuw~LzQQ;ow;an!cCMB)Nbya}H zXqebt3r@5;m!>XU7Z6bvG)juaVv)-zaTV{?xvsY=QYmgT-GCiXJ}fNEYp!S_*P`)6 z?-=4Ohe|h}-y(UyJmB)ugu(KQ-RCm_c~is0vd!AUNHGp$CkVsp_yVrRX>`tK7H{k& zx4EXP9F7~~)?9dwAe3>@59u%>FaRafi8xene&TWT4TBKe%SNqfsSvZK5D>3?!W%@} z^E9p|+pzDygJCsqHoMiTQw|r{Vw==F~_4E37I%MLkKZ zvTr7}-qPjqrm)Y6N)fhT ztePWxQU*AUe;L@B4{jpOH!75XCOJ;eUa;c=&f9HrE-HcH6*e^KaTwFFu1gx{*$EUh zQ~T?TGH?P?>~XSHI;4zfh%>HT*bk5g05UvH0C2^`%r~WPE;aH+?XXCC<&L8S86Sl& z4Y&`b*!5(hxFr}X2=T>{(g2p6wTQ7;twD@L_sqD>c`j5EhrWLO32>0!R0_XBYM$l^ z9krx}jxjIjE}sXz2nb+IKUR7gHiKxlE7F2O!luKk{CvVhnuBTpg9k$hn5jeXYiUNt zw(Py5sn2!o78azSPM|FiIqZjcA>(>orWI(@w4z4mFRfmu+H0Mxw#rEGd(?9t3z23``$VtDcVb+{MQ&? z6n_iRBU%XPV*qy@)ln0!tz{08u<@%mN07HbBCe6(Y6qHB6J9`}579_Iv)D`pgm@ zcff4Y0ult`{4DuU5?D<{tcMOLmo8~4jhci^Bw_nyne zNqJA!6grPKu-H8+ee>qcmvcElH#Wnq`S3_I&I7yS4gkA7j$lK^Eee26CQGsWdE+_^ z)qNuXd&dB9y+*z@o;9VJ#pJnI4)mUa;b?V{Ob z0cOLXbKv6JOp)(DKP~$9LO@z!mNe4&y{mP(#*)%(*T~(WNVGC$ef`@YbMLBy($}`1 z>37|x>(-uOqf^&0y&Zb9$Gd|Vg|*ffUMQN)KEath2SmI#V|1V0Uc_deeaiX#(Elv>VkyY|I(FVx6FmFZ0_ZfK{aH zlPqyk6)LKzG_CKFrIL(~jAWW1X!}g~z3s+kWO-n7!aINH0`sE^`PMFGjq&S1 z&dS%*Fg>v8PI;p&Z9vqkuBN7T%c;Po)c9@ta#wz`Y~B!!JXJP)AS@%ExKVKCjef#u z75Ne2V5?_}0{{Z)F8)d&`W5Ez_UZ zDIw}y&v;3NG%HuOcq4(o>virzby$KqrpyQNMKSW_bUZ7ipI!+eH(8A&c^H;Z`9 z6XGK$0thX;1uwbznHJ_Qj#cX#X=0X9#$|D`xMDiRDbhOXDaap7_hv&BkU_UUS%N|}bp+t(}_w+cs&I$1QgA8eKPo_Krm;Ax!9GgzAN@dCsYVQXo1l0T=ApYs;rmy0hq< zf-94kXhcfcB?Stn#g1-8g*|siyxWYsJ*5=1|PWvvrBt_r;2yfW>N;(@S&bco|T96@yj-~J^SwGwR}@4>~ifX*|GZ9lGh*fy0x|U zW=1fPq-M?LK*_%u{ zy41APmfG;yyLqWSB)&)zUmJbOPFhf)W$tpLZ5kOCdl{Yjz6N*Me|6a=Kvsk!8(B%< zZz(i9_Bo#PdAp{i&AjIMNrb@2#;=V@0IZy9=SEl!AuWxIEXJdU2i?c;#WyGX!!P#Q z2{gZ~8j27riO_Y6a4_k~ZyYHv@LseC*L53fd^FzrT=G11l!fK;4aMGL`mw(BO>Hq! zUj3pzu|6sGw{IvSXB5OQU%q_BZMw5q&2V}bj<^FR%LS|Kj&=Z=X29c=dkV%X1tuVC z)ue^3-%w8h4H@<^KS#*N7YGq@krY_$5!%YJ*jmC+Rua|0S|S6Xcls-gH20N`{!*IX zjQ67=v`pE;Fp=r{Jss%Bwg`Ri~o$5uc@LIXM4pHwuOP#_gY{=)1G6Hyl=esnKor^ohz-u;ontMtHN7Yd%^x zASUB&bmoS^@!pEeuA*`3qXLWzYUWlm5i7Bxo$&_ael0Ode_lN{@3iI>0Kkw7nZ%U`fjkE^eG&f}<``TJByztx2=y2uzqZ<+^6LT_mu zK3d^l^*$-+uMAR4EBVNM8xJx`<9 zz=C)5FCKm=j-)r@&qj>&OAqlU{n*aFjrP*{kgBjQ%Yj{<6K-^g_!#DoXuir^seW&^ z^281O1da`+5VL_o%Y&G^pkgh1WYu?dfCCv2H8lUBezXmQq0=|blPz=z6ciNhTJgoe zFgB*1tbCBenf?NzJ;OQ4=>yh@uH)H9swsr~W8b}d_bQfmtu@-YS7Bg6^(D}U9C(yu z>}fk&6Mr#AGeZ##PX6A8Tvkd)k=_c&Led|8{l*)kXWC&dgB95m_=uk9@-_n9@uAt2 zT-dnohnWOQ?eLk0@F-mcyVYXvZfuB^N*cupB=Mks#MbaNT(*FsnS+UNMbi&NqiMO| zCYa)DnbrG9Aeau_J0bORz5;$6hKrSCs%G&}^^?bb82>9Ulqw>jWPJFUvTwpf(81GCoNwueHa?#P!PRdDdyoT7=4c((e;ZJrCr8{YWI9`ex z%Hn}CIb~GrMI_q_!gWKX2o3xfCw;pb&w^2PpCXi43 z{)v>``&ZGoaz=)}x>K@N-Fe5yQ>gdlG@7pb4N-XM@O0QWLEPEynjMY|{zi-0Z`tl2 z@!2FSabXl~e$rD5jE{&F(5GC57QT!`4OBap=qCL7%o-8q-K!QH)@DVOH|a>}olR{1 zGLVX5i?e%($h#+^*G(v0B6p~%-Oj|4E?kyxdtGa|^|?t-fC^ex2s@P}bTapdDZdE! z&TVfzdzT;K6!zDm>b z&QTSBaJ;M7&8~EziX5WFCdC@861JKUP$y>s1#=@TI-p{UQ#1LUP9)nKyD?Z)rx9tU zAE%t_vwS4ZXy$8y%iuD`zG*wpbKAs<(ll8Idt8H&MT^^f=)<(|<;`zlRxB!_$h5Fi zR@bT;UP!KYqt*~B*0ZFNGNT4AVwn_Rm$|n#B_mADo$_NSh#e%c?So4Ke}yDT!McZT zjJxVEUcRZg*qDW%vaVPBWbqh}uU^M22FC2TlH;f3g@m3n;6@`mW)a0knFO)a2HT(VqWK3XStWo71h?pQ{_;Qebwr)8g70~*2x6bgpJ zD;*6M|M~Syi1kR-lW>01hfk?kGb0>}W#39xR{n`;MS$7yk;dUNd_gfLBRL7q0@3n zA&U>*E|rI0N>XLdv+bf3vHYi1&WzWlt=_2%&@ZR({&@ZN?Uydcol3n{n4Wsimudre z0_Eq1YvJsxMiKcygDqT2B|r5=cW!|R4`SZTm{acb#y&Y(nMduqyRn9TlnKIJr8kmEjY6y;oZ zia{Kf4yXe)Z!Um=3u*qqB_d%N0-rO#RLrF$MZYm*w+n2UP=~_4FgKy(&2K7xaf-Y} z*XfO{h2H%7^lLH1zY|P5iJLFqDdWx-_gILX?#Y_J>4gDYWxODLby2nI(Ac+QB7X3{ z4gK!&R4_(lSgf-MD&6%dT~8a1fvKt0P}x>=Q3r?+)}?wi%n0aABqN*uU}g`Q`AYZ{l6;PU<3w$KX0N+IqBP8B*Nz8ddS?w0-S*iPip;LXOd2s z?n^3ReF=f6;mlnWm-xzJ95(Z`Bk2tWOPP9nxNGO^O{2GBCWe+$@OgVTIA3O-g zXf{Z>As;b{gr`wEy*`9uIH?wB_i7+x2EZox^zI$4RZH=US{Cj$Fso)lx?UYSavbN0 zi|CV}&i>6lky|TpEsZS7C z1o>wXh%EGN!RP_E<}A8f2l>MC#VPPpnE)$%WE~x@epa)Ni0ahMEufv}sV>hnAqk!r3 zvq(OIckF!jnS2?^x?9q$E0O|O_Vjz~DPmxll3JEgX=-{qAe~(SYM+`q17_qT0X9`7 z^ZH=T+ef{h-&7k?qX)u+ST@(vJpr1x7l4*ei0T3c zs^>lhu(Ve_A&)XiQ2l(<3(%gTt5IXD%KXouK{fVkS~;!&HV_{2anUsqzWbHh4U29o zO3%)|)A)g}2e3wGLVzOVIU5CSK=q&yfZC`GNfqQiXKAb&ywXP0_G3i4$nt040Fx`a zA2TVnC}<@)ej5Gk<5Yh!$49WTVh$MlE;?-k)pNHNiroi2<$O1BJV-~7LMXTPBq-8n z=pu;j00CEo0w0u#g)toPvTBn4{C70kY;3fe$lYbUsSVqU_~eo!*6<}zGxS}9iFPDnEl@8cl4`ud27r#1U*7`QWhj0k! zOjG15{rN2L?c29q*{0P1U8J)@gYi;Y14#ECkR(iS7bQX~?+7YsF6LQ~Y2Mv7$r`_- zG~&n5)gKu9G2d$LY3F%KB^5kMj!iALt$s>T46xn?JY;U}6@cKi7fJB7V9yjgL@c+6 zii%nY%<@Cp*$-gMiF}r%(t~ZEP| zdIH+Es_jHfApT_ASJ5+8{8DQo!GQU-SC(U8cPGCh!LE_*y)aO3R)V^uhyVg(_ImR@@jrd?c&4cr-4Y&> zJl6m1_~Z&_Jz1g6`9K_iQ2o90xx4+g;9i_0=?33B;@WC$b>BUt0AMxs!ndS&soI7n zGoJv9pTJ5gwvO9GKfA}1U)K) zyFDtyAYKx8$BAd$gjO7ghU|}yLULmbicJKqP)yCyl8o2(@09!$7#6{WLWnOKTA4UG z6DPf3hVk=UsJ&`ew%}>G4#b8Y8^a3 z%b%RbQD)Y$N5zGOBG7;-FF z@ZO*Sn45Kf@ZelYr(zCEI~_aLcGq1_gy(sdN@rB(C@=DYR(!#Zd}w+p|2AF{EGE zPlv`9){?;<0|VY-9}!c%7tHe8+7JW4v*A6*s7(KxH=hDJY%c{2KS)i05%euy9rShs zR#{ zZEIi9Ep2WA6S4SQx}=I3NbCF$z#>jf2wqyX_4$>(@tPp|=g2lOgLgND>b=WN8GKa8 z2){s!1~3_mdHidEU?x_fLhs2>0l7{i*J^8OL=c~%J;zVsvIDE;V^b?oLcopWpr$I; z*Vn%&-YsAIPA<8nstXd-V-h9Ep{1Hl9HiRWwDmtZm`4?wF1qj7;lNm zLZHMH>F|-Dnsghui=<^Nh|1eNL$=02nji>Bz_mdn*PgD6i1fY)WF(UdwS_3Ev>$96 z5t~1HMJ-7XY`8}rXN)qwqX{1X=y7L`xm<01z4+B|9>ZeS*IUcpb47Asp4R;;B$QO5 zrYdop*&`7Y6DX0_mL$O44Nb=Dp;3|e6v$hA3ru+lRQ7?AnoP)jSQK=CDapIW=mOs#V>C46>0pVW zk`EpnH^c+jIt%x436iR(mO>v9V_?`|x93FE1lC*x1SoD$6` zR>y52;7%Wt(lk&;WhWH^yj=^LfB3!@W08`NZbcE<a|S;Wb7uLBM|%%)7L!d3MaA$H=*s4kh5=VZGY2+1uhb-c)P{ z%DfTCXZe7&>>YGGTsNQ{_^{nI6}fcCYS18GryX;G|91HS_UvE$~*kQm{@H&&y{@iOapS`W3xEs6U`a9U{}gVYrr8!j%D>2xC9M)&G1m1 zIar-gxc*@4Q(z8h0Xdb!rG5z^5EEvTn(nB@?twCm?{ zuYL>~!2`<^O9_=l;L33?_Dmd`(=>q=@1o6a4MaiIjc?=|NvG=XC_|2Uwu&JQ*M=vK zybS||dQMKx`?yb=SvwnilEAn!Ncj}H#MS`VV!G)e0^Zl9ZJYsy-6_`7plACET0}k9 zDv4P&fH9=`)yv;HXW9cQ{-E-SBAvxzON0kKWFafn9xor17e)+4(s@v*9M`S*m z_dEH8b)CoMK-cpfcm}CxcywsG^1w+}pM8Ri$Lc_}A~7WgG$riHf8LdU{EmXjDoTyw z7Op|PT{Ld`h6VLA0=QZzXF-m3+SbdETa#4NyFu45FBE312xlCmb@%Zix;F(cN*X%^ ziVXu-MR%yr;qREn5sGO7m!XjxP;>%_WPjs>?1{=61n^tOeuqNW6=LpC?f&U1(9xo# zEE_POde;wYQ2)-ROaP4u9RaN7L~{K}n;%C(7`!{F_wT_4^ZRsh@Mp(P{;~M~=-xqy z2a#CnNA6{Dz@OBR993>qt?s{DyPsy~pZ{U)rS(P3&HV2)oBDrW>yLQQL2CX#-yu0< zp`ImEh03tKIo!&UVw_CA8%uga&`U>8E|_Y*yNC^>zhlq`;7>p-`8PaUS-*c`$pPBq zPsQeU8(D#R>CyjAcoVv3t`S0{Fnb9vNF?OqS149?6IZ7I05yqkglX+VO)`YPgHVdy zl-oqk9$p;sdhTQ8+VJSEP-S9;z$4nqM!6q+DTDl_N=rvR-J-4Ko_DygS6iBve79Ix z@W)5||4R-2c&9%kiBKA_FJTO5W=*4T8{DnpK^6uS>a0?T&U%ksoH9sT86G3K9{1Zf z$eqPex$RC$_@A{9g)7Z$g};l=%D=(v1ok8up5M9~o1n2`0hBE5X?QXZ(0I(7iNEtV zJ3OurYq{30IeI4&I`x=6F5$iA9Db`J@B1P&taj~Lr&OHLu8OyTzB&3W-`nH|;q=3b zj#Gpi1CNYHz~<57%K;b)>)ZzJp8ewc{LoI4|c|J(0L{!iTJpKAvy#=Cj& zp&RWjIGhBs_pzv0&VRis5FW+9JOzl=|DT07yz^H{M3XVMb)V^9;Gg^rju8qG$}TgT%qTmA$jZ*18UN4W z9I5M$@Be;pw|jrz9^st#=lywq-sAb&@6YS)dtFxS>}mYd$BrF4D|v&JkJ1MTr=!M)a)!Aj7-f9k5RG;?SGP+EKC!Q?jt=T3Im|nCcqa=vvq^ zSQ^>^MZk4kV?#?=1sTvCQ*(1&N)}Oe1}5MVwT!O5sihsr)`XH(5cn=(X=i8-`~`}E z-*Sq;FE!wYnTdg!eZPjyZAw-FW)?06rpv$uF++Vb5YQU53h0EK(C z_Pb!KuWN3&-$$Shv{QN@8v{d|{UR2iE6}prhBmgQAWK*w@CXJ@=ml00N+to|GW0*I z5Nv$F7fW3WSew^@A=1MdHZ|D4UroZ8o>@Y~R9!?^TvtRnaN?+kVds9QR8&*t%B%weO0)v*Ew{!Q!90>xiUHnddL)W8l{WhhLz z*kRp+mWvyj8k@kM46NdQFD-Q8m0<<8Cb|Y7`~Ca(iVm;J{yf5G(*^_r+B>vphcCk` zSvUw8nnR%mpGVkp0NniN_a4lK%>mN?Oml*)svg{vulGew`246xo8n zHu^Bg963yY4xyxIr)y&j@Q*#X99a)W7|=P`ueJAp-(K~vYcFO9vM>aO20Y2e&|KHf z^!DLa0G0*Az41XkXs7-NHh?-XX#I9yp#1YY1Rgm~*nHg9H3!4GNXf!s4$O?8f$44F zqcQX&yvPUyFj^L-uNdL52198<5PE^$c8@(kIT4GCY@0j}@24~%5L zy>FiZ6d&>ABZ~fXXMm3AS^zi|wA8bO{-G8H+Zr0kfJA^1m|9wap+gnW2i6W)Kaiy@ zAP@edjzd@U>rAl#48{sN!vJV?&CLzXK~O0MJk`q3#uUKHo~-!h+V!s&1no^s?F<#I zboZcb5Abr}#zSOeWa?mO03$Le(E%dVTnJ#sdyB3(ljM1zh#} zd6)(JKC?IiHGhS;+n2q7Nu*<7Mad!ruuZ8yDbfG|02Tg+1d*<}sWH^P(FfEus6-Ql ziV0IdUl)K~urM_+fa;e=O0;ibyFWQc=y?A{(ZdBt3Jgtv+}nTbVKlKGfhKs_&jTio z{~MAACQ*J@@-V{;f*&Y){)h$y*MkIXY;>X40USS8Aizyw3thrL-U_@e3y15_yMfkV z3e|!A3qv&r7YxC_Z(!`VdH6Tfkoc~BHZIsI{JHj70Uji}KM{Xq(0kPZPX^p%`t5sQ zwf9V*->dn@EG5{OkKopSTE+{Qo3j8vzzPbR+FI$_>6`pd(z9@Kz~mzgW$bViALh+) zQ{k{#0Ivt9t-s&M=Gy1YM?mMV67<#pmcEnpP>BzRm7Wdo=^NAjZ@0#w>WR+2AN8BA zGcYjxr)tWt`Y(lQ_Nv1f>YMdLq*&_l!^rQN$9Jl z{FBqqVdB~E^#3U552{m873V;$IMAXFYyr5I!_5gesrUR%hiL~09PO1IrX4tq{XDb- zx03%Z#}r`keRb}C_4WVP-TQF0|M%?!n9Jq|+6Pea`}@Hkpw55Q!4IF2-&Q^VXR`&^ z&Ja)$=zmj3_(oX%LeAP4TA2gZ{$4w}mIgp;fOi|#uAK?A^*z5jVCMiIfLIV<{{TN? zfZcCpXdq^2VF9lJ{51mE06{w7gT0BN4fNK7s=#NE5%6m-dRrtxw`VbEPrs=`tGv7Tq8ByE8FyhzI}}`5}IHHYV16 z)&*m52Y&d&?2Y4q=^bWp%s-!Yzzi%LbnSi{=WrbSd6P}Y*jNXyW$Oa_3l2PLKfRs+ zbo{WMebD*;roQl}=NG1T zKN=4>8l?6ax9owB;Yfz_P4nMzmVdE^z{&m{%i(}IlE2Xqjxc}@_F4Tz77(nPzl}UN z$o{nal96H70qt!efx6)iWb9X*{ffiw_u=;obi@1n`?r3F+<(4I`^ii zUUpc4{Rw5_|B(P5u(x1u#eP>N!k6~nZ=wG(vIJ~9+mm>J9mpgD1_DFhQSQH3<}fqC zb=U)%hH>sgLgy%lHJr?UB9ZfB<30j1M@pPMAc;aSmB=~)i0kbRr^&$g4BfqW8{>s+D-y-mT zwP1lu4H)Hf!*>K85-iM2Ebt)DVcLiDh@VHcu)y{=9f5#9ZMz7_$p0Bh@|~{o&zBk8 ztOri;-+~U@Cw>?>KcAfeRVBY$At<|4{6R1J{5YWjlLfyoG+>6;zh7v;X52@qxqq2QKn{BEj|R9^gMM zxMXBljuKqIvCsWGPWaE4RqR|`3~(U)mat-HW?(tOar+a>D_Af8CL`y^y_gCUL%%Pt zU|xlPzr2FY$ZyLlWy`O7Ob+(0eDB8ih0mP&@NEZd*MP?X0^B3{gP6wSlx{X{hX_xe_UnA!m+<=>}ynBU-EPxJ5@ z`E9jB!O#E@qR2 z1L;$MD^P#Np`%!&qWF2$CGO=6*g0Bs}NcAx;4o z+`ct3EI|NQ{WjketmsGZI2Jg#V7TOle?J6E7WkWQhd~Kf@qQu}2UC-N5Gd{UHbfg5 z{120RD#3qI?y|G9{(@Z|X$He1I6pZq|Bw$B&OiS&vlRI%!;cW)`_2Eb*M9!dypape z_uw*{8D4ORIWoijGe0>9f4M>S7plCWr2&viW-o)QzPYZgt*JgBmkwp7`c~C9bTGA3 z1M*ZcF)*>P0TrR&)%U(L0~>4hzkhu<8Mr76okpmDgw_OJmOV(>2rc_nS@*kk9Bg2J z-luYCQQ^AT;WdSq{o!P!znwA%3LE(UBk}_syqN~W-a+OTxV+t;7g$yys3s05KL0`D zqaz;40k>M0b~iha&`zm?M-&IH-;yB{;DD{_H}mMko`B{8aiY2b}G|aL3F7*A@OcHV&jv z-d_j!Guhy_8T^_5T%-f2h64M)$i)BKNdK2@d>+A;t_#nIhf95|Iz`wrd{;vpmiQ0gmncP5s+`F(JYx_P% z`-@$>aPP!_yxrzs;I80=ZVB6yqj09q0uRd`Ql8*i#9@yGTo}M%{P%k-;9CH{(FlHh zwSi^-CRgzv^?R|f{K_#zdk6gh2;D#HM-JFPzJJ6I@Stx_1cKx37v8V~yy|`5w?B^I z@9+H6qoNi9_M4a*gF#@xjko7gWnq#9S<(wa6N&?;bg?iA>Dn6pfeZk@JE-qu-=Cqb ze{_(Al?kTW!btK!VjQB#gEX>-NfPc@{98!ULPS*GT0@6kLC08HQBGY~7{n=-moy0E>rvoZLUp8Q}16^c zLhJ}-?}(z`*RBrrlnH+15AYb+VSna^MnLu=_&2zi+dp;)fC_XlvS16ny~xvEB3~uo z@HtDU%=(of97x+EI=ff&hdTkEi@#=w(3e2=$0rUPb7TlzkG(_re&rpI|DRd>YX<=y*ua4ZluX8=AQ%$Vudc>okzFmM2}r7>{b@*6e>%~uY+ z>#qgzU)#DYXfL_@9;1d~e~+Q>g9ZMc>>gL&hsEBiew{}kECHOKZSDw6gMpzHpr!x? zz~{a8_HXz$X%UkxV7lyyH0ar^(AnF6(gEG+>p}@{FZnxQ>p+*v6q*~}5}I88yQcOQ z>Dy+$s{y?TZ2{Wkes2sN0FVN^V}U~GAOI4e-vbix*&2E>=@}W31I;x2kjZHlaG|%5nP=be2gP8c^#9PAMto724WJTfS@TxvXDZt z0DdSW2QXC`J+@MN$&MmfQH-fw5VABYPwJ+wb3RUvt~CG9D%D%_8Wef%7|8?v^L zdHmWLB3E;Po3pbs4l{1SHIb83y!xaFoS+YN3Hnsl>vM{W;}mGbSKTjC$&O5|5WKwP zmQexfubJ($*ZVjFmh|Ia=KuJjo>-*+X`Em`mmIk^;m$UW0)@oA@s4ESy3Q6dp(&h( zkWbtc(inVqP`R+g5|U&oJkn(j&?oO+={}hK#Ck`l!C@id*@Wzu>xi$sD#17EH8wpR3q?zOYudZc(qD(N~WA(EhVfu~G|J5g}>VB}pHOMJm%o(X5Q<1-mT=W;A?CuwFl8>K=@u^av36Wch+a&oX{2TDqX$ z*9k5^ey`H;kywr(9+QFB=Z<3<1!`A2aixXj z2;#*Bcil|6n=73Bkm<|%tf)L|CP9_&xu`-aFk7Ezf9^0TS9)>B!HtmDFIq}PcOh+% zj#9Qjm4)m}dUWP8+F5`xp*SNJ%no=Hxy{rY$q(YXP*FAFzV)}aH9RxndpQ-mwdtv+ zDx8JgP4Pk>T5h~2PO)oBjnIbcuIP)FF!1~ywSxMZsh!r79^R~3H^Lfw+Xs5HT^|}A zuo4V7j8P_uM{;opCEBVY>aiiHW@*(&#yY*Q{$j}+M#?bdtVB3B-gF_vRz-n|^nna= zCY*`&i#;}<`XFw{dvUnI4d0L7PUK@NYYDR*ShRG$_;bfZvFHoy8xQTbeK9*+D~TnV zsKdu)Mq46?g0H2C4!p2toOq{P34$kFPE)W7qM)i1W0MM4UA=;g%+;$N^|Qkx=#&Y8~# zl&$i$W3C#xr!GUT`FF*u2!9GNJ%NA$w5}Iz7Xoknt`lP3V#MbFetv#hos+L+*5-s!t4m2^T0KKZ;C zA4g={*&`w#Xl&46s9e4s@IoX$w8w1}(GSm=fH6sq=*p$cTxj)p%=NW)LptrEbK{IR zY92FAk8I?%b&u*u5XC=G1s(9uM099Nn;Bj~HLSoVN&EV0(T@|aJ4||QUn;kyIWGvJ zJ8|_^0Ne%<1);*SKZrynqpFf|TBMIgU2+Ps!KhuA1}-l?W3)D^7BeCoKc4A~i;|9G zTmSNQ69e1aRRoa%$1^8uQPuY)p$h8Icp3ch@*UTgopp#cxFPZJ^fMPP1?(OYjK37yfe|g<;1Wk$l z%Qc0k;^+E?S~ma|#E*#i(srC7rCH~GJ+Tv_&=ode4+rauJ&A6XstWadT(@Z9%gY}Q zZ8ee5W|oN%+N#yvltN(?_uZW@=<#6u0NU=Xkua7yGjKXeLMkVexnaAD=shl-Zt*y5vvl{!E8RN29Dt?#Pj)cRRT3vsy8;-dk;Sh;gJ zv&;(}+UT|ZWrjh_sN5M<)Hp7@x6{sq7Nrc%@m>Q7Ky&W*Duv}|48&u%5@M)mQ?bRb zh}NgUiMnSGCPW7?S=a9`otAUn5|PW>%s9=GBynLKyZr*RPyD3(OQTgIDNhu95zsUH ziy%D5Unqa2EP!sIl;s1btdl$E<>E{3E(Vr77EUN_Smt88p*oR_P_&BgS!uO8OehZ3NSXHliyIU=5&n4Uy#b85r-NwNAI-< zU3YJf$}AUeChO07Z+42Yi5nqiPY(c5105e`wcK0~{@#>*Bz)*(`T@k& ztW_G&nCxF?bl&bWbp6g^tb<2xn}5j1J0rH@)80m%ik)m_W(;rVt{b$Soho@v-y6_& zMu!RdG<9CM5)`Ay2doC(s=ZBZw*zfQ1|zEACbza^{Ar6R6~7!zfMFY6HxoN(XLqli zY6VR{-o18o)9Fu$Oy`M&+S`z6M&jY3?ZGbbBmy3Tmgs{jeI{cQJcH(_OOka--;v~w z2oywkau)P9&^ALo(+f_c#xdb(t`YL*+fd@bTEa!BFz@jjp_0>ej_;CZVW0#jA{xyK z!;v&k0${q)P3@Lcs5g|vdCm2mn#APdM2GM*Tk{>YC}hH!3aS&90Ey#n;!|14tSFyt zb2}$&(Xkn2tu2*A}6uN%Msus8tTn4-hT4nAEUW4*nd*azA za-UUe7jjEVhedSyMr!gSC&TgRAPS6X@-qq|baqHH6g~znEEl7Fy%Is4?O|#cg$_#~{p@cJ3peJrSQe+YNY?|MfC#HC z`#Ivz+iN7mPBSmmxZPs&$T56SZWP@nviO2pU9j;4V=a~HqTy+PA)n6ZeYQ6XP~>v& zQCgL<<_aSe5)}@&%qjuxQa;rYefgF$pXA5$t-095j-!~)ZCPgF%i}UsXU3arA_J#e zi&}1i7`n}X)g_G~WpZ5=be^}gRHBh*QV<0|n^c(=cp#OIkRwQ_Tb5{O#8

@@ -16,6 +17,7 @@ Easy, fast, and cheap LLM serving for everyone --- *Latest News* 🔥 + - [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing). - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/). - [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing). @@ -46,6 +48,7 @@ Easy, fast, and cheap LLM serving for everyone --- + ## About vLLM is a fast and easy-to-use library for LLM inference and serving. @@ -75,6 +78,7 @@ vLLM is flexible and easy to use with: - Multi-LoRA support vLLM seamlessly supports most popular open-source models on HuggingFace, including: + - Transformer-like LLMs (e.g., Llama) - Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3) - Embedding Models (e.g., E5-Mistral) @@ -91,6 +95,7 @@ pip install vllm ``` Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more. + - [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html) - [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html) - [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html) @@ -107,6 +112,7 @@ vLLM is a community project. Our compute resources for development and testing a Cash Donations: + - a16z - Dropbox - Sequoia Capital @@ -114,6 +120,7 @@ Cash Donations: - ZhenFund Compute Resources: + - AMD - Anyscale - AWS diff --git a/RELEASE.md b/RELEASE.md index 9352e7ef706c6..db0d51afc7be1 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -60,9 +60,10 @@ Please note: **No feature work allowed for cherry picks**. All PRs that are cons Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI. **Current Coverage:** + * Models: Llama3, Llama4, and Mixtral * Hardware: NVIDIA H100 and AMD MI300x -* *Note: Coverage may change based on new model releases and hardware availability* +* _Note: Coverage may change based on new model releases and hardware availability_ **Performance Validation Process:** @@ -71,11 +72,13 @@ Request write access to the [pytorch/pytorch-integration-testing](https://github **Step 2: Review Benchmark Setup** Familiarize yourself with the benchmark configurations: + * [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda) * [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm) **Step 3: Run the Benchmark** Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure: + * **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`) * **vLLM commit**: Set to the RC commit hash diff --git a/benchmarks/README.md b/benchmarks/README.md index 3b10963c3e014..644517235b122 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -4,7 +4,7 @@ This README guides you through running benchmark tests with the extensive datasets supported on vLLM. It’s a living document, updated as new features and datasets become available. -**Dataset Overview** +## Dataset Overview @@ -81,9 +81,10 @@ become available. **Note**: HuggingFace dataset's `dataset-name` should be set to `hf` ---- +## 🚀 Example - Online Benchmark +
-🚀 Example - Online Benchmark +Show more
@@ -109,7 +110,7 @@ vllm bench serve \ If successful, you will see the following output -``` +```text ============ Serving Benchmark Result ============ Successful requests: 10 Benchmark duration (s): 5.78 @@ -133,11 +134,11 @@ P99 ITL (ms): 8.39 ================================================== ``` -**Custom Dataset** +### Custom Dataset If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl -``` +```json {"prompt": "What is the capital of India?"} {"prompt": "What is the capital of Iran?"} {"prompt": "What is the capital of China?"} @@ -166,7 +167,7 @@ vllm bench serve --port 9001 --save-result --save-detailed \ You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`. -**VisionArena Benchmark for Vision Language Models** +### VisionArena Benchmark for Vision Language Models ```bash # need a model with vision capability here @@ -184,7 +185,7 @@ vllm bench serve \ --num-prompts 1000 ``` -**InstructCoder Benchmark with Speculative Decoding** +### InstructCoder Benchmark with Speculative Decoding ``` bash VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ @@ -201,13 +202,13 @@ vllm bench serve \ --num-prompts 2048 ``` -**Other HuggingFaceDataset Examples** +### Other HuggingFaceDataset Examples ```bash vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests ``` -**`lmms-lab/LLaVA-OneVision-Data`** +`lmms-lab/LLaVA-OneVision-Data`: ```bash vllm bench serve \ @@ -221,7 +222,7 @@ vllm bench serve \ --num-prompts 10 ``` -**`Aeala/ShareGPT_Vicuna_unfiltered`** +`Aeala/ShareGPT_Vicuna_unfiltered`: ```bash vllm bench serve \ @@ -234,7 +235,7 @@ vllm bench serve \ --num-prompts 10 ``` -**`AI-MO/aimo-validation-aime`** +`AI-MO/aimo-validation-aime`: ``` bash vllm bench serve \ @@ -245,7 +246,7 @@ vllm bench serve \ --seed 42 ``` -**`philschmid/mt-bench`** +`philschmid/mt-bench`: ``` bash vllm bench serve \ @@ -255,7 +256,7 @@ vllm bench serve \ --num-prompts 80 ``` -**Running With Sampling Parameters** +### Running With Sampling Parameters When using OpenAI-compatible backends such as `vllm`, optional sampling parameters can be specified. Example client command: @@ -273,25 +274,29 @@ vllm bench serve \ --num-prompts 10 ``` -**Running With Ramp-Up Request Rate** +### Running With Ramp-Up Request Rate The benchmark tool also supports ramping up the request rate over the duration of the benchmark run. This can be useful for stress testing the server or finding the maximum throughput that it can handle, given some latency budget. Two ramp-up strategies are supported: + - `linear`: Increases the request rate linearly from a start value to an end value. - `exponential`: Increases the request rate exponentially. The following arguments can be used to control the ramp-up: + - `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`). - `--ramp-up-start-rps`: The request rate at the beginning of the benchmark. - `--ramp-up-end-rps`: The request rate at the end of the benchmark.
+## 📈 Example - Offline Throughput Benchmark +
-📈 Example - Offline Throughput Benchmark +Show more
@@ -305,15 +310,15 @@ vllm bench throughput \ If successful, you will see the following output -``` +```text Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s Total num prompt tokens: 5014 Total num output tokens: 1500 ``` -**VisionArena Benchmark for Vision Language Models** +### VisionArena Benchmark for Vision Language Models -``` bash +```bash vllm bench throughput \ --model Qwen/Qwen2-VL-7B-Instruct \ --backend vllm-chat \ @@ -325,13 +330,13 @@ vllm bench throughput \ The `num prompt tokens` now includes image token counts -``` +```text Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s Total num prompt tokens: 14527 Total num output tokens: 1280 ``` -**InstructCoder Benchmark with Speculative Decoding** +### InstructCoder Benchmark with Speculative Decoding ``` bash VLLM_WORKER_MULTIPROC_METHOD=spawn \ @@ -349,15 +354,15 @@ vllm bench throughput \ "prompt_lookup_min": 2}' ``` -``` +```text Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s Total num prompt tokens: 261136 Total num output tokens: 204800 ``` -**Other HuggingFaceDataset Examples** +### Other HuggingFaceDataset Examples -**`lmms-lab/LLaVA-OneVision-Data`** +`lmms-lab/LLaVA-OneVision-Data`: ```bash vllm bench throughput \ @@ -370,7 +375,7 @@ vllm bench throughput \ --num-prompts 10 ``` -**`Aeala/ShareGPT_Vicuna_unfiltered`** +`Aeala/ShareGPT_Vicuna_unfiltered`: ```bash vllm bench throughput \ @@ -382,7 +387,7 @@ vllm bench throughput \ --num-prompts 10 ``` -**`AI-MO/aimo-validation-aime`** +`AI-MO/aimo-validation-aime`: ```bash vllm bench throughput \ @@ -394,7 +399,7 @@ vllm bench throughput \ --num-prompts 10 ``` -**Benchmark with LoRA Adapters** +Benchmark with LoRA adapters: ``` bash # download dataset @@ -413,20 +418,22 @@ vllm bench throughput \
+## 🛠️ Example - Structured Output Benchmark +
-🛠️ Example - Structured Output Benchmark +Show more
Benchmark the performance of structured output generation (JSON, grammar, regex). -**Server Setup** +### Server Setup ```bash vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests ``` -**JSON Schema Benchmark** +### JSON Schema Benchmark ```bash python3 benchmarks/benchmark_serving_structured_output.py \ @@ -438,7 +445,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \ --num-prompts 1000 ``` -**Grammar-based Generation Benchmark** +### Grammar-based Generation Benchmark ```bash python3 benchmarks/benchmark_serving_structured_output.py \ @@ -450,7 +457,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \ --num-prompts 1000 ``` -**Regex-based Generation Benchmark** +### Regex-based Generation Benchmark ```bash python3 benchmarks/benchmark_serving_structured_output.py \ @@ -461,7 +468,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \ --num-prompts 1000 ``` -**Choice-based Generation Benchmark** +### Choice-based Generation Benchmark ```bash python3 benchmarks/benchmark_serving_structured_output.py \ @@ -472,7 +479,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \ --num-prompts 1000 ``` -**XGrammar Benchmark Dataset** +### XGrammar Benchmark Dataset ```bash python3 benchmarks/benchmark_serving_structured_output.py \ @@ -485,14 +492,16 @@ python3 benchmarks/benchmark_serving_structured_output.py \
+## 📚 Example - Long Document QA Benchmark +
-📚 Example - Long Document QA Benchmark +Show more
Benchmark the performance of long document question-answering with prefix caching. -**Basic Long Document QA Test** +### Basic Long Document QA Test ```bash python3 benchmarks/benchmark_long_document_qa_throughput.py \ @@ -504,7 +513,7 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \ --repeat-count 5 ``` -**Different Repeat Modes** +### Different Repeat Modes ```bash # Random mode (default) - shuffle prompts randomly @@ -537,14 +546,16 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
+## 🗂️ Example - Prefix Caching Benchmark +
-🗂️ Example - Prefix Caching Benchmark +Show more
Benchmark the efficiency of automatic prefix caching. -**Fixed Prompt with Prefix Caching** +### Fixed Prompt with Prefix Caching ```bash python3 benchmarks/benchmark_prefix_caching.py \ @@ -555,7 +566,7 @@ python3 benchmarks/benchmark_prefix_caching.py \ --input-length-range 128:256 ``` -**ShareGPT Dataset with Prefix Caching** +### ShareGPT Dataset with Prefix Caching ```bash # download dataset @@ -572,14 +583,16 @@ python3 benchmarks/benchmark_prefix_caching.py \
+## ⚡ Example - Request Prioritization Benchmark +
-⚡ Example - Request Prioritization Benchmark +Show more
Benchmark the performance of request prioritization in vLLM. -**Basic Prioritization Test** +### Basic Prioritization Test ```bash python3 benchmarks/benchmark_prioritization.py \ @@ -590,7 +603,7 @@ python3 benchmarks/benchmark_prioritization.py \ --scheduling-policy priority ``` -**Multiple Sequences per Prompt** +### Multiple Sequences per Prompt ```bash python3 benchmarks/benchmark_prioritization.py \ diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md index c479ff1aa29c0..9aad51df6e003 100644 --- a/benchmarks/auto_tune/README.md +++ b/benchmarks/auto_tune/README.md @@ -3,6 +3,7 @@ This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate. ## Table of Contents + - [Prerequisites](#prerequisites) - [Configuration](#configuration) - [How to Run](#how-to-run) @@ -52,7 +53,7 @@ You must set the following variables at the top of the script before execution. 1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section. 2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost. -``` +```bash cd bash auto_tune.sh ``` @@ -64,6 +65,7 @@ bash auto_tune.sh Here are a few examples of how to configure the script for different goals: ### 1. Maximize Throughput (No Latency Constraint) + - **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens. - **Configuration**: @@ -76,6 +78,7 @@ MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number ``` #### 2. Maximize Throughput with a Latency Requirement + - **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms. - **Configuration**: @@ -88,6 +91,7 @@ MAX_LATENCY_ALLOWED_MS=500 ``` #### 3. Maximize Throughput with Prefix Caching and Latency Requirements + - **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms. - **Configuration**: @@ -109,7 +113,7 @@ After the script finishes, you will find the results in a new, timestamped direc - **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found. -``` +```text # Example result.txt content hash:a1b2c3d4... max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8 diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md index 917e814010f89..41e68e047be82 100644 --- a/benchmarks/kernels/deepgemm/README.md +++ b/benchmarks/kernels/deepgemm/README.md @@ -8,7 +8,7 @@ Currently this just includes dense GEMMs and only works on Hopper GPUs. You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory: -``` +```bash git clone --recursive https://github.com/deepseek-ai/DeepGEMM cd DeepGEMM python setup.py install @@ -17,7 +17,7 @@ uv pip install -e . ## Usage -``` +```console python benchmark_fp8_block_dense_gemm.py INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda. ===== STARTING FP8 GEMM BENCHMARK ===== diff --git a/csrc/quantization/cutlass_w8a8/Epilogues.md b/csrc/quantization/cutlass_w8a8/Epilogues.md index a30e1fdf3ac77..15a66913e97a3 100644 --- a/csrc/quantization/cutlass_w8a8/Epilogues.md +++ b/csrc/quantization/cutlass_w8a8/Epilogues.md @@ -86,6 +86,7 @@ D = s_a s_b \widehat A \widehat B ``` Epilogue parameters: + - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector). - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector). @@ -135,7 +136,7 @@ That is precomputed and stored in `azp_with_adj` as a row-vector. Epilogue parameters: - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector). - - Generally this will be per-tensor as the zero-points are per-tensor. + - Generally this will be per-tensor as the zero-points are per-tensor. - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector). - `azp_with_adj` is the precomputed zero-point term ($` z_a J_a \widehat B `$), is per-channel (row-vector). - `bias` is the bias, is always per-channel (row-vector). @@ -152,7 +153,7 @@ That means the zero-point term $` z_a J_a \widehat B `$ becomes an outer product Epilogue parameters: - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector). - - Generally this will be per-token as the zero-points are per-token. + - Generally this will be per-token as the zero-points are per-token. - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector). - `azp_adj` is the precomputed zero-point adjustment term ($` \mathbf 1 \widehat B `$), is per-channel (row-vector). - `azp` is the zero-point (`z_a`), is per-token (column-vector). diff --git a/docs/cli/README.md b/docs/cli/README.md index dfb6051a8c8a6..b1371c82a4c4d 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -6,13 +6,13 @@ toc_depth: 4 The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with: -``` +```bash vllm --help ``` Available Commands: -``` +```bash vllm {chat,complete,serve,bench,collect-env,run-batch} ``` diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md index 005b7f78f4407..0ff0cdda380e9 100644 --- a/docs/configuration/tpu.md +++ b/docs/configuration/tpu.md @@ -40,6 +40,7 @@ Although the first compilation can take some time, for all subsequent server lau Use `VLLM_XLA_CACHE_PATH` environment variable to write to shareable storage for future deployed nodes (like when using autoscaling). #### Reducing compilation time + This initial compilation time ranges significantly and is impacted by many of the arguments discussed in this optimization doc. Factors that influence the length of time to compile are things like model size and `--max-num-batch-tokens`. Other arguments you can tune are things like `VLLM_TPU_MOST_MODEL_LEN`. ### Optimize based on your data @@ -71,12 +72,15 @@ The fewer tokens we pad, the less unnecessary computation TPU does, the better p However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compilaed graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding. -**If possible, use the precision that matches the chip’s hardware acceleration** +#### Quantization + +If possible, use the precision that matches the chip’s hardware acceleration: - v5e has int4/int8 hardware acceleration in the MXU - v6e has int4/int8 hardware acceleration in the MXU -Supported quantized formats and features in vLLM on TPU [Jul '25] +Supported quantized formats and features in vLLM on TPU [Jul '25]: + - INT8 W8A8 - INT8 W8A16 - FP8 KV cache @@ -84,11 +88,13 @@ Supported quantized formats and features in vLLM on TPU [Jul '25] - [WIP] AWQ - [WIP] FP4 W4A8 -**Don't set TP to be less than the number of chips on a single-host deployment** +#### Parallelization + +Don't set TP to be less than the number of chips on a single-host deployment. Although it’s common to do this with GPUs, don't try to fragment 2 or 8 different workloads across 8 chips on a single host. If you need 1 or 4 chips, just create an instance with 1 or 4 chips (these are partial-host machine types). -### Tune your workloads! +### Tune your workloads Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case. @@ -99,6 +105,7 @@ Although we try to have great default configs, we strongly recommend you check o The auto-tuner provides a profile of optimized configurations as its final step. However, interpreting this profile can be challenging for new users. We plan to expand this section in the future with more detailed guidance. In the meantime, you can learn how to collect a TPU profile using vLLM's native profiling tools [here](../examples/offline_inference/profiling_tpu.md). This profile can provide valuable insights into your workload's performance. #### SPMD + More details to come. **Want us to cover something that isn't listed here? Open up an issue please and cite this doc. We'd love to hear your questions or tips.** diff --git a/docs/contributing/ci/failures.md b/docs/contributing/ci/failures.md index 573efb3b05f6e..d7e2dfbca8760 100644 --- a/docs/contributing/ci/failures.md +++ b/docs/contributing/ci/failures.md @@ -20,19 +20,19 @@ the failure? - **Use this title format:** - ``` + ```text [CI Failure]: failing-test-job - regex/matching/failing:test ``` - **For the environment field:** - ``` - Still failing on main as of commit abcdef123 + ```text + Still failing on main as of commit abcdef123 ``` - **In the description, include failing tests:** - ``` + ```text FAILED failing/test.py:failing_test1 - Failure description FAILED failing/test.py:failing_test2 - Failure description https://github.com/orgs/vllm-project/projects/20 diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md index 699d0531ac768..3a6026d450a67 100644 --- a/docs/contributing/ci/update_pytorch_version.md +++ b/docs/contributing/ci/update_pytorch_version.md @@ -106,6 +106,7 @@ releases (which would take too much time), they can be built from source to unblock the update process. ### FlashInfer + Here is how to build and install it from source with `torch2.7.0+cu128` in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271): ```bash @@ -121,6 +122,7 @@ public location for immediate installation, such as [this FlashInfer wheel link] team if you want to get the package published there. ### xFormers + Similar to FlashInfer, here is how to build and install xFormers from source: ```bash @@ -138,7 +140,7 @@ uv pip install --system \ ### causal-conv1d -``` +```bash uv pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' ``` diff --git a/docs/contributing/deprecation_policy.md b/docs/contributing/deprecation_policy.md index ff69cbae08b23..904ef4ca058c0 100644 --- a/docs/contributing/deprecation_policy.md +++ b/docs/contributing/deprecation_policy.md @@ -31,7 +31,7 @@ Features that fall under this policy include (at a minimum) the following: The deprecation process consists of several clearly defined stages that span multiple Y releases: -**1. Deprecated (Still On By Default)** +### 1. Deprecated (Still On By Default) - **Action**: Feature is marked as deprecated. - **Timeline**: A removal version is explicitly stated in the deprecation @@ -46,7 +46,7 @@ warning (e.g., "This will be removed in v0.10.0"). - GitHub Issue (RFC) for feedback - Documentation and use of the `@typing_extensions.deprecated` decorator for Python APIs -**2.Deprecated (Off By Default)** +### 2.Deprecated (Off By Default) - **Action**: Feature is disabled by default, but can still be re-enabled via a CLI flag or environment variable. Feature throws an error when used without @@ -55,7 +55,7 @@ re-enabling. while signaling imminent removal. Ensures any remaining usage is clearly surfaced and blocks silent breakage before full removal. -**3. Removed** +### 3. Removed - **Action**: Feature is completely removed from the codebase. - **Note**: Only features that have passed through the previous deprecation diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index 13c3bc2c7e031..7c18b464b576c 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -112,13 +112,13 @@ vllm bench serve \ In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run: -``` +```bash nsys sessions list ``` to get the session id in the form of `profile-XXXXX`, then run: -``` +```bash nsys stop --session=profile-XXXXX ``` diff --git a/docs/contributing/vulnerability_management.md b/docs/contributing/vulnerability_management.md index e20b10f8f7b32..847883f742974 100644 --- a/docs/contributing/vulnerability_management.md +++ b/docs/contributing/vulnerability_management.md @@ -32,9 +32,9 @@ We prefer to keep all vulnerability-related communication on the security report on GitHub. However, if you need to contact the VMT directly for an urgent issue, you may contact the following individuals: -- Simon Mo - simon.mo@hey.com -- Russell Bryant - rbryant@redhat.com -- Huzaifa Sidhpurwala - huzaifas@redhat.com +- Simon Mo - +- Russell Bryant - +- Huzaifa Sidhpurwala - ## Slack Discussion diff --git a/docs/deployment/frameworks/anything-llm.md b/docs/deployment/frameworks/anything-llm.md index d6b28a358cc3d..e62a33b2085ca 100644 --- a/docs/deployment/frameworks/anything-llm.md +++ b/docs/deployment/frameworks/anything-llm.md @@ -19,9 +19,9 @@ vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096 - Download and install [Anything LLM desktop](https://anythingllm.com/desktop). - On the bottom left of open settings, AI Prooviders --> LLM: - - LLM Provider: Generic OpenAI - - Base URL: http://{vllm server host}:{vllm server port}/v1 - - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ` + - LLM Provider: Generic OpenAI + - Base URL: http://{vllm server host}:{vllm server port}/v1 + - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ` ![](../../assets/deployment/anything-llm-provider.png) @@ -30,9 +30,9 @@ vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096 ![](../../assets/deployment/anything-llm-chat-without-doc.png) - Click the upload button: - - upload the doc - - select the doc and move to the workspace - - save and embed + - upload the doc + - select the doc and move to the workspace + - save and embed ![](../../assets/deployment/anything-llm-upload-doc.png) diff --git a/docs/deployment/frameworks/chatbox.md b/docs/deployment/frameworks/chatbox.md index 15f92ed1e34df..cbca6e6282fc6 100644 --- a/docs/deployment/frameworks/chatbox.md +++ b/docs/deployment/frameworks/chatbox.md @@ -19,11 +19,11 @@ vllm serve qwen/Qwen1.5-0.5B-Chat - Download and install [Chatbox desktop](https://chatboxai.app/en#download). - On the bottom left of settings, Add Custom Provider - - API Mode: `OpenAI API Compatible` - - Name: vllm - - API Host: `http://{vllm server host}:{vllm server port}/v1` - - API Path: `/chat/completions` - - Model: `qwen/Qwen1.5-0.5B-Chat` + - API Mode: `OpenAI API Compatible` + - Name: vllm + - API Host: `http://{vllm server host}:{vllm server port}/v1` + - API Path: `/chat/completions` + - Model: `qwen/Qwen1.5-0.5B-Chat` ![](../../assets/deployment/chatbox-settings.png) diff --git a/docs/deployment/frameworks/dify.md b/docs/deployment/frameworks/dify.md index a3063194fb513..35f02c33cb02b 100644 --- a/docs/deployment/frameworks/dify.md +++ b/docs/deployment/frameworks/dify.md @@ -34,11 +34,11 @@ docker compose up -d - In the top-right user menu (under the profile icon), go to Settings, then click `Model Provider`, and locate the `vLLM` provider to install it. - Fill in the model provider details as follows: - - **Model Type**: `LLM` - - **Model Name**: `Qwen/Qwen1.5-7B-Chat` - - **API Endpoint URL**: `http://{vllm_server_host}:{vllm_server_port}/v1` - - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat` - - **Completion Mode**: `Completion` + - **Model Type**: `LLM` + - **Model Name**: `Qwen/Qwen1.5-7B-Chat` + - **API Endpoint URL**: `http://{vllm_server_host}:{vllm_server_port}/v1` + - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat` + - **Completion Mode**: `Completion` ![](../../assets/deployment/dify-settings.png) diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md index a18d68142cabb..70b4b48d4543e 100644 --- a/docs/deployment/frameworks/haystack.md +++ b/docs/deployment/frameworks/haystack.md @@ -1,7 +1,5 @@ # Haystack -# Haystack - [Haystack](https://github.com/deepset-ai/haystack) is an end-to-end LLM framework that allows you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform retrieval-augmented generation (RAG), document search, question answering or answer generation, Haystack can orchestrate state-of-the-art embedding models and LLMs into pipelines to build end-to-end NLP applications and solve your use case. It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. diff --git a/docs/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md index 96dd99e7118b6..d5f2ec302b6cd 100644 --- a/docs/deployment/frameworks/retrieval_augmented_generation.md +++ b/docs/deployment/frameworks/retrieval_augmented_generation.md @@ -3,6 +3,7 @@ [Retrieval-augmented generation (RAG)](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) is a technique that enables generative artificial intelligence (Gen AI) models to retrieve and incorporate new information. It modifies interactions with a large language model (LLM) so that the model responds to user queries with reference to a specified set of documents, using this information to supplement information from its pre-existing training data. This allows LLMs to use domain-specific and/or updated information. Use cases include providing chatbot access to internal company data or generating responses based on authoritative sources. Here are the integrations: + - vLLM + [langchain](https://github.com/langchain-ai/langchain) + [milvus](https://github.com/milvus-io/milvus) - vLLM + [llamaindex](https://github.com/run-llama/llama_index) + [milvus](https://github.com/milvus-io/milvus) diff --git a/docs/deployment/integrations/production-stack.md b/docs/deployment/integrations/production-stack.md index 497f9f1a92a5d..fae392589c060 100644 --- a/docs/deployment/integrations/production-stack.md +++ b/docs/deployment/integrations/production-stack.md @@ -140,11 +140,12 @@ The core vLLM production stack configuration is managed with YAML. Here is the e ``` In this YAML configuration: + * **`modelSpec`** includes: - * `name`: A nickname that you prefer to call the model. - * `repository`: Docker repository of vLLM. - * `tag`: Docker image tag. - * `modelURL`: The LLM model that you want to use. + * `name`: A nickname that you prefer to call the model. + * `repository`: Docker repository of vLLM. + * `tag`: Docker image tag. + * `modelURL`: The LLM model that you want to use. * **`replicaCount`**: Number of replicas. * **`requestCPU` and `requestMemory`**: Specifies the CPU and memory resource requests for the pod. * **`requestGPU`**: Specifies the number of GPUs required. diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md index f244b0858eb6e..cad801a4312cc 100644 --- a/docs/deployment/k8s.md +++ b/docs/deployment/k8s.md @@ -5,7 +5,7 @@ Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine le - [Deployment with CPUs](#deployment-with-cpus) - [Deployment with GPUs](#deployment-with-gpus) - [Troubleshooting](#troubleshooting) - - [Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"](#startup-probe-or-readiness-probe-failure-container-log-contains-keyboardinterrupt-terminated) + - [Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"](#startup-probe-or-readiness-probe-failure-container-log-contains-keyboardinterrupt-terminated) - [Conclusion](#conclusion) Alternatively, you can deploy vLLM to Kubernetes using any of the following: diff --git a/docs/design/metrics.md b/docs/design/metrics.md index 52cd320dd4e11..ba34c7dca0017 100644 --- a/docs/design/metrics.md +++ b/docs/design/metrics.md @@ -361,7 +361,7 @@ instances in Prometheus. We use this concept for the `vllm:cache_config_info` metric: -``` +```text # HELP vllm:cache_config_info Information of the LLMEngine CacheConfig # TYPE vllm:cache_config_info gauge vllm:cache_config_info{block_size="16",cache_dtype="auto",calculate_kv_scales="False",cpu_offload_gb="0",enable_prefix_caching="False",gpu_memory_utilization="0.9",...} 1.0 @@ -686,7 +686,7 @@ documentation for this option states: The metrics were added by and who up in an OpenTelemetry trace as: -``` +```text -> gen_ai.latency.time_in_scheduler: Double(0.017550230026245117) -> gen_ai.latency.time_in_model_forward: Double(3.151565277099609) -> gen_ai.latency.time_in_model_execute: Double(3.6468167304992676) diff --git a/docs/design/p2p_nccl_connector.md b/docs/design/p2p_nccl_connector.md index 082dff15ef2c8..94af8bedd24d2 100644 --- a/docs/design/p2p_nccl_connector.md +++ b/docs/design/p2p_nccl_connector.md @@ -5,6 +5,7 @@ An implementation of xPyD with dynamic scaling based on point-to-point communica ## Detailed Design ### Overall Process + As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow: 1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface. @@ -23,7 +24,7 @@ A simple HTTP service acts as the entry point for client requests and starts a b The Proxy/Router is responsible for selecting 1P1D based on the characteristics of the client request, such as the prompt, and generating a corresponding `request_id`, for example: -``` +```text cmpl-___prefill_addr_10.0.1.2:21001___decode_addr_10.0.1.3:22001_93923d63113b4b338973f24d19d4bf11-0 ``` @@ -70,6 +71,7 @@ pip install "vllm>=0.9.2" ## Run xPyD ### Instructions + - The following examples are run on an A800 (80GB) device, using the Meta-Llama-3.1-8B-Instruct model. - Pay attention to the setting of the `kv_buffer_size` (in bytes). The empirical value is 10% of the GPU memory size. This is related to the kvcache size. If it is too small, the GPU memory buffer for temporarily storing the received kvcache will overflow, causing the kvcache to be stored in the tensor memory pool, which increases latency. If it is too large, the kvcache available for inference will be reduced, leading to a smaller batch size and decreased throughput. - For Prefill instances, when using non-GET mode, the `kv_buffer_size` can be set to 1, as Prefill currently does not need to receive kvcache. However, when using GET mode, a larger `kv_buffer_size` is required because it needs to store the kvcache sent to the D instance. diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md index 2d3c8412894a6..fcc014cf85164 100644 --- a/docs/design/prefix_caching.md +++ b/docs/design/prefix_caching.md @@ -18,10 +18,12 @@ In the example above, the KV cache in the first block can be uniquely identified * Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision. * Extra hashes: Other values required to make this block unique, such as LoRA IDs, multi-modality input hashes (see the example below), and cache salts to isolate caches in multi-tenant environments. -> **Note 1:** We only cache full blocks. +!!! note "Note 1" + We only cache full blocks. -> **Note 2:** The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we advise to use SHA256** as hash function instead of the default builtin hash. -SHA256 is supported since vLLM v0.8.3 and must be enabled with a command line argument. It comes with a performance impact of about 100-200ns per token (~6ms for 50k tokens of context). +!!! note "Note 2" + The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we advise to use SHA256** as hash function instead of the default builtin hash. + SHA256 is supported since vLLM v0.8.3 and must be enabled with a command line argument. It comes with a performance impact of about 100-200ns per token (~6ms for 50k tokens of context). **A hashing example with multi-modality inputs** In this example, we illustrate how prefix caching works with multi-modality inputs (e.g., images). Assuming we have a request with the following messages: @@ -92,7 +94,8 @@ To improve privacy in shared environments, vLLM supports isolating prefix cache With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others. -> **Note:** Cache isolation is not supported in engine V0. +!!! note + Cache isolation is not supported in engine V0. ## Data Structure diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md index ea5d8ac212f7a..2d76e7f3adc5c 100644 --- a/docs/design/torch_compile.md +++ b/docs/design/torch_compile.md @@ -8,7 +8,7 @@ Throughout the example, we will run a common Llama model using v1, and turn on d In the very verbose logs, we can see: -``` +```console INFO 03-07 03:06:55 [backends.py:409] Using cache directory: ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0 for vLLM's torch.compile ``` @@ -75,7 +75,7 @@ Every submodule can be identified by its index, and will be processed individual In the very verbose logs, we can also see: -``` +```console DEBUG 03-07 03:52:37 [backends.py:134] store the 0-th graph for shape None from inductor via handle ('fpegyiq3v3wzjzphd45wkflpabggdbjpylgr7tta4hj6uplstsiw', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/iw/ciwzrk3ittdqatuzwonnajywvno3llvjcs2vfdldzwzozn3zi3iy.py') DEBUG 03-07 03:52:39 [backends.py:134] store the 1-th graph for shape None from inductor via handle ('f7fmlodmf3h3by5iiu2c4zarwoxbg4eytwr3ujdd2jphl4pospfd', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/ly/clyfzxldfsj7ehaluis2mca2omqka4r7mgcedlf6xfjh645nw6k2.py') ... @@ -93,7 +93,7 @@ One more detail: you can see that the 1-th graph and the 15-th graph have the sa If we already have the cache directory (e.g. run the same code for the second time), we will see the following logs: -``` +```console DEBUG 03-07 04:00:45 [backends.py:86] Directly load the 0-th graph for shape None from inductor via handle ('fpegyiq3v3wzjzphd45wkflpabggdbjpylgr7tta4hj6uplstsiw', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/iw/ciwzrk3ittdqatuzwonnajywvno3llvjcs2vfdldzwzozn3zi3iy.py') ``` diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md index 259a447984cb0..930265b8f9840 100644 --- a/docs/features/compatibility_matrix.md +++ b/docs/features/compatibility_matrix.md @@ -36,9 +36,9 @@ th:not(:first-child) { | Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | enc-dec | logP | prmpt logP | async output | multi-step | mm | best-of | beam-search | |---|---|---|---|---|---|---|---|---|---|---|---|---|---|---| -| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | | -| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | | -| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | | +| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | +| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | +| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | | [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | [pooling](../models/pooling_models.md) | ✅\* | ✅\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | diff --git a/docs/features/lora.md b/docs/features/lora.md index ea1b495138c1b..a4e05dae11c2e 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -119,6 +119,7 @@ export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True ``` ### Using API Endpoints + Loading a LoRA Adapter: To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary @@ -156,6 +157,7 @@ curl -X POST http://localhost:8000/v1/unload_lora_adapter \ ``` ### Using Plugins + Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adapters. LoRAResolver plugins enable you to load LoRA adapters from both local and remote sources such as local file system and S3. On every request, when there's a new model name that hasn't been loaded yet, the LoRAResolver will try to resolve and load the corresponding LoRA adapter. You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds. diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index d4c8852206bba..b8677f11a1d3c 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -588,7 +588,9 @@ Full example: /bin/bash`. + If Ray is running inside containers, run the commands in the remainder of this guide *inside the containers*, not on the host. To open a shell inside a container, connect to a node and use `docker exec -it /bin/bash`. Once a Ray cluster is running, use vLLM as you would in a single-node setting. All resources across the Ray cluster are visible to vLLM, so a single `vllm` command on a single node is sufficient. diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index d79b6fc590189..280b3322b11c3 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -31,11 +31,12 @@ vLLM provides three communication backends for EP: Enable EP by setting the `--enable-expert-parallel` flag. The EP size is automatically calculated as: -``` +```text EP_SIZE = TP_SIZE × DP_SIZE ``` Where: + - `TP_SIZE`: Tensor parallel size (always 1 for now) - `DP_SIZE`: Data parallel size - `EP_SIZE`: Expert parallel size (computed automatically) diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 4eb2ea2731817..dfed15d4ace97 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -206,6 +206,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai We support both [Vision](https://platform.openai.com/docs/guides/vision)- and [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters; see our [Multimodal Inputs](../features/multimodal_inputs.md) guide for more information. + - *Note: `image_url.detail` parameter is not supported.* Code example: diff --git a/docs/usage/security.md b/docs/usage/security.md index 76140434dcb36..d54e2bb37ec07 100644 --- a/docs/usage/security.md +++ b/docs/usage/security.md @@ -13,15 +13,18 @@ All communications between nodes in a multi-node vLLM deployment are **insecure The following options control inter-node communications in vLLM: #### 1. **Environment Variables:** - - `VLLM_HOST_IP`: Sets the IP address for vLLM processes to communicate on + +- `VLLM_HOST_IP`: Sets the IP address for vLLM processes to communicate on #### 2. **KV Cache Transfer Configuration:** - - `--kv-ip`: The IP address for KV cache transfer communications (default: 127.0.0.1) - - `--kv-port`: The port for KV cache transfer communications (default: 14579) + +- `--kv-ip`: The IP address for KV cache transfer communications (default: 127.0.0.1) +- `--kv-port`: The port for KV cache transfer communications (default: 14579) #### 3. **Data Parallel Configuration:** - - `data_parallel_master_ip`: IP of the data parallel master (default: 127.0.0.1) - - `data_parallel_master_port`: Port of the data parallel master (default: 29500) + +- `data_parallel_master_ip`: IP of the data parallel master (default: 127.0.0.1) +- `data_parallel_master_port`: Port of the data parallel master (default: 29500) ### Notes on PyTorch Distributed @@ -41,18 +44,21 @@ Key points from the PyTorch security guide: ### Security Recommendations #### 1. **Network Isolation:** - - Deploy vLLM nodes on a dedicated, isolated network - - Use network segmentation to prevent unauthorized access - - Implement appropriate firewall rules + +- Deploy vLLM nodes on a dedicated, isolated network +- Use network segmentation to prevent unauthorized access +- Implement appropriate firewall rules #### 2. **Configuration Best Practices:** - - Always set `VLLM_HOST_IP` to a specific IP address rather than using defaults - - Configure firewalls to only allow necessary ports between nodes + +- Always set `VLLM_HOST_IP` to a specific IP address rather than using defaults +- Configure firewalls to only allow necessary ports between nodes #### 3. **Access Control:** - - Restrict physical and network access to the deployment environment - - Implement proper authentication and authorization for management interfaces - - Follow the principle of least privilege for all system components + +- Restrict physical and network access to the deployment environment +- Implement proper authentication and authorization for management interfaces +- Follow the principle of least privilege for all system components ## Security and Firewalls: Protecting Exposed vLLM Systems diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 498ff3da0ca31..38399c6633bdb 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -148,7 +148,7 @@ are not yet supported. vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic differences compared to V0: -**Logprobs Calculation** +##### Logprobs Calculation Logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e. before applying any logits post-processing such as temperature scaling or penalty @@ -157,7 +157,7 @@ probabilities used during sampling. Support for logprobs with post-sampling adjustments is in progress and will be added in future updates. -**Prompt Logprobs with Prefix Caching** +##### Prompt Logprobs with Prefix Caching Currently prompt logprobs are only supported when prefix caching is turned off via `--no-enable-prefix-caching`. In a future release, prompt logprobs will be compatible with prefix caching, but a recomputation will be triggered to recover the full prompt logprobs even upon a prefix cache hit. See details in [RFC #13414](gh-issue:13414). @@ -165,7 +165,7 @@ Currently prompt logprobs are only supported when prefix caching is turned off v As part of the major architectural rework in vLLM V1, several legacy features have been deprecated. -**Sampling features** +##### Sampling features - **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](gh-issue:13361). - **Per-Request Logits Processors**: In V0, users could pass custom @@ -173,11 +173,11 @@ As part of the major architectural rework in vLLM V1, several legacy features ha feature has been deprecated. Instead, the design is moving toward supporting **global logits processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](gh-pr:13360). -**KV Cache features** +##### KV Cache features - **GPU <> CPU KV Cache Swapping**: with the new simplified core architecture, vLLM V1 no longer requires KV cache swapping to handle request preemptions. -**Structured Output features** +##### Structured Output features - **Request-level Structured Output Backend**: Deprecated, alternative backends (outlines, guidance) with fallbacks is supported now. diff --git a/examples/offline_inference/disaggregated-prefill-v1/README.md b/examples/offline_inference/disaggregated-prefill-v1/README.md index 9cbdb19820f56..abf6883f8d3ef 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/README.md +++ b/examples/offline_inference/disaggregated-prefill-v1/README.md @@ -5,6 +5,6 @@ This example contains scripts that demonstrate disaggregated prefill in the offl ## Files - `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially. - - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`. + - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`. - `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`. - `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`. diff --git a/examples/offline_inference/openai_batch/README.md b/examples/offline_inference/openai_batch/README.md index 631fde91fcd08..3c6f6c7a6c588 100644 --- a/examples/offline_inference/openai_batch/README.md +++ b/examples/offline_inference/openai_batch/README.md @@ -19,9 +19,9 @@ We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` e ## Pre-requisites * The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`. - - Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens) - - Install the token on your machine (Run `huggingface-cli login`). - - Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions. + * Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens) + * Install the token on your machine (Run `huggingface-cli login`). + * Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions. ## Example 1: Running with a local file @@ -105,7 +105,7 @@ To integrate with cloud blob storage, we recommend using presigned urls. * [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html). * The `awscli` package (Run `pip install awscli`) to configure your credentials and interactively use s3. - - [Configure your credentials](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html). + * [Configure your credentials](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html). * The `boto3` python package (Run `pip install boto3`) to generate presigned urls. ### Step 1: Upload your input script diff --git a/examples/others/lmcache/README.md b/examples/others/lmcache/README.md index 95a6bf995b2fd..759be55d6f1c5 100644 --- a/examples/others/lmcache/README.md +++ b/examples/others/lmcache/README.md @@ -28,16 +28,20 @@ to run disaggregated prefill and benchmark the performance. ### Components #### Server Scripts + - `disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh` - Launches individual vLLM servers for prefill/decode, and also launches the proxy server. - `disagg_prefill_lmcache_v1/disagg_proxy_server.py` - FastAPI proxy server that coordinates between prefiller and decoder - `disagg_prefill_lmcache_v1/disagg_example_nixl.sh` - Main script to run the example #### Configuration + - `disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml` - Configuration for prefiller server - `disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml` - Configuration for decoder server #### Log Files + The main script generates several log files: + - `prefiller.log` - Logs from the prefill server - `decoder.log` - Logs from the decode server - `proxy.log` - Logs from the proxy server diff --git a/examples/others/logging_configuration.md b/examples/others/logging_configuration.md index 916ab5fd1c871..7c8bdd199a72d 100644 --- a/examples/others/logging_configuration.md +++ b/examples/others/logging_configuration.md @@ -8,11 +8,11 @@ of logging configurations that range from simple-and-inflexible to more-complex-and-more-flexible. - No vLLM logging (simple and inflexible) - - Set `VLLM_CONFIGURE_LOGGING=0` (leaving `VLLM_LOGGING_CONFIG_PATH` unset) + - Set `VLLM_CONFIGURE_LOGGING=0` (leaving `VLLM_LOGGING_CONFIG_PATH` unset) - vLLM's default logging configuration (simple and inflexible) - - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` + - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` - Fine-grained custom logging configuration (more complex, more flexible) - - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` and + - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` and set `VLLM_LOGGING_CONFIG_PATH=` ## Logging Configuration Environment Variables diff --git a/pyproject.toml b/pyproject.toml index a65267942d47e..dfad5d2cdf319 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -156,16 +156,6 @@ markers = [ "optional: optional tests that are automatically skipped, include --optional to run them", ] -[tool.pymarkdown] -plugins.md004.style = "sublist" # ul-style -plugins.md007.indent = 4 # ul-indent -plugins.md007.start_indented = true # ul-indent -plugins.md013.enabled = false # line-length -plugins.md041.enabled = false # first-line-h1 -plugins.md033.enabled = false # inline-html -plugins.md046.enabled = false # code-block-style -plugins.md024.allow_different_nesting = true # no-duplicate-headers - [tool.ty.src] root = "./vllm" respect-ignore-files = true diff --git a/tools/ep_kernels/README.md b/tools/ep_kernels/README.md index f1479146f053c..273e0f378e343 100644 --- a/tools/ep_kernels/README.md +++ b/tools/ep_kernels/README.md @@ -1,6 +1,9 @@ +# Expert parallel kernels + Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Technical Report](http://arxiv.org/abs/2412.19437), is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package. Here we break down the requirements in 2 steps: + 1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this. 2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine. @@ -8,15 +11,15 @@ Here we break down the requirements in 2 steps: All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`. -# Usage +## Usage -## Single-node +### Single-node ```bash bash install_python_libraries.sh ``` -## Multi-node +### Multi-node ```bash bash install_python_libraries.sh diff --git a/vllm/plugins/lora_resolvers/README.md b/vllm/plugins/lora_resolvers/README.md index 7e7c55f5c69c7..48f27dddea07e 100644 --- a/vllm/plugins/lora_resolvers/README.md +++ b/vllm/plugins/lora_resolvers/README.md @@ -6,7 +6,8 @@ via the LoRAResolver plugin framework. Note that `VLLM_ALLOW_RUNTIME_LORA_UPDATING` must be set to true to allow LoRA resolver plugins to work, and `VLLM_PLUGINS` must be set to include the desired resolver plugins. -# lora_filesystem_resolver +## lora_filesystem_resolver + This LoRA Resolver is installed with vLLM by default. To use, set `VLLM_PLUGIN_LORA_CACHE_DIR` to a local directory. When vLLM receives a request for a LoRA adapter `foobar` it doesn't currently recognize, it will look in that local directory From 76080cff79b5b56e3d8b6a2fb9b9c5b4c4633c67 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Tue, 29 Jul 2025 19:45:18 -0700 Subject: [PATCH 032/224] [DOC] Fix path of v1 related figures (#21868) Signed-off-by: Chen Zhang Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../design/{v1 => }/metrics/intervals-1.png | Bin .../design/{v1 => }/metrics/intervals-2.png | Bin .../design/{v1 => }/metrics/intervals-3.png | Bin .../{v1 => }/prefix_caching/example-time-1.png | Bin .../{v1 => }/prefix_caching/example-time-3.png | Bin .../{v1 => }/prefix_caching/example-time-4.png | Bin .../{v1 => }/prefix_caching/example-time-5.png | Bin .../{v1 => }/prefix_caching/example-time-6.png | Bin .../{v1 => }/prefix_caching/example-time-7.png | Bin .../design/{v1 => }/prefix_caching/free.png | Bin .../design/{v1 => }/prefix_caching/overview.png | Bin .../design/{v1 => }/tpu/most_model_len.png | Bin docs/configuration/tpu.md | 2 +- docs/design/metrics.md | 6 +++--- docs/design/prefix_caching.md | 16 ++++++++-------- 15 files changed, 12 insertions(+), 12 deletions(-) rename docs/assets/design/{v1 => }/metrics/intervals-1.png (100%) rename docs/assets/design/{v1 => }/metrics/intervals-2.png (100%) rename docs/assets/design/{v1 => }/metrics/intervals-3.png (100%) rename docs/assets/design/{v1 => }/prefix_caching/example-time-1.png (100%) rename docs/assets/design/{v1 => }/prefix_caching/example-time-3.png (100%) rename docs/assets/design/{v1 => }/prefix_caching/example-time-4.png (100%) rename docs/assets/design/{v1 => }/prefix_caching/example-time-5.png (100%) rename docs/assets/design/{v1 => }/prefix_caching/example-time-6.png (100%) rename docs/assets/design/{v1 => }/prefix_caching/example-time-7.png (100%) rename docs/assets/design/{v1 => }/prefix_caching/free.png (100%) rename docs/assets/design/{v1 => }/prefix_caching/overview.png (100%) rename docs/assets/design/{v1 => }/tpu/most_model_len.png (100%) diff --git a/docs/assets/design/v1/metrics/intervals-1.png b/docs/assets/design/metrics/intervals-1.png similarity index 100% rename from docs/assets/design/v1/metrics/intervals-1.png rename to docs/assets/design/metrics/intervals-1.png diff --git a/docs/assets/design/v1/metrics/intervals-2.png b/docs/assets/design/metrics/intervals-2.png similarity index 100% rename from docs/assets/design/v1/metrics/intervals-2.png rename to docs/assets/design/metrics/intervals-2.png diff --git a/docs/assets/design/v1/metrics/intervals-3.png b/docs/assets/design/metrics/intervals-3.png similarity index 100% rename from docs/assets/design/v1/metrics/intervals-3.png rename to docs/assets/design/metrics/intervals-3.png diff --git a/docs/assets/design/v1/prefix_caching/example-time-1.png b/docs/assets/design/prefix_caching/example-time-1.png similarity index 100% rename from docs/assets/design/v1/prefix_caching/example-time-1.png rename to docs/assets/design/prefix_caching/example-time-1.png diff --git a/docs/assets/design/v1/prefix_caching/example-time-3.png b/docs/assets/design/prefix_caching/example-time-3.png similarity index 100% rename from docs/assets/design/v1/prefix_caching/example-time-3.png rename to docs/assets/design/prefix_caching/example-time-3.png diff --git a/docs/assets/design/v1/prefix_caching/example-time-4.png b/docs/assets/design/prefix_caching/example-time-4.png similarity index 100% rename from docs/assets/design/v1/prefix_caching/example-time-4.png rename to docs/assets/design/prefix_caching/example-time-4.png diff --git a/docs/assets/design/v1/prefix_caching/example-time-5.png b/docs/assets/design/prefix_caching/example-time-5.png similarity index 100% rename from docs/assets/design/v1/prefix_caching/example-time-5.png rename to docs/assets/design/prefix_caching/example-time-5.png diff --git a/docs/assets/design/v1/prefix_caching/example-time-6.png b/docs/assets/design/prefix_caching/example-time-6.png similarity index 100% rename from docs/assets/design/v1/prefix_caching/example-time-6.png rename to docs/assets/design/prefix_caching/example-time-6.png diff --git a/docs/assets/design/v1/prefix_caching/example-time-7.png b/docs/assets/design/prefix_caching/example-time-7.png similarity index 100% rename from docs/assets/design/v1/prefix_caching/example-time-7.png rename to docs/assets/design/prefix_caching/example-time-7.png diff --git a/docs/assets/design/v1/prefix_caching/free.png b/docs/assets/design/prefix_caching/free.png similarity index 100% rename from docs/assets/design/v1/prefix_caching/free.png rename to docs/assets/design/prefix_caching/free.png diff --git a/docs/assets/design/v1/prefix_caching/overview.png b/docs/assets/design/prefix_caching/overview.png similarity index 100% rename from docs/assets/design/v1/prefix_caching/overview.png rename to docs/assets/design/prefix_caching/overview.png diff --git a/docs/assets/design/v1/tpu/most_model_len.png b/docs/assets/design/tpu/most_model_len.png similarity index 100% rename from docs/assets/design/v1/tpu/most_model_len.png rename to docs/assets/design/tpu/most_model_len.png diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md index 0ff0cdda380e9..a2941c80bd27c 100644 --- a/docs/configuration/tpu.md +++ b/docs/configuration/tpu.md @@ -47,7 +47,7 @@ This initial compilation time ranges significantly and is impacted by many of th #### max model len vs. most model len -![most_model_len](../assets/design/v1/tpu/most_model_len.png) +![most_model_len](../assets/design/tpu/most_model_len.png) If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most model len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable. diff --git a/docs/design/metrics.md b/docs/design/metrics.md index ba34c7dca0017..1f65331d3c0a9 100644 --- a/docs/design/metrics.md +++ b/docs/design/metrics.md @@ -223,7 +223,7 @@ And the calculated intervals are: Put another way: -![Interval calculations - common case](../../assets/design/v1/metrics/intervals-1.png) +![Interval calculations - common case](../assets/design/metrics/intervals-1.png) We explored the possibility of having the frontend calculate these intervals using the timing of events visible by the frontend. However, @@ -238,13 +238,13 @@ When a preemption occurs during decode, since any already generated tokens are reused, we consider the preemption as affecting the inter-token, decode, and inference intervals. -![Interval calculations - preempted decode](../../assets/design/v1/metrics/intervals-2.png) +![Interval calculations - preempted decode](../assets/design/metrics/intervals-2.png) When a preemption occurs during prefill (assuming such an event is possible), we consider the preemption as affecting the time-to-first-token and prefill intervals. -![Interval calculations - preempted prefill](../../assets/design/v1/metrics/intervals-3.png) +![Interval calculations - preempted prefill](../assets/design/metrics/intervals-3.png) ### Frontend Stats Collection diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md index fcc014cf85164..9941837bf1652 100644 --- a/docs/design/prefix_caching.md +++ b/docs/design/prefix_caching.md @@ -125,7 +125,7 @@ There are two design points to highlight: As a result, we will have the following components when the KV cache manager is initialized: -![Component Overview](../../assets/design/v1/prefix_caching/overview.png) +![Component Overview](../assets/design/prefix_caching/overview.png) * Block Pool: A list of KVCacheBlock. * Free Block Queue: Only store the pointers of head and tail blocks for manipulations. @@ -195,7 +195,7 @@ As can be seen, block 3 is a new full block and is cached. However, it is redund When a request is finished, we free all its blocks if no other requests are using them (reference count = 0). In this example, we free request 1 and block 2, 3, 4, 8 associated with it. We can see that the freed blocks are added to the tail of the free queue in the *reverse* order. This is because the last block of a request must hash more tokens and is less likely to be reused by other requests. As a result, it should be evicted first. -![Free queue after a request us freed](../../assets/design/v1/prefix_caching/free.png) +![Free queue after a request us freed](../assets/design/prefix_caching/free.png) ### Eviction (LRU) @@ -211,24 +211,24 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens), **Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens. -![Example Time 1](../../assets/design/v1/prefix_caching/example-time-1.png) +![Example Time 1](../assets/design/prefix_caching/example-time-1.png) **Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4. -![Example Time 3](../../assets/design/v1/prefix_caching/example-time-3.png) +![Example Time 3](../assets/design/prefix_caching/example-time-3.png) **Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens. -![Example Time 4](../../assets/design/v1/prefix_caching/example-time-4.png) +![Example Time 4](../assets/design/prefix_caching/example-time-4.png) **Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1. -![Example Time 5](../../assets/design/v1/prefix_caching/example-time-5.png) +![Example Time 5](../assets/design/prefix_caching/example-time-5.png) **Time 6: Request 1 is finished and free.** -![Example Time 6](../../assets/design/v1/prefix_caching/example-time-6.png) +![Example Time 6](../assets/design/prefix_caching/example-time-6.png) **Time 7: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted). -![Example Time 7](../../assets/design/v1/prefix_caching/example-time-7.png) +![Example Time 7](../assets/design/prefix_caching/example-time-7.png) From fb58e3a651f7321eb882ff28018a918b31726c82 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 29 Jul 2025 22:45:41 -0400 Subject: [PATCH 033/224] [Docs] Update docker.md with HF_TOKEN, new model, and podman fix (#21856) --- docs/deployment/docker.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md index e500751896b34..5f6cfcb00a37a 100644 --- a/docs/deployment/docker.md +++ b/docs/deployment/docker.md @@ -10,23 +10,23 @@ The image can be used to run OpenAI compatible server and is available on Docker ```bash docker run --runtime nvidia --gpus all \ -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=" \ + --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ -p 8000:8000 \ --ipc=host \ vllm/vllm-openai:latest \ - --model mistralai/Mistral-7B-v0.1 + --model Qwen/Qwen3-0.6B ``` This image can also be used with other container engines such as [Podman](https://podman.io/). ```bash -podman run --gpus all \ +podman run --device nvidia.com/gpu=all \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ -p 8000:8000 \ --ipc=host \ - vllm/vllm-openai:latest \ - --model mistralai/Mistral-7B-v0.1 + docker.io/vllm/vllm-openai:latest \ + --model Qwen/Qwen3-0.6B ``` You can add any other [engine-args](../configuration/engine_args.md) you need after the image tag (`vllm/vllm-openai:latest`). From b917da442b820245f537602d752e7146e66dd37a Mon Sep 17 00:00:00 2001 From: Csrayz <33659823+Csrayz@users.noreply.github.com> Date: Wed, 30 Jul 2025 10:46:31 +0800 Subject: [PATCH 034/224] Expose PyTorch profiler configuration to environment variables (#21803) Signed-off-by: Csrayz <33659823+Csrayz@users.noreply.github.com> --- docs/contributing/profiling.md | 7 ++++++- vllm/envs.py | 29 +++++++++++++++++++++++++++++ vllm/v1/worker/gpu_worker.py | 15 +++++++++++++-- vllm/v1/worker/xpu_worker.py | 13 ++++++++++++- 4 files changed, 60 insertions(+), 4 deletions(-) diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index 7c18b464b576c..74627e9062167 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -5,7 +5,12 @@ ## Profile with PyTorch Profiler -We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/` +We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`. Additionally, you can control the profiling content by specifying the following environment variables: + +- `VLLM_TORCH_PROFILER_RECORD_SHAPES=1` to enable recording Tensor Shapes, off by default +- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default +- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default +- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set. diff --git a/vllm/envs.py b/vllm/envs.py index 9b6d8c8be242a..50cb3b7d1b7aa 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -80,6 +80,10 @@ if TYPE_CHECKING: VLLM_PLUGINS: Optional[list[str]] = None VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None VLLM_TORCH_PROFILER_DIR: Optional[str] = None + VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False + VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False + VLLM_TORCH_PROFILER_WITH_STACK: bool = True + VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False VLLM_USE_TRITON_AWQ: bool = False VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_SKIP_P2P_CHECK: bool = False @@ -629,6 +633,31 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))), + # Enable torch profiler to record shapes if set + # VLLM_TORCH_PROFILER_RECORD_SHAPES=1. If not set, torch profiler will + # not record shapes. + "VLLM_TORCH_PROFILER_RECORD_SHAPES": + lambda: bool(os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0") != "0"), + + # Enable torch profiler to profile memory if set + # VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1. If not set, torch profiler + # will not profile memory. + "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": + lambda: bool( + os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0") != "0"), + + # Enable torch profiler to profile stack if set + # VLLM_TORCH_PROFILER_WITH_STACK=1. If not set, torch profiler WILL + # profile stack by default. + "VLLM_TORCH_PROFILER_WITH_STACK": + lambda: bool(os.getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"), + + # Enable torch profiler to profile flops if set + # VLLM_TORCH_PROFILER_WITH_FLOPS=1. If not set, torch profiler will + # not profile flops. + "VLLM_TORCH_PROFILER_WITH_FLOPS": + lambda: bool(os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"), + # If set, vLLM will use Triton implementations of AWQ. "VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))), diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index d9d1f14f0554c..0f46ed223ab88 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -71,12 +71,23 @@ class Worker(WorkerBase): torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR logger.info("Profiling enabled. Traces will be saved to: %s", torch_profiler_trace_dir) + logger.debug( + "Profiler config: record_shapes=%s," + "profile_memory=%s,with_stack=%s,with_flops=%s", + envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, + envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, + envs.VLLM_TORCH_PROFILER_WITH_STACK, + envs.VLLM_TORCH_PROFILER_WITH_FLOPS, + ) self.profiler = torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA, ], - with_stack=True, + record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, + profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, + with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, + with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, on_trace_ready=torch.profiler.tensorboard_trace_handler( torch_profiler_trace_dir, use_gzip=True)) else: @@ -209,7 +220,7 @@ class Worker(WorkerBase): @torch.inference_mode() def determine_available_memory(self) -> int: - """Profiles the peak memory usage of the model to determine how much + """Profiles the peak memory usage of the model to determine how much memory can be used for KV cache without OOMs. The engine will first conduct a profiling of the existing memory usage. diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py index c7885694f7a38..2a7e0625b2f87 100644 --- a/vllm/v1/worker/xpu_worker.py +++ b/vllm/v1/worker/xpu_worker.py @@ -41,12 +41,23 @@ class XPUWorker(Worker): torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR logger.info("Profiling enabled. Traces will be saved to: %s", torch_profiler_trace_dir) + logger.debug( + "Profiler config: record_shapes=%s," + "profile_memory=%s,with_stack=%s,with_flops=%s", + envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, + envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, + envs.VLLM_TORCH_PROFILER_WITH_STACK, + envs.VLLM_TORCH_PROFILER_WITH_FLOPS, + ) self.profiler = torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.XPU, ], - with_stack=True, + record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, + profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, + with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, + with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, on_trace_ready=torch.profiler.tensorboard_trace_handler( torch_profiler_trace_dir, use_gzip=True)) else: From fdde18229ef32d5872596ce9b004dabf310edbde Mon Sep 17 00:00:00 2001 From: Areeb Syed Date: Wed, 30 Jul 2025 09:05:21 +0530 Subject: [PATCH 035/224] [Bugfix] Fix shape mismatch assertion error when loading Gemma3n model with BitsAndBytes quantization (#21808) Signed-off-by: sydarb --- vllm/model_executor/models/gemma3n.py | 31 +++++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index 168665cc29655..d0880103d4e86 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -167,22 +167,33 @@ class Gemma3nAltUp(nn.Module): class Gemma3nLaurelBlock(nn.Module): """Learned Augmented Residual Layer""" - def __init__(self, hidden_size: int, laurel_rank: int, rms_norm_eps: float, - prefix: str): + def __init__( + self, + hidden_size: int, + laurel_rank: int, + rms_norm_eps: float, + *, + quant_config: Optional[QuantizationConfig] = None, + prefix: str, + ) -> None: super().__init__() self.linear_left = ColumnParallelLinear( hidden_size, laurel_rank, bias=False, + quant_config=quant_config, prefix=f"{prefix}.linear_left", return_bias=False, ) - self.linear_right = RowParallelLinear(laurel_rank, - hidden_size, - bias=False, - prefix=f"{prefix}.linear_right", - return_bias=False) + self.linear_right = RowParallelLinear( + laurel_rank, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.linear_right", + return_bias=False, + ) self.post_laurel_norm = RMSNorm( hidden_size=hidden_size, eps=rms_norm_eps, @@ -417,6 +428,7 @@ class Gemma3nDecoderLayer(nn.Module): hidden_size=config.hidden_size, laurel_rank=config.laurel_rank, rms_norm_eps=config.rms_norm_eps, + quant_config=quant_config, prefix=f"{prefix}.laurel", ) @@ -427,6 +439,7 @@ class Gemma3nDecoderLayer(nn.Module): config.hidden_size, config.hidden_size_per_layer_input, bias=False, + quant_config=quant_config, prefix=f"{prefix}.per_layer_input_gate", return_bias=False, ) @@ -434,6 +447,7 @@ class Gemma3nDecoderLayer(nn.Module): config.hidden_size_per_layer_input, config.hidden_size, bias=False, + quant_config=quant_config, prefix=f"{prefix}.per_layer_projection", return_bias=False, ) @@ -547,6 +561,7 @@ class Gemma3nTextModel(nn.Module): bias=False, gather_output=True, return_bias=False, + quant_config=quant_config, prefix=f"{prefix}.per_layer_model_projection", ) self.per_layer_projection_norm = RMSNorm( @@ -566,6 +581,7 @@ class Gemma3nTextModel(nn.Module): bias=False, gather_output=True, return_bias=False, + quant_config=quant_config, prefix=f"{prefix}.{idx-1}.altup_projections", ) for idx in range(1, self.config.altup_num_inputs) ]) @@ -576,6 +592,7 @@ class Gemma3nTextModel(nn.Module): bias=False, gather_output=True, return_bias=False, + quant_config=quant_config, prefix=f"{prefix}.{idx-1}.altup_unembed_projections", ) for idx in range(1, self.config.altup_num_inputs) ]) From b7b23da4d25add19411821fa5f784529d4de8732 Mon Sep 17 00:00:00 2001 From: MingzhenHan Date: Wed, 30 Jul 2025 11:35:33 +0800 Subject: [PATCH 036/224] [Bugfix] Fix comment typo of get_num_common_prefix_blocks() (#21827) Signed-off-by: MingzhenHan --- vllm/v1/core/kv_cache_coordinator.py | 4 ++-- vllm/v1/core/single_type_kv_cache_manager.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index 0cce2ec81e08a..258805843e227 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -130,10 +130,10 @@ class KVCacheCoordinator(ABC): Args: request_id: The request ID. - block_hashes: The block hashes of the request. + num_running_requests: The number of requests in the RUNNING state. Returns: - The number of common prefix blocks. + list[int]: The number of common prefix blocks. """ num_blocks_per_group = [ manager.get_num_common_prefix_blocks(request_id, diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index e8a44c7773a71..714f49494c9a1 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -181,7 +181,7 @@ class SingleTypeKVCacheManager(ABC): Args: request_id: The request ID. - block_hashes: The block hashes of the request. + num_running_requests: The number of requests in the RUNNING state. Returns: The number of common prefix blocks. From 44bc46da6008c04d351d8fd0bf026bff8ab57dab Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 30 Jul 2025 11:36:04 +0800 Subject: [PATCH 037/224] [Bugfix] Actually disable processing cache when API server is scaled out (#21839) Signed-off-by: DarkLight1337 --- vllm/entrypoints/cli/serve.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index a69363e3d98fe..7dcba2cccdb52 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -140,11 +140,16 @@ def run_multi_api_server(args: argparse.Namespace): num_api_servers = args.api_server_count assert num_api_servers > 0 + orig_disable_mm_preprocessor_cache = args.disable_mm_preprocessor_cache + # set_process_title("ProcManager") if num_api_servers > 1: setup_multiprocess_prometheus() + # Not compatible with API server scale-out + args.disable_mm_preprocessor_cache = True + listen_address, sock = setup_server(args) engine_args = vllm.AsyncEngineArgs.from_cli_args(args) @@ -161,11 +166,9 @@ def run_multi_api_server(args: argparse.Namespace): "with api_server_count > 1") if model_config.is_multimodal_model and not ( - model_config.disable_mm_preprocessor_cache): - logger.warning( - "Multi-model preprocessor cache will be disabled for" - " api_server_count > 1") - model_config.disable_mm_preprocessor_cache = True + orig_disable_mm_preprocessor_cache): + logger.warning("Multi-model preprocessor cache will be disabled " + "for api_server_count > 1") executor_class = Executor.get_class(vllm_config) log_stats = not engine_args.disable_log_stats From 1b0a15553420e5459d9a8512a3f1bd7d4117db08 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 29 Jul 2025 23:50:46 -0400 Subject: [PATCH 038/224] [Perf] Using `__nv_fp8_e4m3` instead of `c10::e4m3` for `per_token_group_quant` (#21867) Signed-off-by: yewentao256 --- csrc/quantization/fp8/per_token_group_quant.cu | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/csrc/quantization/fp8/per_token_group_quant.cu b/csrc/quantization/fp8/per_token_group_quant.cu index 2609054f2072b..f5b40e35b6e5a 100644 --- a/csrc/quantization/fp8/per_token_group_quant.cu +++ b/csrc/quantization/fp8/per_token_group_quant.cu @@ -1,12 +1,10 @@ #include -#include #include "../per_token_group_quant_8bit.h" #include -#include -#include +#include #include @@ -199,7 +197,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input, VLLM_DISPATCH_FLOATING_TYPES( input.scalar_type(), "per_token_group_quant_8bit", ([&] { if (dst_type == at::ScalarType::Float8_e4m3fn) { - LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn); + LAUNCH_KERNEL(scalar_t, __nv_fp8_e4m3); } else if (dst_type == at::ScalarType::Char) { LAUNCH_KERNEL(scalar_t, int8_t); } From 65f311ce5906941840fb5e16e29e798e7d35cf65 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Wed, 30 Jul 2025 11:56:03 +0800 Subject: [PATCH 039/224] [Frontend] Add LLM.reward specific to reward models (#21720) Signed-off-by: wang.yuqi --- docs/models/pooling_models.md | 81 ++++++++++++------- examples/offline_inference/basic/embed.py | 3 +- examples/offline_inference/basic/reward.py | 53 ++++++++++++ tests/conftest.py | 4 + tests/models/language/pooling/test_reward.py | 2 +- .../pooling/test_truncation_control.py | 6 +- vllm/entrypoints/llm.py | 60 +++++++++++++- 7 files changed, 174 insertions(+), 35 deletions(-) create mode 100644 examples/offline_inference/basic/reward.py diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index f1200103171e9..1fbbba7ace5e1 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -45,14 +45,14 @@ Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks], enabling the corresponding APIs: -| Task | APIs | -|------------|--------------------| -| `encode` | `encode` | -| `embed` | `embed`, `score`\* | -| `classify` | `classify` | -| `score` | `score` | +| Task | APIs | +|------------|--------------------------------------| +| `encode` | `LLM.reward(...)` | +| `embed` | `LLM.embed(...)`, `LLM.score(...)`\* | +| `classify` | `LLM.classify(...)` | +| `score` | `LLM.score(...)` | -\* The `score` API falls back to `embed` task if the model does not support `score` task. +\* The `LLM.score(...)` API falls back to `embed` task if the model does not support `score` task. ### Pooler Configuration @@ -66,11 +66,11 @@ you can override some of its attributes via the `--override-pooler-config` optio If the model has been converted via `--convert` (see above), the pooler assigned to each task has the following attributes by default: -| Task | Pooling Type | Normalization | Softmax | -|------------|----------------|---------------|---------| -| `encode` | `ALL` | ❌ | ❌ | -| `embed` | `LAST` | ✅︎ | ❌ | -| `classify` | `LAST` | ❌ | ✅︎ | +| Task | Pooling Type | Normalization | Softmax | +|------------|--------------|---------------|---------| +| `reward` | `ALL` | ❌ | ❌ | +| `embed` | `LAST` | ✅︎ | ❌ | +| `classify` | `LAST` | ❌ | ✅︎ | When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults. @@ -83,21 +83,6 @@ which takes priority over both the model's and Sentence Transformers's defaults. The [LLM][vllm.LLM] class provides various methods for offline inference. See [configuration][configuration] for a list of options when initializing the model. -### `LLM.encode` - -The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. -It returns the extracted hidden states directly, which is useful for reward models. - -```python -from vllm import LLM - -llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", runner="pooling") -(output,) = llm.encode("Hello, my name is") - -data = output.outputs.data -print(f"Data: {data!r}") -``` - ### `LLM.embed` The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt. @@ -106,7 +91,7 @@ It is primarily designed for embedding models. ```python from vllm import LLM -llm = LLM(model="intfloat/e5-mistral-7b-instruct", runner="pooling") +llm = LLM(model="intfloat/e5-small", runner="pooling") (output,) = llm.embed("Hello, my name is") embeds = output.outputs.embedding @@ -154,6 +139,46 @@ print(f"Score: {score}") A code example can be found here: +### `LLM.reward` + +The [reward][vllm.LLM.reward] method is available to all reward models in vLLM. +It returns the extracted hidden states directly. + +```python +from vllm import LLM + +llm = LLM(model="internlm/internlm2-1_8b-reward", runner="pooling", trust_remote_code=True) +(output,) = llm.reward("Hello, my name is") + +data = output.outputs.data +print(f"Data: {data!r}") +``` + +A code example can be found here: + +### `LLM.encode` + +The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. +It returns the extracted hidden states directly. + +!!! note + Please use one of the more specific methods or set the task directly when using `LLM.encode`: + + - For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`. + - For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`. + - For rewards, use `LLM.reward(...)` or `pooling_task="reward"`. + - For similarity scores, use `LLM.score(...)`. + +```python +from vllm import LLM + +llm = LLM(model="intfloat/e5-small", runner="pooling") +(output,) = llm.encode("Hello, my name is", pooling_task="embed") + +data = output.outputs.data +print(f"Data: {data!r}") +``` + ## Online Serving Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs: diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py index 526753bcef22f..158836728beed 100644 --- a/examples/offline_inference/basic/embed.py +++ b/examples/offline_inference/basic/embed.py @@ -12,10 +12,9 @@ def parse_args(): parser = EngineArgs.add_cli_args(parser) # Set example specific arguments parser.set_defaults( - model="intfloat/e5-mistral-7b-instruct", + model="intfloat/e5-small", runner="pooling", enforce_eager=True, - max_model_len=1024, ) return parser.parse_args() diff --git a/examples/offline_inference/basic/reward.py b/examples/offline_inference/basic/reward.py new file mode 100644 index 0000000000000..aa173cf96f5bc --- /dev/null +++ b/examples/offline_inference/basic/reward.py @@ -0,0 +1,53 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from argparse import Namespace + +from vllm import LLM, EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def parse_args(): + parser = FlexibleArgumentParser() + parser = EngineArgs.add_cli_args(parser) + # Set example specific arguments + parser.set_defaults( + model="internlm/internlm2-1_8b-reward", + runner="pooling", + enforce_eager=True, + max_model_len=1024, + trust_remote_code=True, + ) + return parser.parse_args() + + +def main(args: Namespace): + # Sample prompts. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + # Create an LLM. + # You should pass runner="pooling" for reward models + llm = LLM(**vars(args)) + + # Generate rewards. The output is a list of PoolingRequestOutput. + outputs = llm.reward(prompts) + + # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) + for prompt, output in zip(prompts, outputs): + rewards = output.outputs.data + rewards_trimmed = ( + (str(rewards[:16])[:-1] + ", ...]") if len(rewards) > 16 else rewards + ) + print(f"Prompt: {prompt!r} \nReward: {rewards_trimmed} (size={len(rewards)})") + print("-" * 60) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/tests/conftest.py b/tests/conftest.py index e4df6ebf2c260..67f0e7424038c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1053,6 +1053,10 @@ class VllmRunner: req_outputs = self.llm.encode(prompts) return [req_output.outputs.data for req_output in req_outputs] + def reward(self, prompts: list[str]) -> list[list[float]]: + req_outputs = self.llm.reward(prompts) + return [req_output.outputs.data for req_output in req_outputs] + def score( self, text_1: Union[str, list[str]], diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py index 3b7fab3ba5c99..a5f7dca76d822 100644 --- a/tests/models/language/pooling/test_reward.py +++ b/tests/models/language/pooling/test_reward.py @@ -95,7 +95,7 @@ def test_prm_models( monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False") with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.encode(math_step_prompts) + vllm_outputs = vllm_model.reward(math_step_prompts) with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model: hf_model = step_reward_patch_hf_model(hf_model) diff --git a/tests/models/language/pooling/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py index dc2bf21ef63bc..c6ef899958a07 100644 --- a/tests/models/language/pooling/test_truncation_control.py +++ b/tests/models/language/pooling/test_truncation_control.py @@ -28,7 +28,7 @@ def test_smaller_truncation_size(vllm_runner, with vllm_runner(model_name, runner="pooling", max_model_len=max_model_len) as vllm_model: - vllm_output = vllm_model.llm.encode( + vllm_output = vllm_model.llm.embed( input_str, truncate_prompt_tokens=truncate_prompt_tokens) prompt_tokens = vllm_output[0].prompt_token_ids @@ -43,7 +43,7 @@ def test_max_truncation_size(vllm_runner, with vllm_runner(model_name, runner="pooling", max_model_len=max_model_len) as vllm_model: - vllm_output = vllm_model.llm.encode( + vllm_output = vllm_model.llm.embed( input_str, truncate_prompt_tokens=truncate_prompt_tokens) prompt_tokens = vllm_output[0].prompt_token_ids @@ -61,7 +61,7 @@ def test_bigger_truncation_size(vllm_runner, model_name, runner="pooling", max_model_len=max_model_len) as vllm_model: - llm_output = vllm_model.llm.encode( + llm_output = vllm_model.llm.embed( input_str, truncate_prompt_tokens=truncate_prompt_tokens) assert llm_output == f"""truncate_prompt_tokens value diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index adef350931f3d..842a22ccebaa4 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1037,7 +1037,7 @@ class LLM: truncate_prompt_tokens: Optional[int] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - pooling_task: PoolingTask = "encode", + pooling_task: Optional[PoolingTask] = None, tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> list[PoolingRequestOutput]: """Apply pooling to the hidden states corresponding to the input @@ -1069,6 +1069,25 @@ class LLM: considered legacy and may be deprecated in the future. You should instead pass them via the `inputs` parameter. """ + if pooling_task is None: + if "embed" in self.supported_tasks: + pooling_task = "embed" + else: + pooling_task = "encode" + + logger.warning_once( + "`LLM.encode` is currently using `pooling_task = %s`.\n" + "Please use one of the more specific methods or set the " + "task directly when using `LLM.encode`:\n" + " - For embeddings, use `LLM.embed(...)` " + "or `pooling_task=\"embed\"`.\n" + " - For classification logits, use `LLM.classify(...)` " + "or `pooling_task=\"classify\"`.\n" + " - For rewards, use `LLM.reward(...)` " + "or `pooling_task=\"reward\"`\n" + " - For similarity scores, use `LLM.score(...)`.", + pooling_task) + model_config = self.llm_engine.model_config runner_type = model_config.runner_type if runner_type != "pooling": @@ -1207,6 +1226,45 @@ class LLM: return [ClassificationRequestOutput.from_base(item) for item in items] + def reward( + self, + prompts: Union[PromptType, Sequence[PromptType]], + /, + *, + truncate_prompt_tokens: Optional[int] = None, + use_tqdm: Union[bool, Callable[..., tqdm]] = True, + pooling_params: Optional[Union[PoolingParams, + Sequence[PoolingParams]]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, + ) -> list[PoolingRequestOutput]: + """ + Generate rewards for each prompt. + + Args: + prompts: The prompts to the LLM. You may pass a sequence of prompts + for batch inference. See [PromptType][vllm.inputs.PromptType] + for more details about the format of each prompts. + use_tqdm: If `True`, shows a tqdm progress bar. + If a callable (e.g., `functools.partial(tqdm, leave=False)`), + it is used to create the progress bar. + If `False`, no progress bar is created. + lora_request: LoRA request to use for generation, if any. + pooling_params: The pooling parameters for pooling. If None, we + use the default pooling parameters. + Returns: + A list of `PoolingRequestOutput` objects containing the + pooled hidden states in the same order as the input prompts. + """ + + return self.encode( + prompts, + use_tqdm=use_tqdm, + lora_request=lora_request, + pooling_params=pooling_params, + truncate_prompt_tokens=truncate_prompt_tokens, + pooling_task="encode", + ) + def _embedding_score( self, tokenizer: AnyTokenizer, From 05cbbe20c55d957f18c12f7eb11cf551504e657d Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Wed, 30 Jul 2025 11:56:14 +0800 Subject: [PATCH 040/224] [XPU] use `ZE_AFFINITY_MASK` for device select on xpu (#21815) Signed-off-by: Kunshang Ji --- vllm/platforms/xpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 1d0bb3654929b..d8a663f2f0c4a 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -30,7 +30,7 @@ class XPUPlatform(Platform): # see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501 ray_device_key: str = "GPU" dist_backend: str = "ccl" # ccl | xccl - device_control_env_var: str = "ONEAPI_DEVICE_SELECTOR" + device_control_env_var: str = "ZE_AFFINITY_MASK" @classmethod def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, From e3bc17ceead9af80851b61a65010613eb34511e8 Mon Sep 17 00:00:00 2001 From: Tao He Date: Wed, 30 Jul 2025 12:30:44 +0800 Subject: [PATCH 041/224] Add @sighingnow as maintainer of qwen's related files. (#21895) Signed-off-by: Tao He --- .github/CODEOWNERS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index a3b2713430eb5..fb9f44353cec8 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -61,3 +61,7 @@ mkdocs.yaml @hmellor /vllm/v1/worker/^xpu @jikunshang /vllm/platforms/xpu.py @jikunshang /docker/Dockerfile.xpu @jikunshang + +# Qwen-specific files +/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow +/vllm/model_executor/models/qwen* @sighingnow From 16f32505275687c01823b87134ce2d93f89407ad Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 30 Jul 2025 12:53:08 +0800 Subject: [PATCH 042/224] [CI/Build] Fix pre-commit failure in docs (#21897) Signed-off-by: DarkLight1337 --- docs/design/fused_moe_modular_kernel.md | 63 +++++++++++++++++-------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md index 0943454d64292..3ef1232051b07 100644 --- a/docs/design/fused_moe_modular_kernel.md +++ b/docs/design/fused_moe_modular_kernel.md @@ -1,6 +1,7 @@ # Fused MoE Modular Kernel ## Introduction + FusedMoEModularKernel is implemented [here](gh-file:/vllm/model_executor/layers/fused_moe/modular_kernel.py) Based on the format of the input activations, FusedMoE implementations are broadly classified into 2 types. @@ -31,7 +32,8 @@ As can be seen from the diagrams, there are a lot of operations and there can be The rest of the document will focus on the Contiguous / Non-Batched case. Extrapolating to the Batched case should be straight-forward. -## ModularKernel Components: +## ModularKernel Components + FusedMoEModularKernel splits the FusedMoE operation into 3 parts, 1. TopKWeightAndReduce @@ -39,6 +41,7 @@ FusedMoEModularKernel splits the FusedMoE operation into 3 parts, 3. FusedMoEPermuteExpertsUnpermute ### TopKWeightAndReduce + The TopK Weight Application and Reduction components happen right after the Unpermute operation and before the All2All Combine. Note that the `FusedMoEPermuteExpertsUnpermute` is responsible for the Unpermute and `FusedMoEPrepareAndFinalize` is responsible for the All2All Combine. There is value in doing the TopK Weight Application and Reduction in the `FusedMoEPermuteExpertsUnpermute`. But some implementations choose to do it `FusedMoEPrepareAndFinalize`. In order to enable this flexibility, we have a TopKWeightAndReduce abstract class. Please find the implementations of TopKWeightAndReduce [here](gh-file:vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py). @@ -50,12 +53,14 @@ The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExperts * `FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceContiguous` / `TopKWeightAndReduceNaiveBatched` / `TopKWeightAndReduceDelegate` if the `FusedMoEPermuteExpertsUnpermute` implementation needs the `FusedMoEPrepareAndFinalize::finalize()` to do the weight application and reduction. ### FusedMoEPrepareAndFinalize + The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare` and `finalize` functions. The `prepare` function is responsible for input activation Quantization and All2All Dispatch. The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section) ![](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png "FusedMoEPrepareAndFinalize Blocks") ### FusedMoEPermuteExpertsUnpermute + The `FusedMoEPermuteExpertsUnpermute` class is where the crux of the MoE operations happen. The `FusedMoEPermuteExpertsUnpermute` abstract class exposes a few important functions, * apply() @@ -63,6 +68,7 @@ The `FusedMoEPermuteExpertsUnpermute` class is where the crux of the MoE operati * finalize_weight_and_reduce_impl() #### apply() + The `apply` method is where the implementations perform * Permute @@ -74,50 +80,56 @@ The `apply` method is where the implementations perform * Maybe TopK Weight Application + Reduction #### workspace_shapes() + The core FusedMoE implementation performs a series of operations. It would be inefficient to create output memory for each of these operations separately. To that effect, implementations are required to declare 2 workspace shapes, the workspace datatype and the FusedMoE output shape as outputs of the workspace_shapes() method. This information is used to allocate the workspace tensors and the output tensor in `FusedMoEModularKernel::forward()` and passed on to the `FusedMoEPermuteExpertsUnpermute::apply()` method. The workspaces could then be used as intermediate buffers in the FusedMoE implementation. #### finalize_weight_and_reduce_impl() + It is sometimes efficient to perform TopK weight application and Reduction inside the `FusedMoEPermuteExpertsUnpermute::apply()`. Find an example [here](https://github.com/vllm-project/vllm/pull/20228). We have a `TopKWeightAndReduce` abstract class to facilitate such implementations. Please refer to the TopKWeightAndReduce section. `FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl()` returns the `TopKWeightAndReduce` object that the implementation wants the `FusedMoEPrepareAndFinalize::finalize()` to use. ![](../assets/design/fused_moe_modular_kernel/fused_experts_blocks.png "FusedMoEPermuteExpertsUnpermute Blocks") ### FusedMoEModularKernel + `FusedMoEModularKernel` is composed of the `FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` objects. `FusedMoEModularKernel` pseudocode/sketch, -``` -FusedMoEModularKernel::__init__(self, - prepare_finalize: FusedMoEPrepareAndFinalize, - fused_experts: FusedMoEPermuteExpertsUnpermute): +```py +class FusedMoEModularKernel: + def __init__(self, + prepare_finalize: FusedMoEPrepareAndFinalize, + fused_experts: FusedMoEPermuteExpertsUnpermute): - self.prepare_finalize = prepare_finalize - self.fused_experts = fused_experts + self.prepare_finalize = prepare_finalize + self.fused_experts = fused_experts -FusedMoEModularKernel::forward(self, DP_A): + def forward(self, DP_A): - Aq, A_scale, _, _, _ = self.prepare_finalize.prepare(DP_A, ...) + Aq, A_scale, _, _, _ = self.prepare_finalize.prepare(DP_A, ...) - workspace13_shape, workspace2_shape, _, _ = self.fused_experts.workspace_shapes(...) + workspace13_shape, workspace2_shape, _, _ = self.fused_experts.workspace_shapes(...) - # allocate workspaces - workspace_13 = torch.empty(workspace13_shape, ...) - workspace_2 = torch.empty(workspace2_shape, ...) + # allocate workspaces + workspace_13 = torch.empty(workspace13_shape, ...) + workspace_2 = torch.empty(workspace2_shape, ...) - # execute fused_experts - fe_out = self.fused_experts.apply(Aq, A_scale, workspace13, workspace2, ...) + # execute fused_experts + fe_out = self.fused_experts.apply(Aq, A_scale, workspace13, workspace2, ...) - # war_impl is an object of type TopKWeightAndReduceNoOp if the fused_experts implementations performs the TopK Weight Application and Reduction. - war_impl = self.fused_experts.finalize_weight_and_reduce_impl() + # war_impl is an object of type TopKWeightAndReduceNoOp if the fused_experts implementations + # performs the TopK Weight Application and Reduction. + war_impl = self.fused_experts.finalize_weight_and_reduce_impl() - output = self.prepare_finalize.finalize(fe_out, war_impl,...) - - return output + output = self.prepare_finalize.finalize(fe_out, war_impl,...) + + return output ``` ## How-To ### How To Add a FusedMoEPrepareAndFinalize Type + Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & Combine implementation / kernel. For example, * PplxPrepareAndFinalize type is backed by Pplx All2All kernels, @@ -125,9 +137,11 @@ Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & C * DeepEPLLPrepareAndFinalize type is backed by DeepEP Low-Latency All2All kernels. #### Step 1: Add an All2All manager + The purpose of the All2All Manager is to setup the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py). #### Step 2: Add a FusedMoEPrepareAndFinalize Type + This section describes the significance of the various functions exposed by the `FusedMoEPrepareAndFinalize` abstract class. `FusedMoEPrepareAndFinalize::prepare()`: The prepare method implements the Quantization and All2All Dispatch. Typically the Dispatch function from the relevant All2All Manager is invoked. @@ -145,6 +159,7 @@ This section describes the significance of the various functions exposed by the We suggest picking an already existing `FusedMoEPrepareAndFinalize` implementation that matches your All2All implementation closely and using it as a reference. ### How To Add a FusedMoEPermuteExpertsUnpermute Type + FusedMoEPermuteExpertsUnpermute performs the core of the FusedMoE operations. The various functions exposed by the abstract class and their significance is as follows, `FusedMoEPermuteExpertsUnpermute::activation_formats()`: Return the supported Input and Output activation formats. i.e. Contiguous / Batched format. @@ -159,12 +174,14 @@ implementations that input `FusedMoEActivationFormat.Standard` support chunking `FusedMoEPermuteExpertsUnpermute::apply`: Refer to `FusedMoEPermuteExpertsUnpermute` section above. ### FusedMoEModularKernel Initialization + `FusedMoEMethodBase` class has 2 methods that are collectively responsible in creating the `FusedMoEModularKernel` object. They are, * select_gemm_impl, and * init_prepare_finalize #### select_gemm_impl + The `select_gemm_impl` method is undefined in the base class. It is the responsibility of the derived class to implement a method that constructs a valid/appropriate `FusedMoEPermuteExpertsUnpermute` object. Please refer to the implementations in, @@ -176,12 +193,14 @@ Please refer to the implementations in, dervied classes. #### init_prepare_finalize + Based on the input and env settings, the `init_prepare_finalize` method creates the appropriate `FusedMoEPrepareAndFinalize` object. The method then queries `select_gemm_impl` for the appropriate `FusedMoEPermuteExpertsUnpermute` object and builds the `FusedMoEModularKernel` object Please take a look at [init_prepare_finalize](https://github.com/vllm-project/vllm/blob/1cbf951ba272c230823b947631065b826409fa62/vllm/model_executor/layers/fused_moe/layer.py#L188). **Important**: The `FusedMoEMethodBase` derived classes use the `FusedMoEMethodBase::fused_experts` object in their `apply` methods. When settings permit the construction of a valid `FusedMoEModularKernel` object, we override `FusedMoEMethodBase::fused_experts` with it. This essentially makes the derived classes agnostic to what FusedMoE implementation is used. ### How To Unit Test + We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py). The unit test iterates through all combinations of `FusedMoEPrepareAndFinalize` and `FusedMoEPremuteExpertsUnpermute` types and if they are @@ -196,18 +215,21 @@ If you are adding some `FusedMoEPrepareAndFinalize` / `FusedMoEPermuteExpertsUnp Doing this will add the new implementation to the test suite. ### How To Check `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` Compatibility + The unit test file [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script. Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts` As a side-effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked with incompatible types, the script will error. ### How To Profile + Please take a look at [profile_modular_kernel.py](gh-file:tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py) The script can be used to generate Torch traces for a single `FusedMoEModularKernel::forward()` call for any compatible `FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` types. Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts` ## FusedMoEPrepareAndFinalize Implementations + The following table lists the `FusedMoEPrepareAndFinalize` implementations at the time of writing, | Implementation | Type | Comments | @@ -220,6 +242,7 @@ The following table lists the `FusedMoEPrepareAndFinalize` implementations at th | BatchedPrepareAndFinalize | Batched | A reference prepare/finalize class that reorganizes the tokens into expert batched format, i.e. E x max_num_tokens x K. (Doesn’t use any all2all kernels. This is primarily used in unit testing) | ## FusedMoEPermuteExpertsUnpermute + The following table lists the `FusedMoEPermuteExpertsUnpermute` implementations at the time of writing, | Implementation | Type | Comment | From 4cd7fe6ceaf5ad7d8ac2ba5597cd964c6db7e306 Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Tue, 29 Jul 2025 22:07:28 -0700 Subject: [PATCH 043/224] [Docs] Expand introduction to Ray in Multi-node deployment section (#21584) Signed-off-by: Ricardo Decal --- docs/serving/distributed_serving.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/serving/distributed_serving.md b/docs/serving/distributed_serving.md index 9304976572788..08d889a00d2cf 100644 --- a/docs/serving/distributed_serving.md +++ b/docs/serving/distributed_serving.md @@ -58,7 +58,17 @@ vllm serve gpt2 \ ## Multi-node deployment -If a single node lacks sufficient GPUs to hold the model, deploy vLLM across multiple nodes. Multi-node deployments require Ray as the runtime engine. Ensure that every node provides an identical execution environment, including the model path and Python packages. Using container images is recommended because they provide a convenient way to keep environments consistent and to hide host heterogeneity. +If a single node lacks sufficient GPUs to hold the model, deploy vLLM across multiple nodes. Ensure that every node provides an identical execution environment, including the model path and Python packages. Using container images is recommended because they provide a convenient way to keep environments consistent and to hide host heterogeneity. + +### What is Ray? + +Ray is a distributed computing framework for scaling Python programs. Multi-node vLLM deployments require Ray as the runtime engine. + +vLLM uses Ray to manage the distributed execution of tasks across multiple nodes and control where execution happens. + +Ray also offers high-level APIs for large-scale [offline batch inference](https://docs.ray.io/en/latest/data/working-with-llms.html) and [online serving](https://docs.ray.io/en/latest/serve/llm/serving-llms.html) that can leverage vLLM as the engine. These APIs add production-grade fault tolerance, scaling, and distributed observability to vLLM workloads. + +For details, see the [Ray documentation](https://docs.ray.io/en/latest/index.html). ### Ray cluster setup with containers From 6f8d26188200385fa994526b10b5858a3da1ede7 Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Tue, 29 Jul 2025 22:57:03 -0700 Subject: [PATCH 044/224] Update vLLM Benchmark Suite for Xeon based on 0.9.2 release (#21486) Signed-off-by: Tsai, Louie --- .../convert-results-json-to-markdown.py | 1 + .../scripts/run-performance-benchmarks.sh | 2 +- .../tests/serving-tests-cpu-snc2.json | 209 +++++++++++++++++ .../tests/serving-tests-cpu-snc3.json | 211 ++++++++++++++++++ .../tests/serving-tests-cpu.json | 15 ++ 5 files changed, 437 insertions(+), 1 deletion(-) create mode 100644 .buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json create mode 100644 .buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 05623879c0c2c..554256b4bdb8b 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -44,6 +44,7 @@ serving_column_mapping = { "test_name": "Test name", "gpu_type": "GPU", "completed": "# of req.", + "max_concurrency": "# of max concurrency.", "request_throughput": "Tput (req/s)", "total_token_throughput": "Total Token Tput (tok/s)", "output_throughput": "Output Tput (tok/s)", diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index b515ee43934d1..2c57666a81aa3 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -33,7 +33,7 @@ check_gpus() { check_cpus() { # check the number of CPUs and NUMA Node and GPU type. - declare -g numa_count=$(python3 -c "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)") + declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}') if [[ $numa_count -gt 0 ]]; then echo "NUMA found." echo $numa_count diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json new file mode 100644 index 0000000000000..a144b4420fbf1 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json @@ -0,0 +1,209 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "max_concurrency": 60, + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "max_concurrency": 60, + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "max_concurrency": 60, + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_128", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "max_concurrency": 1000, + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_128", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "max_concurrency": 1000, + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_tp4_random_128_128", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "max_concurrency": 1000, + "num_prompts": 1000 + } + } +] diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json new file mode 100644 index 0000000000000..e6e69b63b74df --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json @@ -0,0 +1,211 @@ +[ + { + "test_name": "serving_llama8B_pp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "pipeline_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "max_concurrency": 60, + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_pp3_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "max_concurrency": 60, + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp2pp6_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "max_concurrency": 60, + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_pp1_random_128_128", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "pipeline_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "max_concurrency": 1000, + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_pp3_random_128_128", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL:": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "max_concurrency": 1000, + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_tp2pp3_random_128_128", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "disable_log_requests": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "max_concurrency": 1000, + "num_prompts": 1000 + } + } +] diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json index 22f71c993ff33..ce1f924de387f 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json @@ -6,6 +6,7 @@ "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { @@ -18,6 +19,8 @@ "disable_log_stats": "", "disable_log_requests": "", "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { @@ -36,6 +39,7 @@ "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { @@ -48,6 +52,8 @@ "disable_log_stats": "", "disable_log_requests": "", "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { @@ -66,6 +72,7 @@ "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { @@ -78,6 +85,8 @@ "disable_log_stats": "", "disable_log_requests": "", "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { @@ -96,6 +105,7 @@ "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { @@ -109,6 +119,8 @@ "disable_log_stats": "", "disable_log_requests": "", "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { @@ -129,6 +141,7 @@ "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { @@ -142,6 +155,8 @@ "disable_log_stats": "", "disable_log_requests": "", "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { From 2ca5f82c2a8152ba67eaa033fbdb479d28f4cc3b Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 30 Jul 2025 14:54:18 +0800 Subject: [PATCH 045/224] [Misc] Remove redundant config definitions (#21891) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/aimv2.py | 22 +- vllm/model_executor/models/dbrx.py | 14 +- vllm/model_executor/models/exaone.py | 8 +- vllm/model_executor/models/exaone4.py | 6 +- vllm/model_executor/models/keye.py | 3 - vllm/model_executor/models/minimax_vl_01.py | 7 +- vllm/model_executor/models/mpt.py | 8 +- vllm/model_executor/models/ovis.py | 13 +- vllm/transformers_utils/config.py | 28 +- vllm/transformers_utils/configs/__init__.py | 30 +- vllm/transformers_utils/configs/cohere2.py | 195 ------------ vllm/transformers_utils/configs/dbrx.py | 280 ------------------ vllm/transformers_utils/configs/exaone.py | 190 ------------ vllm/transformers_utils/configs/exaone4.py | 252 ---------------- .../configs/minimax_text_01.py | 70 ----- .../configs/minimax_vl_01.py | 71 ----- vllm/transformers_utils/configs/mpt.py | 180 ----------- vllm/transformers_utils/configs/nvlm_d.py | 31 -- vllm/transformers_utils/configs/ovis.py | 184 ------------ vllm/transformers_utils/configs/skyworkr1v.py | 54 ---- vllm/transformers_utils/configs/solar.py | 247 --------------- vllm/transformers_utils/configs/telechat2.py | 64 ---- .../transformers_utils/processors/__init__.py | 7 + 23 files changed, 54 insertions(+), 1910 deletions(-) delete mode 100644 vllm/transformers_utils/configs/cohere2.py delete mode 100644 vllm/transformers_utils/configs/dbrx.py delete mode 100644 vllm/transformers_utils/configs/exaone.py delete mode 100644 vllm/transformers_utils/configs/exaone4.py delete mode 100644 vllm/transformers_utils/configs/minimax_text_01.py delete mode 100644 vllm/transformers_utils/configs/minimax_vl_01.py delete mode 100644 vllm/transformers_utils/configs/mpt.py delete mode 100644 vllm/transformers_utils/configs/nvlm_d.py delete mode 100644 vllm/transformers_utils/configs/ovis.py delete mode 100644 vllm/transformers_utils/configs/skyworkr1v.py delete mode 100644 vllm/transformers_utils/configs/solar.py delete mode 100644 vllm/transformers_utils/configs/telechat2.py diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py index b13d863ebb744..d2307bb464bdb 100644 --- a/vllm/model_executor/models/aimv2.py +++ b/vllm/model_executor/models/aimv2.py @@ -8,6 +8,7 @@ from typing import Optional import torch import torch.nn as nn +from transformers import PretrainedConfig from vllm.attention.layer import MultiHeadAttention from vllm.distributed import get_tensor_model_parallel_world_size @@ -20,13 +21,12 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.transformers_utils.configs.ovis import AIMv2Config class AIMv2SwiGLUFFN(nn.Module): - def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, - prefix: str): + def __init__(self, config: PretrainedConfig, + quant_config: QuantizationConfig, prefix: str): super().__init__() hidden_features = config.intermediate_size in_features = config.hidden_size @@ -57,7 +57,7 @@ class AIMv2SwiGLUFFN(nn.Module): class AIMv2PatchEmbed(nn.Module): - def __init__(self, config: AIMv2Config): + def __init__(self, config: PretrainedConfig): super().__init__() self.proj = nn.Conv2d( config.num_channels, @@ -75,7 +75,7 @@ class AIMv2PatchEmbed(nn.Module): class AIMv2ViTPreprocessor(nn.Module): - def __init__(self, config: AIMv2Config): + def __init__(self, config: PretrainedConfig): super().__init__() num_patches = (config.image_size // config.patch_size)**2 @@ -93,8 +93,8 @@ class AIMv2ViTPreprocessor(nn.Module): class AIMv2Attention(nn.Module): - def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, - prefix: str): + def __init__(self, config: PretrainedConfig, + quant_config: QuantizationConfig, prefix: str): super().__init__() self.config = config self.embed_dim = config.hidden_size @@ -141,8 +141,8 @@ class AIMv2Attention(nn.Module): class AIMv2Block(nn.Module): - def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, - prefix: str): + def __init__(self, config: PretrainedConfig, + quant_config: QuantizationConfig, prefix: str): super().__init__() self.attn = AIMv2Attention(config, quant_config=quant_config, @@ -163,7 +163,7 @@ class AIMv2Transformer(nn.Module): def __init__( self, - config: AIMv2Config, + config: PretrainedConfig, quant_config: QuantizationConfig, *, require_post_norm: Optional[bool] = None, @@ -193,7 +193,7 @@ class AIMv2Transformer(nn.Module): class AIMv2Model(torch.nn.Module): def __init__(self, - config: AIMv2Config, + config: PretrainedConfig, quant_config: QuantizationConfig, *, require_post_norm: Optional[bool] = None, diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 7a4dd69443ad7..360c7e66bf5ce 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -6,6 +6,7 @@ from typing import Optional, Union import torch import torch.nn as nn +from transformers import PretrainedConfig from vllm.attention import Attention from vllm.config import CacheConfig, VllmConfig @@ -24,7 +25,6 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.configs.dbrx import DbrxConfig from .interfaces import SupportsPP from .utils import (AutoWeightsLoader, is_pp_missing_parameter, @@ -39,7 +39,7 @@ class DbrxRouter(nn.Module): def __init__( self, - config: DbrxConfig, + config: PretrainedConfig, params_dtype: Optional[torch.dtype] = None, ): super().__init__() @@ -63,7 +63,7 @@ class DbrxExperts(FusedMoE): def __init__( self, - config: DbrxConfig, + config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, params_dtype: Optional[torch.dtype] = None, prefix: str = "", @@ -138,7 +138,7 @@ class DbrxMoE(nn.Module): def __init__( self, - config: DbrxConfig, + config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, params_dtype: Optional[torch.dtype] = None, prefix: str = "", @@ -169,7 +169,7 @@ class DbrxAttention(nn.Module): def __init__( self, - config: DbrxConfig, + config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -249,7 +249,7 @@ class DbrxFusedNormAttention(nn.Module): def __init__( self, - config: DbrxConfig, + config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -284,7 +284,7 @@ class DbrxBlock(nn.Module): def __init__( self, - config: DbrxConfig, + config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index aaf105ec2552a..8052b6bb82348 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -30,6 +30,7 @@ from typing import Any, Optional, Union import torch from torch import nn +from transformers import PretrainedConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -49,7 +50,6 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.configs.exaone import ExaoneConfig from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, @@ -99,7 +99,7 @@ class ExaoneAttention(nn.Module): def __init__( self, - config: ExaoneConfig, + config: PretrainedConfig, hidden_size: int, num_heads: int, num_kv_heads: int, @@ -194,7 +194,7 @@ class ExaoneBlockAttention(nn.Module): def __init__( self, - config: ExaoneConfig, + config: PretrainedConfig, hidden_size: int, num_heads: int, num_kv_heads: int, @@ -236,7 +236,7 @@ class ExaoneDecoderLayer(nn.Module): def __init__( self, - config: ExaoneConfig, + config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 97aeb6fd7b172..3d6ce3e8895fb 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -26,6 +26,7 @@ from typing import Any, Optional, Union import torch from torch import nn +from transformers import PretrainedConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -45,7 +46,6 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.configs.exaone4 import Exaone4Config from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index, @@ -96,7 +96,7 @@ class Exaone4Attention(nn.Module): def __init__( self, - config: Exaone4Config, + config: PretrainedConfig, hidden_size: int, num_heads: int, num_kv_heads: int, @@ -224,7 +224,7 @@ class Exaone4DecoderLayer(nn.Module): def __init__( self, - config: Exaone4Config, + config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 36e57b5e4f46a..892d970aaade0 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -980,9 +980,6 @@ class KeyeMultiModalDataParser(MultiModalDataParser): class KeyeProcessingInfo(BaseProcessingInfo): - def get_hf_config(self): - return self.ctx.get_hf_config(PretrainedConfig) - def get_hf_processor( self, *, diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py index 9aba82cb115ed..62a7d37ec9d33 100644 --- a/vllm/model_executor/models/minimax_vl_01.py +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -5,7 +5,7 @@ from typing import Literal, Optional, TypedDict, Union, cast import torch import torch.nn as nn -from transformers import BatchFeature +from transformers import BatchFeature, PretrainedConfig from vllm.config import VllmConfig from vllm.jsontree import json_map_leaves @@ -17,7 +17,6 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFieldConfig from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config from .clip import CLIPVisionModel from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP @@ -90,8 +89,8 @@ class MiniMaxVL01DummyInputsBuilder(LlavaDummyInputsBuilder): class MiniMaxVL01ProcessingInfo(LlavaNextProcessingInfo): - def get_hf_config(self): - return self.ctx.get_hf_config(MiniMaxVL01Config) + def get_hf_config(self): # Need to override the config type + return self.ctx.get_hf_config(PretrainedConfig) def get_hf_processor(self, **kwargs: object): hf_processor = self.ctx.get_hf_processor(**kwargs) diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 0878ada34d1d8..c243f575ae54a 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -8,6 +8,7 @@ from typing import Optional, Union import torch import torch.nn as nn +from transformers import PretrainedConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -25,7 +26,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.configs.mpt import MPTConfig from .interfaces import SupportsPP from .utils import (AutoWeightsLoader, is_pp_missing_parameter, @@ -50,7 +50,7 @@ class MPTAttention(nn.Module): def __init__( self, - config: MPTConfig, + config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -144,7 +144,7 @@ class MPTMLP(nn.Module): def __init__( self, - config: MPTConfig, + config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() @@ -176,7 +176,7 @@ class MPTBlock(nn.Module): def __init__( self, - config: MPTConfig, + config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 111628d8d18cb..c8b528048b557 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -25,7 +25,7 @@ import torch import torch.nn as nn from torch import Tensor from torch.nn.functional import gumbel_softmax, pad, softmax -from transformers import BaseImageProcessor, BatchFeature +from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig from vllm.config import VllmConfig from vllm.model_executor.layers.linear import ReplicatedLinear @@ -48,8 +48,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.configs.ovis import (BaseVisualTokenizerConfig, - OvisConfig) from vllm.transformers_utils.processors.ovis import OvisProcessor from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP @@ -83,7 +81,7 @@ class VisualTokenizer(torch.nn.Module): def __init__( self, - config: BaseVisualTokenizerConfig, + config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ): @@ -107,7 +105,7 @@ class VisualTokenizer(torch.nn.Module): def _init_backbone( self, - config: BaseVisualTokenizerConfig, + config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> nn.Module: @@ -247,9 +245,6 @@ class VisualEmbedding(torch.nn.Embedding): class OvisProcessingInfo(BaseProcessingInfo): - def get_hf_config(self): - return self.ctx.get_hf_config(OvisConfig) - def get_hf_processor(self, **kwargs): return self.ctx.get_hf_processor( OvisProcessor, @@ -417,7 +412,7 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - self.config: OvisConfig = config + self.config: PretrainedConfig = config self.llm = init_vllm_registered_model( vllm_config=vllm_config.with_hf_config(config.get_text_config()), prefix=maybe_prefix(prefix, "llm"), diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 04ff08825bbc5..40a6a9118e53e 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -29,19 +29,13 @@ from vllm import envs from vllm.logger import init_logger # yapf conflicts with isort for this block # yapf: disable -from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config, - DbrxConfig, DeepseekVLV2Config, - EAGLEConfig, Exaone4Config, - ExaoneConfig, JAISConfig, +from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config, + EAGLEConfig, JAISConfig, KimiVLConfig, MedusaConfig, - MiniMaxText01Config, - MiniMaxVL01Config, MllamaConfig, - MLPSpeculatorConfig, MPTConfig, + MllamaConfig, MLPSpeculatorConfig, Nemotron_Nano_VL_Config, - NemotronConfig, NVLM_D_Config, - OvisConfig, RWConfig, - SkyworkR1VChatConfig, SolarConfig, - Telechat2Config, UltravoxConfig) + NemotronConfig, RWConfig, + UltravoxConfig) # yapf: enable from vllm.transformers_utils.configs.mistral import adapt_config_dict from vllm.transformers_utils.utils import check_gguf_file @@ -77,28 +71,16 @@ _CONFIG_REGISTRY_OVERRIDE_HF: dict[str, type[PretrainedConfig]] = { _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = { "chatglm": ChatGLMConfig, - "cohere2": Cohere2Config, - "dbrx": DbrxConfig, "deepseek_vl_v2": DeepseekVLV2Config, "kimi_vl": KimiVLConfig, "Llama_Nemotron_Nano_VL": Nemotron_Nano_VL_Config, - "mpt": MPTConfig, "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) "jais": JAISConfig, "mlp_speculator": MLPSpeculatorConfig, "medusa": MedusaConfig, "eagle": EAGLEConfig, - "exaone": ExaoneConfig, - "exaone4": Exaone4Config, - "minimax_text_01": MiniMaxText01Config, - "minimax_vl_01": MiniMaxVL01Config, "nemotron": NemotronConfig, - "NVLM_D": NVLM_D_Config, - "ovis": OvisConfig, - "solar": SolarConfig, - "skywork_chat": SkyworkR1VChatConfig, - "telechat": Telechat2Config, "ultravox": UltravoxConfig, **_CONFIG_REGISTRY_OVERRIDE_HF } diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 89303213a27e1..0fcb2beb8c7db 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -1,13 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Model configs may be defined in this directory for the following reasons: + +- There is no configuration file defined by HF Hub or Transformers library. +- There is a need to override the existing config to support vLLM. +""" from vllm.transformers_utils.configs.chatglm import ChatGLMConfig -from vllm.transformers_utils.configs.cohere2 import Cohere2Config -from vllm.transformers_utils.configs.dbrx import DbrxConfig from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config from vllm.transformers_utils.configs.eagle import EAGLEConfig -from vllm.transformers_utils.configs.exaone import ExaoneConfig -from vllm.transformers_utils.configs.exaone4 import Exaone4Config # RWConfig is for the original tiiuae/falcon-40b(-instruct) and # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the # `FalconConfig` class from the official HuggingFace transformers library. @@ -15,36 +17,21 @@ from vllm.transformers_utils.configs.falcon import RWConfig from vllm.transformers_utils.configs.jais import JAISConfig from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig from vllm.transformers_utils.configs.medusa import MedusaConfig -from vllm.transformers_utils.configs.minimax_text_01 import MiniMaxText01Config -from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config from vllm.transformers_utils.configs.mllama import MllamaConfig from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig from vllm.transformers_utils.configs.moonvit import MoonViTConfig -from vllm.transformers_utils.configs.mpt import MPTConfig from vllm.transformers_utils.configs.nemotron import NemotronConfig from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config -from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config -from vllm.transformers_utils.configs.ovis import OvisConfig -from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig -from vllm.transformers_utils.configs.solar import SolarConfig -from vllm.transformers_utils.configs.telechat2 import Telechat2Config from vllm.transformers_utils.configs.ultravox import UltravoxConfig __all__ = [ "ChatGLMConfig", - "Cohere2Config", - "DbrxConfig", "DeepseekVLV2Config", - "MPTConfig", "RWConfig", "JAISConfig", "MedusaConfig", "EAGLEConfig", - "ExaoneConfig", - "Exaone4Config", - "MiniMaxText01Config", - "MiniMaxVL01Config", "MllamaConfig", "MLPSpeculatorConfig", "MoonViTConfig", @@ -52,10 +39,5 @@ __all__ = [ "NemotronConfig", "NemotronHConfig", "Nemotron_Nano_VL_Config", - "NVLM_D_Config", - "OvisConfig", - "SkyworkR1VChatConfig", - "SolarConfig", - "Telechat2Config", "UltravoxConfig", ] diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py deleted file mode 100644 index e547a9c281cff..0000000000000 --- a/vllm/transformers_utils/configs/cohere2.py +++ /dev/null @@ -1,195 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# ruff: noqa - -# Adapted from -# https://github.com/huggingface/transformers/blob/main/src/transformers/models/cohere2/configuration_cohere2.py -from transformers import PretrainedConfig -from transformers.modeling_rope_utils import rope_config_validation - - -class Cohere2Config(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere - model according to the specified arguments, defining the model architecture. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. Instantiating a configuration - with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model. - - - Args: - vocab_size (`int`, *optional*, defaults to 256000): - Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`CohereModel`] - hidden_size (`int`, *optional*, defaults to 8192): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 22528): - Dimension of the MLP representations. - logit_scale (`float`, *optional*, defaults to 0.0625): - The scaling factor for the output logits. - num_hidden_layers (`int`, *optional*, defaults to 40): - Number of hidden layers in the Transformer decoder. - num_attention_heads (`int`, *optional*, defaults to 64): - Number of attention heads for each attention layer in the Transformer decoder. - num_key_value_heads (`int`, *optional*): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to - `num_attention_heads`. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 8192): - The maximum sequence length that this model might ever be used with. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - layer_norm_eps (`float`, *optional*, defaults to 1e-05): - The epsilon used by the layer normalization. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - pad_token_id (`int`, *optional*, defaults to 0): - Padding token id. - bos_token_id (`int`, *optional*, defaults to 5): - Beginning of stream token id. - eos_token_id (`int`, *optional*, defaults to 255001): - End of stream token id. - tie_word_embeddings (`bool`, *optional*, defaults to `True`): - Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE - attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): - Whether to use a bias in the query, key, value and output projection layers during self-attention. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - sliding_window (`int`, *optional*, defaults to 4096): - Size of the sliding window attention context. - sliding_window_pattern (`int`, *optional*, defaults to 4): - Pattern for the sliding window attention. - cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`. - - ```python - >>> from transformers import Cohere2Model, Cohere2Config - - >>> # Initializing a Cohere Nextmodel configuration - >>> configuration = Cohere2Config() - - >>> # Initializing a model from the Cohere2 configuration - >>> model = Cohere2Model(configuration) # doctest: +SKIP - - >>> # Accessing the model configuration - >>> configuration = model.config # doctest: +SKIP - ``` - """ - - model_type = "cohere2" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=256000, - hidden_size=8192, - intermediate_size=22528, - logit_scale=0.0625, - num_hidden_layers=40, - num_attention_heads=64, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=8192, - initializer_range=0.02, - layer_norm_eps=1e-5, - use_cache=True, - pad_token_id=0, - bos_token_id=5, - eos_token_id=255001, - tie_word_embeddings=True, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - sliding_window=4096, - sliding_window_pattern=4, - cache_implementation="hybrid", - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.logit_scale = logit_scale - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.sliding_window = sliding_window - self.sliding_window_pattern = sliding_window_pattern - # Need to specify head_dim in the config so it can be used in the attention forward functions - self.head_dim = hidden_size // num_attention_heads - self.cache_implementation = cache_implementation - - # Validate the correctness of rotary position embeddings parameters - rope_config_validation(self) - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - - -__all__ = ["Cohere2Config"] diff --git a/vllm/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py deleted file mode 100644 index 7dbda99f85a4e..0000000000000 --- a/vllm/transformers_utils/configs/dbrx.py +++ /dev/null @@ -1,280 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# yapf: disable -# ruff: noqa: E501 -# coding=utf-8 -# Copied from -# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py -"""Dbrx configuration.""" - -from typing import Any, Optional - -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging - -logger = logging.get_logger(__name__) - -DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} # type: ignore - - -class DbrxAttentionConfig(PretrainedConfig): - """Configuration class for Dbrx Attention. - - [`DbrxAttention`] class. It is used to instantiate attention layers - according to the specified arguments, defining the layers architecture. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - Args: - attn_pdrop (`float`, *optional*, defaults to 0.0): - The dropout probability for the attention layers. - clip_qkv (`float`, *optional*, defaults to None): - If not `None`, clip the queries, keys, and values in the attention layer to this value. - kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads. - rope_theta (float): The base frequency for rope. - """ - - def __init__( - self, - attn_pdrop: float = 0, - clip_qkv: Optional[float] = None, - kv_n_heads: int = 1, - rope_theta: float = 10000.0, - **kwargs: Any, - ): - super().__init__(**kwargs) - self.attn_pdrop = attn_pdrop - self.clip_qkv = clip_qkv - self.kv_n_heads = kv_n_heads - self.rope_theta = rope_theta - - for k in ["model_type"]: - if k in kwargs: - kwargs.pop(k) - if len(kwargs) != 0: - raise ValueError(f"Found unknown {kwargs=}") - - @classmethod - def from_pretrained( - cls, pretrained_model_name_or_path: str, **kwargs: Any - ) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict( - pretrained_model_name_or_path, **kwargs - ) - - if config_dict.get("model_type") == "dbrx": - config_dict = config_dict["attn_config"] - - if ( - "model_type" in config_dict - and hasattr(cls, "model_type") - and config_dict["model_type"] != cls.model_type - ): - logger.warning( - "You are using a model of type %s to instantiate a model of " - "type %s. This is not supported for all configurations of " - "models and can yield errors.", - config_dict["model_type"], cls.model_type) - - return cls.from_dict(config_dict, **kwargs) - - -class DbrxFFNConfig(PretrainedConfig): - """Configuration class for Dbrx FFN. - - [`DbrxFFN`] class. It is used to instantiate feedforward layers according to - the specified arguments, defining the layers architecture. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - Args: - ffn_act_fn (dict, optional): A dict specifying activation function for the FFN. - The dict should have a key 'name' with the value being the name of - the activation function along with any additional keyword arguments. - ffn_hidden_size (int, optional): The hidden size of the feedforward network. - moe_num_experts (int, optional): The number of experts in the mixture of experts layer. - moe_top_k (int, optional): The number of experts to use in the mixture of experts layer. - moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer. - moe_loss_weight (float, optional): The loss weight for the mixture of experts layer. - moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights. - uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment. - This should only be used for benchmarking purposes. - """ - - def __init__( - self, - ffn_act_fn: Optional[dict] = None, - ffn_hidden_size: int = 3584, - moe_num_experts: int = 4, - moe_top_k: int = 1, - moe_jitter_eps: Optional[float] = None, - moe_loss_weight: float = 0.01, - moe_normalize_expert_weights: Optional[float] = 1, - uniform_expert_assignment: bool = False, - **kwargs: Any, - ): - super().__init__() - if ffn_act_fn is None: - ffn_act_fn = {"name": "silu"} - self.ffn_act_fn = ffn_act_fn - self.ffn_hidden_size = ffn_hidden_size - self.moe_num_experts = moe_num_experts - self.moe_top_k = moe_top_k - self.moe_jitter_eps = moe_jitter_eps - self.moe_loss_weight = moe_loss_weight - self.moe_normalize_expert_weights = moe_normalize_expert_weights - self.uniform_expert_assignment = uniform_expert_assignment - - for k in ["model_type"]: - if k in kwargs: - kwargs.pop(k) - if len(kwargs) != 0: - raise ValueError(f"Found unknown {kwargs=}") - - @classmethod - def from_pretrained( - cls, pretrained_model_name_or_path: str, **kwargs: Any - ) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict( - pretrained_model_name_or_path, **kwargs - ) - - if config_dict.get("model_type") == "dbrx": - config_dict = config_dict["ffn_config"] - - if ( - "model_type" in config_dict - and hasattr(cls, "model_type") - and config_dict["model_type"] != cls.model_type - ): - logger.warning( - "You are using a model of type %s to instantiate a model of " - "type %s. This is not supported for all " - "configurations of models and can yield errors.", config_dict["model_type"], cls.model_type) - - return cls.from_dict(config_dict, **kwargs) - - -class DbrxConfig(PretrainedConfig): - """Configuration class for Dbrx. - - [`DbrxModel`]. It is used to instantiate a Dbrx model according to the - specified arguments, defining the model architecture. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - - Args: - d_model (`int`, *optional*, defaults to 6144): - Dimensionality of the embeddings and hidden states. - n_heads (`int`, *optional*, defaults to 48): - Number of attention heads for each attention layer in the Transformer encoder. - n_layers (`int`, *optional*, defaults to 40): - Number of hidden layers in the Transformer encoder. - max_seq_len (`int`, *optional*, defaults to 32768): - The maximum sequence length of the model. - vocab_size (`int`, *optional*, defaults to 100352): - Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by - the `inputs_ids` passed when calling [`DbrxModel`]. - resid_pdrop (`float`, *optional*, defaults to 0.0): - The dropout probability applied to the attention output before combining with residual. - emb_pdrop (`float`, *optional*, defaults to 0.0): - The dropout probability for the embedding layer. - attn_config (`dict`, *optional*): - A dictionary used to configure the model's attention module. - ffn_config (`dict`, *optional*): - A dictionary used to configure the model's FFN module. - use_cache (`bool`, *optional*, defaults to `False`): - Whether or not the model should return the last key/values attentions (not used by all models). - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - output_router_logits (`bool`, *optional*, defaults to `False`): - Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss. - router_aux_loss_coef (`float`, *optional*, defaults to 0.001): - The aux loss factor for the total loss. - - - Example: - ```python - >>> from transformers import DbrxConfig, DbrxModel - - >>> # Initializing a Dbrx configuration - >>> configuration = DbrxConfig() - - >>> # Initializing a model (with random weights) from the configuration - >>> model = DbrxModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ``` - """ - - model_type = "dbrx" - attribute_map = { - "num_attention_heads": "n_heads", - "hidden_size": "d_model", - "num_hidden_layers": "n_layers", - "max_position_embeddings": "max_seq_len", - } - - def __init__( - self, - d_model: int = 2048, - n_heads: int = 16, - n_layers: int = 24, - max_seq_len: int = 2048, - vocab_size: int = 32000, - resid_pdrop: float = 0.0, - emb_pdrop: float = 0.0, - attn_config: Optional[DbrxAttentionConfig] = None, - ffn_config: Optional[DbrxFFNConfig] = None, - use_cache: bool = True, - initializer_range: float = 0.02, - output_router_logits: bool = False, - router_aux_loss_coef: float = 0.05, - **kwargs: Any, - ): - if attn_config is None: - self.attn_config = DbrxAttentionConfig() - elif isinstance(attn_config, dict): - self.attn_config = DbrxAttentionConfig(**attn_config) - else: - self.attn_config = attn_config - - if ffn_config is None: - self.ffn_config = DbrxFFNConfig() - elif isinstance(ffn_config, dict): - self.ffn_config = DbrxFFNConfig(**ffn_config) - else: - self.ffn_config = ffn_config - - self.d_model = d_model - self.n_heads = n_heads - self.n_layers = n_layers - self.max_seq_len = max_seq_len - self.vocab_size = vocab_size - self.resid_pdrop = resid_pdrop - self.emb_pdrop = emb_pdrop - self.use_cache = use_cache - self.initializer_range = initializer_range - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - - tie_word_embeddings = kwargs.pop("tie_word_embeddings", False) - if tie_word_embeddings: - raise ValueError( - "tie_word_embeddings is not supported for Dbrx models." - ) - - super().__init__( - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py deleted file mode 100644 index 7450904a15caf..0000000000000 --- a/vllm/transformers_utils/configs/exaone.py +++ /dev/null @@ -1,190 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Copied from -# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py -# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Exaone model configuration""" - -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging - -logger = logging.get_logger(__name__) - -EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: dict[str, str] = {} - - -class ExaoneConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a :class: - `~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model - according to the specified arguments, defining the model architecture. - Instantiating a configuration with the defaults will yield a similar - configuration to that of the Exaone - - Configuration objects inherit from {class}`~transformers.PretrainedConfig` - and can be used to control the model outputs. Read the documentation from : - class:`~transformers.PretrainedConfig` for more information. - - Args: - vocab_size ({obj}`int`, `optional`, defaults to 50257): - Vocabulary size of the GPT Lingvo model. Defines the number of - different tokens that can be represented by the {obj}`inputs_ids` - passed when calling {class}`~transformers.ExaoneModel`. Vocabulary - size of the model. - Defines the different tokens that can be represented by the - `inputs_ids` passed to the forward method of :class: - `~transformers.EXAONEModel`. - hidden_size ({obj}`int`, `optional`, defaults to 2048): - Dimensionality of the encoder layers and the pooler layer. - num_layers ({obj}`int`, `optional`, defaults to 24): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the - Transformer decoder. - num_key_value_heads (`int`, *optional*): - This is the number of key_value heads that should be used to - implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi - Head Attention (MHA), if `num_key_value_heads=1 the model will use - Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, - each group key and value head should be constructed by meanpooling - all the original heads within that group. For more details checkout - [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not - specified, will default to `num_attention_heads`. - rotary_pct (`float`, *optional*, defaults to 0.25): - percentage of hidden dimensions to allocate to rotary embeddings - intermediate_size ({obj}`int`, `optional`, defaults to 8192): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in - the Transformer encoder. - activation_function ({obj}`str` or {obj}`function`, `optional`, - defaults to {obj}`"gelu_new"`): - The non-linear activation function (function or string) in the - encoder and pooler. If string, {obj}`"gelu"`, {obj}`"relu"`, - {obj}`"selu"` and {obj}`"gelu_new"` are supported. - embed_dropout ({obj}`float`, `optional`, defaults to 0.0): - The dropout probabilitiy for all fully connected layers in the - embeddings, encoder, and pooler. - attention_dropout ({obj}`float`, `optional`, defaults to 0.0): - The dropout ratio for the attention probabilities. - max_position_embeddings ({obj}`int`, `optional`, defaults to 2048): - The maximum sequence length that this model might ever be used with. - Typically set this to something large just in case - (e.g., 512 or 1024 or 2048). - type_vocab_size ({obj}`int`, `optional`, defaults to 2): - The vocabulary size of the {obj}`token_type_ids` passed when calling - {class}`~transformers.EXAONEModel`. - initializer_range ({obj}`float`, `optional`, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for - initializing all weight matrices. - layer_norm_epsilon ({obj}`float`, `optional`, defaults to 1e-5): - The epsilon used by the layer normalization layers. - use_cache ({obj}`bool`, `optional`, defaults to {obj}`True`): - Whether or not the model should return the last key/values - attentions (not used by all models). - Only relevant if ``config.is_decoder=True``. - gradient_checkpointing ({obj}`bool`, `optional`, - defaults to {obj}`False`): - If True, use gradient checkpointing to save memory at the expense - of slower backward pass. - Example:: - - >>> from transformers import ExoneModel, ExaoneConfig - - >>> # Initializing a EXAONE configuration - >>> configuration = ExaoneConfig() - - >>> # Initializing a model from configuration - >>> model = ExoneModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - """ - - model_type = "exaone" - keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = {"num_hidden_layers": "num_layers"} - - def __init__( - self, - vocab_size=102400, - max_position_embeddings=2048, - hidden_size=2048, - num_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - intermediate_size=None, - activation_function="silu", - rotary_pct=0.25, - resid_dropout=0.0, - embed_dropout=0.0, - attention_dropout=0.0, - layer_norm_epsilon=1e-6, - initializer_range=0.02, - use_cache=True, - bos_token_id=0, - eos_token_id=2, - tie_word_embeddings=True, - **kwargs, - ): - super().__init__( - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.num_layers = num_layers - self.num_attention_heads = num_attention_heads - self.num_hidden_layers = num_layers - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - if intermediate_size: - self.intermediate_size = intermediate_size - else: - self.intermediate_size = hidden_size * 4 - self.activation_function = activation_function - self.resid_dropout = resid_dropout - self.embed_dropout = embed_dropout - self.attention_dropout = attention_dropout - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.use_cache = use_cache - self.rotary_pct = rotary_pct - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - - self.use_logit_cap = kwargs.pop("use_logit_cap", False) - self.ln_no_scale = kwargs.pop("ln_no_scale", False) - self.use_gated = kwargs.pop("use_gated", False) - self.use_emb_norm = kwargs.pop("use_emb_norm", False) - self.use_rotary_pos = kwargs.pop("use_rotary_pos", False) - self.rotary_type = kwargs.pop("rotary_type", None) - self.scaling_factor = kwargs.pop("scaling_factor", 1) - self.use_absolute_pos = kwargs.pop("use_absolute_pos", True) - self.use_extra_logit = kwargs.pop("use_extra_logit", True) - self.rotary_expand_length = kwargs.pop("rotary_expand_length", None) - self.rotary_base = kwargs.pop("rotary_base", 10000.0) - self.use_qkv_fuse = kwargs.pop("use_qkv_fuse", False) - self.rescale_before_lm_head = kwargs.pop("rescale_before_lm_head", - (rotary_pct == 0.25)) - if self.use_rotary_pos: - self.use_absolute_pos = False diff --git a/vllm/transformers_utils/configs/exaone4.py b/vllm/transformers_utils/configs/exaone4.py deleted file mode 100644 index a22ebaa6bd6bb..0000000000000 --- a/vllm/transformers_utils/configs/exaone4.py +++ /dev/null @@ -1,252 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# ruff: noqa: E501 - -# Copied from -# https://github.com/lgai-exaone/transformers/blob/add-exaone4/src/transformers/models/exaone4/configuration_exaone4.py -# Copyright 2025 The LG CNS Gen AI Solution Delivery Team. -# Copyright 2025 The LG AI Research and HuggingFace Inc. team. All rights reserved. -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from transformers.configuration_utils import (PretrainedConfig, - layer_type_validation) -from transformers.utils import logging - -logger = logging.get_logger(__name__) - - -def check_is_sliding(config, layer_idx): - """ - Check if the current layer is a sliding window attention (local attention) layer. - """ - if config.sliding_window is None: - return False - if config.layer_types is not None: - return config.layer_types[layer_idx] == "sliding_attention" - if isinstance(config.sliding_window_pattern, int): - return ((layer_idx + 1) % config.sliding_window_pattern) != 0 - elif isinstance(config.sliding_window_pattern, str): - assert isinstance(config.sliding_window, int), ( - f"Sliding window must be positive integer, but got {config.sliding_window}" - ) - return (layer_idx != config.num_hidden_layers - 1 - and config.sliding_window_pattern[layer_idx % len( - config.sliding_window_pattern)] == "L") - else: - logger.warning_once( - "Sliding window is set, but none of `sliding_window_pattern` or `layer_types` is set. " - "Defaulting to use 'full_attention' for all layers.") - return False - - -class Exaone4Config(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`Exaone4Model`]. It is used to - instantiate a EXAONE 4.0 model according to the specified arguments, defining the model architecture. Instantiating a - configuration with the defaults will yield a similar configuration to that of the EXAONE-4.0-Instruct [LGAI-EXAONE/EXAONE-4.0-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-Instruct) - NOTE: `EXAONE-4.0-Instruct` is a placeholder model ID. The exact model ID will be updated in the future. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model - outputs. Read the documentation from [`PretrainedConfig`] for more information. - - Args: - vocab_size (`int`, *optional*, defaults to 102400): - Vocabulary size of the EXAONE 4.0 model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`Exaone4Model`]. - hidden_size (`int`, *optional*, defaults to 4096): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to `hidden_size * 4`): - Dimensionality of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 32): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the Transformer decoder. - num_key_value_heads (`int`, *optional*): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to - `num_attention_heads`. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 2048): - The maximum sequence length that this model might ever be used with. Typically set this to something large - just in case (e.g., 32768 for EXAONE 3.5). - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-05): - The epsilon used by the layer normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if ``config.is_decoder=True``. - bos_token_id (`int`, *optional*, defaults to 0): - Beginning of stream token id. - eos_token_id (`int`, *optional*, defaults to 2): - End of stream token id. - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - sliding_window (`int`, *optional*): - The size of the sliding window for the sliding window attention. - sliding_window_pattern (`str`, *optional*): - The pattern to use for sliding window attention. Can be one of: - - `None`: No sliding window attention is used - - `int`: Every `sliding_window` layers, use global attention, else use local attention. - - `str`: A sequence of "L" (local attention) and "G" (global attention) characters that defines the - attention pattern. The pattern starts from layer 0 and repeats every `sliding_window` layers. The - final layer always uses global attention regardless of the pattern. - For instance, sliding_window_pattern="LLLG" same as sliding_window=4, which means: - - Layer 0, 1, 2: local attention, - - Layer 3: global attention, - ...(repeated) - layer_types (`list`, *optional*): - Attention pattern for each layer. Prioritized over `sliding_window_pattern`. - - Example: - - ```python - >>> from transformers import Exaone4Model, Exaone4Config - - >>> # Initializing a EXAONE configuration - >>> configuration = Exaone4Config() - - >>> # Initializing a model from configuration - >>> model = Exaone4Model(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "exaone4" - keys_to_ignore_at_inference = ["past_key_values"] - # Default tensor parallel plan for base model `LlamaModel` - base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise", - "layers.*.self_attn.k_proj": "colwise", - "layers.*.self_attn.v_proj": "colwise", - "layers.*.self_attn.o_proj": "rowwise", - "layers.*.mlp.gate_proj": "colwise", - "layers.*.mlp.up_proj": "colwise", - "layers.*.mlp.down_proj": "rowwise", - } - base_model_pp_plan = { - "embed_tokens": (["input_ids"], ["inputs_embeds"]), - "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), - "norm": (["hidden_states"], ["hidden_states"]), - } - - def __init__( - self, - vocab_size=102400, - hidden_size=4096, - intermediate_size=None, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - bos_token_id=0, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_dropout=0.0, - sliding_window=None, - sliding_window_pattern=None, - layer_types=None, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - if intermediate_size: - self.intermediate_size = intermediate_size - else: - self.intermediate_size = hidden_size * 4 - self.hidden_act = hidden_act - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.attention_dropout = attention_dropout - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self.sliding_window = sliding_window - self.sliding_window_pattern = sliding_window_pattern - - self.layer_types = layer_types - if self.layer_types is None: - self.layer_types = [ - "sliding_attention" - if check_is_sliding(self, i) else "full_attention" - for i in range(self.num_hidden_layers) - ] - layer_type_validation(self.layer_types) - - super().__init__(bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs) - - -__all__ = ["Exaone4Config"] diff --git a/vllm/transformers_utils/configs/minimax_text_01.py b/vllm/transformers_utils/configs/minimax_text_01.py deleted file mode 100644 index e3b63dfa00371..0000000000000 --- a/vllm/transformers_utils/configs/minimax_text_01.py +++ /dev/null @@ -1,70 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" MiniMaxText01 model configuration""" - -from transformers.configuration_utils import PretrainedConfig - - -class MiniMaxText01Config(PretrainedConfig): - model_type = "MiniMaxText01" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=14336, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=8, - hidden_act="silu", - max_position_embeddings=4096 * 32, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=None, - bos_token_id=None, - eos_token_id=None, - tie_word_embeddings=False, - rope_theta=1e6, - sliding_window=None, - attention_dropout=0.0, - num_experts_per_tok=2, - num_local_experts=8, - output_router_logits=False, - router_aux_loss_coef=0.001, - router_jitter_noise=0.0, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.rope_theta = rope_theta - self.attention_dropout = attention_dropout - - self.num_experts_per_tok = num_experts_per_tok - self.num_local_experts = num_local_experts - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.router_jitter_noise = router_jitter_noise - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) diff --git a/vllm/transformers_utils/configs/minimax_vl_01.py b/vllm/transformers_utils/configs/minimax_vl_01.py deleted file mode 100644 index c62497192cc2a..0000000000000 --- a/vllm/transformers_utils/configs/minimax_vl_01.py +++ /dev/null @@ -1,71 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""MiniMaxVL01 model configuration""" - -from transformers.configuration_utils import PretrainedConfig -from transformers.models.auto import CONFIG_MAPPING - -from .minimax_text_01 import MiniMaxText01Config - - -class MiniMaxVL01Config(PretrainedConfig): - model_type = "minimax_vl_01" - - def __init__( - self, - vision_config=None, - text_config=None, - ignore_index=-100, - image_token_index=32000, - projector_hidden_act="gelu", - vision_feature_select_strategy="default", - vision_feature_layer=-2, - image_grid_pinpoints=None, - tie_word_embeddings=False, - image_seq_length=576, - **kwargs, - ): - self.ignore_index = ignore_index - self.image_token_index = image_token_index - self.projector_hidden_act = projector_hidden_act - self.image_seq_length = image_seq_length - - if vision_feature_select_strategy not in ["default", "full"]: - raise ValueError("vision_feature_select_strategy should " + - "be one of 'default', 'full'." + - f"Got: {vision_feature_select_strategy}") - - self.vision_feature_select_strategy = vision_feature_select_strategy - self.vision_feature_layer = vision_feature_layer - image_grid_pinpoints = ( - image_grid_pinpoints if image_grid_pinpoints is not None else - [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]) - self.image_grid_pinpoints = image_grid_pinpoints - - if isinstance(vision_config, dict): - if "model_type" not in vision_config: - vision_config["model_type"] = "clip_vision_model" - vision_config = CONFIG_MAPPING[vision_config["model_type"]]( - **vision_config) - elif vision_config is None: - vision_config = CONFIG_MAPPING["clip_vision_model"]( - intermediate_size=4096, - hidden_size=1024, - patch_size=14, - image_size=336, - num_hidden_layers=24, - num_attention_heads=16, - vocab_size=32000, - projection_dim=768, - ) - - self.vision_config = vision_config - - if text_config is not None: - text_config = MiniMaxText01Config(**text_config) - else: - text_config = MiniMaxText01Config() - - self.text_config = text_config - - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py deleted file mode 100644 index 91316408dcd89..0000000000000 --- a/vllm/transformers_utils/configs/mpt.py +++ /dev/null @@ -1,180 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Copied from -# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py -"""A HuggingFace-style model configuration.""" -import warnings -from typing import Any, Optional, Union - -from transformers import PretrainedConfig - -attn_config_defaults: dict = { - 'attn_type': 'multihead_attention', - 'attn_pdrop': 0.0, - 'attn_impl': 'triton', - 'qk_ln': False, - 'clip_qkv': None, - 'softmax_scale': None, - 'prefix_lm': False, - 'attn_uses_sequence_id': False, - 'alibi': False, - 'alibi_bias_max': 8 -} -ffn_config_defaults: dict = {'ffn_type': 'mptmlp'} -init_config_defaults: dict = { - 'name': 'kaiming_normal_', - 'fan_mode': 'fan_in', - 'init_nonlinearity': 'relu', - 'init_div_is_residual': True, - 'emb_init_std': None, - 'emb_init_uniform_lim': None, - 'init_std': None, - 'init_gain': 0.0 -} - - -class MPTConfig(PretrainedConfig): - model_type = 'mpt' - attribute_map = { - 'num_attention_heads': 'n_heads', - 'hidden_size': 'd_model', - 'num_hidden_layers': 'n_layers', - } - - # pylint: disable=dangerous-default-value - def __init__(self, - d_model: int = 2048, - n_heads: int = 16, - n_layers: int = 24, - expansion_ratio: int = 4, - max_seq_len: int = 2048, - vocab_size: int = 50368, - resid_pdrop: float = 0.0, - emb_pdrop: float = 0.0, - learned_pos_emb: bool = True, - attn_config: dict = attn_config_defaults, - ffn_config: dict = ffn_config_defaults, - init_device: str = 'cpu', - logit_scale: Optional[Union[float, str]] = None, - no_bias: bool = False, - embedding_fraction: float = 1.0, - norm_type: str = 'low_precision_layernorm', - use_cache: bool = False, - init_config: dict = init_config_defaults, - fc_type: str = 'torch', - verbose: Optional[int] = None, - **kwargs: Any): - self.d_model = d_model - self.n_heads = n_heads - self.n_layers = n_layers - self.expansion_ratio = expansion_ratio - self.max_seq_len = max_seq_len - self.vocab_size = vocab_size - self.resid_pdrop = resid_pdrop - self.emb_pdrop = emb_pdrop - self.learned_pos_emb = learned_pos_emb - self.attn_config = attn_config - self.ffn_config = ffn_config - self.init_device = init_device - self.logit_scale = logit_scale - self.no_bias = no_bias - self.embedding_fraction = embedding_fraction - self.norm_type = norm_type - self.use_cache = use_cache - self.init_config = init_config - self.fc_type = fc_type - if verbose is not None: - warnings.warn(DeprecationWarning( - 'verbose argument for MPTConfig is now ignored and ' - 'will be removed. Use python_log_level instead.'), - stacklevel=2) - if 'name' in kwargs: - del kwargs['name'] - if 'loss_fn' in kwargs: - del kwargs['loss_fn'] - if self.attn_config.get('alibi', False): - self.learned_pos_emb = False - warnings.warn( - f'alibi is turned on, setting `learned_pos_emb` ' - f'to {self.learned_pos_emb}`', - stacklevel=2) - super().__init__(**kwargs) - self._validate_config() - - def _set_config_defaults( - self, config: dict[str, Any], - config_defaults: dict[str, Any]) -> dict[str, Any]: - for (k, v) in config_defaults.items(): - if k not in config: - config[k] = v - return config - - def _validate_config(self) -> None: - self.attn_config = self._set_config_defaults(self.attn_config, - attn_config_defaults) - self.ffn_config = self._set_config_defaults(self.ffn_config, - ffn_config_defaults) - self.init_config = self._set_config_defaults(self.init_config, - init_config_defaults) - if self.d_model % self.n_heads != 0: - raise ValueError('d_model must be divisible by n_heads') - if any( - prob < 0 or prob > 1 for prob in - [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop - ]): - raise ValueError( - "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are " - "probabilities and must be between 0 and 1") - if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']: - raise ValueError( - f"Unknown attn_impl={self.attn_config['attn_impl']}") - if self.attn_config['prefix_lm'] and self.attn_config[ - 'attn_impl'] not in ['torch', 'triton']: - raise NotImplementedError( - 'prefix_lm only implemented with torch and triton attention.') - if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [ - 'torch', 'triton' - ]: - raise NotImplementedError( - 'alibi only implemented with torch and triton attention.') - if self.attn_config['attn_uses_sequence_id'] and self.attn_config[ - 'attn_impl'] not in ['torch', 'triton']: - raise NotImplementedError( - 'attn_uses_sequence_id only implemented with torch ' - 'and triton attention.') - if self.embedding_fraction > 1 or self.embedding_fraction <= 0: - raise ValueError( - 'model.embedding_fraction must be between 0 (exclusive) ' - 'and 1 (inclusive)!') - if isinstance(self.logit_scale, - str) and self.logit_scale != 'inv_sqrt_d_model': - raise ValueError( - f"self.logit_scale={self.logit_scale!r} is not recognized as " - "an option; use numeric value or 'inv_sqrt_d_model'.") - if self.init_config.get('name', None) is None: - raise ValueError( - f"self.init_config={self.init_config!r} 'name' needs to be set." - ) - if not self.learned_pos_emb and (not self.attn_config['alibi']): - warnings.warn( - 'Positional information not being provided to the model.', - stacklevel=2) - if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp': - try: - # pylint: disable=import-outside-toplevel - import transformer_engine.pytorch as te - del te - except Exception as exc: - raise ImportError( - 'TransformerEngine import fail. `fc_type: te` requires ' - 'TransformerEngine be installed. ' - 'The required version of transformer_engine also requires ' - 'FlashAttention v1.0.6 is installed:\n' - 'pip install flash-attn==1.0.6 --no-build-isolation \n' - 'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156' - ) from exc - if self.ffn_config['ffn_type'] == 'mptmlp': - self.ffn_config['fc_type'] = self.fc_type - elif self.ffn_config['ffn_type'] == 'te_ln_mlp': - self.ffn_config['bias'] = not self.no_bias diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py deleted file mode 100644 index edfc506882ff5..0000000000000 --- a/vllm/transformers_utils/configs/nvlm_d.py +++ /dev/null @@ -1,31 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Adapted from -# https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py -# -------------------------------------------------------- -# NVLM-D -# Copyright (c) 2024 NVIDIA -# Licensed under Apache 2.0 License [see LICENSE for details] -# -------------------------------------------------------- -from transformers import Qwen2Config -from transformers.configuration_utils import PretrainedConfig - - -class NVLM_D_Config(PretrainedConfig): - model_type = 'NVLM_D' - is_composition = True - - def __init__(self, vision_config=None, llm_config=None, **kwargs): - super().__init__(**kwargs) - - # Handle vision_config initialization - if vision_config is None: - vision_config = {} - - # Handle llm_config initialization - if llm_config is None: - llm_config = {} - - self.vision_config = PretrainedConfig(**vision_config) - self.text_config = Qwen2Config(**llm_config) diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py deleted file mode 100644 index 021d402a71f4c..0000000000000 --- a/vllm/transformers_utils/configs/ovis.py +++ /dev/null @@ -1,184 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# yapf: disable -# ruff: noqa: E501 -# copied from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py -# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py -from typing import Any, Optional, Union - -from transformers import AutoConfig, PretrainedConfig - - -class AIMv2Config(PretrainedConfig): - """This is the configuration class to store the configuration of an [`AIMv2Model`]. - - Instantiating a configuration with the defaults will yield a similar configuration - to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224). - - Args: - hidden_size: Dimension of the hidden representations. - intermediate_size: Dimension of the SwiGLU representations. - num_hidden_layers: Number of hidden layers in the Transformer. - num_attention_heads: Number of attention heads for each attention layer - in the Transformer. - num_channels: Number of input channels. - image_size: Image size. - patch_size: Patch size. - rms_norm_eps: Epsilon value used for the RMS normalization layer. - attention_dropout: Dropout ratio for attention probabilities. - projection_dropout: Dropout ratio for the projection layer after the attention. - qkv_bias: Whether to add a bias to the queries, keys and values. - use_bias: Whether to add a bias in the feed-forward and projection layers. - kwargs: Keyword arguments for the [`PretrainedConfig`]. - """ - - model_type: str = "aimv2" - - def __init__( - self, - hidden_size: int = 1024, - intermediate_size: int = 2816, - num_hidden_layers: int = 24, - num_attention_heads: int = 8, - num_channels: int = 3, - image_size: int = 224, - patch_size: int = 14, - rms_norm_eps: float = 1e-5, - attention_dropout: float = 0.0, - projection_dropout: float = 0.0, - qkv_bias: bool = False, - use_bias: bool = False, - **kwargs: Any, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.attention_dropout = attention_dropout - self.rms_norm_eps = rms_norm_eps - - self.projection_dropout = projection_dropout - self.qkv_bias = qkv_bias - self.use_bias = use_bias - - -IGNORE_ID = -100 -IMAGE_TOKEN_ID = -200 -IMAGE_TOKEN = "" -IMAGE_ATOM_ID = -300 -IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305] - - -# ---------------------------------------------------------------------- -# Visual Tokenizer Configuration -# ---------------------------------------------------------------------- -class BaseVisualTokenizerConfig(PretrainedConfig): - - def __init__(self, - vocab_size=16384, - tokenize_function="softmax", - tau=1.0, - depths=None, - drop_cls_token=False, - backbone_config: Optional[Union[PretrainedConfig, - dict]] = None, - hidden_stride: int = 1, - **kwargs): - super().__init__(**kwargs) - self.vocab_size = vocab_size - self.tokenize_function = tokenize_function - self.tau = tau - if isinstance(depths, str): - depths = [int(x) for x in depths.split('|')] - self.depths = depths - self.backbone_kwargs = dict[str, Any]() - self.drop_cls_token = drop_cls_token - if backbone_config is not None: - assert isinstance(backbone_config, (PretrainedConfig, dict)), \ - f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type" - if not isinstance(backbone_config, PretrainedConfig): - model_type = backbone_config['model_type'] - if model_type != "aimv2": - backbone_config.pop('model_type') - backbone_config = AutoConfig.for_model(model_type, **backbone_config) - else: - backbone_config = AIMv2Config(**backbone_config) - self.backbone_config = backbone_config - self.hidden_stride = hidden_stride - - -class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig): - model_type = "aimv2_visual_tokenizer" - - def __init__(self, **kwargs): - super().__init__(**kwargs) - if self.drop_cls_token: - self.drop_cls_token = False - if self.depths: - assert len(self.depths) == 1 - self.backbone_kwargs['num_hidden_layers'] = self.depths[0] - - -class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig): - model_type = "siglip_visual_tokenizer" - - def __init__(self, **kwargs): - super().__init__(**kwargs) - if self.drop_cls_token: - self.drop_cls_token = False - if self.depths: - assert len(self.depths) == 1 - self.backbone_kwargs['num_hidden_layers'] = self.depths[0] - - -AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig) -AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig) - - -# ---------------------------------------------------------------------- -# Ovis Configuration -# ---------------------------------------------------------------------- -class OvisConfig(PretrainedConfig): - model_type = "ovis" - - def __init__(self, - llm_config: Optional[Union[PretrainedConfig, dict]] = None, - visual_tokenizer_config: Optional[Union[PretrainedConfig, - dict]] = None, - multimodal_max_length=8192, - hidden_size=None, - conversation_formatter_class=None, - llm_attn_implementation=None, - disable_tie_weight=False, - **kwargs): - super().__init__(**kwargs) - if llm_config is not None: - assert isinstance(llm_config, (PretrainedConfig, dict)), \ - f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type" - if not isinstance(llm_config, PretrainedConfig): - model_type = llm_config['model_type'] - llm_config.pop('model_type') - llm_config = AutoConfig.for_model(model_type, **llm_config) - - # map llm_config to text_config - self.text_config = llm_config - if visual_tokenizer_config is not None: - assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \ - f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type" - if not isinstance(visual_tokenizer_config, PretrainedConfig): - model_type = visual_tokenizer_config['model_type'] - visual_tokenizer_config.pop('model_type') - visual_tokenizer_config = AutoConfig.for_model( - model_type, **visual_tokenizer_config) - - self.visual_tokenizer_config = visual_tokenizer_config - self.multimodal_max_length = multimodal_max_length - self.hidden_size = hidden_size - self.conversation_formatter_class = conversation_formatter_class - self.llm_attn_implementation = llm_attn_implementation - self.disable_tie_weight = disable_tie_weight diff --git a/vllm/transformers_utils/configs/skyworkr1v.py b/vllm/transformers_utils/configs/skyworkr1v.py deleted file mode 100644 index 33a45220e3159..0000000000000 --- a/vllm/transformers_utils/configs/skyworkr1v.py +++ /dev/null @@ -1,54 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Adapted from -# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/configuration_skywork_chat.py -# -------------------------------------------------------- -# SkyworkR1V -# Copyright (c) 2025 Skywork -# Licensed under The MIT License [see LICENSE for details] -# -------------------------------------------------------- -from transformers.configuration_utils import PretrainedConfig - - -class SkyworkR1VChatConfig(PretrainedConfig): - model_type = 'internvl_chat' - is_composition = True - - def __init__(self, - vision_config=None, - llm_config=None, - use_backbone_lora=0, - use_llm_lora=0, - select_layer=-1, - force_image_size=None, - downsample_ratio=0.5, - template=None, - dynamic_image_size=False, - use_thumbnail=False, - ps_version='v1', - min_dynamic_patch=1, - max_dynamic_patch=6, - **kwargs): - super().__init__(**kwargs) - - if vision_config is None: - vision_config = {} - - if llm_config is None: - llm_config = {} - - self.vision_config = PretrainedConfig(**vision_config) - self.text_config = PretrainedConfig(**llm_config) - - self.use_backbone_lora = use_backbone_lora - self.use_llm_lora = use_llm_lora - self.select_layer = select_layer - self.force_image_size = force_image_size - self.downsample_ratio = downsample_ratio - self.template = template - self.dynamic_image_size = dynamic_image_size - self.use_thumbnail = use_thumbnail - self.ps_version = ps_version # pixel shuffle version - self.min_dynamic_patch = min_dynamic_patch - self.max_dynamic_patch = max_dynamic_patch diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py deleted file mode 100644 index a83dfa40b43a5..0000000000000 --- a/vllm/transformers_utils/configs/solar.py +++ /dev/null @@ -1,247 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Solar model configuration""" - -from transformers import PretrainedConfig -from transformers.utils import logging - -logger = logging.get_logger(__name__) - - -class SolarConfig(PretrainedConfig): - r""" - This is the configuration class to store - the configuration of a [`SolarModel`]. - It is used to instantiate an LLaMA model - according to the specified arguments, - defining the model architecture. - Instantiating a configuration with the - defaults will yield a similar - configuration to that of the LLaMA-7B. - Configuration objects inherit from [`PretrainedConfig`] - and can be used to control the model outputs. - Read the documentation from [`PretrainedConfig`] for more information. - Args: - vocab_size (`int`, *optional*, defaults to 32000): - Vocabulary size of the LLaMA model. - Defines the number of different tokens - that can be represented by the `inputs_ids` - passed when calling [`SolarModel`] - hidden_size (`int`, *optional*, defaults to 4096): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 11008): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 32): - Number of hidden layers in the Transformer decoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer - in the Transformer decoder. - num_key_value_heads (`int`, *optional*): - This is the number of key_value heads that - should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, - the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1` the model - will use Multi Query Attention (MQA) - otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, - each group key and value head should be constructed - by meanpooling all the original heads within that group. - For more details checkout [this paper] - (https://arxiv.org/pdf/2305.13245.pdf). - If it is not specified, will default to - `num_attention_heads`. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) - in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 2048): - The maximum sequence length that this model might ever be used with. - Solar 1 supports up to 2048 tokens, - Solar 2 up to 4096, CodeSolar up to 16384. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of - the truncated_normal_initializer for initializing - all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return - the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - pad_token_id (`int`, *optional*): - Padding token id. - bos_token_id (`int`, *optional*, defaults to 1): - Beginning of stream token id. - eos_token_id (`int`, *optional*, defaults to 2): - End of stream token id. - pretraining_tp (`int`, *optional*, defaults to 1): - Experimental feature. Tensor parallelism rank - used during pretraining. - Please refer to [this - document](https://huggingface.co/docs/ - transformers/main/ - perf_train_gpu_many#tensor-parallelism) - to understand more about it. This value is - necessary to ensure exact reproducibility - of the pretraining results. - Please refer to [this - issue](https://github.com/pytorch/pytorch/issues/76232). - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`dict`, *optional*): - Dictionary containing the scaling configuration for - the RoPE embeddings. - Currently supports two scaling - strategies: linear and dynamic. - Their scaling factor must be a float greater than 1. - The expected format is - `{"type": strategy name, "factor": scaling factor}`. - When using this flag, don't update - `max_position_embeddings` to the expected new maximum. - See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/ - dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking - API changes in future versions. - attention_bias (`bool`, *optional*, defaults to `False`): - Whether to use a bias in the query, key, value - and output projection layers during self-attention. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - mlp_bias (`bool`, *optional*, defaults to `False`): - Whether to use a bias in up_proj, down_proj and gate_proj - layers in the MLP layers. - sliding_window (`int`, *optional*, defaults to 2047): - Sliding window attention window size. If not specified, - will default to `2047`. - ```python - >>> from transformers import SolarModel, SolarConfig - >>> # Initializing a Solar-pro style configuration - >>> configuration = SolarConfig() - >>> # Initializing a model from the Solar-pro style configuration - >>> model = SolarModel(configuration) - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "solar" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - pretraining_tp=1, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - sliding_window=2047, - bskcn_1=None, - bskcn_2=None, - bskcn_3=None, - bskcn_4=None, - bskcn_tv=None, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.pretraining_tp = pretraining_tp - self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self._rope_scaling_validation() - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - self.sliding_window = sliding_window - self.bskcn_1 = bskcn_1 if bskcn_1 is not None else [12, 20, 32, 44] - self.bskcn_2 = bskcn_2 if bskcn_2 is not None else [20, 32] - self.bskcn_3 = bskcn_3 if bskcn_3 is not None else [16, 24, 36, 48] - self.bskcn_4 = bskcn_4 if bskcn_4 is not None else [28, 40] - self.bskcn_tv = bskcn_tv if bskcn_tv is not None else [0.9, 0.8] - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - - def _rope_scaling_validation(self): - """ - Validate the `rope_scaling` configuration. - """ - if self.rope_scaling is None: - return - - if (not isinstance(self.rope_scaling, dict) - or len(self.rope_scaling) != 2): - raise ValueError( - "`rope_scaling` must be a dictionary with two fields," - " `type` and `factor`, " - f"got {self.rope_scaling}") - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_factor = self.rope_scaling.get("factor", None) - if rope_scaling_type is None or rope_scaling_type not in [ - "linear", - "dynamic", - ]: - raise ValueError(f"`rope_scaling`'s type field must be one of " - f"['linear', 'dynamic'], got {rope_scaling_type}") - if (rope_scaling_factor is None - or not isinstance(rope_scaling_factor, float) - or rope_scaling_factor <= 1.0): - raise ValueError( - f"`rope_scaling`'s factor field must be a float > 1," - f" got {rope_scaling_factor}") diff --git a/vllm/transformers_utils/configs/telechat2.py b/vllm/transformers_utils/configs/telechat2.py deleted file mode 100644 index 050a7851d143f..0000000000000 --- a/vllm/transformers_utils/configs/telechat2.py +++ /dev/null @@ -1,64 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py -""" Telechat configuration compatible with LlamaConfig. """ - -from transformers.configuration_utils import PretrainedConfig - - -class Telechat2Config(PretrainedConfig): - - model_type = "telechat" - keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = { - "num_hidden_layers": "n_layer", - "num_attention_heads": "n_head", - "intermediate_size": "ffn_hidden_size", - "rms_norm_eps": "layer_norm_epsilon" - } - - def __init__( - self, - vocab_size=160256, - hidden_size=4096, - n_layer=30, - n_head=32, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - use_cache=True, - bos_token_id=1, - eos_token_id=2, - apply_residual_connection_post_layernorm=False, - hidden_dropout=0.0, - attention_dropout=0.0, - ffn_hidden_size=12288, - training_seqlen=8192, - logn=True, - embed_layernorm=False, - hidden_act="silu", - **kwargs, - ): - self.vocab_size = vocab_size - n_embed = kwargs.pop("n_embed", None) - self.hidden_size = hidden_size if n_embed is None else n_embed - self.n_layer = n_layer - self.n_head = n_head - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.use_cache = use_cache - self.apply_residual_connection_post_layernorm = ( - apply_residual_connection_post_layernorm) - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.logn = logn - self.training_seqlen = training_seqlen - self.embed_layernorm = embed_layernorm - self.num_key_value_heads = kwargs.pop("num_key_value_heads", None) - self.ffn_hidden_size = ffn_hidden_size - self.hidden_act = hidden_act - super().__init__(bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - **kwargs) diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py index 14d15f2bc1673..eca4d7c884dd3 100644 --- a/vllm/transformers_utils/processors/__init__.py +++ b/vllm/transformers_utils/processors/__init__.py @@ -1,5 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Multi-modal processors may be defined in this directory for the following +reasons: + +- There is no processing file defined by HF Hub or Transformers library. +- There is a need to override the existing processor to support vLLM. +""" from vllm.transformers_utils.processors.deepseek_vl2 import ( DeepseekVLV2Processor) From 02f82fe4386b3e84eb0f06bfaf7744c5b4fdba4f Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 30 Jul 2025 14:58:57 +0800 Subject: [PATCH 046/224] [Doc] Update Intern-S1 info (#21908) Signed-off-by: Jee Jee Li --- docs/models/supported_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index e2172051cd186..5a9823bb6bae7 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -595,7 +595,7 @@ See [this page](generative_models.md) for more information on how to use generat | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | -| `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ + VE+ | `internlm/Intern-S1`, etc. | | ✅︎ | ✅︎ | +| `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ + VE+ | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ | | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ | From 30ef30ed5af77829771aec485e0f41d05d4d9880 Mon Sep 17 00:00:00 2001 From: Kebe Date: Wed, 30 Jul 2025 15:37:59 +0800 Subject: [PATCH 047/224] [CI] rollback lint-and-deploy pipeline using amd machine (#21912) Signed-off-by: Kebe --- .github/workflows/lint-and-deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index d5736c0aee208..74a7a3a3530f5 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -7,7 +7,7 @@ permissions: jobs: lint-and-deploy: - runs-on: ubuntu-24.04-arm + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 From 547795232de307e0bff5e779530f01d7e6f4a9ad Mon Sep 17 00:00:00 2001 From: Varun Vinayak Shenoy Date: Wed, 30 Jul 2025 00:44:15 -0700 Subject: [PATCH 048/224] [Tests] Fixing bug inside MultiModalProfiler. (#21842) Signed-off-by: Varun Shenoy --- .../multimodal/processing/test_mllama4.py | 67 +++++++++++++++++++ tests/models/registry.py | 4 +- 2 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 tests/models/multimodal/processing/test_mllama4.py diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py new file mode 100644 index 0000000000000..f3871b60c3f64 --- /dev/null +++ b/tests/models/multimodal/processing/test_mllama4.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for mllama's multimodal preprocessing and profiling.""" +import pytest +from torch import prod +from transformers import Llama4Config + +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.profiling import MultiModalProfiler + +from ...utils import build_model_context + + +@pytest.mark.parametrize("model_id", ["meta-llama/Llama-Guard-4-12B"]) +@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072]) +def test_profiling(model_id: str, max_model_len: int): + model_config_kwargs = { + "max_model_len": max_model_len, + } + ctx = build_model_context( + model_id, + model_config_kwargs=model_config_kwargs, + limit_mm_per_prompt={"image": 1}, + ) + + mm_config = ctx.get_mm_config() + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + profiler = MultiModalProfiler(processor) + + decoder_dummy_data = profiler.get_decoder_dummy_data( + max_model_len, + mm_counts=mm_config.limit_per_prompt, + ) + dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs( + max_model_len, + mm_counts=mm_config.limit_per_prompt, + ) + + hf_config = ctx.get_hf_config(Llama4Config) + + mm_kwargs = processor.apply( + prompt=dummy_mm_data.prompt, + mm_data=dummy_mm_data.mm_data, + hf_processor_mm_kwargs=dict(), + )["mm_kwargs"] + + image_size = hf_config.vision_config.image_size + patch_size = hf_config.vision_config.patch_size + downsample_ratio = int( + round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2))) + tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio + chunks_per_image = prod(mm_kwargs["patches_per_image"]) + total_num_patches = chunks_per_image * tokens_per_patch + num_tiles = mm_kwargs["aspect_ratios"][0][0] * mm_kwargs["aspect_ratios"][ + 0][1] # x-y seperator tokens + total_tokens = total_num_patches.item() + num_tiles.item( + ) + 3 # image start, image, image end + + profiled_tokens = profiler.get_mm_max_contiguous_tokens( + max_model_len, + mm_counts=mm_config.limit_per_prompt, + ) + + assert total_tokens == profiled_tokens["image"] + assert total_tokens == sum( + placeholder.length for placeholder in + decoder_dummy_data.multi_modal_placeholders["image"]) diff --git a/tests/models/registry.py b/tests/models/registry.py index 4fcd02efb6d0b..caa691039fce3 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -391,7 +391,9 @@ _MULTIMODAL_EXAMPLE_MODELS = { extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501 trust_remote_code=True), "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501 - max_model_len=10240), + max_model_len=10240, + extras={"llama-guard-4": "meta-llama/Llama-Guard-4-12B"}, # noqa: E501 + ), "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf", extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501 "mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}), # noqa: E501 From fc91da549978347a3b5f5ebe6e8cbeae6148e012 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 30 Jul 2025 15:55:03 +0800 Subject: [PATCH 049/224] [Model] Remove DSV2 unused code (#21903) Signed-off-by: Jee Jee Li --- vllm/model_executor/models/deepseek_v2.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 79ddd3d0f6276..68a0a83d6204c 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -830,20 +830,6 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts): sampling_metadata) return logits - def make_empty_intermediate_tensors( - self, batch_size: int, dtype: torch.dtype, - device: torch.device) -> IntermediateTensors: - return IntermediateTensors({ - "hidden_states": - torch.zeros((batch_size, self.config.hidden_size), - dtype=dtype, - device=device), - "residual": - torch.zeros((batch_size, self.config.hidden_size), - dtype=dtype, - device=device), - }) - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: stacked_params_mapping = [ From 533db0935da051ac793e8b22afbcb9ae9fa4255b Mon Sep 17 00:00:00 2001 From: Peter Pan Date: Wed, 30 Jul 2025 16:15:43 +0800 Subject: [PATCH 050/224] [benchmark] add max-concurrency in result table (#21095) Signed-off-by: Peter Pan --- benchmarks/benchmark_serving.py | 4 ++++ benchmarks/benchmark_serving_structured_output.py | 4 ++++ vllm/benchmarks/serve.py | 6 ++++++ 3 files changed, 14 insertions(+) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 53bd3247afbb6..3affa18ae3a4f 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -413,6 +413,10 @@ async def benchmark( print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + if max_concurrency is not None: + print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency)) + if request_rate != float("inf"): + print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate)) print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index d535cd5d7e1a6..2a22f122c78e6 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -555,6 +555,10 @@ async def benchmark( print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + if max_concurrency is not None: + print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency)) + if request_rate != float("inf"): + print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate)) print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 635363440c081..bd2b1e5990c83 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -486,6 +486,12 @@ async def benchmark( print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + if max_concurrency is not None: + print("{:<40} {:<10}".format("Maximum request concurrency:", + max_concurrency)) + if request_rate != float('inf'): + print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", + request_rate )) print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) From 5bbaf492a6238ff517249e73151ae9989f7bea9e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 30 Jul 2025 16:32:39 +0800 Subject: [PATCH 051/224] [Doc] Update partial support (#21916) Signed-off-by: DarkLight1337 --- docs/features/compatibility_matrix.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md index 930265b8f9840..5b08b3810776c 100644 --- a/docs/features/compatibility_matrix.md +++ b/docs/features/compatibility_matrix.md @@ -41,17 +41,18 @@ th:not(:first-child) { | [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | | [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | -| [pooling](../models/pooling_models.md) | ✅\* | ✅\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | +| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | enc-dec | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | | logP | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | | async output | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | | multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | -| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194) | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | +| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)^ | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | | best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | | beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | -\* Chunked prefill and prefix caching are only applicable to last-token pooling. +\* Chunked prefill and prefix caching are only applicable to last-token pooling. +^ LoRA is only applicable to the language backbone of multimodal models. [](){ #feature-x-hardware } From 5c8fe389d6fb2b8776d4113d8334d8dd09f78733 Mon Sep 17 00:00:00 2001 From: Hongsheng Liu Date: Wed, 30 Jul 2025 20:11:58 +0800 Subject: [PATCH 052/224] [Docs] Fix the example code of streaming chat completions in reasoning (#21825) Signed-off-by: wangzi <3220100013@zju.edu.cn> Co-authored-by: wangzi <3220100013@zju.edu.cn> Co-authored-by: Zi Wang <66560864+BruceW-07@users.noreply.github.com> --- docs/features/reasoning_outputs.md | 13 ++++++------- ...enai_chat_completion_with_reasoning_streaming.py | 13 ++++++------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 6b84eca275309..04b943efbbbb4 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -123,13 +123,12 @@ OpenAI Python client library does not officially support `reasoning_content` att printed_content = False for chunk in stream: - reasoning_content = None - content = None - # Check the content is reasoning_content or content - if hasattr(chunk.choices[0].delta, "reasoning_content"): - reasoning_content = chunk.choices[0].delta.reasoning_content - elif hasattr(chunk.choices[0].delta, "content"): - content = chunk.choices[0].delta.content + # Safely extract reasoning_content and content from delta, + # defaulting to None if attributes don't exist or are empty strings + reasoning_content = ( + getattr(chunk.choices[0].delta, "reasoning_content", None) or None + ) + content = getattr(chunk.choices[0].delta, "content", None) or None if reasoning_content is not None: if not printed_reasoning_content: diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py index 5a91929770945..7d1ea37714599 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py @@ -51,13 +51,12 @@ def main(): printed_content = False for chunk in stream: - reasoning_content = None - content = None - # Check the content is reasoning_content or content - if hasattr(chunk.choices[0].delta, "reasoning_content"): - reasoning_content = chunk.choices[0].delta.reasoning_content - elif hasattr(chunk.choices[0].delta, "content"): - content = chunk.choices[0].delta.content + # Safely extract reasoning_content and content from delta, + # defaulting to None if attributes don't exist or are empty strings + reasoning_content = ( + getattr(chunk.choices[0].delta, "reasoning_content", None) or None + ) + content = getattr(chunk.choices[0].delta, "content", None) or None if reasoning_content is not None: if not printed_reasoning_content: From 13986365a9e669a8aa1abb308d48dfd276a4f97b Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 30 Jul 2025 14:42:51 +0200 Subject: [PATCH 053/224] Add @patrickvonplaten as maintainer of mistral's related files. (#21928) Signed-off-by: Patrick von Platen --- .github/CODEOWNERS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index fb9f44353cec8..5bc944296763d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -65,3 +65,11 @@ mkdocs.yaml @hmellor # Qwen-specific files /vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow /vllm/model_executor/models/qwen* @sighingnow + +# Mistral-specific files +/vllm/model_executor/models/mistral*.py @patrickvonplaten +/vllm/model_executor/models/mixtral*.py @patrickvonplaten +/vllm/model_executor/models/voxtral*.py @patrickvonplaten +/vllm/model_executor/models/pixtral*.py @patrickvonplaten +/vllm/transformers_utils/configs/mistral.py @patrickvonplaten +/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten From b876860c6214d03279e79e0babb7eb4e3e286cbd Mon Sep 17 00:00:00 2001 From: Eric Curtin Date: Wed, 30 Jul 2025 14:22:00 +0100 Subject: [PATCH 054/224] [Hardware][CPU] Build fix for ARM without BF16 (#21848) Signed-off-by: Eric Curtin --- csrc/cpu/quant.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp index c1f7c64ea2f49..6e120b8d20a7e 100644 --- a/csrc/cpu/quant.cpp +++ b/csrc/cpu/quant.cpp @@ -16,12 +16,14 @@ struct KernelVecType { using cvt_vec_type = vec_op::FP32Vec16; }; +#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT) template <> struct KernelVecType { using load_vec_type = vec_op::BF16Vec16; using azp_adj_load_vec_type = vec_op::INT32Vec16; using cvt_vec_type = vec_op::FP32Vec16; }; +#endif template <> struct KernelVecType { From d979dd6bebb1857052e6beae682e5186f8447fde Mon Sep 17 00:00:00 2001 From: aladerran <108529629+aladerran@users.noreply.github.com> Date: Wed, 30 Jul 2025 21:27:57 +0800 Subject: [PATCH 055/224] [Feature][EPLB] Add eplb support for Qwen3 (#20815) Signed-off-by: aladerran --- vllm/model_executor/models/qwen3_moe.py | 166 ++++++++++++++++++++---- 1 file changed, 142 insertions(+), 24 deletions(-) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 12899c28016b9..ca14fd06574ec 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen3MoE model compatible with HuggingFace weights.""" -from collections.abc import Iterable +import typing +from collections.abc import Callable, Iterable from typing import Any, Optional, Union import torch @@ -31,8 +32,9 @@ from transformers import PretrainedConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.distributed import (get_ep_group, get_pp_group, + get_tensor_model_parallel_world_size) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE @@ -50,8 +52,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP -from .utils import (AutoWeightsLoader, extract_layer_index, +from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP +from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -101,23 +103,47 @@ class Qwen3MoeSparseMoeBlock(nn.Module): config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + enable_eplb: bool = False, ): super().__init__() self.tp_size = get_tensor_model_parallel_world_size() + self.ep_group = get_ep_group().device_group + self.ep_rank = self.ep_group.rank() + self.ep_size = self.ep_group.size() + self.n_routed_experts = config.num_experts + if self.tp_size > config.num_experts: raise ValueError( f"Tensor parallel size {self.tp_size} is greater than " f"the number of experts {config.num_experts}.") - self.experts = FusedMoE(num_experts=config.num_experts, + # Load balancing settings. + vllm_config = get_current_vllm_config() + parallel_config = vllm_config.parallel_config + self.enable_eplb = enable_eplb + + self.n_logical_experts = self.n_routed_experts + self.n_redundant_experts = parallel_config.num_redundant_experts + self.n_physical_experts = (self.n_logical_experts + + self.n_redundant_experts) + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + + self.physical_expert_start = (self.ep_rank * + self.n_local_physical_experts) + self.physical_expert_end = (self.physical_expert_start + + self.n_local_physical_experts) + + self.experts = FusedMoE(num_experts=self.n_routed_experts, top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, reduce_results=False, renormalize=config.norm_topk_prob, quant_config=quant_config, - prefix=f"{prefix}.experts") + prefix=f"{prefix}.experts", + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts) self.gate = ReplicatedLinear(config.hidden_size, config.num_experts, @@ -246,6 +272,7 @@ class Qwen3MoeDecoderLayer(nn.Module): cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + enable_eplb: bool = False, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -277,7 +304,8 @@ class Qwen3MoeDecoderLayer(nn.Module): (layer_idx + 1) % config.decoder_sparse_step == 0): self.mlp = Qwen3MoeSparseMoeBlock(config=config, quant_config=quant_config, - prefix=f"{prefix}.mlp") + prefix=f"{prefix}.mlp", + enable_eplb=enable_eplb) else: self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, @@ -323,6 +351,9 @@ class Qwen3MoeModel(nn.Module): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + parallel_config = vllm_config.parallel_config + enable_eplb = parallel_config.enable_eplb + self.num_redundant_experts = parallel_config.num_redundant_experts self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -336,7 +367,8 @@ class Qwen3MoeModel(nn.Module): lambda prefix: Qwen3MoeDecoderLayer(config=config, cache_config=cache_config, quant_config=quant_config, - prefix=prefix), + prefix=prefix, + enable_eplb=enable_eplb), prefix=f"{prefix}.layers", ) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -382,7 +414,8 @@ class Qwen3MoeModel(nn.Module): ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", ckpt_up_proj_name="up_proj", - num_experts=self.config.num_experts) + num_experts=self.config.num_experts, + num_redundant_experts=self.num_redundant_experts) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: @@ -433,27 +466,51 @@ class Qwen3MoeModel(nn.Module): weight_loader(param, loaded_weight, shard_id) break else: + is_expert_weight = False for mapping in expert_params_mapping: param_name, weight_name, expert_id, shard_id = mapping if weight_name not in name: continue - name = name.replace(weight_name, param_name) - # Skip layers on other devices. - if is_pp_missing_parameter(name, self): + + # Anyway, this is an expert weight and should not be + # attempted to load as other weights later + is_expert_weight = True + + # Do not modify `name` since the loop may continue here + # Instead, create a new variable + name_mapped = name.replace(weight_name, param_name) + + if is_pp_missing_parameter(name_mapped, self): continue + # Skip loading extra parameters for GPTQ/modelopt models. - if name.endswith( - ignore_suffixes) and name not in params_dict: + if name_mapped.endswith( + ignore_suffixes + ) and name_mapped not in params_dict: continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, - loaded_weight, - name, - shard_id=shard_id, - expert_id=expert_id) - break + + param = params_dict[name_mapped] + # We should ask the weight loader to return success or not + # here since otherwise we may skip experts with other + # available replicas. + weight_loader = typing.cast(Callable[..., bool], + param.weight_loader) + success = weight_loader(param, + loaded_weight, + name_mapped, + shard_id=shard_id, + expert_id=expert_id, + return_success=True) + if success: + name = name_mapped + break else: + if is_expert_weight: + # We've checked that this is an expert weight + # However it's not mapped locally to this rank + # So we simply skip it + continue + # Skip loading extra parameters for GPTQ/modelopt models. if name.endswith( ignore_suffixes) and name not in params_dict: @@ -482,7 +539,8 @@ class Qwen3MoeModel(nn.Module): return loaded_params -class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): +class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, + MixtureOfExperts): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -514,6 +572,66 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + # Set MoE hyperparameters + self.expert_weights = [] + + self.moe_layers: list[FusedMoE] = [] + example_layer = None + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + + assert isinstance(layer, Qwen3MoeDecoderLayer) + if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock): + example_layer = layer.mlp + self.moe_layers.append(layer.mlp.experts) + + if example_layer is None: + raise RuntimeError("No Qwen3MoE layer found in the model.layers.") + + self.num_moe_layers = len(self.moe_layers) + self.num_expert_groups = 1 + self.num_shared_experts = 0 + self.num_logical_experts = example_layer.n_logical_experts + self.num_physical_experts = example_layer.n_physical_experts + self.num_local_physical_experts = example_layer.n_local_physical_experts + self.num_routed_experts = example_layer.n_routed_experts + self.num_redundant_experts = example_layer.n_redundant_experts + + def set_eplb_state( + self, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + for layer_idx, layer in enumerate(self.moe_layers): + # Register the expert weights. + self.expert_weights.append(layer.get_expert_weights()) + layer.set_eplb_state( + moe_layer_idx=layer_idx, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = (num_physical_experts - + self.num_logical_experts) + for layer in self.model.layers: + if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock): + moe = layer.mlp + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) From fcfd1eb9c556e295eb5708eb0f5e6ae775807775 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 30 Jul 2025 21:36:34 +0800 Subject: [PATCH 056/224] [Doc] Remove vLLM prefix and add citation for PagedAttention (#21910) Signed-off-by: DarkLight1337 --- .../paged_attention}/k_vecs.png | Bin .../paged_attention}/key.png | Bin .../paged_attention}/logits_vec.png | Bin .../paged_attention}/q_vecs.png | Bin .../paged_attention}/query.png | Bin .../paged_attention}/v_vec.png | Bin .../paged_attention}/value.png | Bin docs/design/paged_attention.md | 29 ++++++++++++------ docs/design/plugin_system.md | 2 +- docs/design/torch_compile.md | 2 +- 10 files changed, 22 insertions(+), 11 deletions(-) rename docs/assets/{kernel => design/paged_attention}/k_vecs.png (100%) rename docs/assets/{kernel => design/paged_attention}/key.png (100%) rename docs/assets/{kernel => design/paged_attention}/logits_vec.png (100%) rename docs/assets/{kernel => design/paged_attention}/q_vecs.png (100%) rename docs/assets/{kernel => design/paged_attention}/query.png (100%) rename docs/assets/{kernel => design/paged_attention}/v_vec.png (100%) rename docs/assets/{kernel => design/paged_attention}/value.png (100%) diff --git a/docs/assets/kernel/k_vecs.png b/docs/assets/design/paged_attention/k_vecs.png similarity index 100% rename from docs/assets/kernel/k_vecs.png rename to docs/assets/design/paged_attention/k_vecs.png diff --git a/docs/assets/kernel/key.png b/docs/assets/design/paged_attention/key.png similarity index 100% rename from docs/assets/kernel/key.png rename to docs/assets/design/paged_attention/key.png diff --git a/docs/assets/kernel/logits_vec.png b/docs/assets/design/paged_attention/logits_vec.png similarity index 100% rename from docs/assets/kernel/logits_vec.png rename to docs/assets/design/paged_attention/logits_vec.png diff --git a/docs/assets/kernel/q_vecs.png b/docs/assets/design/paged_attention/q_vecs.png similarity index 100% rename from docs/assets/kernel/q_vecs.png rename to docs/assets/design/paged_attention/q_vecs.png diff --git a/docs/assets/kernel/query.png b/docs/assets/design/paged_attention/query.png similarity index 100% rename from docs/assets/kernel/query.png rename to docs/assets/design/paged_attention/query.png diff --git a/docs/assets/kernel/v_vec.png b/docs/assets/design/paged_attention/v_vec.png similarity index 100% rename from docs/assets/kernel/v_vec.png rename to docs/assets/design/paged_attention/v_vec.png diff --git a/docs/assets/kernel/value.png b/docs/assets/design/paged_attention/value.png similarity index 100% rename from docs/assets/kernel/value.png rename to docs/assets/design/paged_attention/value.png diff --git a/docs/design/paged_attention.md b/docs/design/paged_attention.md index ef525e8c60412..fb991a35caf30 100644 --- a/docs/design/paged_attention.md +++ b/docs/design/paged_attention.md @@ -1,7 +1,7 @@ -# vLLM Paged Attention +# Paged Attention !!! warning - This document is being kept in the vLLM documentation for historical purposes. + This is a historical document based on the [original paper for vLLM](https://arxiv.org/abs/2309.06180). It no longer describes the code used in vLLM today. Currently, vLLM utilizes its own implementation of a multi-head query @@ -140,7 +140,7 @@ const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; ```
- ![](../../assets/kernel/query.png){ align="center" alt="query" width="70%" } + ![](../assets/design/paged_attention/query.png){ align="center" alt="query" width="70%" }
Each thread defines its own `q_ptr` which points to the assigned @@ -149,7 +149,7 @@ and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains total of 128 elements divided into 128 / 4 = 32 vecs.
- ![](../../assets/kernel/q_vecs.png){ align="center" alt="q_vecs" width="70%" } + ![](../assets/design/paged_attention/q_vecs.png){ align="center" alt="q_vecs" width="70%" }
```cpp @@ -188,7 +188,7 @@ points to key token data based on `k_cache` at assigned block, assigned head and assigned token.
- ![](../../assets/kernel/key.png){ align="center" alt="key" width="70%" } + ![](../assets/design/paged_attention/key.png){ align="center" alt="key" width="70%" }
The diagram above illustrates the memory layout for key data. It @@ -203,7 +203,7 @@ elements for one token) that will be processed by 2 threads (one thread group) separately.
- ![](../../assets/kernel/k_vecs.png){ align="center" alt="k_vecs" width="70%" } + ![](../assets/design/paged_attention/k_vecs.png){ align="center" alt="k_vecs" width="70%" }
```cpp @@ -362,15 +362,15 @@ later steps. Now, it should store the normalized softmax result of ## Value
- ![](../../assets/kernel/value.png){ align="center" alt="value" width="70%" } + ![](../assets/design/paged_attention/value.png){ align="center" alt="value" width="70%" }
- ![](../../assets/kernel/logits_vec.png){ align="center" alt="logits_vec" width="50%" } + ![](../assets/design/paged_attention/logits_vec.png){ align="center" alt="logits_vec" width="50%" }
- ![](../../assets/kernel/v_vec.png){ align="center" alt="v_vec" width="70%" } + ![](../assets/design/paged_attention/v_vec.png){ align="center" alt="v_vec" width="70%" }
Now we need to retrieve the value data and perform dot multiplication @@ -499,3 +499,14 @@ for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { Finally, we need to iterate over different assigned head positions and write out the corresponding accumulated result based on the `out_ptr`. + +## Citation + +```bibtex +@inproceedings{kwon2023efficient, + title={Efficient Memory Management for Large Language Model Serving with PagedAttention}, + author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica}, + booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles}, + year={2023} +} +``` diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index 23a05ac719ce2..ca1c2c2305d91 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -1,4 +1,4 @@ -# vLLM's Plugin System +# Plugin System The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM. diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md index 2d76e7f3adc5c..47ac4958dbf7f 100644 --- a/docs/design/torch_compile.md +++ b/docs/design/torch_compile.md @@ -1,4 +1,4 @@ -# vLLM's `torch.compile` integration +# `torch.compile` integration In vLLM's V1 architecture, `torch.compile` is enabled by default and is a critical part of the framework. This document gives a simple walk-through example to show how to understand the `torch.compile` usage. From da3e0bd6e53f12bb18d518940e8150ba023956aa Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Wed, 30 Jul 2025 21:51:58 +0800 Subject: [PATCH 057/224] [Bugfix] we should use metavar is not choices (#21902) Signed-off-by: rongfu.leng --- vllm/entrypoints/openai/cli_args.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 2d19e16883aa2..282493e543552 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -194,7 +194,9 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`""" # Special case: Tool call parser shows built-in options. valid_tool_parsers = list(ToolParserManager.tool_parsers.keys()) - frontend_kwargs["tool_call_parser"]["choices"] = valid_tool_parsers + parsers_str = ",".join(valid_tool_parsers) + frontend_kwargs["tool_call_parser"]["metavar"] = ( + f"{{{parsers_str}}} or name registered in --tool-parser-plugin") frontend_group = parser.add_argument_group( title="Frontend", From bf668b5bf56644db8e90cd0d385b62cc15a4657a Mon Sep 17 00:00:00 2001 From: Yan Pashkovsky Date: Wed, 30 Jul 2025 15:03:23 +0100 Subject: [PATCH 058/224] [Feature] Support multiple api keys in server (#18548) Signed-off-by: Yan Pashkovsky --- docs/getting_started/quickstart.md | 1 + vllm/entrypoints/openai/api_server.py | 12 +++---- vllm/entrypoints/openai/cli_args.py | 46 +++++++++++++-------------- 3 files changed, 30 insertions(+), 29 deletions(-) diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index 74235db16a15d..3a93497fab137 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -126,6 +126,7 @@ curl http://localhost:8000/v1/models ``` You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header. +You can pass multiple keys after `--api-key`, and the server will accept any of the keys passed, this can be useful for key rotation. ### OpenAI Completions API with vLLM diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index c375c8755108c..05d9a69a65f83 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1239,9 +1239,9 @@ class AuthenticationMiddleware: 2. The request path doesn't start with /v1 (e.g. /health). """ - def __init__(self, app: ASGIApp, api_token: str) -> None: + def __init__(self, app: ASGIApp, tokens: list[str]) -> None: self.app = app - self.api_token = api_token + self.api_tokens = {f"Bearer {token}" for token in tokens} def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]: @@ -1255,7 +1255,7 @@ class AuthenticationMiddleware: headers = Headers(scope=scope) # Type narrow to satisfy mypy. if url_path.startswith("/v1") and headers.get( - "Authorization") != f"Bearer {self.api_token}": + "Authorization") not in self.api_tokens: response = JSONResponse(content={"error": "Unauthorized"}, status_code=401) return response(scope, receive, send) @@ -1303,7 +1303,7 @@ class ScalingMiddleware: """ Middleware that checks if the model is currently scaling and returns a 503 Service Unavailable response if it is. - + This middleware applies to all HTTP requests and prevents processing when the model is in a scaling state. """ @@ -1512,8 +1512,8 @@ def build_app(args: Namespace) -> FastAPI: status_code=HTTPStatus.BAD_REQUEST) # Ensure --api-key option from CLI takes precedence over VLLM_API_KEY - if token := args.api_key or envs.VLLM_API_KEY: - app.add_middleware(AuthenticationMiddleware, api_token=token) + if tokens := [key for key in (args.api_key or [envs.VLLM_API_KEY]) if key]: + app.add_middleware(AuthenticationMiddleware, tokens=tokens) if args.enable_request_id_headers: app.add_middleware(XRequestIdMiddleware) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 282493e543552..dfbc9cde3d5b1 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -85,22 +85,22 @@ class FrontendArgs: """Allowed methods.""" allowed_headers: list[str] = field(default_factory=lambda: ["*"]) """Allowed headers.""" - api_key: Optional[str] = None - """If provided, the server will require this key to be presented in the - header.""" + api_key: Optional[list[str]] = None + """If provided, the server will require one of these keys to be presented in + the header.""" lora_modules: Optional[list[LoRAModulePath]] = None """LoRA modules configurations in either 'name=path' format or JSON format - or JSON list format. Example (old format): `'name=path'` Example (new - format): `{\"name\": \"name\", \"path\": \"lora_path\", + or JSON list format. Example (old format): `'name=path'` Example (new + format): `{\"name\": \"name\", \"path\": \"lora_path\", \"base_model_name\": \"id\"}`""" chat_template: Optional[str] = None - """The file path to the chat template, or the template in single-line form + """The file path to the chat template, or the template in single-line form for the specified model.""" chat_template_content_format: ChatTemplateContentFormatOption = "auto" """The format to render message content within a chat template. * "string" will render the content as a string. Example: `"Hello World"` -* "openai" will render the content as a list of dictionaries, similar to OpenAI +* "openai" will render the content as a list of dictionaries, similar to OpenAI schema. Example: `[{"type": "text", "text": "Hello world!"}]`""" response_role: str = "assistant" """The role name to return if `request.add_generation_prompt=true`.""" @@ -117,40 +117,40 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`""" root_path: Optional[str] = None """FastAPI root_path when app is behind a path based routing proxy.""" middleware: list[str] = field(default_factory=lambda: []) - """Additional ASGI middleware to apply to the app. We accept multiple - --middleware arguments. The value should be an import path. If a function - is provided, vLLM will add it to the server using - `@app.middleware('http')`. If a class is provided, vLLM will + """Additional ASGI middleware to apply to the app. We accept multiple + --middleware arguments. The value should be an import path. If a function + is provided, vLLM will add it to the server using + `@app.middleware('http')`. If a class is provided, vLLM will add it to the server using `app.add_middleware()`.""" return_tokens_as_token_ids: bool = False - """When `--max-logprobs` is specified, represents single tokens as - strings of the form 'token_id:{token_id}' so that tokens that are not + """When `--max-logprobs` is specified, represents single tokens as + strings of the form 'token_id:{token_id}' so that tokens that are not JSON-encodable can be identified.""" disable_frontend_multiprocessing: bool = False - """If specified, will run the OpenAI frontend server in the same process as + """If specified, will run the OpenAI frontend server in the same process as the model serving engine.""" enable_request_id_headers: bool = False - """If specified, API server will add X-Request-Id header to responses. + """If specified, API server will add X-Request-Id header to responses. Caution: this hurts performance at high QPS.""" enable_auto_tool_choice: bool = False - """If specified, exclude tool definitions in prompts when + """If specified, exclude tool definitions in prompts when tool_choice='none'.""" exclude_tools_when_tool_choice_none: bool = False - """Enable auto tool choice for supported models. Use `--tool-call-parser` + """Enable auto tool choice for supported models. Use `--tool-call-parser` to specify which parser to use.""" tool_call_parser: Optional[str] = None - """Select the tool call parser depending on the model that you're using. - This is used to parse the model-generated tool call into OpenAI API format. - Required for `--enable-auto-tool-choice`. You can choose any option from + """Select the tool call parser depending on the model that you're using. + This is used to parse the model-generated tool call into OpenAI API format. + Required for `--enable-auto-tool-choice`. You can choose any option from the built-in parsers or register a plugin via `--tool-parser-plugin`.""" tool_parser_plugin: str = "" - """Special the tool parser plugin write to parse the model-generated tool - into OpenAI API format, the name register in this plugin can be used in + """Special the tool parser plugin write to parse the model-generated tool + into OpenAI API format, the name register in this plugin can be used in `--tool-call-parser`.""" log_config_file: Optional[str] = envs.VLLM_LOGGING_CONFIG_PATH """Path to logging config JSON file for both vllm and uvicorn""" max_log_len: Optional[int] = None - """Max number of prompt characters or prompt ID numbers being printed in + """Max number of prompt characters or prompt ID numbers being printed in log. The default of None means unlimited.""" disable_fastapi_docs: bool = False """Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint.""" From e91d3c9cda69b9770241c79fbf94f81f5576e7f4 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 30 Jul 2025 22:05:04 +0800 Subject: [PATCH 059/224] [misc] skip p2p check by default (#21904) --- vllm/envs.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 50cb3b7d1b7aa..ec4b0888d0f40 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -668,12 +668,14 @@ environment_variables: dict[str, Callable[[], Any]] = { (os.environ.get("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "0").strip().lower() in ("1", "true")), - # By default, vLLM will check the peer-to-peer capability itself, - # in case of broken drivers. See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa - # If this env var is set to 1, vLLM will skip the peer-to-peer check, - # and trust the driver's peer-to-peer capability report. + # We assume drivers can report p2p status correctly. + # If the program hangs when using custom allreduce, + # potantially caused by a bug in the driver (535 series), + # if might be helpful to set VLLM_SKIP_P2P_CHECK=0 + # so that vLLM can verify if p2p is actually working. + # See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa "VLLM_SKIP_P2P_CHECK": - lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "0") == "1", + lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "1") == "1", # List of quantization kernels that should be disabled, used for testing # and performance comparisons. Currently only affects MPLinearKernel From 0271c2ff2fd15bd1a7c19484572a81e056e75620 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 30 Jul 2025 10:15:02 -0400 Subject: [PATCH 060/224] [Test] Add Benchmark and Unit Test for `per_token_group_quant` (#21860) Signed-off-by: yewentao256 --- .../benchmark_per_token_group_quant.py | 159 ++++++++++++++++++ .../test_per_token_group_quant.py | 31 +++- 2 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 benchmarks/kernels/benchmark_per_token_group_quant.py diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py new file mode 100644 index 0000000000000..1ccb5e08b3d57 --- /dev/null +++ b/benchmarks/kernels/benchmark_per_token_group_quant.py @@ -0,0 +1,159 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import math +from contextlib import contextmanager +from typing import Callable +from unittest.mock import patch + +import torch + +from vllm.model_executor.layers.quantization.utils import fp8_utils, int8_utils +from vllm.platforms import current_platform + + +@contextmanager +def _triton_mode(): + """Temporarily force the Triton fallback path""" + with patch("vllm.platforms.current_platform.is_cuda", return_value=False): + yield + + +def _time_cuda( + fn: Callable[[], tuple[torch.Tensor, torch.Tensor]], + warmup_iters: int, + bench_iters: int, +) -> float: + # warmup + for _ in range(warmup_iters): + fn() + torch.cuda.synchronize() + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + start.record() + for _ in range(bench_iters): + fn() + end.record() + torch.cuda.synchronize() + + return start.elapsed_time(end) / bench_iters # ms/iter + + +def _run_single( + shape: tuple[int, int], + group_size: int, + dtype: str, + *, + column_major: bool = False, + scale_ue8m0: bool = False, + warmup_iters: int, + bench_iters: int, +) -> None: + num_tokens, hidden_dim = shape + + device = torch.device("cuda") + torch.manual_seed(42) + x = torch.randn(num_tokens, hidden_dim, device=device, dtype=torch.bfloat16) * 8 + + if dtype == "fp8": + + def cuda_impl(): + return fp8_utils.per_token_group_quant_fp8( + x, + group_size, + column_major_scales=column_major, + use_ue8m0=scale_ue8m0, + ) + + def triton_impl(): + with _triton_mode(): + return fp8_utils.per_token_group_quant_fp8( + x, + group_size, + column_major_scales=column_major, + use_ue8m0=scale_ue8m0, + ) + elif dtype == "int8": + + def cuda_impl(): + return int8_utils.per_token_group_quant_int8(x, group_size) + + def triton_impl(): + with _triton_mode(): + return int8_utils.per_token_group_quant_int8(x, group_size) + else: + raise ValueError("dtype must be 'fp8' or 'int8'") + + cuda_ms = _time_cuda(cuda_impl, warmup_iters, bench_iters) + triton_ms = _time_cuda(triton_impl, warmup_iters, bench_iters) + + speedup = triton_ms / cuda_ms if cuda_ms else math.inf + + cfg_desc = ( + f"shape={shape} gs={group_size:<3} col_major={column_major:<5} " + f"ue8m0={scale_ue8m0:<5} dtype={dtype}" + ) + print( + f"{cfg_desc:55} | CUDA {cuda_ms:7.3f} ms | Triton {triton_ms:7.3f} ms | " + f"speed-up ×{speedup:5.2f}" + ) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--warmup-iters", type=int, default=10) + parser.add_argument("--bench-iters", type=int, default=100) + parser.add_argument("--dtype", choices=["fp8", "int8", "both"], default="both") + return parser.parse_args() + + +if __name__ == "__main__": + if not current_platform.is_cuda(): + raise RuntimeError("CUDA device is required to run this benchmark.") + + args = parse_args() + warmup_iters, bench_iters = args.warmup_iters, args.bench_iters + + shapes = [(32, 128), (64, 256), (16, 512)] + group_sizes = [64, 128] + + dtypes = ["fp8", "int8"] if args.dtype == "both" else [args.dtype] + + header = ( + "Configuration".ljust(55) + + " | " + + "CUDA (ms)".center(12) + + " | " + + "Triton (ms)".center(13) + + " | " + + "Speed-up" + ) + print(header) + print("-" * len(header)) + + for dtype in dtypes: + for shape in shapes: + for gs in group_sizes: + if dtype == "fp8": + for col_major in (False, True): + for ue8m0 in (False, True): + _run_single( + shape, + gs, + dtype, + column_major=col_major, + scale_ue8m0=ue8m0, + warmup_iters=warmup_iters, + bench_iters=bench_iters, + ) + else: # INT8 has no col-major / ue8m0 switches + _run_single( + shape, + gs, + dtype, + warmup_iters=warmup_iters, + bench_iters=bench_iters, + ) diff --git a/tests/kernels/quantization/test_per_token_group_quant.py b/tests/kernels/quantization/test_per_token_group_quant.py index f826983fe94e1..07f17d1efe641 100644 --- a/tests/kernels/quantization/test_per_token_group_quant.py +++ b/tests/kernels/quantization/test_per_token_group_quant.py @@ -5,7 +5,7 @@ from unittest.mock import patch import pytest import torch -from vllm.model_executor.layers.quantization.utils import fp8_utils +from vllm.model_executor.layers.quantization.utils import fp8_utils, int8_utils @pytest.mark.parametrize("shape", [(32, 128), (64, 256), (16, 512)]) @@ -42,3 +42,32 @@ def test_per_token_group_quant_fp8(shape, column_major: bool, assert torch.allclose(out_q.float(), ref_q.float(), atol=0.15, rtol=0.15) assert torch.allclose(scale, ref_s, atol=0.01, rtol=0.01) + + +@pytest.mark.parametrize("shape", [(32, 128), (64, 256), (16, 512)]) +@pytest.mark.parametrize("group_size", [64, 128]) +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_per_token_group_quant_int8(shape, group_size: int): + device = "cuda" + + torch.manual_seed(42) + num_tokens, hidden_dim = shape + + x = (torch.randn( + (num_tokens, hidden_dim), device=device, dtype=torch.bfloat16) * 8) + + # cuda path + out_q, scale = int8_utils.per_token_group_quant_int8( + x, + group_size, + ) + + # triton ref + with patch("vllm.platforms.current_platform.is_cuda", return_value=False): + ref_q, ref_s = int8_utils.per_token_group_quant_int8( + x, + group_size, + ) + + assert torch.allclose(out_q.float(), ref_q.float(), atol=0.15, rtol=0.15) + assert torch.allclose(scale, ref_s, atol=0.01, rtol=0.01) From 0e40b2607317515bd4e847490ebd77e88f92dc1d Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 30 Jul 2025 22:17:14 +0800 Subject: [PATCH 061/224] [CI/Build] Only run markdownlint in CI (#21892) Signed-off-by: DarkLight1337 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .github/workflows/matchers/markdownlint.json | 17 +++++++++++++++++ .github/workflows/pre-commit.yml | 1 + .pre-commit-config.yaml | 3 ++- 3 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/matchers/markdownlint.json diff --git a/.github/workflows/matchers/markdownlint.json b/.github/workflows/matchers/markdownlint.json new file mode 100644 index 0000000000000..fe094a9badb25 --- /dev/null +++ b/.github/workflows/matchers/markdownlint.json @@ -0,0 +1,17 @@ +{ + "problemMatcher": [ + { + "owner": "markdownlint", + "pattern": [ + { + "regexp": "^([^:]*):(\\d+):?(\\d+)?\\s([\\w-\\/]*)\\s(.*)$", + "file": 1, + "line": 2, + "column": 3, + "code": 4, + "message": 5 + } + ] + } + ] +} \ No newline at end of file diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 8e694d18134ef..835e91d91ae94 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -17,6 +17,7 @@ jobs: with: python-version: "3.12" - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" + - run: echo "::add-matcher::.github/workflows/matchers/markdownlint.json" - run: echo "::add-matcher::.github/workflows/matchers/mypy.json" - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 with: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 045096cb86369..612b290e88d46 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,8 +38,9 @@ repos: - repo: https://github.com/igorshubovych/markdownlint-cli rev: v0.45.0 hooks: - - id: markdownlint-fix + - id: markdownlint exclude: '.*\.inc\.md' + stages: [manual] # Only run in CI - repo: https://github.com/rhysd/actionlint rev: v1.7.7 hooks: From 36ede4598949092be3b61418a5141cbe730d1098 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 30 Jul 2025 15:18:02 +0100 Subject: [PATCH 062/224] Reduce time wasted in GitHub Actions using `concurrency` (#21919) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .github/workflows/lint-and-deploy.yaml | 4 ++++ .github/workflows/pre-commit.yml | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index 74a7a3a3530f5..2b1086b7faf43 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -2,6 +2,10 @@ name: Lint and Deploy Charts on: pull_request +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + permissions: contents: read diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 835e91d91ae94..195579f206a2f 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -5,6 +5,10 @@ on: push: branches: [main] +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + permissions: contents: read From 8f4a1c9a04b36cb7527e67f1fea96c4f05ed0e03 Mon Sep 17 00:00:00 2001 From: Ruixiang Tan <819464715@qq.com> Date: Wed, 30 Jul 2025 22:20:43 +0800 Subject: [PATCH 063/224] [Misc] Improve code readability of KVCacheManager (#21673) Signed-off-by: tanruixiang Signed-off-by: Ruixiang Tan <819464715@qq.com> Signed-off-by: GitHub --- tests/v1/core/test_kv_cache_utils.py | 4 ++-- vllm/v1/core/block_pool.py | 2 +- vllm/v1/core/kv_cache_coordinator.py | 9 ++++++--- vllm/v1/core/kv_cache_manager.py | 5 +---- vllm/v1/core/kv_cache_utils.py | 8 -------- vllm/v1/core/single_type_kv_cache_manager.py | 12 ++++++++---- 6 files changed, 18 insertions(+), 22 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index e9c6f1f95cd71..bff3724d95e68 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -112,9 +112,9 @@ def test_kv_cache_block(): assert block.block_hash is None # Test reference count manipulation - block.incr_ref() + block.ref_cnt += 1 assert block.ref_cnt == 1 - block.decr_ref() + block.ref_cnt -= 1 assert block.ref_cnt == 0 # Test block hash setting and resetting diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index 5bf4d3a2acb45..ad9854dd29c38 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -276,7 +276,7 @@ class BlockPool: # candidate), so remove it. if block.ref_cnt == 0 and not block.is_null: self.free_block_queue.remove(block) - block.incr_ref() + block.ref_cnt += 1 def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None: """Free a list of blocks. The blocks should be ordered by their diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index 258805843e227..f3a16d64e19fd 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -126,14 +126,17 @@ class KVCacheCoordinator(ABC): def get_num_common_prefix_blocks(self, request_id: str, num_running_requests: int) -> list[int]: """ - Get the number of common prefix blocks for a request. + Get the number of common prefix blocks for all requests in the RUNNING + state for each kv cache group. Args: request_id: The request ID. - num_running_requests: The number of requests in the RUNNING state. + num_running_requests: The total number of requests in the RUNNING + state. Returns: - list[int]: The number of common prefix blocks. + list[int]: The number of common prefix blocks for all requests in + the RUNNING state for each kv cache group. """ num_blocks_per_group = [ manager.get_num_common_prefix_blocks(request_id, diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index e820a0ad6d5d0..ce333dbe61a19 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -170,10 +170,6 @@ class KVCacheManager: self.block_size, request) self.req_to_block_hashes[request.request_id] = block_hashes - if self.log_stats: - assert self.prefix_cache_stats is not None - self.prefix_cache_stats.requests += 1 - # NOTE: When all tokens hit the cache, we must recompute the last token # to obtain logits. Thus, set max_cache_hit_length to prompt_length - 1. # This can trigger recomputation of an entire block, rather than just @@ -187,6 +183,7 @@ class KVCacheManager: if self.log_stats: assert self.prefix_cache_stats is not None + self.prefix_cache_stats.requests += 1 self.prefix_cache_stats.queries += request.num_tokens self.prefix_cache_stats.hits += num_new_computed_tokens diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 3a72ac271afa6..25520eb655111 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -154,14 +154,6 @@ class KVCacheBlock: # Whether the block is a null block that should never be cached. is_null: bool = False - # TODO(Jialin): For performance, let callers handle ref_cnt bumps to - # avoid function calls. - def incr_ref(self): - self.ref_cnt += 1 - - def decr_ref(self): - self.ref_cnt -= 1 - @property def block_hash(self) -> Optional[BlockHashWithGroupId]: return self._block_hash diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 714f49494c9a1..8f310023a8cd3 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import itertools from abc import ABC, abstractmethod from collections import defaultdict from typing import Callable @@ -177,14 +178,17 @@ class SingleTypeKVCacheManager(ABC): def get_num_common_prefix_blocks(self, request_id: str, num_running_requests: int) -> int: """ - Get the number of common prefix blocks for a request. + Get the number of common prefix blocks for all requests in the RUNNING + state. Args: request_id: The request ID. - num_running_requests: The number of requests in the RUNNING state. + num_running_requests: The total number of requests in the RUNNING + state. Returns: - The number of common prefix blocks. + The number of common prefix blocks for all requests in the RUNNING + state. """ raise NotImplementedError @@ -264,7 +268,7 @@ class FullAttentionManager(SingleTypeKVCacheManager): computed_blocks: tuple[list[KVCacheBlock], ...] = tuple( [] for _ in range(len(kv_cache_group_ids))) max_num_blocks = max_length // kv_cache_spec.block_size - for i, block_hash in zip(range(max_num_blocks), block_hashes): + for block_hash in itertools.islice(block_hashes, max_num_blocks): # block_hashes is a chain of block hashes. If a block hash is not # in the cached_block_hash_to_id, the following block hashes are # not computed yet for sure. From ff08e51940a77d2dd14a6c512bec4613d060b4fa Mon Sep 17 00:00:00 2001 From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com> Date: Wed, 30 Jul 2025 22:33:40 +0800 Subject: [PATCH 064/224] [NVIDIA] Fix Llama4 Scout FP4 functionality issues (#21499) Signed-off-by: Po-Han Huang --- vllm/model_executor/layers/fused_moe/layer.py | 15 +- .../layers/quantization/modelopt.py | 2 - vllm/model_executor/models/llama4.py | 270 +++++++++++++----- 3 files changed, 218 insertions(+), 69 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 254cd2e10b8fb..e16fc13c945cf 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -874,6 +874,14 @@ class FusedMoE(torch.nn.Module): elif shard_id == "w2": param_data[expert_id] = loaded_weight + def _load_w13_weight_scale(self, shard_dim: int, + loaded_weight: torch.Tensor, + param: torch.Tensor, tp_rank: int): + shard_size = param.shape[shard_dim] + loaded_weight = loaded_weight.narrow(shard_dim, shard_size * tp_rank, + shard_size) + param.copy_(loaded_weight) + def _load_model_weight_or_group_weight_scale(self, shard_dim: int, expert_data: torch.Tensor, @@ -1123,7 +1131,12 @@ class FusedMoE(torch.nn.Module): "weight_scale_2" in weight_name if uses_weight_scale_2 else "weight_scale" in weight_name) or "input_scale" in weight_name - if per_tensor_conditions: + if "w13_weight_scale" in weight_name: + self._load_w13_weight_scale(shard_dim=shard_dim, + loaded_weight=loaded_weight, + param=param, + tp_rank=self.tp_rank) + elif per_tensor_conditions: self._load_per_tensor_weight_scale( shard_id=shard_id, param=param, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 38866586ae29e..8fbc3231d86c3 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -778,8 +778,6 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase): # Swizzle the weight blockscale. # contracting dimension is input dimension # block_size = 16; - assert (layer.weight_scale.shape[1] % 16 == 0), ( - "Expected weight_scale.dim(1) to be divisible by 16") assert (layer.weight_scale.dtype == torch.float8_e4m3fn), ( "Weight Block scale must be represented as FP8-E4M3") swizzled_weight_scale = swizzle_blockscale(layer.weight_scale) diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index fab1c163ac288..470e701d98013 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -342,34 +342,94 @@ class Llama4Model(LlamaModel): expert_params_mapping: list[tuple[str, str, int, str]], fused: bool = True, ) -> bool: + """ + Load MoE expert weights. + + Args: + name: The name of the weight to load. + loaded_weight: The weight to load. + params_dict: The dictionary of module parameters. + loaded_params: The set of already loaded parameters. + expert_params_mapping: The mapping of expert parameters. Must be + generated by FusedMoE.make_expert_params_mapping(). + fused: Whether the expert weights are fused into a single weight + tensor or are separate weight tensors for each expert. + When fused is True, loaded_weight should have shape of: + [num_experts, hidden_in, hidden_out] for gate/up/down proj and + [hidden_out, hidden_in] for the others like router. + When fused is False, loaded_weight should have shape of: + [hidden_out, hidden_in]. + + Returns: + True if loaded_weight is one of MoE weights and the MoE expert + weights are loaded successfully, False otherwise. + """ + + # Whether the MoE expert weights are loaded successfully. expert_param_loaded = False - if "experts.gate_up_proj" in name: - loaded_weight = loaded_weight.chunk(2, dim=-1) + + # If fused is True, the loaded weight is in the layout of: + # [num_experts, hidden_in, hidden_out], so we must transpose the last + # two dimensions to match the expected layout of the parameters. + if fused and loaded_weight.ndim == 3: + loaded_weight = loaded_weight.transpose(-1, -2) + + # If the gate_proj and up_proj weights are fused into a single + # weight tensor, we need to split the weight tensor into a tuple + # of two weight tensors along the hidden_out dimension. + if "experts.gate_up_proj" in name: + loaded_weight = loaded_weight.chunk(2, dim=-2) + + # Iterate over all the expert parameters and load the weights if we find + # a match in weight name. for (param_name, weight_name, expert_id, shard_id) in expert_params_mapping: + + # Get a view of the loaded_weight to avoid modifying the original + # one across iterations. new_loaded_weight = loaded_weight + + # If expert weights are fused into a single weight tensor, remove + # the expert index from the expected weight name. if fused: + # The string between e_str and proj_str is the expert index. e_str, _, proj_str, _ = weight_name.split('.') weight_name = f"{e_str}.{proj_str}" param_name = f"{param_name}weight" + + # Skip if the current weight is not one of the MoE weights. if weight_name not in name: continue + + # Replace the weight name with the parameter name. full_param_name = name.replace(weight_name, param_name) - # Skip layers on other devices. + + # Skip if the current weight corresponds to a parameter that + # does not exist on the current PP (pipeline parallel) rank. if is_pp_missing_parameter(name, self): continue + + # Skip if the current weight is for the bias. if ((name.endswith(".bias") or name.endswith("_bias")) and name not in params_dict): continue + param = params_dict[full_param_name] weight_loader = param.weight_loader + if fused: + # If the parameter is for w13 together, the corresponding weight + # will be a tuple, so we must select the correct weight + # depending on the shard id, which is either "w1" or "w3". if "w13" in full_param_name: + assert shard_id in ["w1", "w3"] shard_idx = 0 if shard_id == "w1" else 1 new_loaded_weight = new_loaded_weight[shard_idx] - new_loaded_weight = new_loaded_weight.transpose(-1, -2) + + # If EP (expert parallel) is enabled, update expert_id to the + # starting expert index for the current EP rank and extract the + # corresponding expert weights. layer_idx = extract_layer_index(name) - # EP mapping expert_map = self.layers[ layer_idx].feed_forward.experts.expert_map if expert_map is not None: @@ -382,6 +442,9 @@ class Llama4Model(LlamaModel): else: # TODO: add EP support for non fused weights pass + + # Load the weight into the module parameter with corresponding + # shard id and expert id. weight_loader(param, new_loaded_weight, full_param_name, @@ -390,10 +453,13 @@ class Llama4Model(LlamaModel): loaded_params.add(full_param_name) expert_param_loaded = True + return expert_param_loaded def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + # Name mapping from the parameter name to the shard name and + # corresponding shard id. stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -402,26 +468,43 @@ class Llama4Model(LlamaModel): (".gate_up_proj", ".gate_proj", 0), (".gate_up_proj", ".up_proj", 1), ] + # Indicate whether the expert weights are fused into a single weight + # tensor. fused_experts_params = False + # Expert parameter mapping for the case where the expert weights are + # not fused into a single weight tensor. expert_params_mapping = FusedMoE.make_expert_params_mapping( ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", ckpt_up_proj_name="up_proj", num_experts=self.num_experts) + # Expert parameter mapping for the case where the expert weights are + # fused into a single weight tensor. expert_params_mapping_fused = FusedMoE.make_expert_params_mapping( ckpt_gate_proj_name="gate_up_proj", ckpt_down_proj_name="down_proj", ckpt_up_proj_name="gate_up_proj", num_experts=1) + # All the module parameters. params_dict = dict(self.named_parameters()) + # The module parameters that have been loaded. loaded_params: set[str] = set() + + # Iterate over all the weights and load them into module parameters. for name, loaded_weight in weights: + + # If the name contains "experts.gate_up_proj" or "experts.down_proj" + # without the expert indices, it means the expert weights are fused + # into a single weight tensor across all experts. if "experts.gate_up_proj" in name or "experts.down_proj" in name: fused_experts_params = True expert_params_mapping = expert_params_mapping_fused + + # If kv cache quantization scales exist and the weight name + # corresponds to one of the kv cache quantization scales, load + # them. if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) @@ -430,84 +513,119 @@ class Llama4Model(LlamaModel): weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue + + # Iterate over stacked_params_mapping to check if the current weight + # is one of the stacked parameters. If so, load the weight with the + # corresponding shard id. Note that MoE weights are handled + # separately in the else block. for param_name, weight_name, shard_id in stacked_params_mapping: + # Skip if the current weight is not one of the stacked + # parameters or if the current weight is a MoE weight. if weight_name not in name or "experts" in name: continue - # This check is for ModelOpt ckpts with kv cache quant enabled + + # For ModelOpt checkpoints, we need to rename the self_attn + # weight/weight_scale names except for kv cache scales. if not (name.endswith( (".k_scale", ".v_scale")) and "self_attn" in name): name = name.replace(weight_name, param_name) + + # Skip if the current weight corresponds to a parameter that + # does not exist on the current PP (pipeline parallel) rank. if is_pp_missing_parameter(name, self): continue - if name.endswith("scale") and "expert" not in name: - # Remapping the name of FP8 kv-scale. + + # Remap kv cache scale names for ModelOpt checkpoints. + # TODO: ModelOpt should implement get_cache_scale() such that + # kv cache scale name remapping can be done there. + if name.endswith("scale"): name = maybe_remap_kv_scale_name(name, params_dict) if name is None: continue + + # Load the weight into the module parameter with corresponding + # shard id and exit the for loop and the else block. param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) + if weight_loader == default_weight_loader: weight_loader(param, loaded_weight) else: weight_loader(param, loaded_weight, shard_id) + loaded_params.add(name) break + + # Handle normal (non-stacked) weights and MoE weights. else: - moe_loaded = self.load_moe_expert_weights( - name, - loaded_weight, - params_dict, - loaded_params, - expert_params_mapping, - fused=fused_experts_params) + # First, try to load MoE weights using load_moe_expert_weights. + # If successful, move on to next loaded weight. + if self.load_moe_expert_weights(name, + loaded_weight, + params_dict, + loaded_params, + expert_params_mapping, + fused=fused_experts_params): + continue - if not moe_loaded: - if is_pp_missing_parameter(name, self): - continue + # Skip if the current weight corresponds to a parameter that + # does not exist on the current PP (pipeline parallel) rank. + if is_pp_missing_parameter(name, self): + continue - # Handle flat expert scale parameters that - # don't match per-expert patterns - if ("experts." in name and ("w13_input_scale" in name - or "w13_weight_scale" in name - or "w2_input_scale" in name - or "w2_weight_scale" in name)): - # These are flat expert scales that apply to all experts - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - - # Check for MoE-specific loading support via - # attribute instead of expensive runtime reflection - supports_moe = getattr(weight_loader, - 'supports_moe_loading', False) - - if supports_moe: - # This is a MoE weight loader - if "w13_" in name: - shard_id = "w1" - elif "w2_" in name: - shard_id = "w2" - else: - shard_id = "w1" - - weight_loader(param, - loaded_weight, - name, - shard_id=shard_id, - expert_id=0) - else: - # Regular weight loader (handles both - # param.weight_loader and default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - continue + # Handle flat expert scale parameters that don't match + # per-expert patterns, i.e. one weight scale tensor for all + # experts. + scale_names = [ + "w13_input_scale", "w13_weight_scale", "w2_input_scale", + "w2_weight_scale" + ] + if ("experts." in name and any(scale_name in name + for scale_name in scale_names)): param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) + + # If weight loader supports special moe loading, use it to + # avoid expensive runtime reflection + if getattr(weight_loader, 'supports_moe_loading', False): + # Map the weight name to the corresponding shard id. + shard_id = "w2" if "w2_" in name else "w1" + + # Transpose if weight scales are FP8 block scales with + # three dimensions: + # [num_experts, hidden_in, hidden_out]. + if name.endswith("weight_scale") \ + and loaded_weight.dtype == torch.float8_e4m3fn \ + and loaded_weight.ndim == 3: + loaded_weight = loaded_weight.transpose(-1, -2) + + # Load the weight into the module parameter with + # corresponding shard id and expert id. + weight_loader(param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=0) + + else: + # Regular weight loader (handles both + # param.weight_loader and default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + continue + + # Handle normal (non-stacked, non-MoE) weights. + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + + # Finally, return the set of loaded parameters. return loaded_params @@ -560,23 +678,43 @@ class Llama4ForCausalLM(LlamaForCausalLM): loaded_weight: torch.Tensor, ) -> tuple[str, torch.Tensor]: - def permute(w: torch.Tensor, n_heads: int): + # Helper function to permute the weight's channels + def permute(w: torch.Tensor, n_heads: int, is_weight_scale: bool): + + # Calculate the expected shape of the weight. + # Do not rely on w's shape, as it may be in another layout. attn_in = self.config.head_dim * n_heads attn_out = self.config.hidden_size + # If the weight is FP4 packed as uint8, we need to divide attn_out + # by 2. + if w.dtype == torch.uint8 and w.shape[1] * 2 == attn_out: + attn_out = attn_out // 2 + + # If the weight is a weight scale, we need to divide attn_out by + # block size, which is currently 16. + elif w.dtype == torch.float8_e4m3fn and is_weight_scale \ + and w.shape[1] * 16 == attn_out: + attn_out = attn_out // 16 + return w.view(n_heads, attn_in // n_heads // 2, 2, attn_out).transpose(1, 2).reshape(attn_in, attn_out) modules = name.split(".") - # rotary embeds should be sliced - if ("wk" in modules or "k_proj" in modules) \ - and modules[-1] == "weight": - loaded_weight = permute(loaded_weight, - self.config.num_key_value_heads) - elif ("wq" in modules or "q_proj" in modules) \ - and modules[-1] == "weight": - loaded_weight = permute(loaded_weight, - self.config.num_attention_heads) + # Permute Q/K weights and weight block scales for rotary embedding + is_weight = modules[-1] == "weight" + is_nvfp4_weight_scale = (modules[-1] == "weight_scale" and + loaded_weight.dtype == torch.float8_e4m3fn) + + if is_weight or is_nvfp4_weight_scale: + if ("wk" in modules or "k_proj" in modules): + loaded_weight = permute(loaded_weight, + self.config.num_key_value_heads, + is_nvfp4_weight_scale) + elif ("wq" in modules or "q_proj" in modules): + loaded_weight = permute(loaded_weight, + self.config.num_attention_heads, + is_nvfp4_weight_scale) return name, loaded_weight From 88edf5994c123314cc3b18621352dd118bec2b99 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 30 Jul 2025 15:35:08 +0100 Subject: [PATCH 065/224] [Docs] Reduce the size of the built docs (#21920) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- mkdocs.yaml | 7 +++++++ requirements/docs.txt | 1 + 2 files changed, 8 insertions(+) diff --git a/mkdocs.yaml b/mkdocs.yaml index 78f1c5b77cd07..e5b7454003310 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -67,6 +67,13 @@ plugins: exclude: - argparse/* - examples/* + - minify: + minify_html: true + minify_js: true + minify_css: true + cache_safe: true + js_files: [docs/mkdocs/javascript/*.js] + css_files: [docs/mkdocs/stylesheets/*.css] # For API reference generation - api-autonav: modules: ["vllm"] diff --git a/requirements/docs.txt b/requirements/docs.txt index 9e56c9573b33b..4d4fc7da6816d 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -6,6 +6,7 @@ mkdocs-gen-files mkdocs-awesome-nav mkdocs-glightbox mkdocs-git-revision-date-localized-plugin +mkdocs-minify-plugin python-markdown-math regex ruff From 6e599eebe8655dab75462a8a165f6d811d0d845f Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 30 Jul 2025 22:35:47 +0800 Subject: [PATCH 066/224] [Bugfix] Fix OOM tests in initialization test (#21921) Signed-off-by: Isotr0py <2037008807@qq.com> --- tests/models/test_initialization.py | 14 ++++++++------ vllm/model_executor/models/glm4_1v.py | 1 + 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index d5441540176e8..4c7da24fca32a 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -33,12 +33,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") - # FIXME: Possible memory leak in the previous tests? - if model_arch in ("Glm4vForConditionalGeneration", - "GraniteSpeechForConditionalGeneration", - "KimiVLForConditionalGeneration"): - pytest.skip("Avoid OOM") - if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"): from vllm.model_executor.models.llama4 import Llama4ForCausalLM from vllm.model_executor.models.registry import ModelRegistry @@ -87,6 +81,14 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, "num_hidden_layers": 1, }) + # e.g.: Qwen/Qwen2-Audio-7B-Instruct + if hasattr(hf_config, "audio_config"): + hf_config.audio_config.update({ + "num_layers": 1, + "num_hidden_layers": 1, + "encoder_layers": 1, + }) + return hf_config # Avoid calling model.forward() diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 1fd65cc9099b7..ae1bf22c704e5 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1275,6 +1275,7 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, vllm_config=vllm_config, prefix=maybe_prefix(prefix, ""), architectures=["Glm4ForCausalLM"], + hf_config=self.config.get_text_config(), ) self.make_empty_intermediate_tensors = ( From 366f6b3a4d92ee0b2df8e5620a88ddf57afc3681 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 30 Jul 2025 23:42:05 +0800 Subject: [PATCH 067/224] [Bugfix] Fix multi-api server not working for text models (#21933) Signed-off-by: DarkLight1337 --- vllm/config.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 8e8c1198833c2..012a791a3c872 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -856,7 +856,7 @@ class ModelConfig: self.tokenizer = s3_tokenizer.dir def _init_multimodal_config(self) -> Optional["MultiModalConfig"]: - if self.registry.is_multimodal_model(self.architectures, self): + if self._model_info.supports_multimodal: return MultiModalConfig( limit_per_prompt=self.limit_mm_per_prompt, media_io_kwargs=self.media_io_kwargs, @@ -865,19 +865,6 @@ class ModelConfig: disable_mm_preprocessor_cache, interleave_mm_strings=self.interleave_mm_strings) - if self.limit_mm_per_prompt: - raise ValueError("`limit_mm_per_prompt` is only supported for " - "multimodal models.") - if self.mm_processor_kwargs: - raise ValueError("`mm_processor_kwargs` is only supported for " - "multimodal models.") - if self.disable_mm_preprocessor_cache: - raise ValueError("`disable_mm_preprocessor_cache` is only " - "supported for multimodal models.") - if self.interleave_mm_strings: - raise ValueError("`interleave_mm_strings` is only " - "supported for multimodal models.") - return None def _get_encoder_config(self): From ad510309ee10e5182b99ee94ddc5ace716c65050 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Wed, 30 Jul 2025 08:54:15 -0700 Subject: [PATCH 068/224] Override attention metadata for fast prefill in some KV sharing setups (#21590) Signed-off-by: Yong Hoon Shin --- tests/v1/e2e/test_kv_sharing_fast_prefill.py | 143 +++++++++++++++++++ vllm/config.py | 15 ++ vllm/engine/arg_utils.py | 6 + vllm/model_executor/models/gemma3n.py | 1 + vllm/v1/attention/backends/utils.py | 35 ++++- vllm/v1/worker/gpu_model_runner.py | 113 +++++++++++---- 6 files changed, 287 insertions(+), 26 deletions(-) create mode 100644 tests/v1/e2e/test_kv_sharing_fast_prefill.py diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py new file mode 100644 index 0000000000000..616fc7a860599 --- /dev/null +++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py @@ -0,0 +1,143 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import gc +import random +from typing import Optional, Union + +import pytest +import torch + +from vllm import LLM, SamplingParams +from vllm.config import CompilationConfig, CompilationLevel +from vllm.forward_context import get_forward_context +from vllm.model_executor.models.gemma3n import Gemma3nForConditionalGeneration +from vllm.model_executor.models.registry import ModelRegistry +from vllm.model_executor.models.utils import extract_layer_index +from vllm.sequence import IntermediateTensors + +from ...utils import fork_new_process_for_each_test + + +class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration): + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds, **kwargs) + attn_metadata = get_forward_context().attn_metadata + # attn_metadata is None during dummy runs + if (attn_metadata is not None + and self.cache_config.kv_sharing_fast_prefill): + assert isinstance(attn_metadata, dict) # true in V1 + # Gemma3n-E2B has 30 layers, with last 20 layers being + # cross-decoder layers. Check attention metadata is correct + for layer_name, metadata in attn_metadata.items(): + layer_idx = extract_layer_index(layer_name) + if layer_idx >= 20: + assert hasattr(metadata, 'logits_indices_padded') + assert hasattr(metadata, 'num_logits_indices') + else: + assert not hasattr(metadata, 'logits_indices_padded') + assert not hasattr(metadata, 'num_logits_indices') + + # Last layer will be a KV sharing layer + layer_attn_metadata = attn_metadata[ + self.model.language_model.layers[-1].self_attn.attn.layer_name] + logits_indices_padded = (layer_attn_metadata.logits_indices_padded) + assert logits_indices_padded is not None + num_logits_indices = layer_attn_metadata.num_logits_indices + assert num_logits_indices > 0 + # Reset hidden states to random values and + # only set logits at logits_indices to valid values + # Because logits_indices are the only positions that are used + # for output token sampling, this still produces same outputs + logits_hs = hidden_states[logits_indices_padded] + hidden_states = torch.randn_like(hidden_states) + gen_indices = logits_indices_padded[:num_logits_indices] + hidden_states[gen_indices] = logits_hs[:num_logits_indices] + + return hidden_states + + +@pytest.fixture +def test_prompts(): + """ + Adapted from tests/v1/e2e/test_spec_decode.py + """ + prompt_types = ["repeat", "sentence"] + # Setting higher num prompts increases the chance of numerics mismatch + # due to matrix multiplication numerics depending on batch dimension + num_prompts = 10 + prompts = [] + + random.seed(0) + random_prompt_type_choices = random.choices(prompt_types, k=num_prompts) + + for kind in random_prompt_type_choices: + word_choices = ["test", "temp", "hello", "where"] + word = random.choice(word_choices) + if kind == "repeat": + prompt = f"""please repeat the word '{word}' 10 times.""" + elif kind == "sentence": + prompt = f"""please give a ten-word sentence that + uses the word {word} at least once.""" + else: + raise ValueError(f"Unknown prompt type: {kind}") + prompts.append(prompt) + + return prompts + + +@fork_new_process_for_each_test +@pytest.mark.parametrize("enforce_eager", [True, False]) +def test_kv_sharing_fast_prefill( + monkeypatch: pytest.MonkeyPatch, + enforce_eager: bool, + test_prompts: list[str], +): + ModelRegistry.register_model("Gemma3nForConditionalGeneration", + TestGemma3nForConditionalGeneration) + sampling_params = SamplingParams(temperature=0.0, max_tokens=100) + compilation_config = CompilationConfig( + # This allows vLLM compilation backend to handle allocating and + # managing buffers for cudagraph + cudagraph_copy_inputs=True, + level=CompilationLevel.PIECEWISE + if not enforce_eager else CompilationLevel.NO_COMPILATION) + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + llm = LLM( + model="google/gemma-3n-E2B-it", + enforce_eager=enforce_eager, + compilation_config=compilation_config, + ) + ref_responses = llm.generate(test_prompts, sampling_params) + + del llm + gc.collect() + torch.cuda.empty_cache() + + llm = LLM(model="google/gemma-3n-E2B-it", + enforce_eager=enforce_eager, + compilation_config=compilation_config, + kv_sharing_fast_prefill=True) + optimized_responses = llm.generate(test_prompts, sampling_params) + + misses = 0 + + for ref_response, optimized_response in zip(ref_responses, + optimized_responses): + if ref_response.outputs[0].text != optimized_response.outputs[ + 0].text: + misses += 1 + + assert misses == 0 diff --git a/vllm/config.py b/vllm/config.py index 012a791a3c872..a330bafb76332 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1795,6 +1795,16 @@ class CacheConfig: num_cpu_blocks: Optional[int] = field(default=None, init=False) """The number of blocks to allocate for CPU memory.""" + kv_sharing_fast_prefill: bool = False + """This feature is work in progress and no prefill optimization takes place + with this flag enabled currently. + + In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254), + some layers can skip tokens corresponding to prefill. This flag enables + attention metadata for eligible layers to be overriden with metadata + necessary for implementating this optimization in some models (e.g. Gemma3n) + """ + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, @@ -1836,6 +1846,11 @@ class CacheConfig: "GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.") + if self.kv_sharing_fast_prefill: + logger.warning_once( + "--kv-sharing-fast-prefill is currently work in progress " + "and not functional yet (i.e. no prefill savings)") + return self def _verify_cache_dtype(self) -> None: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6bdc3c361af34..ababa49a53ae4 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -445,6 +445,9 @@ class EngineArgs: # DEPRECATED enable_prompt_adapter: bool = False + kv_sharing_fast_prefill: bool = \ + CacheConfig.kv_sharing_fast_prefill + def __post_init__(self): # support `EngineArgs(compilation_config={...})` # without having to manually construct a @@ -697,6 +700,8 @@ class EngineArgs: **cache_kwargs["cpu_offload_gb"]) cache_group.add_argument("--calculate-kv-scales", **cache_kwargs["calculate_kv_scales"]) + cache_group.add_argument("--kv-sharing-fast-prefill", + **cache_kwargs["kv_sharing_fast_prefill"]) # Multimodal related configs multimodal_kwargs = get_kwargs(MultiModalConfig) @@ -1069,6 +1074,7 @@ class EngineArgs: prefix_caching_hash_algo=self.prefix_caching_hash_algo, cpu_offload_gb=self.cpu_offload_gb, calculate_kv_scales=self.calculate_kv_scales, + kv_sharing_fast_prefill=self.kv_sharing_fast_prefill, ) # Get the current placement group if Ray is initialized and diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index d0880103d4e86..a58b32793dbef 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -793,6 +793,7 @@ class Gemma3nForConditionalGeneration(nn.Module): del lora_config # Unused. super().__init__() self.config = config + self.cache_config = vllm_config.cache_config self.model = Gemma3nModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) self.logits_processor = LogitsProcessor( diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index d1599ba10b618..36bacf0cb36f8 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -3,8 +3,8 @@ import abc import functools from abc import abstractmethod -from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Generic, Optional, TypeVar +from dataclasses import dataclass, make_dataclass +from typing import TYPE_CHECKING, Any, ClassVar, Generic, Optional, TypeVar import numpy as np import torch @@ -508,3 +508,34 @@ def reorder_batch_to_split_decodes_and_prefills( modified_batch = True return modified_batch + + +KV_SHARING_FAST_PREFILL_METADATA_FIELDS = [ + ('logits_indices_padded', Optional[torch.Tensor], None), + ('num_logits_indices', int, 0), +] + + +def subclass_attention_metadata( + name_prefix: str, + metadata_cls: Any, + fields: list[tuple[str, Any, Any]], +) -> Any: + """ + Return a new subclass of `metadata_cls` with additional fields + """ + name: str = name_prefix + metadata_cls.__name__ # type: ignore + Wrapped = make_dataclass(name, fields, bases=(metadata_cls, )) + return Wrapped + + +def make_kv_sharing_fast_prefill_attention_metadata( + metadata_cls: Any, ) -> Any: + """ + Return a new subclass of `metadata_cls` for fast prefill + """ + return subclass_attention_metadata( + name_prefix="KVSharingFastPrefill", + metadata_cls=metadata_cls, + fields=KV_SHARING_FAST_PREFILL_METADATA_FIELDS, + ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3befb6adf2753..987ef22a1b7fb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import dataclasses import gc import time from contextlib import contextmanager @@ -47,6 +48,7 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, + make_kv_sharing_fast_prefill_attention_metadata, make_local_attention_virtual_batches) from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.kv_cache_interface import (AttentionSpec, @@ -320,6 +322,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # means this layer will perform attention using the keys and values # from the KV cache of `shared_kv_cache_layers[layer_name]`. self.shared_kv_cache_layers: dict[str, str] = {} + self.kv_sharing_fast_prefill_eligible_layers: set[str] = set() + + self.kv_sharing_fast_prefill_logits_indices = None + if self.cache_config.kv_sharing_fast_prefill: + self.kv_sharing_fast_prefill_logits_indices = torch.zeros( + self.max_num_tokens, dtype=torch.int32, device=self.device) def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None: """ @@ -735,6 +743,55 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): spec_decode_common_attn_metadata = None + use_spec_decode = len( + scheduler_output.scheduled_spec_decode_tokens) > 0 + if not use_spec_decode: + # NOTE(woosuk): Due to chunked prefills, the batch may contain + # partial requests. While we should not sample any token + # from these partial requests, we do so for simplicity. + # We will ignore the sampled tokens from the partial requests. + # TODO: Support prompt logprobs. + logits_indices = query_start_loc[1:] - 1 + spec_decode_metadata = None + else: + # Get the number of draft tokens for each request. + # Iterate over the dictionary rather than all requests since not all + # requests have draft tokens. + num_draft_tokens = np.zeros(num_reqs, dtype=np.int32) + for req_id, draft_token_ids in ( + scheduler_output.scheduled_spec_decode_tokens.items()): + req_idx = self.input_batch.req_id_to_index[req_id] + num_draft_tokens[req_idx] = len(draft_token_ids) + + spec_decode_metadata = self._calc_spec_decode_metadata( + num_draft_tokens, cu_num_tokens) + logits_indices = spec_decode_metadata.logits_indices + + logits_indices_padded = None + if self.cache_config.kv_sharing_fast_prefill: + assert self.kv_sharing_fast_prefill_logits_indices is not None + num_logits = logits_indices.shape[0] + assert num_logits > 0 + self.kv_sharing_fast_prefill_logits_indices[:num_logits].copy_( + logits_indices) + # There might have leftover indices in logits_indices[num_logits:] + # from previous iterations, whose values may be greater than the + # batch size in the current iteration. To ensure indices are always + # valid, we fill the padded indices with the last index. + self.kv_sharing_fast_prefill_logits_indices[num_logits:].fill_( + logits_indices[-1].item()) + if (self.use_cuda_graph + and num_logits <= self.cudagraph_batch_sizes[-1]): + # Use piecewise CUDA graphs. + # Add padding to the batch size. + num_logits_padded = self.vllm_config.pad_for_cudagraph( + num_logits) + else: + num_logits_padded = num_logits + logits_indices_padded = ( + self.kv_sharing_fast_prefill_logits_indices[:num_logits_padded] + ) + attn_metadata: dict[str, Any] = {} # Prepare encoder attention metadata separately @@ -806,7 +863,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): common_attn_metadata=common_attn_metadata, )) + fast_prefill_metadata = attn_metadata_i + if (self.cache_config.kv_sharing_fast_prefill + and self.kv_sharing_fast_prefill_eligible_layers): + # Dynamically create a a dataclass type that inherits + # from attention metadata type but includes additional + # fields logits_indices_padded and num_logits_indices + # which are required for prefill truncation + fast_prefill_metadata_type = ( + make_kv_sharing_fast_prefill_attention_metadata( + metadata_cls=type(attn_metadata_i), )) + fast_prefill_metadata = fast_prefill_metadata_type( + **dataclasses.asdict(attn_metadata_i), + logits_indices_padded=logits_indices_padded, + num_logits_indices=logits_indices.size(0), + ) + for layer_name in kv_cache_group_spec.layer_names: + if (self.cache_config.kv_sharing_fast_prefill and layer_name + in self.kv_sharing_fast_prefill_eligible_layers): + attn_metadata[layer_name] = fast_prefill_metadata + continue + attn_metadata[layer_name] = attn_metadata_i # Hack for now to fix chunked local attention + no hybrid kv cache @@ -838,30 +916,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): b.can_run_in_cudagraph(common_attn_metadata) for b in self.attn_metadata_builders) - use_spec_decode = len( - scheduler_output.scheduled_spec_decode_tokens) > 0 - if not use_spec_decode: - # NOTE(woosuk): Due to chunked prefills, the batch may contain - # partial requests. While we should not sample any token - # from these partial requests, we do so for simplicity. - # We will ignore the sampled tokens from the partial requests. - # TODO: Support prompt logprobs. - logits_indices = query_start_loc[1:] - 1 - spec_decode_metadata = None - else: - # Get the number of draft tokens for each request. - # Iterate over the dictionary rather than all requests since not all - # requests have draft tokens. - num_draft_tokens = np.zeros(num_reqs, dtype=np.int32) - for req_id, draft_token_ids in ( - scheduler_output.scheduled_spec_decode_tokens.items()): - req_idx = self.input_batch.req_id_to_index[req_id] - num_draft_tokens[req_idx] = len(draft_token_ids) - - spec_decode_metadata = self._calc_spec_decode_metadata( - num_draft_tokens, cu_num_tokens) - logits_indices = spec_decode_metadata.logits_indices - # Hot-Swap lora model if self.lora_config: self.set_active_loras(self.input_batch, num_scheduled_tokens) @@ -1433,6 +1487,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): spec_decode_metadata, num_scheduled_tokens_np, spec_decode_common_attn_metadata) = ( self._prepare_inputs(scheduler_output)) + num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if (self.use_cuda_graph and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): @@ -2814,6 +2869,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): kv_cache_config.kv_cache_groups, kv_caches, ) + attn_layers = get_layers_from_vllm_config(self.vllm_config, + Attention) + # Iterate in reversed order and add layers that re-use KV cache + # e.g. in YOCO-like KV sharing setups (e.g. Gemma3n) + for layer_name in reversed(attn_layers): + if layer_name in self.shared_kv_cache_layers: + self.kv_sharing_fast_prefill_eligible_layers.add( + layer_name) + else: + break bind_kv_cache(kv_caches, self.compilation_config.static_forward_context, From 5c765aec65d0f978cc2ad42164a5da2d3e0cf071 Mon Sep 17 00:00:00 2001 From: 633WHU Date: Wed, 30 Jul 2025 23:54:44 +0800 Subject: [PATCH 069/224] [Bugfix] Fix TypeError in scheduler when comparing mixed request_id types (#21816) Signed-off-by: chiliu Co-authored-by: chiliu --- tests/v1/engine/test_engine_core.py | 72 +++++++++++++++++++++++------ vllm/v1/engine/core.py | 5 ++ 2 files changed, 64 insertions(+), 13 deletions(-) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index bbdc73e9608a1..eb826bf06236f 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -236,7 +236,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): Test that the engine can handle multiple concurrent batches. """ - def make_request_with_max_tokens(req_id: int, + def make_request_with_max_tokens(req_id: str, max_tokens: int) -> EngineCoreRequest: request = make_request() request.request_id = req_id @@ -297,16 +297,16 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): assert engine_core.batch_queue is not None # Add two requests in a row. Each request have 12 prompt tokens. - req0 = make_request_with_max_tokens(0, 5) + req0 = make_request_with_max_tokens("0", 5) engine_core.add_request(req0) - req1 = make_request_with_max_tokens(1, 5) + req1 = make_request_with_max_tokens("1", 5) engine_core.add_request(req1) # Schedule Batch 1: (10, req0) assert engine_core.step_with_batch_queue()[0] is None assert engine_core.batch_queue.qsize() == 1 scheduler_output = engine_core.batch_queue.queue[-1][1] - assert scheduler_output.num_scheduled_tokens[0] == 10 + assert scheduler_output.num_scheduled_tokens["0"] == 10 # num_computed_tokens should have been updated immediately. assert engine_core.scheduler.requests[ req0.request_id].num_computed_tokens == 10 @@ -315,11 +315,11 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): assert engine_core.step_with_batch_queue()[0] is None assert engine_core.batch_queue.qsize() == 2 scheduler_output = engine_core.batch_queue.queue[-1][1] - assert scheduler_output.num_scheduled_tokens[0] == 2 - assert scheduler_output.num_scheduled_tokens[1] == 8 + assert scheduler_output.num_scheduled_tokens["0"] == 2 + assert scheduler_output.num_scheduled_tokens["1"] == 8 # num_computed_tokens should have been updated immediately. - assert engine_core.scheduler.requests[0].num_computed_tokens == 12 - assert engine_core.scheduler.requests[1].num_computed_tokens == 8 + assert engine_core.scheduler.requests["0"].num_computed_tokens == 12 + assert engine_core.scheduler.requests["1"].num_computed_tokens == 8 assert engine_core.scheduler.get_num_unfinished_requests() == 2 @@ -331,7 +331,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): engine_core.step_with_batch_queue() assert engine_core.batch_queue.qsize() == 2 scheduler_output = engine_core.batch_queue.queue[-1][1] - assert scheduler_output.num_scheduled_tokens[1] == 4 + assert scheduler_output.num_scheduled_tokens["1"] == 4 # Batch queue is full. Finish Batch 2. Get first token of req0. output = engine_core.step_with_batch_queue()[0].get(0) @@ -343,7 +343,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): engine_core.step_with_batch_queue() assert engine_core.batch_queue.qsize() == 2 scheduler_output = engine_core.batch_queue.queue[-1][1] - assert scheduler_output.num_scheduled_tokens[0] == 1 + assert scheduler_output.num_scheduled_tokens["0"] == 1 # Batch queue is full. Finish Batch 3. Get first token of req1. output = engine_core.step_with_batch_queue()[0].get(0) @@ -355,14 +355,14 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): engine_core.step_with_batch_queue() assert engine_core.batch_queue.qsize() == 2 scheduler_output = engine_core.batch_queue.queue[-1][1] - assert scheduler_output.num_scheduled_tokens[1] == 1 + assert scheduler_output.num_scheduled_tokens["1"] == 1 # Loop until req0 is finished. step = 0 req_id = 0 expected_num_tokens = [ - engine_core.scheduler.requests[0].num_tokens + 1, - engine_core.scheduler.requests[1].num_tokens + 1, + engine_core.scheduler.requests["0"].num_tokens + 1, + engine_core.scheduler.requests["1"].num_tokens + 1, ] while engine_core.scheduler.get_num_unfinished_requests() == 2: output = engine_core.step_with_batch_queue()[0] @@ -413,3 +413,49 @@ def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch): get_worker_cache_config_field, args=("num_cpu_blocks", )) assert all(x is not None for x in num_gpu_blocks) assert all(x is not None for x in num_cpu_blocks) + + +@create_new_process_for_each_test() +def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch): + """Test that engine raises TypeError for non-string request_id.""" + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + engine_args = EngineArgs(model=MODEL_NAME) + vllm_config = engine_args.create_engine_config() + executor_class = Executor.get_class(vllm_config) + + with set_default_torch_num_threads(1): + engine_core = EngineCore(vllm_config=vllm_config, + executor_class=executor_class, + log_stats=True) + + # Test with UUID object (common mistake) + uuid_request = make_request() + uuid_request.request_id = uuid.uuid4() # UUID object instead of string + + with pytest.raises(TypeError, + match="request_id must be a string, got.*UUID"): + engine_core.add_request(uuid_request) + + # Test with integer + int_request = make_request() + int_request.request_id = 12345 + + with pytest.raises(TypeError, + match="request_id must be a string, got.*int"): + engine_core.add_request(int_request) + + # Test with None + none_request = make_request() + none_request.request_id = None + + with pytest.raises(TypeError, + match="request_id must be a string, got.*NoneType"): + engine_core.add_request(none_request) + + # Verify engine is still functional after errors + valid_request = make_request() + engine_core.add_request(valid_request) + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 0 diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index cad93061e65b0..39fda521f36af 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -207,6 +207,11 @@ class EngineCore: def add_request(self, request: EngineCoreRequest): """Add request to the scheduler.""" + # Validate the request_id type. + if not isinstance(request.request_id, str): + raise TypeError( + f"request_id must be a string, got {type(request.request_id)}") + if pooling_params := request.pooling_params: supported_pooling_tasks = [ task for task in self.get_supported_tasks() From 004203e95330ac9a878df8192619570b0770667e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 31 Jul 2025 00:10:41 +0800 Subject: [PATCH 070/224] [CI/Build] Fix registry tests (#21934) Signed-off-by: DarkLight1337 --- tests/models/registry.py | 16 +++++++---- vllm/model_executor/models/mpt.py | 20 ++++++------- vllm/model_executor/models/telechat2.py | 15 ++++++++-- vllm/transformers_utils/config.py | 5 ++-- vllm/transformers_utils/configs/__init__.py | 2 ++ vllm/transformers_utils/configs/nvlm_d.py | 31 +++++++++++++++++++++ 6 files changed, 70 insertions(+), 19 deletions(-) create mode 100644 vllm/transformers_utils/configs/nvlm_d.py diff --git a/tests/models/registry.py b/tests/models/registry.py index caa691039fce3..8fcff5a8c5113 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -170,8 +170,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { min_transformers_version="4.54"), "Ernie4_5_MoeForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT", min_transformers_version="4.54"), - "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501 - "Exaone4ForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-4.0-32B"), # noqa: E501 + "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", + trust_remote_code=True), + "Exaone4ForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-4.0-32B", + min_transformers_version="4.54"), "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501 "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base", @@ -199,8 +201,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "HunYuanMoEV1ForCausalLM": _HfExamplesInfo("tencent/Hunyuan-A13B-Instruct", trust_remote_code=True), + # TODO: Remove is_available_online once their config.json is fixed "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124", - trust_remote_code=True), + trust_remote_code=True, + is_available_online=False), "HCXVisionForCausalLM": _HfExamplesInfo( "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", trust_remote_code=True), @@ -275,7 +279,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"), # noqa: E501 "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"), "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"), - "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"), + "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct", + trust_remote_code=True), "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B", trust_remote_code=True), "TeleFLMForCausalLM": _HfExamplesInfo("CofeAI/FLM-2-52B-Instruct-2407", @@ -449,7 +454,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { max_model_len=4096), "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"), "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"), # noqa: E501 - "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"), + "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B", + trust_remote_code=True), "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501 "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 trust_remote_code=True), diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index c243f575ae54a..8db52a69924c9 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -8,7 +8,7 @@ from typing import Optional, Union import torch import torch.nn as nn -from transformers import PretrainedConfig +from transformers import MptConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -50,7 +50,7 @@ class MPTAttention(nn.Module): def __init__( self, - config: PretrainedConfig, + config: MptConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -59,15 +59,15 @@ class MPTAttention(nn.Module): self.d_model = config.d_model self.total_num_heads = config.n_heads self.head_dim = self.d_model // self.total_num_heads - self.clip_qkv = config.attn_config["clip_qkv"] - self.qk_ln = config.attn_config["qk_ln"] - self.alibi_bias_max = config.attn_config["alibi_bias_max"] + self.clip_qkv = config.attn_config.clip_qkv + self.qk_ln = config.attn_config.qk_ln + self.alibi_bias_max = config.attn_config.alibi_bias_max if "kv_n_heads" in config.attn_config: - self.total_num_kv_heads = config.attn_config['kv_n_heads'] + self.total_num_kv_heads = config.attn_config.kv_n_heads else: self.total_num_kv_heads = self.total_num_heads - assert not config.attn_config["prefix_lm"] - assert config.attn_config["alibi"] + assert not config.attn_config.prefix_lm + assert config.attn_config.alibi # pylint: disable=invalid-name self.Wqkv = QKVParallelLinear( @@ -144,7 +144,7 @@ class MPTMLP(nn.Module): def __init__( self, - config: PretrainedConfig, + config: MptConfig, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() @@ -176,7 +176,7 @@ class MPTBlock(nn.Module): def __init__( self, - config: PretrainedConfig, + config: MptConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py index f0b31b1332fb1..49a7677151a94 100644 --- a/vllm/model_executor/models/telechat2.py +++ b/vllm/model_executor/models/telechat2.py @@ -37,9 +37,20 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, class TeleChat2Model(LlamaModel): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + hf_config = vllm_config.model_config.hf_config + + vllm_config.model_config.hf_config.attribute_map = { + "num_hidden_layers": "n_layer", + "num_attention_heads": "n_head", + "intermediate_size": "ffn_hidden_size", + "rms_norm_eps": "layer_norm_epsilon" + } + vllm_config.model_config.hf_config.hidden_act = "silu" + # 1. Initialize the LlamaModel with bias - vllm_config.model_config.hf_config.bias = True - vllm_config.model_config.hf_config.mlp_bias = True + hf_config.bias = True + hf_config.mlp_bias = True + super().__init__(vllm_config=vllm_config, prefix=prefix) # 2. Remove the bias from the qkv_proj and gate_up_proj based on config # Telechat2's gate_up_proj and qkv_proj don't have bias diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 40a6a9118e53e..4ce56cb3a6aac 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -34,8 +34,8 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config, KimiVLConfig, MedusaConfig, MllamaConfig, MLPSpeculatorConfig, Nemotron_Nano_VL_Config, - NemotronConfig, RWConfig, - UltravoxConfig) + NemotronConfig, NVLM_D_Config, + RWConfig, UltravoxConfig) # yapf: enable from vllm.transformers_utils.configs.mistral import adapt_config_dict from vllm.transformers_utils.utils import check_gguf_file @@ -81,6 +81,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = { "medusa": MedusaConfig, "eagle": EAGLEConfig, "nemotron": NemotronConfig, + "NVLM_D": NVLM_D_Config, "ultravox": UltravoxConfig, **_CONFIG_REGISTRY_OVERRIDE_HF } diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 0fcb2beb8c7db..7c7d859e4a325 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -23,6 +23,7 @@ from vllm.transformers_utils.configs.moonvit import MoonViTConfig from vllm.transformers_utils.configs.nemotron import NemotronConfig from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config +from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config from vllm.transformers_utils.configs.ultravox import UltravoxConfig __all__ = [ @@ -39,5 +40,6 @@ __all__ = [ "NemotronConfig", "NemotronHConfig", "Nemotron_Nano_VL_Config", + "NVLM_D_Config", "UltravoxConfig", ] diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py new file mode 100644 index 0000000000000..edfc506882ff5 --- /dev/null +++ b/vllm/transformers_utils/configs/nvlm_d.py @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py +# -------------------------------------------------------- +# NVLM-D +# Copyright (c) 2024 NVIDIA +# Licensed under Apache 2.0 License [see LICENSE for details] +# -------------------------------------------------------- +from transformers import Qwen2Config +from transformers.configuration_utils import PretrainedConfig + + +class NVLM_D_Config(PretrainedConfig): + model_type = 'NVLM_D' + is_composition = True + + def __init__(self, vision_config=None, llm_config=None, **kwargs): + super().__init__(**kwargs) + + # Handle vision_config initialization + if vision_config is None: + vision_config = {} + + # Handle llm_config initialization + if llm_config is None: + llm_config = {} + + self.vision_config = PretrainedConfig(**vision_config) + self.text_config = Qwen2Config(**llm_config) From 4904e53c3277e92c881bf2a1442805bdc3da983f Mon Sep 17 00:00:00 2001 From: Chenguang Zheng <645327136@qq.com> Date: Thu, 31 Jul 2025 00:18:37 +0800 Subject: [PATCH 071/224] [Bugfix] SharedStorage Connector for V1 PD multimodal (#21611) Signed-off-by: fake0fan <645327136@qq.com> Signed-off-by: herotai214 Co-authored-by: herotai214 --- .../unit/test_shared_storage_connector.py | 215 ++++++++++++++++++ .../v1/shared_storage_connector.py | 41 +++- 2 files changed, 244 insertions(+), 12 deletions(-) create mode 100644 tests/v1/kv_connector/unit/test_shared_storage_connector.py diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_shared_storage_connector.py new file mode 100644 index 0000000000000..ee3e71d3b8452 --- /dev/null +++ b/tests/v1/kv_connector/unit/test_shared_storage_connector.py @@ -0,0 +1,215 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import asdict +from typing import NamedTuple + +from PIL import Image + +from vllm import LLM, EngineArgs, SamplingParams +from vllm.assets.image import ImageAsset +from vllm.config import KVTransferConfig +from vllm.multimodal.utils import encode_image_base64 + +MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct" + +SAMPLING_PARAMS = SamplingParams(temperature=0.0, top_k=1, max_tokens=128) + +TEXT_PROMPTS = [ + "What's in the image(s)? Around 30 words. What's special in 2nd image?", + "The future of AI is", +] + + +class InputCase(NamedTuple): + text: str + img: list[Image] + expected_len: int + info: str + + +def _check_path_len(path): + """Return the latest length in path""" + return len(list(path.iterdir())) + + +def _list_path(path): + """Return the list of foldername (hashes generatd) under the path""" + return list(path.iterdir()) + + +def run_test(tmp_path, processor, llm: LLM, question: str, + image_urls: list[Image], expected_len: int, info: str): + """ + One individual test to process the prompt and output base on 1 set of input + Then check if the length in the strorage path matches the expected length + `info` introduces details or purpose of the individual test + """ + print(f"***info: {info}***") + print( + f"**Expected storage path length after llm generate: {expected_len}**") + process_prompt(processor, llm, question, image_urls) + + print(f"Path matched expected length: {_check_path_len(tmp_path)}") + print(f"Hashes under the storage path: {_list_path(tmp_path)}") + + assert _check_path_len(tmp_path) == expected_len, ( + f"Expect storage path length {expected_len} ;", + f"but end up {_check_path_len(tmp_path)} instead. ", f"Info: {info}") + + +def process_prompt(processor, llm: LLM, question: str, + image_urls: list[Image]): + """ + Form the prompt based on the text and image input, then llm generate output + """ + placeholders = [{ + "type": "image_url", + "image_url": { + "url": f"data:image;base64,{encode_image_base64(image_pil)}" + } + } for image_pil in image_urls] + + messages = [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": [ + *placeholders, + { + "type": "text", + "text": question + }, + ], + }, + ] + + prompt = processor.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + outputs = llm.generate( + { + "prompt": + prompt, + **({ + "multi_modal_data": { + "image": [*image_urls] + } + } if image_urls else {}) + }, + sampling_params=SAMPLING_PARAMS, + ) + + print("-" * 50) + print("Output:") + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + print("-" * 50) + + +def test_shared_storage_connector_hashes(tmp_path): + """ + Tests that SharedStorageConnector saves KV to the storage locations + with proper hashes; that are unique for inputs with identical text but + differnt images (same size), or same multiple images but different orders. + """ + # Using tmp_path as the storage path to store KV + print(f"KV storage path at: {str(tmp_path)}") + + # Configure the SharedStorageConnector + kv_transfer_config = KVTransferConfig( + kv_connector="SharedStorageConnector", + kv_role="kv_both", + kv_connector_extra_config={"shared_storage_path": str(tmp_path)}) + + engine_args = EngineArgs( + model=MODEL_NAME, + max_model_len=8192, + max_num_seqs=1, + kv_transfer_config=kv_transfer_config, + limit_mm_per_prompt={"image": 2}, + ) + + # don't put this import at the top level + # it will call torch.cuda.device_count() + from transformers import AutoProcessor # noqa: F401 + + # Create processor to handle the chat prompt + processor = AutoProcessor.from_pretrained(MODEL_NAME) + + # Prepare images for the tests + # Resize to the same size to check hashes correctness + image_1 = ImageAsset("stop_sign").pil_image.resize((1280, 720)) + image_2 = ImageAsset("cherry_blossom").pil_image.resize((1280, 720)) + + # Make sure that they are not the same picture + assert image_1 != image_2, "The images should not be identical" + + # Create the LLM instance + engine_args = asdict(engine_args) + llm = LLM(**engine_args) + + # Prepare the input cases + input_cases = [ + InputCase(text=TEXT_PROMPTS[0], + img=[image_1], + expected_len=1, + info="image_1 single input the first time."), + InputCase(text=TEXT_PROMPTS[0], + img=[image_2], + expected_len=2, + info=("image_2 single input the first time. " + "It is in same pixel size with image_1, yet it " + "should be able to form a new unique hash.")), + InputCase(text=TEXT_PROMPTS[0], + img=[image_1], + expected_len=2, + info=("image_1 single input the 2nd time. " + "It should not form aother new hash.")), + InputCase(text=TEXT_PROMPTS[0], + img=[image_2], + expected_len=2, + info=("image_2 single input the 2nd time. " + "It should not form aother new hash.")), + InputCase(text=TEXT_PROMPTS[0], + img=[image_1, image_2], + expected_len=3, + info="image_1 with image_2 input the first time."), + InputCase(text=TEXT_PROMPTS[0], + img=[image_2, image_1], + expected_len=4, + info="The image order is swapped. Should form new hash."), + InputCase(text=TEXT_PROMPTS[0], + img=[image_1, image_2], + expected_len=4, + info=("[image_1, image_2] input the 2nd time. " + "It should not form aother new hash.")), + InputCase(text=TEXT_PROMPTS[0], + img=[image_2, image_1], + expected_len=4, + info=("[image_2, image_1] input the 2nd time. " + "It should not form aother new hash.")), + InputCase(text=TEXT_PROMPTS[0], + img=[], + expected_len=5, + info="Pure text input test as a case-control"), + InputCase(text=TEXT_PROMPTS[0], + img=[], + expected_len=5, + info="Identical pure text input as a case-control"), + InputCase(text=TEXT_PROMPTS[1], + img=[], + expected_len=6, + info="Another pure text input as a case-control"), + ] + + # Run tests + for case_id, (text, img, expected_len, info) in enumerate(input_cases): + print("\n", "=" * 25, f"Below running input case: {case_id}", "=" * 25) + run_test(tmp_path, processor, llm, text, img, expected_len, info) + + print("All tests passed successfully!") diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 048748e6b8ecb..fd79387269d56 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -32,10 +32,11 @@ class ReqMeta: slot_mapping: torch.Tensor # Is store or load is_store: bool + mm_hashes: list[str] @staticmethod def make_meta(token_ids: list[int], block_ids: list[int], block_size: int, - is_store: bool) -> "ReqMeta": + is_store: bool, mm_hashes: list[str]) -> "ReqMeta": valid_num_tokens = align_to_block_size(len(token_ids), block_size) token_ids_tensor = torch.tensor(token_ids)[:valid_num_tokens] block_ids_tensor = torch.tensor(block_ids) @@ -48,6 +49,7 @@ class ReqMeta: token_ids=token_ids_tensor, slot_mapping=slot_mapping, is_store=is_store, + mm_hashes=mm_hashes, ) @@ -64,9 +66,11 @@ class SharedStorageConnectorMetadata(KVConnectorMetadata): block_ids: list[int], block_size: int, is_store: bool, + mm_hashes: list[str], ) -> None: self.requests.append( - ReqMeta.make_meta(token_ids, block_ids, block_size, is_store)) + ReqMeta.make_meta(token_ids, block_ids, block_size, is_store, + mm_hashes)) class SharedStorageConnector(KVConnectorBase_V1): @@ -169,7 +173,7 @@ class SharedStorageConnector(KVConnectorBase_V1): forward_context.virtual_engine] filename = self._generate_filename_debug( - layer_name, request.token_ids) + layer_name, request.token_ids, request.mm_hashes) kv_cache = safetensors.torch.load_file( filename)["kv_cache"].cuda() inject_kv_into_layer(kv_cache_layer, kv_cache, @@ -221,7 +225,7 @@ class SharedStorageConnector(KVConnectorBase_V1): for request in connector_metadata.requests: if request.is_store: filename = self._generate_filename_debug( - layer_name, request.token_ids) + layer_name, request.token_ids, request.mm_hashes) kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping) tensors = {"kv_cache": kv_cache.detach().cpu()} @@ -299,7 +303,8 @@ class SharedStorageConnector(KVConnectorBase_V1): meta.add_request(token_ids=new_req.prompt_token_ids, block_ids=new_req.block_ids[0], block_size=self._block_size, - is_store=False) + is_store=False, + mm_hashes=new_req.mm_hashes) total_need_load += 1 else: # NOTE: here, we set the store and load being exclusive, @@ -310,7 +315,8 @@ class SharedStorageConnector(KVConnectorBase_V1): meta.add_request(token_ids=new_req.prompt_token_ids, block_ids=new_req.block_ids[0], block_size=self._block_size, - is_store=True) + is_store=True, + mm_hashes=new_req.mm_hashes) cached_reqs = scheduler_output.scheduled_cached_reqs for i, req_id in enumerate(cached_reqs.req_ids): @@ -338,7 +344,8 @@ class SharedStorageConnector(KVConnectorBase_V1): meta.add_request(token_ids=token_ids, block_ids=block_ids, block_size=self._block_size, - is_store=False) + is_store=False, + mm_hashes=request.mm_hashes) total_need_load += 1 assert total_need_load == len(self._requests_need_load) @@ -359,20 +366,28 @@ class SharedStorageConnector(KVConnectorBase_V1): len(request.prompt_token_ids) - 1, self._block_size) foldername = self._generate_foldername_debug(torch.tensor( request.prompt_token_ids)[:num_tokens_to_check], + request.mm_hashes, create_folder=False) return os.path.exists(foldername) def _generate_foldername_debug( self, - input_ids: torch.Tensor, + token_ids: torch.Tensor, + mm_hashes: list[str], create_folder=False, ) -> str: """Generate a folder name based on the hash of the bytes of the input ids. """ - input_ids_bytes = input_ids.numpy().tobytes() - input_ids_hash = hashlib.md5(input_ids_bytes, + token_bytes = token_ids.numpy().tobytes() + # Add mm_hashes to the bytes being hashed to avoid path traversal and + # to create a canonical key. + if mm_hashes: + mm_str = "-".join(mm_hashes) + token_bytes += mm_str.encode('utf-8') + input_ids_hash = hashlib.md5(token_bytes, usedforsecurity=False).hexdigest() + foldername = os.path.join(self._storage_path, input_ids_hash) if create_folder: os.makedirs(foldername, exist_ok=True) @@ -381,12 +396,14 @@ class SharedStorageConnector(KVConnectorBase_V1): def _generate_filename_debug( self, layer_name: str, - input_ids: torch.Tensor, + token_ids: torch.Tensor, + mm_hashes: list[str], ) -> str: """Generate a file name based on the layer name and the hash of the bytes of the input ids. """ - foldername = self._generate_foldername_debug(input_ids, + foldername = self._generate_foldername_debug(token_ids, + mm_hashes=mm_hashes, create_folder=True) return os.path.join(foldername, f"{layer_name}.safetensors") From f4135232b9a8c4845f8961fb1cd17581c56ae2ce Mon Sep 17 00:00:00 2001 From: wxsm Date: Thu, 31 Jul 2025 00:41:51 +0800 Subject: [PATCH 072/224] feat(distributed): add `get_required_kvcache_layout` class method to kv connector api (#20433) Signed-off-by: wxsm --- tests/distributed/test_kvlayout.py | 72 +++++++++++++++++++ .../kv_transfer/kv_connector/base.py | 16 ++++- .../kv_transfer/kv_connector/factory.py | 37 +++++----- .../kv_transfer/kv_connector/utils.py | 19 ++--- .../kv_transfer/kv_connector/v1/base.py | 14 ++++ .../kv_connector/v1/multi_connector.py | 33 +++++++++ .../kv_connector/v1/nixl_connector.py | 23 +++++- 7 files changed, 186 insertions(+), 28 deletions(-) create mode 100644 tests/distributed/test_kvlayout.py diff --git a/tests/distributed/test_kvlayout.py b/tests/distributed/test_kvlayout.py new file mode 100644 index 0000000000000..d447876f6cc7c --- /dev/null +++ b/tests/distributed/test_kvlayout.py @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.config import (DeviceConfig, KVTransferConfig, ModelConfig, + VllmConfig, set_current_vllm_config) +from vllm.distributed.kv_transfer.kv_connector.utils import ( + get_kv_connector_cache_layout) +from vllm.logger import init_logger + +logger = init_logger("test_expert_parallel") + + +def test_get_kv_connector_cache_layout_without_kv_connector(): + vllm_config = VllmConfig(device_config=DeviceConfig("cpu")) + with set_current_vllm_config(vllm_config): + # Test with default settings + layout = get_kv_connector_cache_layout() + assert layout == "NHD" + + +def test_get_kv_connector_cache_layout_with_lmcache_connector(): + kv_transfer_config = KVTransferConfig( + kv_connector="LMCacheConnectorV1", + kv_role="kv_both", + ) + vllm_config = VllmConfig(device_config=DeviceConfig("cpu"), + kv_transfer_config=kv_transfer_config) + with set_current_vllm_config(vllm_config): + # Test with default settings + layout = get_kv_connector_cache_layout() + assert layout == "NHD" + + +def test_get_kv_connector_cache_layout_with_nixl_connector(): + kv_transfer_config = KVTransferConfig( + kv_connector="NixlConnector", + kv_role="kv_both", + ) + model_config = ModelConfig() + vllm_config = VllmConfig(device_config=DeviceConfig("cpu"), + model_config=model_config, + kv_transfer_config=kv_transfer_config) + with set_current_vllm_config(vllm_config): + # Test with default settings + layout = get_kv_connector_cache_layout() + assert layout == "HND" + + +def test_get_kv_connector_cache_layout_with_multi_connector(): + kv_transfer_config = KVTransferConfig(kv_connector="MultiConnector", + kv_role="kv_both", + kv_connector_extra_config={ + "connectors": [{ + "kv_connector": + "SharedStorageConnector", + "kv_role": + "kv_both" + }, { + "kv_connector": + "NixlConnector", + "kv_role": + "kv_both" + }] + }) + model_config = ModelConfig() + vllm_config = VllmConfig(device_config=DeviceConfig("cpu"), + model_config=model_config, + kv_transfer_config=kv_transfer_config) + with set_current_vllm_config(vllm_config): + # Test with default settings + layout = get_kv_connector_cache_layout() + assert layout == "HND" diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py index 181c33925da76..868b227fc8994 100644 --- a/vllm/distributed/kv_transfer/kv_connector/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/base.py @@ -9,7 +9,7 @@ The class provides two primary abstract methods: """ from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING, Optional, Union import torch @@ -124,5 +124,19 @@ class KVConnectorBase(ABC): raise NotImplementedError + @classmethod + def get_required_kvcache_layout( + cls, vllm_config: "VllmConfig") -> Optional[str]: + """ + Get the required KV cache layout for this connector. + Args: + vllm_config (VllmConfig): the vllm config. + + Returns: + str: the required KV cache layout. e.g. HND, or NHD. + None if the connector does not require a specific layout. + """ + return None + KVConnectorBaseType = Union[KVConnectorBase, KVConnectorBase_V1] diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index be9ce72dea67a..cf7cde2c43771 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -5,6 +5,7 @@ import importlib from typing import TYPE_CHECKING, Callable import vllm.envs as envs +from vllm.config import KVTransferConfig from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1, KVConnectorRole) @@ -41,14 +42,27 @@ class KVConnectorFactory: raise ValueError("Attempting to initialize a V0 Connector, " f"but found {envs.VLLM_USE_V1=}") - connector_name = config.kv_transfer_config.kv_connector - if connector_name not in cls._registry: - raise ValueError(f"Unsupported connector type: {connector_name}") - - connector_cls = cls._registry[connector_name]() + connector_cls = cls.get_connector_class(config.kv_transfer_config) assert issubclass(connector_cls, KVConnectorBase) return connector_cls(rank, local_rank, config) + @classmethod + def get_connector_class( + cls, kv_transfer_config: "KVTransferConfig" + ) -> type[KVConnectorBaseType]: + """Get the connector class by name.""" + connector_name = kv_transfer_config.kv_connector + if connector_name in cls._registry: + connector_cls = cls._registry[connector_name]() + else: + connector_module_path = kv_transfer_config.kv_connector_module_path + if connector_module_path is None: + raise ValueError( + f"Unsupported connector type: {connector_name}") + connector_module = importlib.import_module(connector_module_path) + connector_cls = getattr(connector_module, connector_name) + return connector_cls + @classmethod def create_connector_v1( cls, @@ -60,19 +74,10 @@ class KVConnectorFactory: f"but found {envs.VLLM_USE_V1=}") kv_transfer_config = config.kv_transfer_config - connector_name = kv_transfer_config.kv_connector - if connector_name in cls._registry: - connector_cls = cls._registry[connector_name]() - else: - connector_module_path = kv_transfer_config.kv_connector_module_path - if connector_module_path is None: - raise ValueError( - f"Unsupported connector type: {connector_name}") - connector_module = importlib.import_module(connector_module_path) - connector_cls = getattr(connector_module, connector_name) + connector_cls = cls.get_connector_class(kv_transfer_config) assert issubclass(connector_cls, KVConnectorBase_V1) logger.info("Creating v1 connector with name: %s and engine_id: %s", - connector_name, kv_transfer_config.engine_id) + connector_cls.__name__, kv_transfer_config.engine_id) # NOTE(Kuntai): v1 connector is explicitly separated into two roles. # Scheduler connector: # - Co-locate with scheduler process diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 459a532989140..559c233947ce8 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -13,6 +13,8 @@ import torch import vllm.envs as envs from vllm import _custom_ops as ops from vllm.config import VllmConfig, get_current_vllm_config +from vllm.distributed.kv_transfer.kv_connector.factory import ( + KVConnectorFactory) from vllm.logger import init_logger from vllm.v1.outputs import ModelRunnerOutput @@ -103,15 +105,14 @@ def get_kv_connector_cache_layout(): # used for faster transfer. vllm_config = get_current_vllm_config() kv_config = vllm_config.kv_transfer_config - if kv_config is not None and vllm_config.model_config is None: - logger.warning_once("Unable to detect current VLLM config. " \ - "Defaulting to NHD kv cache layout.") - elif kv_config is not None: - use_mla = vllm_config.model_config.use_mla - if not use_mla and kv_config.kv_connector == "NixlConnector": - logger.info_once("NixlConnector detected. Setting KV cache " \ - "layout to HND for better xfer performance.") - return "HND" + if kv_config is not None: + connector_cls = KVConnectorFactory.get_connector_class(kv_config) + required_kvcache_layout = connector_cls.get_required_kvcache_layout( + vllm_config) + if required_kvcache_layout is not None: + return required_kvcache_layout + logger.info_once("Connectors do not specify a " \ + "kv cache layout, defaulting to NHD.") return "NHD" diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 8bbdd7e0621c6..7a2ccb58656fd 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -299,3 +299,17 @@ class KVConnectorBase_V1(ABC): returned by the engine. """ return False, None + + @classmethod + def get_required_kvcache_layout( + cls, vllm_config: "VllmConfig") -> Optional[str]: + """ + Get the required KV cache layout for this connector. + Args: + vllm_config (VllmConfig): the vllm config. + + Returns: + str: the required KV cache layout. e.g. HND, or NHD. + None if the connector does not require a specific layout. + """ + return None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index a2eaa0040191e..934a03a12ee5e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -202,3 +202,36 @@ class MultiConnector(KVConnectorBase_V1): self._requests_to_connector.pop(request.request_id, None) return async_saves > 0, kv_txfer_params + + @classmethod + def get_required_kvcache_layout( + cls, vllm_config: "VllmConfig") -> Optional[str]: + """ + Get the required KV cache layout for this connector. + Args: + vllm_config (VllmConfig): the vllm config. + + Returns: + str: the required KV cache layout. e.g. HND, or NHD. + None if the connector does not require a specific layout. + """ + ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get( + "connectors") + assert ktcs is not None + layouts: set[str] = set() + temp_vllm_config = copy.copy(vllm_config) + for ktc in ktcs: + kv_transfer_config = KVTransferConfig(**ktc) + temp_vllm_config.kv_transfer_config = kv_transfer_config + required_kvcache_layout = KVConnectorFactory.get_connector_class( + kv_transfer_config).get_required_kvcache_layout( + temp_vllm_config) + if required_kvcache_layout is not None: + layouts.add(required_kvcache_layout) + + if len(layouts) > 1: + raise ValueError(f"KV cache layout mismatch: " + f"found {len(layouts)} different layouts " + f"({', '.join(layouts) })." + f"All connectors must use the same layout.") + return next(iter(layouts), None) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 6d86ab7f7a4c2..e7fc2b118145c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -133,6 +133,25 @@ class NixlConnector(KVConnectorBase_V1): self.connector_worker = NixlConnectorWorker( vllm_config, self.engine_id) + ############################################################ + # Class Methods + ############################################################ + @classmethod + def get_required_kvcache_layout(cls, vllm_config: VllmConfig): + if vllm_config.model_config is None: + logger.warning_once("Unable to detect current VLLM config. " + "Fallback to default kv cache layout.") + return None + use_mla = vllm_config.model_config.use_mla + if use_mla: + # return None when we have mla + # as the layout should not matter in that case, + # which fallback to the default behavior. + return None + logger.info_once("NixlConnector setting KV cache " + "layout to HND for better xfer performance.") + return "HND" + ############################################################ # Scheduler Side Methods ############################################################ @@ -236,13 +255,13 @@ class NixlConnectorScheduler: """ For remote prefill, pull all prompt blocks from remote asynchronously relative to engine execution. - + Args: request (Request): the request object. num_computed_tokens (int): the number of locally computed tokens for this request Returns: - * the number of tokens that can be loaded from the + * the number of tokens that can be loaded from the external KV cache beyond what is already computed. * true if the external KV cache tokens will be loaded asynchronously (between scheduler steps). From 8f0d5167155247934d247eb10ae086108db8d473 Mon Sep 17 00:00:00 2001 From: wenxindongwork <161090399+wenxindongwork@users.noreply.github.com> Date: Wed, 30 Jul 2025 10:02:12 -0700 Subject: [PATCH 073/224] [TPU] Support Pathways in vLLM (#21417) Signed-off-by: wenxindongwork --- vllm/envs.py | 5 +++++ vllm/platforms/__init__.py | 18 ++++++++++++------ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index ec4b0888d0f40..19bc9156b2586 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -124,6 +124,7 @@ if TYPE_CHECKING: VLLM_V1_USE_OUTLINES_CACHE: bool = False VLLM_TPU_BUCKET_PADDING_GAP: int = 0 VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None + VLLM_TPU_USING_PATHWAYS: bool = False VLLM_USE_DEEP_GEMM: bool = False VLLM_USE_FLASHINFER_MOE_FP8: bool = False VLLM_USE_FLASHINFER_MOE_FP4: bool = False @@ -900,6 +901,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_TPU_MOST_MODEL_LEN": lambda: maybe_convert_int(os.environ.get("VLLM_TPU_MOST_MODEL_LEN", None)), + # Whether using Pathways + "VLLM_TPU_USING_PATHWAYS": + lambda: bool("proxy" in os.getenv("JAX_PLATFORMS", "").lower()), + # Allow use of DeepGemm kernels for fused moe ops. "VLLM_USE_DEEP_GEMM": lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))), diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index c13659f8a06e6..56edb8629e45b 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -1,11 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - import logging import traceback from itertools import chain from typing import TYPE_CHECKING, Optional +from vllm import envs from vllm.plugins import load_plugins_by_group from vllm.utils import resolve_obj_by_qualname, supports_xccl @@ -31,20 +31,26 @@ def vllm_version_matches_substr(substr: str) -> bool: def tpu_platform_plugin() -> Optional[str]: - is_tpu = False logger.debug("Checking if TPU platform is available.") + + # Check for Pathways TPU proxy + if envs.VLLM_TPU_USING_PATHWAYS: + logger.debug("Confirmed TPU platform is available via Pathways proxy.") + return "tpu_commons.platforms.tpu_jax.TpuPlatform" + + # Check for libtpu installation try: # While it's technically possible to install libtpu on a # non-TPU machine, this is a very uncommon scenario. Therefore, - # we assume that libtpu is installed if and only if the machine + # we assume that libtpu is installed only if the machine # has TPUs. + import libtpu # noqa: F401 - is_tpu = True logger.debug("Confirmed TPU platform is available.") + return "vllm.platforms.tpu.TpuPlatform" except Exception as e: logger.debug("TPU platform is not available because: %s", str(e)) - - return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None + return None def cuda_platform_plugin() -> Optional[str]: From 56bd537dde023f2d8372257255af45fa784ee739 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 30 Jul 2025 18:20:20 +0100 Subject: [PATCH 074/224] [Misc] Support more collective_rpc return types (#21845) Signed-off-by: Nick Hill --- tests/v1/engine/test_engine_core_client.py | 65 +++++++++++++++++++++- vllm/v1/engine/__init__.py | 9 ++- vllm/v1/engine/core.py | 6 +- vllm/v1/engine/core_client.py | 3 +- vllm/v1/serial_utils.py | 44 +++++++++++++++ 5 files changed, 121 insertions(+), 6 deletions(-) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 2ac6dc796bd10..f648c38a63f79 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -6,8 +6,9 @@ import os import signal import time import uuid +from dataclasses import dataclass from threading import Thread -from typing import Optional +from typing import Optional, Union from unittest.mock import MagicMock import pytest @@ -292,6 +293,68 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch): client.shutdown() +@dataclass +class MyDataclass: + message: str + + +# Dummy utility function to monkey-patch into engine core. +def echo_dc( + self, + msg: str, + return_list: bool = False, +) -> Union[MyDataclass, list[MyDataclass]]: + print(f"echo dc util function called: {msg}") + # Return dataclass to verify support for returning custom types + # (for which there is special handling to make it work with msgspec). + return [MyDataclass(msg) for _ in range(3)] if return_list \ + else MyDataclass(msg) + + +@pytest.mark.asyncio(loop_scope="function") +async def test_engine_core_client_util_method_custom_return( + monkeypatch: pytest.MonkeyPatch): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + # Must set insecure serialization to allow returning custom types. + m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + + # Monkey-patch core engine utility function to test. + m.setattr(EngineCore, "echo_dc", echo_dc, raising=False) + + engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True) + vllm_config = engine_args.create_engine_config( + usage_context=UsageContext.UNKNOWN_CONTEXT) + executor_class = Executor.get_class(vllm_config) + + with set_default_torch_num_threads(1): + client = EngineCoreClient.make_client( + multiprocess_mode=True, + asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=True, + ) + + try: + # Test utility method returning custom / non-native data type. + core_client: AsyncMPClient = client + + result = await core_client.call_utility_async( + "echo_dc", "testarg2", False) + assert isinstance(result, + MyDataclass) and result.message == "testarg2" + result = await core_client.call_utility_async( + "echo_dc", "testarg2", True) + assert isinstance(result, list) and all( + isinstance(r, MyDataclass) and r.message == "testarg2" + for r in result) + finally: + client.shutdown() + + @pytest.mark.parametrize( "multiprocessing_mode,publisher_config", [(True, "tcp"), (False, "inproc")], diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 79dc80d8fc547..810d03f32d726 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -123,6 +123,13 @@ class EngineCoreOutput( return self.finish_reason is not None +class UtilityResult: + """Wrapper for special handling when serializing/deserializing.""" + + def __init__(self, r: Any = None): + self.result = r + + class UtilityOutput( msgspec.Struct, array_like=True, # type: ignore[call-arg] @@ -132,7 +139,7 @@ class UtilityOutput( # Non-None implies the call failed, result should be None. failure_message: Optional[str] = None - result: Any = None + result: Optional[UtilityResult] = None class EngineCoreOutputs( diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 39fda521f36af..9f2fca6961388 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -36,7 +36,7 @@ from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, EngineCoreRequestType, ReconfigureDistributedRequest, ReconfigureRankType, - UtilityOutput) + UtilityOutput, UtilityResult) from vllm.v1.engine.mm_input_cache import MirroredProcessingCache from vllm.v1.engine.utils import EngineHandshakeMetadata, EngineZmqAddresses from vllm.v1.executor.abstract import Executor @@ -715,8 +715,8 @@ class EngineCoreProc(EngineCore): output = UtilityOutput(call_id) try: method = getattr(self, method_name) - output.result = method( - *self._convert_msgspec_args(method, args)) + result = method(*self._convert_msgspec_args(method, args)) + output.result = UtilityResult(result) except BaseException as e: logger.exception("Invocation of %s method failed", method_name) output.failure_message = (f"Call to {method_name} method" diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index acff5bf6823d9..fdf5a5de191c0 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -552,7 +552,8 @@ def _process_utility_output(output: UtilityOutput, if output.failure_message is not None: future.set_exception(Exception(output.failure_message)) else: - future.set_result(output.result) + assert output.result is not None + future.set_result(output.result.result) class SyncMPClient(MPClient): diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index 03200c2c2f8ec..4b6a983252b0e 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses +import importlib import pickle from collections.abc import Sequence from inspect import isclass @@ -9,6 +10,7 @@ from types import FunctionType from typing import Any, Optional, Union import cloudpickle +import msgspec import numpy as np import torch import zmq @@ -22,6 +24,7 @@ from vllm.multimodal.inputs import (BaseMultiModalField, MultiModalFlatField, MultiModalKwargs, MultiModalKwargsItem, MultiModalSharedField, NestedTensors) +from vllm.v1.engine import UtilityResult logger = init_logger(__name__) @@ -46,6 +49,10 @@ def _log_insecure_serialization_warning(): "VLLM_ALLOW_INSECURE_SERIALIZATION=1") +def _typestr(t: type): + return t.__module__, t.__qualname__ + + class MsgpackEncoder: """Encoder with custom torch tensor and numpy array serialization. @@ -122,6 +129,18 @@ class MsgpackEncoder: for itemlist in mm._items_by_modality.values() for item in itemlist] + if isinstance(obj, UtilityResult): + result = obj.result + if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION or result is None: + return None, result + # Since utility results are not strongly typed, we also encode + # the type (or a list of types in the case it's a list) to + # help with correct msgspec deserialization. + cls = result.__class__ + return _typestr(cls) if cls is not list else [ + _typestr(type(v)) for v in result + ], result + if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION: raise TypeError(f"Object of type {type(obj)} is not serializable" "Set VLLM_ALLOW_INSECURE_SERIALIZATION=1 to allow " @@ -237,8 +256,33 @@ class MsgpackDecoder: k: self._decode_nested_tensors(v) for k, v in obj.items() }) + if t is UtilityResult: + return self._decode_utility_result(obj) return obj + def _decode_utility_result(self, obj: Any) -> UtilityResult: + result_type, result = obj + if result_type is not None: + if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION: + raise TypeError("VLLM_ALLOW_INSECURE_SERIALIZATION must " + "be set to use custom utility result types") + assert isinstance(result_type, list) + if len(result_type) == 2 and isinstance(result_type[0], str): + result = self._convert_result(result_type, result) + else: + assert isinstance(result, list) + result = [ + self._convert_result(rt, r) + for rt, r in zip(result_type, result) + ] + return UtilityResult(result) + + def _convert_result(self, result_type: Sequence[str], result: Any): + mod_name, name = result_type + mod = importlib.import_module(mod_name) + result_type = getattr(mod, name) + return msgspec.convert(result, result_type, dec_hook=self.dec_hook) + def _decode_ndarray(self, arr: Any) -> np.ndarray: dtype, shape, data = arr # zero-copy decode. We assume the ndarray will not be kept around, From b9b753e7a7d95311186bbfc2b30b643a2f9e6ca1 Mon Sep 17 00:00:00 2001 From: Doug Smith Date: Wed, 30 Jul 2025 16:04:40 -0400 Subject: [PATCH 075/224] For VLLM_USE_PRECOMPILED, only compiled .so files should be extracted (#21964) --- setup.py | 79 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 35 deletions(-) diff --git a/setup.py b/setup.py index 58e5833f16ae1..bf3391e2db19e 100644 --- a/setup.py +++ b/setup.py @@ -371,40 +371,31 @@ class repackage_wheel(build_ext): raise SetupError( f"Failed to get vLLM wheel from {wheel_location}") from e - # During a docker build: determine correct filename, copy wheel. - if envs.VLLM_DOCKER_BUILD_CONTEXT: - dist_dir = "/workspace/dist" - os.makedirs(dist_dir, exist_ok=True) - # Determine correct wheel filename from METADATA - with zipfile.ZipFile(wheel_path, "r") as z: - metadata_file = next( - (n for n in z.namelist() - if n.endswith(".dist-info/METADATA")), - None, - ) - if not metadata_file: - raise RuntimeError( - "Could not find METADATA in precompiled wheel.") - metadata = z.read(metadata_file).decode() - version_line = next((line for line in metadata.splitlines() - if line.startswith("Version: ")), None) - if not version_line: - raise RuntimeError( - "Could not determine version from METADATA.") - version = version_line.split(": ")[1].strip() + # Set the dist_dir for Docker build context + dist_dir = ("/workspace/dist" + if envs.VLLM_DOCKER_BUILD_CONTEXT else "dist") + os.makedirs(dist_dir, exist_ok=True) - # Build correct filename using internal version - arch_tag = "cp38-abi3-manylinux1_x86_64" - corrected_wheel_name = f"vllm-{version}-{arch_tag}.whl" - final_wheel_path = os.path.join(dist_dir, corrected_wheel_name) - - print(f"Docker build context detected, copying precompiled wheel " - f"({version}) to {final_wheel_path}") - shutil.copy2(wheel_path, final_wheel_path) - return - - # Unzip the wheel when not in Docker context + # Extract only necessary compiled .so files from precompiled wheel with zipfile.ZipFile(wheel_path) as wheel: + # Get version from METADATA (optional, mostly useful for logging) + metadata_file = next((n for n in wheel.namelist() + if n.endswith(".dist-info/METADATA")), None) + if not metadata_file: + raise RuntimeError( + "Could not find METADATA in precompiled wheel.") + metadata = wheel.read(metadata_file).decode() + version_line = next((line for line in metadata.splitlines() + if line.startswith("Version: ")), None) + if not version_line: + raise RuntimeError( + "Could not determine version from METADATA.") + version = version_line.split(": ")[1].strip() + + print(f"Extracting precompiled kernels from vLLM wheel version: " + f"{version}") + + # List of compiled shared objects to extract files_to_copy = [ "vllm/_C.abi3.so", "vllm/_moe_C.abi3.so", @@ -413,6 +404,7 @@ class repackage_wheel(build_ext): "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", "vllm/cumem_allocator.abi3.so", ] + file_members = list( filter(lambda x: x.filename in files_to_copy, wheel.filelist)) compiled_regex = re.compile( @@ -430,9 +422,26 @@ class repackage_wheel(build_ext): if package_name not in package_data: package_data[package_name] = [] - wheel.extract(file) - if not file_name.endswith(".py"): - package_data[package_name].append(file_name) + output_base = (dist_dir + if envs.VLLM_DOCKER_BUILD_CONTEXT else ".") + target_path = os.path.join(output_base, file.filename) + os.makedirs(os.path.dirname(target_path), exist_ok=True) + with wheel.open(file.filename) as src, open(target_path, + "wb") as dst: + shutil.copyfileobj(src, dst) + + package_data[package_name].append(file_name) + + # Copy wheel into dist dir for Docker to consume (e.g., via --mount) + if envs.VLLM_DOCKER_BUILD_CONTEXT: + arch_tag = "cp38-abi3-manylinux1_x86_64" + corrected_wheel_name = f"vllm-{version}-{arch_tag}.whl" + final_wheel_path = os.path.join(dist_dir, corrected_wheel_name) + + print( + "Docker build context detected, copying precompiled wheel to " + f"{final_wheel_path}") + shutil.copy2(wheel_path, final_wheel_path) def _no_device() -> bool: From f12d9256b39f058b93c201cedc7ffd9e605e9db8 Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Wed, 30 Jul 2025 13:15:06 -0700 Subject: [PATCH 076/224] [Misc] Use dracut on CentOS and skip clone if repo exists for EP kernel installation (#21635) Signed-off-by: Ming Yang --- tools/ep_kernels/configure_system_drivers.sh | 12 +++++- tools/ep_kernels/install_python_libraries.sh | 40 +++++++++++++++++++- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/tools/ep_kernels/configure_system_drivers.sh b/tools/ep_kernels/configure_system_drivers.sh index cf15c1daccaec..b8bd8b8f6f550 100644 --- a/tools/ep_kernels/configure_system_drivers.sh +++ b/tools/ep_kernels/configure_system_drivers.sh @@ -2,6 +2,16 @@ set -ex # turn on IBGDA echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf -update-initramfs -u + +if command -v update-initramfs &> /dev/null; then + # for Debian/Ubuntu + sudo update-initramfs -u +elif command -v dracut &> /dev/null; then + # for Fedora/CentOS + sudo dracut --force +else + echo "No supported initramfs update tool found." + exit 1 +fi echo "Please reboot the system to apply the changes" diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh index 83643c084bf9a..9d1b2da3b4122 100644 --- a/tools/ep_kernels/install_python_libraries.sh +++ b/tools/ep_kernels/install_python_libraries.sh @@ -53,9 +53,45 @@ popd export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH +is_git_dirty() { + local dir=$1 + pushd "$dir" > /dev/null + + if [ -d ".git" ] && [ -n "$(git status --porcelain 2>/dev/null)" ]; then + popd > /dev/null + return 0 # dirty (true) + else + popd > /dev/null + return 1 # clean (false) + fi +} + +# Function to handle git repository cloning with dirty/incomplete checks +clone_repo() { + local repo_url=$1 + local dir_name=$2 + local key_file=$3 + + if [ -d "$dir_name" ]; then + # Check if directory has uncommitted changes (dirty) + if is_git_dirty "$dir_name"; then + echo "$dir_name directory is dirty, skipping clone" + # Check if clone failed (directory exists but not a valid git repo or missing key files) + elif [ ! -d "$dir_name/.git" ] || [ ! -f "$dir_name/$key_file" ]; then + echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning" + rm -rf "$dir_name" + git clone "$repo_url" + else + echo "$dir_name directory exists and appears complete; manually update if needed" + fi + else + git clone "$repo_url" + fi +} + # build and install pplx, require pytorch installed pushd $WORKSPACE -git clone https://github.com/ppl-ai/pplx-kernels +clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" cd pplx-kernels # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925 # PIP_NO_BUILD_ISOLATION=0 disables build isolation @@ -64,7 +100,7 @@ popd # build and install deepep, require pytorch installed pushd $WORKSPACE -git clone https://github.com/deepseek-ai/DeepEP +clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" cd DeepEP export NVSHMEM_DIR=$WORKSPACE/nvshmem_install PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e . From 287f527f5403bb42a32136cf6c802faeb92a09ef Mon Sep 17 00:00:00 2001 From: cascade Date: Wed, 30 Jul 2025 14:23:41 -0700 Subject: [PATCH 077/224] [Feature] Add async tensor parallelism for scaled mm (#20155) Signed-off-by: cascade812 --- tests/compile/test_async_tp.py | 143 ++++++++++++- vllm/compilation/collective_fusion.py | 244 ++++++++++++++++++++++- vllm/compilation/sequence_parallelism.py | 2 +- 3 files changed, 381 insertions(+), 8 deletions(-) diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py index 916ec2b83df4f..9a51e6b3514f4 100644 --- a/tests/compile/test_async_tp.py +++ b/tests/compile/test_async_tp.py @@ -22,6 +22,8 @@ from ..utils import (compare_two_settings, create_new_process_for_each_test, multi_gpu_test) from .backend import TestBackend +FP8_DTYPE = current_platform.fp8_dtype() + prompts = [ "Hello, my name is", "The president of the United States is", @@ -32,9 +34,10 @@ prompts = [ class TestMMRSModel(torch.nn.Module): - def __init__(self, hidden_size=16): + def __init__(self, hidden_size=16, dtype=torch.float16): super().__init__() self.hidden_size = hidden_size + self.dtype = dtype self.gate_proj = torch.nn.Parameter(torch.empty( (self.hidden_size * 2, hidden_size)), requires_grad=False) @@ -64,9 +67,10 @@ class TestMMRSModel(torch.nn.Module): class TestAGMMModel(torch.nn.Module): - def __init__(self, hidden_size=16): + def __init__(self, hidden_size=16, dtype=torch.float16): super().__init__() self.hidden_size = hidden_size + self.dtype = dtype self.weight = torch.nn.Parameter(torch.empty( (hidden_size, hidden_size)), requires_grad=False) @@ -91,8 +95,125 @@ class TestAGMMModel(torch.nn.Module): return [torch.ops.symm_mem.fused_all_gather_matmul.default] +class _BaseScaledMMModel(torch.nn.Module): + + def __init__(self, hidden_size=16, dtype=torch.float16): + super().__init__() + self.hidden_size = hidden_size + self.dtype = dtype + self.weight = torch.empty([hidden_size, hidden_size], dtype=FP8_DTYPE)\ + .contiguous().transpose(0, 1) + + # Initialize scale_b for _scaled_mm. + self.scale_b = torch.ones(1, self.hidden_size, dtype=torch.float32) + + +class TestScaledMMRSModel(_BaseScaledMMModel): + + def forward(self, input: torch.Tensor): + """ + Forward pass implementing the scaled_mm + reduce scatter in the FX graph + + """ + fp8_input = input.to(FP8_DTYPE) + scale_a = torch.ones(input.shape[0], 1, dtype=torch.float32) + scaled_mm = torch._scaled_mm(fp8_input, + self.weight, + scale_a=scale_a, + scale_b=self.scale_b, + out_dtype=self.dtype) + reduce_scatter = tensor_model_parallel_reduce_scatter(scaled_mm, dim=0) + return reduce_scatter + + def ops_in_model_before(self): + return [torch.ops.vllm.reduce_scatter.default] + + def ops_in_model_after(self): + return [torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter.default] + + +class TestAGScaledMMModel(_BaseScaledMMModel): + + def forward(self, input: torch.Tensor): + """ + Forward pass implementing the all gather + scaled_mm in the FX graph + """ + # Reshape input + fp8_input = input.to(FP8_DTYPE) + all_gather = tensor_model_parallel_all_gather(fp8_input, dim=0) + + scale_a = torch.ones(all_gather.shape[0], 1, dtype=torch.float32) + scaled_mm = torch._scaled_mm(all_gather, + self.weight, + scale_a=scale_a, + scale_b=self.scale_b, + out_dtype=self.dtype) + return scaled_mm + + def ops_in_model_before(self): + return [torch.ops.vllm.all_gather.default] + + def ops_in_model_after(self): + return [torch.ops.symm_mem.fused_all_gather_scaled_matmul.default] + + +class TestCutlassScaledMMRSModel(_BaseScaledMMModel): + + def forward(self, input: torch.Tensor): + """ + Forward pass implementing the cutlass_scaled_mm + reduce scatter + in the FX graph + + """ + fp8_input = input.to(FP8_DTYPE) + scale_a = torch.ones(input.shape[0], 1, dtype=torch.float32) + mm_out = torch.empty((fp8_input.shape[0], self.weight.shape[1]), + dtype=self.dtype, + device=input.device) + torch.ops._C.cutlass_scaled_mm(mm_out, fp8_input, self.weight, scale_a, + self.scale_b, None) + reduce_scatter = tensor_model_parallel_reduce_scatter(mm_out, dim=0) + return reduce_scatter + + def ops_in_model_before(self): + return [torch.ops.vllm.reduce_scatter.default] + + def ops_in_model_after(self): + return [torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter.default] + + +class TestAGCutlassScaledMMModel(_BaseScaledMMModel): + + def forward(self, input: torch.Tensor): + """ + Forward pass implementing the all gather + cutlass_scaled_mm + in the FX graph + """ + # Reshape input + fp8_input = input.to(FP8_DTYPE) + all_gather = tensor_model_parallel_all_gather(fp8_input, dim=0) + + scale_a = torch.ones(all_gather.shape[0], 1, dtype=torch.float32) + + mm_out = torch.empty((all_gather.shape[0], self.weight.shape[1]), + dtype=self.dtype, + device=all_gather.device) + torch.ops._C.cutlass_scaled_mm(mm_out, all_gather, self.weight, + scale_a, self.scale_b, None) + return mm_out + + def ops_in_model_before(self): + return [torch.ops.vllm.all_gather.default] + + def ops_in_model_after(self): + return [torch.ops.symm_mem.fused_all_gather_scaled_matmul.default] + + @multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize("test_model", [TestMMRSModel, TestAGMMModel]) +@pytest.mark.parametrize("test_model", [ + TestMMRSModel, TestAGMMModel, TestScaledMMRSModel, TestAGScaledMMModel, + TestCutlassScaledMMRSModel, TestAGCutlassScaledMMModel +]) @pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("seq_len", [16]) @pytest.mark.parametrize("hidden_size", [16]) @@ -101,6 +222,14 @@ class TestAGMMModel(torch.nn.Module): reason="Only test on CUDA") def test_async_tp_pass_replace(test_model: str, batch_size: int, seq_len: int, hidden_size: int, dtype: torch.dtype): + if test_model in (TestScaledMMRSModel, TestAGScaledMMModel, + TestCutlassScaledMMRSModel, + TestAGCutlassScaledMMModel) and dtype == torch.float16: + pytest.skip( + "Only bf16 high precision output types are supported for " \ + "per-token (row-wise) scaling" + ) + num_processes = 2 def run_torch_spawn(fn, nprocs): @@ -155,7 +284,8 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int, async_tp_pass = AsyncTPPass(vllm_config) backend = TestBackend(async_tp_pass) - model = test_model_cls(hidden_size) + model = test_model_cls(hidden_size, + dtype) # Pass dtype to model constructor hidden_states = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype, @@ -174,7 +304,10 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int, @create_new_process_for_each_test() -@pytest.mark.parametrize("model_id", ["meta-llama/Llama-3.2-1B-Instruct"]) +@pytest.mark.parametrize("model_id", [ + "meta-llama/Llama-3.2-1B-Instruct", + "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8" +]) @pytest.mark.parametrize("tp_size", [2]) @pytest.mark.parametrize("async_tp_enabled", [True]) @pytest.mark.parametrize("distributed_backend", ["mp"]) diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index 0e7961841bd33..cb99fe8310e73 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -15,10 +15,13 @@ from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op from .vllm_inductor_pass import VllmInductorPass +FP8_DTYPE = current_platform.fp8_dtype() + if find_spec("flashinfer"): try: import flashinfer.comm as flashinfer_comm @@ -28,7 +31,6 @@ if find_spec("flashinfer"): flashinfer_comm = None else: flashinfer_comm = None -from vllm.platforms import current_platform logger = init_logger(__name__) @@ -118,6 +120,230 @@ class AllGatherGEMMPattern(BasePattern): pm.fwd_only, pm_pass) +class ScaledMMReduceScatterPattern(BasePattern): + + def get_inputs(self): + input = torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE) + mm_weight = torch.empty([16, 16], device=self.device, + dtype=FP8_DTYPE).contiguous().transpose(0, 1) + scale_a = torch.empty([16, 1], device=self.device, dtype=torch.float32) + scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32) + return [input, mm_weight, scale_a, scale_b] + + def register(self, pm_pass: PatternMatcherPass): + + def pattern(input: torch.Tensor, mat2: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor) -> torch.Tensor: + scaled_mm = torch.ops.aten._scaled_mm.default(input, + mat2=mat2, + scale_a=scale_a, + scale_b=scale_b, + bias=None, + scale_result=None, + out_dtype=self.dtype) + reduce_scatter = torch.ops.vllm.reduce_scatter.default( + scaled_mm, + dim=0, + world_size=self.tp_size, + group_name=self.tp.unique_name) + return reduce_scatter + + def replacement(input: torch.Tensor, mat2: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor) -> torch.Tensor: + gemm_rs = torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter( + input, + mat2, + scale_a, + scale_b, + "avg", + scatter_dim=0, + out_dtype=self.dtype, + group_name=self.tp.device_group.group_name, + ) + + return gemm_rs + + pm.register_replacement(pattern, replacement, self.get_inputs(), + pm.fwd_only, pm_pass) + + +class AllGatherScaledMMPattern(BasePattern): + + def get_inputs(self): + x = torch.empty([8, 16], device=self.device, dtype=FP8_DTYPE) + weight = torch.empty([16, 16], device=self.device, + dtype=FP8_DTYPE).contiguous().transpose(0, 1) + + s1 = x.shape[0] * self.tp_size + + scale_a = torch.empty([s1, 1], device=self.device, dtype=torch.float32) + scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32) + + return [x, weight, scale_a, scale_b] + + def register(self, pm_pass: PatternMatcherPass): + + def pattern( + x: torch.Tensor, + weight: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + ) -> torch.Tensor: + all_gather = torch.ops.vllm.all_gather.default( + x, + dim=0, + world_size=self.tp_size, + group_name=self.tp.unique_name) + + return torch.ops.aten._scaled_mm.default(all_gather, + mat2=weight, + scale_a=scale_a, + scale_b=scale_b, + bias=None, + scale_result=None, + out_dtype=self.dtype) + + def replacement(x: torch.Tensor, weight: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor) -> torch.Tensor: + ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_scaled_matmul( # noqa + x, + [weight], + scale_a, + [scale_b], + gather_dim=0, + biases=[None], + result_scales=[None], + out_dtypes=[self.dtype], + use_fast_accum=[False], + group_name=self.tp.device_group.group_name, + ) + return mm_outputs + + pm.register_replacement(pattern, replacement, self.get_inputs(), + pm.fwd_only, pm_pass) + + +class CutlassScaledMMReduceScatterPattern(BasePattern): + + def get_inputs(self): + input = torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE) + mm_weight = torch.empty([16, 16], device=self.device, + dtype=FP8_DTYPE).contiguous().transpose(0, 1) + scale_a = torch.empty([16, 1], device=self.device, dtype=torch.float32) + scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32) + + cutlass_mm_output = torch.empty([16, 16], + device=self.device, + dtype=self.dtype) + return [input, mm_weight, scale_a, scale_b, cutlass_mm_output] + + def register(self, pm_pass: PatternMatcherPass): + + def pattern(input: torch.Tensor, weight: torch.Tensor, + scale_a: torch.Tensor, scale_b: torch.Tensor, + cutlass_mm_output: torch.Tensor) -> torch.Tensor: + cutlass_scaled_mm = torch.ops.higher_order.auto_functionalized( + torch.ops._C.cutlass_scaled_mm.default, + out=cutlass_mm_output, + a=input, + b=weight, + a_scales=scale_a, + b_scales=scale_b, + bias=None) + + reduce_scatter = torch.ops.vllm.reduce_scatter.default( + cutlass_scaled_mm[1], + dim=0, + world_size=self.tp_size, + group_name=self.tp.unique_name) + return reduce_scatter + + def replacement(input: torch.Tensor, mat2: torch.Tensor, + scale_a: torch.Tensor, scale_b: torch.Tensor, + cutlass_mm_output: torch.Tensor) -> torch.Tensor: + gemm_rs = torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter( + input, + mat2, + scale_a, + scale_b, + "avg", + scatter_dim=0, + out_dtype=self.dtype, + group_name=self.tp.device_group.group_name, + ) + + return gemm_rs + + pm.register_replacement(pattern, replacement, self.get_inputs(), + pm.fwd_only, pm_pass) + + +class AllGatherCutlassScaledMMPattern(BasePattern): + + def get_inputs(self): + x = torch.empty([8, 16], device=self.device, dtype=FP8_DTYPE) + weight = torch.empty([16, 16], device=self.device, + dtype=FP8_DTYPE).contiguous().transpose(0, 1) + + s1 = x.shape[0] * self.tp_size + + scale_a = torch.empty([s1, 1], device=self.device, dtype=torch.float32) + scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32) + + s2 = weight.shape[1] + output = torch.empty([s1, s2], device=self.device, dtype=self.dtype) + + return [x, weight, scale_a, scale_b, output] + + def register(self, pm_pass: PatternMatcherPass): + + def pattern( + x: torch.Tensor, + weight: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + output: torch.Tensor, + ) -> torch.Tensor: + all_gather = torch.ops.vllm.all_gather.default( + x, + dim=0, + world_size=self.tp_size, + group_name=self.tp.unique_name) + + cutlass_scaled_mm = torch.ops.higher_order.auto_functionalized( + torch.ops._C.cutlass_scaled_mm.default, + out=output, + a=all_gather, + b=weight, + a_scales=scale_a, + b_scales=scale_b, + bias=None) + return cutlass_scaled_mm[1] + + def replacement(x: torch.Tensor, weight: torch.Tensor, + scale_a: torch.Tensor, scale_b: torch.Tensor, + output: torch.Tensor) -> torch.Tensor: + ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_scaled_matmul( # noqa + x, + [weight], + scale_a, + [scale_b], + gather_dim=0, + biases=[None], + result_scales=[None], + out_dtypes=[self.dtype], + use_fast_accum=[False], + group_name=self.tp.device_group.group_name, + ) + return mm_outputs + + pm.register_replacement(pattern, replacement, self.get_inputs(), + pm.fwd_only, pm_pass) + + class AsyncTPPass(VllmInductorPass): def __init__(self, config: VllmConfig): @@ -133,6 +359,20 @@ class AsyncTPPass(VllmInductorPass): AllGatherGEMMPattern(self.model_dtype, self.device).register(self.patterns) + # These fusions are enabled only for bfloat16 models because + # `scaled_mm` or `cutlass_scaled_mm` with per-token (row-wise) scaling + # only supports bfloat16 as the output dtype. + if self.model_dtype == torch.bfloat16: + ScaledMMReduceScatterPattern(self.model_dtype, + self.device).register(self.patterns) + AllGatherScaledMMPattern(self.model_dtype, + self.device).register(self.patterns) + + CutlassScaledMMReduceScatterPattern( + self.model_dtype, self.device).register(self.patterns) + AllGatherCutlassScaledMMPattern( + self.model_dtype, self.device).register(self.patterns) + def is_applicable_for_shape(self, shape: Optional[int]) -> bool: # only do replace for specific shapes tp_size = get_tensor_model_parallel_world_size() @@ -142,7 +382,7 @@ class AsyncTPPass(VllmInductorPass): self.begin() self.dump_graph(graph, "before_async_tp_pass") count = self.patterns.apply(graph) - logger.debug("Replaced %s patterns", count) + logger.debug("Replaced %s patterns with async TP pass.", count) self.dump_graph(graph, "after_async_tp_pass") self.end_and_log() diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py index 6107046e40dcd..ebc025cba71ed 100644 --- a/vllm/compilation/sequence_parallelism.py +++ b/vllm/compilation/sequence_parallelism.py @@ -477,6 +477,6 @@ class SequenceParallelismPass(VllmInductorPass): self.begin() self.dump_graph(graph, "before_sequence_parallelism_pass") count = self.patterns.apply(graph) - logger.debug("Replaced %s patterns", count) + logger.debug("Replaced %s patterns with sequence parallelism", count) self.dump_graph(graph, "after_sequence_parallelism_pass") self.end_and_log() From 601f856d5679a474b6488fb7dd75ebbd7125d1ca Mon Sep 17 00:00:00 2001 From: Bram <153647206+br4mm@users.noreply.github.com> Date: Wed, 30 Jul 2025 14:44:02 -0700 Subject: [PATCH 078/224] [Bugfix] Fix None value handling in trace span creation for cancelled requests (#20272) --- vllm/engine/llm_engine.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3f30a34170ffe..79255b031eeca 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1862,8 +1862,14 @@ class LLMEngine: context=trace_context, start_time=arrival_time_nano_seconds) as seq_span: metrics = seq_group.metrics - ttft = metrics.first_token_time - metrics.arrival_time - e2e_time = metrics.finished_time - metrics.arrival_time + + # Handle potential None values for cancelled/aborted requests + ttft = (metrics.first_token_time - metrics.arrival_time + if metrics.first_token_time is not None else None) + + e2e_time = (metrics.finished_time - metrics.arrival_time + if metrics.finished_time is not None else None) + seq_span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL, self.model_config.model) seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID, @@ -1886,11 +1892,18 @@ class LLMEngine: seq.get_output_len() for seq in seq_group.get_finished_seqs() ])) - seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE, - metrics.time_in_queue) - seq_span.set_attribute( - SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft) - seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time) + + # Only set timing attributes if the values are available + if metrics.time_in_queue is not None: + seq_span.set_attribute( + SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE, + metrics.time_in_queue) + if ttft is not None: + seq_span.set_attribute( + SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft) + if e2e_time is not None: + seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, + e2e_time) if metrics.scheduler_time is not None: seq_span.set_attribute( SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER, From ca9e2be3ed6320b51f52f536595cd24e254f8bb2 Mon Sep 17 00:00:00 2001 From: Zebing Lin Date: Wed, 30 Jul 2025 18:00:54 -0400 Subject: [PATCH 079/224] [Core] Move EngineCoreRequest to Request conversion out of EngineCore (#21627) Signed-off-by: linzebing --- tests/v1/engine/test_engine_core.py | 44 ++++++++++------- vllm/v1/engine/core.py | 74 ++++++++++++++++++----------- vllm/v1/engine/core_client.py | 3 +- 3 files changed, 73 insertions(+), 48 deletions(-) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index eb826bf06236f..c52b98967126b 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -65,7 +65,8 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch): """Test basic request lifecycle.""" # First request. - engine_core.add_request(make_request()) + engine_core.add_request( + *engine_core.preprocess_add_request(make_request())) assert len(engine_core.scheduler.waiting) == 1 assert len(engine_core.scheduler.running) == 0 @@ -74,7 +75,8 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch): assert len(engine_core.scheduler.running) == 1 # Second request. - engine_core.add_request(make_request()) + engine_core.add_request( + *engine_core.preprocess_add_request(make_request())) assert len(engine_core.scheduler.waiting) == 1 assert len(engine_core.scheduler.running) == 1 @@ -83,8 +85,10 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch): assert len(engine_core.scheduler.running) == 2 # Add two requests in a row. - engine_core.add_request(make_request()) - engine_core.add_request(make_request()) + engine_core.add_request( + *engine_core.preprocess_add_request(make_request())) + engine_core.add_request( + *engine_core.preprocess_add_request(make_request())) assert len(engine_core.scheduler.waiting) == 2 assert len(engine_core.scheduler.running) == 2 @@ -104,7 +108,7 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch): req = make_request() request_id = req.request_id - engine_core.add_request(req) + engine_core.add_request(*engine_core.preprocess_add_request(req)) assert len(engine_core.scheduler.waiting) == 1 assert len(engine_core.scheduler.running) == 0 assert engine_core.scheduler.has_unfinished_requests() @@ -131,8 +135,8 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch): req1 = make_request() req2 = make_request() - engine_core.add_request(req0) - engine_core.add_request(req1) + engine_core.add_request(*engine_core.preprocess_add_request(req0)) + engine_core.add_request(*engine_core.preprocess_add_request(req1)) assert len(engine_core.scheduler.waiting) == 2 assert len(engine_core.scheduler.running) == 0 @@ -140,7 +144,7 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch): assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.running) == 2 - engine_core.add_request(req2) + engine_core.add_request(*engine_core.preprocess_add_request(req2)) assert len(engine_core.scheduler.waiting) == 1 assert len(engine_core.scheduler.running) == 2 @@ -166,12 +170,12 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch): req0 = make_request() req1 = make_request() req0.request_id = req1.request_id = "test" - engine_core.add_request(req0) + engine_core.add_request(*engine_core.preprocess_add_request(req0)) while (outs := engine_core.step()[0].get(0)) and outs.outputs: pass - engine_core.add_request(req1) + engine_core.add_request(*engine_core.preprocess_add_request(req1)) while (outs := engine_core.step()[0].get(0)) and outs.outputs: pass @@ -207,7 +211,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch): repetition_penalty=0.1, stop_token_ids=[1001, 1002], ) - engine_core.add_request(request) + engine_core.add_request(*engine_core.preprocess_add_request(request)) def _check_engine_state(): assert len(engine_core.scheduler.waiting) == 1 @@ -226,7 +230,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch): top_p=0.99, top_k=50, ) - engine_core.add_request(request2) + engine_core.add_request(*engine_core.preprocess_add_request(request2)) _check_engine_state() @@ -298,9 +302,9 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): # Add two requests in a row. Each request have 12 prompt tokens. req0 = make_request_with_max_tokens("0", 5) - engine_core.add_request(req0) + engine_core.add_request(*engine_core.preprocess_add_request(req0)) req1 = make_request_with_max_tokens("1", 5) - engine_core.add_request(req1) + engine_core.add_request(*engine_core.preprocess_add_request(req1)) # Schedule Batch 1: (10, req0) assert engine_core.step_with_batch_queue()[0] is None @@ -436,7 +440,8 @@ def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch): with pytest.raises(TypeError, match="request_id must be a string, got.*UUID"): - engine_core.add_request(uuid_request) + engine_core.add_request( + *engine_core.preprocess_add_request(uuid_request)) # Test with integer int_request = make_request() @@ -444,7 +449,8 @@ def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch): with pytest.raises(TypeError, match="request_id must be a string, got.*int"): - engine_core.add_request(int_request) + engine_core.add_request( + *engine_core.preprocess_add_request(int_request)) # Test with None none_request = make_request() @@ -452,10 +458,12 @@ def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch): with pytest.raises(TypeError, match="request_id must be a string, got.*NoneType"): - engine_core.add_request(none_request) + engine_core.add_request( + *engine_core.preprocess_add_request(none_request)) # Verify engine is still functional after errors valid_request = make_request() - engine_core.add_request(valid_request) + engine_core.add_request( + *engine_core.preprocess_add_request(valid_request)) assert len(engine_core.scheduler.waiting) == 1 assert len(engine_core.scheduler.running) == 0 diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 9f2fca6961388..f9a6315df8af8 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -205,8 +205,12 @@ class EngineCore: def get_supported_tasks(self) -> tuple[SupportedTask, ...]: return self.model_executor.supported_tasks - def add_request(self, request: EngineCoreRequest): - """Add request to the scheduler.""" + def add_request(self, request: Request, request_wave: int = 0): + """Add request to the scheduler. + + `request_wave`: indicate which wave of requests this is expected to + belong to in DP case + """ # Validate the request_id type. if not isinstance(request.request_id, str): raise TypeError( @@ -222,27 +226,12 @@ class EngineCore: raise ValueError(f"Unsupported task: {pooling_params.task!r} " f"Supported tasks: {supported_pooling_tasks}") - if request.mm_hashes is not None: - # Here, if hash exists for a multimodal input, then it will be - # fetched from the cache, else it will be added to the cache. - # Note that the cache here is mirrored with the client cache, so - # anything that has a hash must have a HIT cache entry here - # as well. - assert request.mm_inputs is not None - request.mm_inputs = self.mm_input_cache_server.get_and_update_p1( - request.mm_inputs, request.mm_hashes) - - req = Request.from_engine_core_request(request) - if req.use_structured_output: - # Start grammar compilation asynchronously - self.structured_output_manager.grammar_init(req) - - if req.kv_transfer_params is not None and ( + if request.kv_transfer_params is not None and ( not self.scheduler.get_kv_connector()): logger.warning("Got kv_transfer_params, but no KVConnector found. " "Disabling KVTransfer for this request.") - self.scheduler.add_request(req) + self.scheduler.add_request(request) def abort_requests(self, request_ids: list[str]): """Abort requests from the scheduler.""" @@ -414,6 +403,31 @@ class EngineCore: self.model_executor.save_tensorized_model( tensorizer_config=tensorizer_config, ) + def preprocess_add_request( + self, request: EngineCoreRequest) -> tuple[Request, int]: + """Preprocess the request. + + This function could be directly used in input processing thread to allow + request initialization running in parallel with Model forward + """ + if request.mm_hashes is not None: + assert request.mm_inputs is not None + # Note on thread safety: no race condition. + # `mm_input_cache_server` is reset at the end of LLMEngine init, + # and will only accessed in the input processing thread afterwards. + request.mm_inputs = self.mm_input_cache_server.get_and_update_p1( + request.mm_inputs, request.mm_hashes) + + req = Request.from_engine_core_request(request) + if req.use_structured_output: + # Note on thread safety: no race condition. + # `grammar_init` is only invoked in input processing thread. For + # `structured_output_manager`, each request is independent and + # grammar compilation is async. Scheduler always checks grammar + # compilation status before scheduling request. + self.structured_output_manager.grammar_init(req) + return req, request.current_wave + class EngineCoreProc(EngineCore): """ZMQ-wrapper for running EngineCore in background process.""" @@ -707,7 +721,8 @@ class EngineCoreProc(EngineCore): """Dispatch request from client.""" if request_type == EngineCoreRequestType.ADD: - self.add_request(request) + req, request_wave = request + self.add_request(req, request_wave) elif request_type == EngineCoreRequestType.ABORT: self.abort_requests(request) elif request_type == EngineCoreRequestType.UTILITY: @@ -806,10 +821,11 @@ class EngineCoreProc(EngineCore): bytes(type_frame.buffer)) # Deserialize the request data. - decoder = add_request_decoder if ( - request_type - == EngineCoreRequestType.ADD) else generic_decoder - request = decoder.decode(data_frames) + if request_type == EngineCoreRequestType.ADD: + request = add_request_decoder.decode(data_frames) + request = self.preprocess_add_request(request) + else: + request = generic_decoder.decode(data_frames) # Push to input queue for core busy loop. self.input_queue.put_nowait((request_type, request)) @@ -939,17 +955,17 @@ class DPEngineCoreProc(EngineCoreProc): if dp_group := getattr(self, "dp_group", None): stateless_destroy_torch_distributed_process_group(dp_group) - def add_request(self, request: EngineCoreRequest): - if self.has_coordinator and request.current_wave != self.current_wave: - if request.current_wave > self.current_wave: - self.current_wave = request.current_wave + def add_request(self, request: Request, request_wave: int = 0): + if self.has_coordinator and request_wave != self.current_wave: + if request_wave > self.current_wave: + self.current_wave = request_wave elif not self.engines_running: # Request received for an already-completed wave, notify # front-end that we need to start the next one. self.output_queue.put_nowait( (-1, EngineCoreOutputs(start_wave=self.current_wave))) - super().add_request(request) + super().add_request(request, request_wave) def _handle_client_request(self, request_type: EngineCoreRequestType, request: Any) -> None: diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index fdf5a5de191c0..26985df6f62df 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -250,7 +250,8 @@ class InprocClient(EngineCoreClient): return self.engine_core.get_supported_tasks() def add_request(self, request: EngineCoreRequest) -> None: - self.engine_core.add_request(request) + req, request_wave = self.engine_core.preprocess_add_request(request) + self.engine_core.add_request(req, request_wave) def abort_requests(self, request_ids: list[str]) -> None: if len(request_ids) > 0: From 9cb497bfa346721aaf5e09a7f483764a1a54f8b4 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 30 Jul 2025 20:39:46 -0400 Subject: [PATCH 080/224] [Example] Add `async_llm_streaming.py` example for AsyncLLM streaming in python (#21763) Signed-off-by: mgoin --- .../offline_inference/async_llm_streaming.py | 111 ++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 examples/offline_inference/async_llm_streaming.py diff --git a/examples/offline_inference/async_llm_streaming.py b/examples/offline_inference/async_llm_streaming.py new file mode 100644 index 0000000000000..b876d536e3a19 --- /dev/null +++ b/examples/offline_inference/async_llm_streaming.py @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Simple example demonstrating streaming offline inference with AsyncLLM (V1 engine). + +This script shows the core functionality of vLLM's AsyncLLM engine for streaming +token-by-token output in offline inference scenarios. It demonstrates DELTA mode +streaming where you receive new tokens as they are generated. + +Usage: + python examples/offline_inference/async_llm_streaming.py +""" + +import asyncio + +from vllm import SamplingParams +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.sampling_params import RequestOutputKind +from vllm.v1.engine.async_llm import AsyncLLM + + +async def stream_response(engine: AsyncLLM, prompt: str, request_id: str) -> None: + """ + Stream response from AsyncLLM and display tokens as they arrive. + + This function demonstrates the core streaming pattern: + 1. Create SamplingParams with DELTA output kind + 2. Call engine.generate() and iterate over the async generator + 3. Print new tokens as they arrive + 4. Handle the finished flag to know when generation is complete + """ + print(f"\n🚀 Prompt: {prompt!r}") + print("💬 Response: ", end="", flush=True) + + # Configure sampling parameters for streaming + sampling_params = SamplingParams( + max_tokens=100, + temperature=0.8, + top_p=0.95, + seed=42, # For reproducible results + output_kind=RequestOutputKind.DELTA, # Get only new tokens each iteration + ) + + try: + # Stream tokens from AsyncLLM + async for output in engine.generate( + request_id=request_id, prompt=prompt, sampling_params=sampling_params + ): + # Process each completion in the output + for completion in output.outputs: + # In DELTA mode, we get only new tokens generated since last iteration + new_text = completion.text + if new_text: + print(new_text, end="", flush=True) + + # Check if generation is finished + if output.finished: + print("\n✅ Generation complete!") + break + + except Exception as e: + print(f"\n❌ Error during streaming: {e}") + raise + + +async def main(): + print("🔧 Initializing AsyncLLM...") + + # Create AsyncLLM engine with simple configuration + engine_args = AsyncEngineArgs( + model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, # Faster startup for examples + ) + engine = AsyncLLM.from_engine_args(engine_args) + + try: + # Example prompts to demonstrate streaming + prompts = [ + "The future of artificial intelligence is", + "In a galaxy far, far away", + "The key to happiness is", + ] + + print(f"🎯 Running {len(prompts)} streaming examples...") + + # Process each prompt + for i, prompt in enumerate(prompts, 1): + print(f"\n{'=' * 60}") + print(f"Example {i}/{len(prompts)}") + print(f"{'=' * 60}") + + request_id = f"stream-example-{i}" + await stream_response(engine, prompt, request_id) + + # Brief pause between examples + if i < len(prompts): + await asyncio.sleep(0.5) + + print("\n🎉 All streaming examples completed!") + + finally: + # Always clean up the engine + print("🔧 Shutting down engine...") + engine.shutdown() + + +if __name__ == "__main__": + try: + asyncio.run(main()) + except KeyboardInterrupt: + print("\n🛑 Interrupted by user") From ec02e536dfa46c7e8785cb5aaf5dd4eaad88f405 Mon Sep 17 00:00:00 2001 From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> Date: Thu, 31 Jul 2025 04:38:52 +0100 Subject: [PATCH 081/224] [Bugfix] Relax lang pin for voxtral (#21833) Signed-off-by: Sanchit Gandhi Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/entrypoints/openai/speech_to_text.py | 8 +-- vllm/model_executor/models/interfaces.py | 53 ++++++++++++++-- vllm/model_executor/models/voxtral.py | 25 +++++--- vllm/model_executor/models/whisper.py | 74 +++++------------------ 4 files changed, 80 insertions(+), 80 deletions(-) diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index c2227a21a4b9a..01140a4bfea7e 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -86,11 +86,7 @@ class OpenAISpeechToText(OpenAIServing): audio_data: bytes, ) -> tuple[list[PromptType], float]: # Validate request - # TODO language should be optional and can be guessed. - # For now we default to en. See - # https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/generation_whisper.py#L1520 - lang = request.language or "en" - self.model_cls.validate_language(lang) + language = self.model_cls.validate_language(request.language) if len(audio_data) / 1024**2 > self.max_audio_filesize_mb: raise ValueError("Maximum file size exceeded.") @@ -112,7 +108,7 @@ class OpenAISpeechToText(OpenAIServing): audio=chunk, stt_config=self.asr_config, model_config=self.model_config, - language=lang, + language=language, task_type=self.task_type, request_prompt=request.prompt) prompts.append(prompt) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 957b57276b4ca..b6d9877cd01b6 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1,13 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Iterable, MutableSequence +from collections.abc import Iterable, Mapping, MutableSequence from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol, Union, overload, runtime_checkable) import numpy as np import torch from torch import Tensor +from transformers.models.whisper.tokenization_whisper import LANGUAGES from typing_extensions import Self, TypeIs from vllm.config import ModelConfig, SpeechToTextConfig @@ -685,6 +686,8 @@ class SupportsQuant: @runtime_checkable class SupportsTranscription(Protocol): """The interface required for all models that support transcription.""" + # Mapping from ISO639_1 language codes: language names + supported_languages: ClassVar[Mapping[str, str]] supports_transcription: ClassVar[Literal[True]] = True @@ -694,11 +697,22 @@ class SupportsTranscription(Protocol): `True`. """ + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + # language codes in supported_languages + # that don't exist in the full language map + invalid = set(cls.supported_languages) - set(LANGUAGES.keys()) + if invalid: + raise ValueError( + f"{cls.__name__}.supported_languages contains invalid " + f"language codes: {sorted(invalid)}\n. " + f"Valid choices are: {sorted(LANGUAGES.keys())}") + @classmethod def get_generation_prompt(cls, audio: np.ndarray, stt_config: SpeechToTextConfig, - model_config: ModelConfig, language: str, - task_type: str, + model_config: ModelConfig, + language: Optional[str], task_type: str, request_prompt: str) -> PromptType: """Get the prompt for the ASR model. The model has control over the construction, as long as it @@ -706,9 +720,36 @@ class SupportsTranscription(Protocol): ... @classmethod - def validate_language(cls, language: str) -> bool: - """Check if the model supports a specific ISO639_1 language.""" - ... + def get_other_languages(cls) -> Mapping[str, str]: + # other possible language codes from the whisper map + return { + k: v + for k, v in LANGUAGES.items() if k not in cls.supported_languages + } + + @classmethod + def validate_language(cls, language: Optional[str]) -> Optional[str]: + """ + Ensure the language specified in the transcription request + is a valid ISO 639-1 language code. If the request language is + valid, but not natively supported by the model, trigger a + warning (but not an exception). + """ + if language is None or language in cls.supported_languages: + return language + elif language in cls.get_other_languages(): + logger.warning( + "Language %r is not natively supported by %s; " + "results may be less accurate. Supported languages: %r", + language, + cls.__name__, + list(cls.supported_languages.keys()), + ) + return language + else: + raise ValueError( + f"Unsupported language: {language!r}. Must be one of " + f"{list(cls.supported_languages.keys())}.") @classmethod def get_speech_to_text_config( diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 97cab628317e4..6b06c0ac6683f 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -26,8 +26,7 @@ from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models import SupportsPP # yapf: disable -from vllm.model_executor.models.whisper import ( - WhisperEncoder, WhisperForConditionalGeneration) +from vllm.model_executor.models.whisper import WhisperEncoder # yapf: enable from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY @@ -50,6 +49,18 @@ from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix, logger = init_logger(__name__) +ISO639_1_SUPPORTED_LANGS = { + "ar": "Arabic", + "nl": "Dutch", + "en": "English", + "fr": "French", + "de": "German", + "hi": "Hindi", + "it": "Italian", + "pt": "Portuguese", + "es": "Spanish", +} + class VoxtralProcessorAdapter: """ @@ -301,6 +312,7 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo] dummy_inputs=VoxtralDummyInputsBuilder) class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, SupportsTranscription): + supported_languages = ISO639_1_SUPPORTED_LANGS def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -441,8 +453,8 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal, # for speech-to-text transcription def get_generation_prompt(cls, audio: np.ndarray, model_config: ModelConfig, - stt_config: SpeechToTextConfig, language: str, - task_type: str, + stt_config: SpeechToTextConfig, + language: Optional[str], task_type: str, request_prompt: str) -> PromptType: tokenizer = cached_tokenizer_from_config(model_config) audio = Audio(audio, int(stt_config.sample_rate), @@ -457,11 +469,6 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal, prompts_dict["prompt_token_ids"] = tokenized.tokens return cast(PromptType, prompts_dict) - @classmethod - def validate_language(cls, language: str) -> bool: - # same as whisper - return WhisperForConditionalGeneration.validate_language(language) - @classmethod def get_num_audio_tokens(cls, audio_duration_s: float, stt_config: SpeechToTextConfig, diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index d98dab5fac0e4..d7bafb9ef84d9 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -109,51 +109,6 @@ ISO639_1_SUPPORTED_LANGS = { "vi": "Vietnamese", "cy": "Welsh" } -ISO639_1_OTHER_LANGS = { - "lo": "Lao", - "jw": "Javanese", - "tk": "Turkmen", - "yi": "Yiddish", - "so": "Somali", - "bn": "Bengali", - "nn": "Norwegian Nynorsk", - "si": "Sinhala", - "yo": "Yoruba", - "sa": "Sanskrit", - "mi": "Māori", - "fo": "Faroese", # codespell:ignore - "mt": "Maltese", - "tg": "Tajik", - "mg": "Malagasy", - "haw": "Hawaiian", - "km": "Khmer", - "br": "Breton", - "ps": "Pashto", - "ln": "Lingala", - "la": "Latin", - "ml": "Malayalam", - "sq": "Albanian", - "su": "Sundanese", - "eu": "Basque", - "ka": "Georgian", - "uz": "Uzbek", - "sn": "Shona", - "ht": "Haitian", - "as": "Assamese", - "mn": "Mongolian", - "te": "Telugu", - "pa": "Panjabi", - "tt": "Tatar", - "gu": "Gujarati", - "oc": "Occitan", - "ha": "Hausa", - "ba": "Bashkir", - "my": "Burmese", - "sd": "Sindhi", - "am": "Amharic", - "lb": "Luxembourgish", - "bo": "Tibetan" -} class WhisperAudioInputs(TypedDict): @@ -807,22 +762,20 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription, # Whisper only supports audio-conditioned generation. supports_transcription_only = True + supported_languages = ISO639_1_SUPPORTED_LANGS @classmethod - def validate_language(cls, language: str) -> bool: - if language in ISO639_1_SUPPORTED_LANGS: - return True - elif language in ISO639_1_OTHER_LANGS: + def validate_language(cls, language: Optional[str]) -> Optional[str]: + if language is None: + # TODO language should be optional and can be guessed. + # For now we default to en. See + # https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/generation_whisper.py#L1520 logger.warning( - "The selected language %s has limited accuracy with" - " reported WER>=0.5. Results may be less accurate " - "for this choice.", language) - return True - else: - raise ValueError(f"Unsupported language: {language}." - "Language should be one of:" + - f" {list(ISO639_1_SUPPORTED_LANGS.values())}" + - f"or {list(ISO639_1_OTHER_LANGS.values())}") + "Defaulting to language='en'. If you wish to transcribe " + "audio in a different language, pass the `language` field " + "in the TranscriptionRequest.") + language = "en" + return super().validate_language(language) @classmethod def get_generation_prompt( @@ -830,9 +783,12 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription, audio: np.ndarray, model_config: ModelConfig, # not needed here stt_config: SpeechToTextConfig, - language: str, + language: Optional[str], task_type: str, request_prompt: str) -> PromptType: + if language is None: + raise ValueError( + "Language must be specified when creating the Whisper prompt") prompt = { "encoder_prompt": { # Whisper does not support encoder prompt. From 61445453df8e514d9ddf4d6bd3f9063f120cdac5 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 30 Jul 2025 23:40:34 -0400 Subject: [PATCH 082/224] [UX] Rename CUTLASS_MLA_VLLM_V1 to CUTLASS_MLA (#21966) Signed-off-by: mgoin --- vllm/engine/arg_utils.py | 2 +- vllm/platforms/cuda.py | 10 +++++----- vllm/platforms/interface.py | 2 +- vllm/v1/attention/backends/mla/cutlass_mla.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ababa49a53ae4..c36c79c69317e 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1417,7 +1417,7 @@ class EngineArgs: "PALLAS_VLLM_V1", "TRITON_ATTN_VLLM_V1", "TRITON_MLA", - "CUTLASS_MLA_VLLM_V1", + "CUTLASS_MLA", "FLASHMLA", "FLASHINFER", "FLASHINFER_VLLM_V1", diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index c35d22c1d6824..87ff6b385809a 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -162,7 +162,7 @@ class CudaPlatformBase(Platform): if cls.is_device_capability(100): # Blackwell => Force CutlassMLA. use_cutlass_mla = True - envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA_VLLM_V1" + envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA" else: # Not Blackwell use_flashmla = True @@ -170,7 +170,7 @@ class CudaPlatformBase(Platform): # Forced case use_flashmla = (envs.VLLM_ATTENTION_BACKEND == "FLASHMLA") use_cutlass_mla = ( - envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA_VLLM_V1") + envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA") from vllm.attention.ops.flashmla import is_flashmla_supported if use_flashmla and is_flashmla_supported()[0] \ @@ -182,7 +182,7 @@ class CudaPlatformBase(Platform): if use_cutlass_mla and cache_config.block_size != 128: cache_config.block_size = 128 logger.info("Forcing kv cache block size to 128 for " - "CUTLASS_MLA_VLLM_V1 backend.") + "CUTLASS_MLA backend.") compilation_config = vllm_config.compilation_config if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput" @@ -211,9 +211,9 @@ class CudaPlatformBase(Platform): kv_cache_dtype, block_size, use_v1, use_mla) -> str: if use_mla: - # TODO(lucas): refactor to be more concise + # TODO(lucas): refactor to be more concise # we should probably consider factoring out V1 here - if selected_backend == _Backend.CUTLASS_MLA_VLLM_V1: + if selected_backend == _Backend.CUTLASS_MLA: if use_v1: logger.info_once("Using Cutlass MLA backend on V1 engine.") return ("vllm.v1.attention.backends.mla." diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 02cc392244bac..6bae0fe25c797 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -53,7 +53,7 @@ class _Backend(enum.Enum): TRITON_MLA_VLLM_V1 = enum.auto() FLASHMLA_VLLM_V1 = enum.auto() FLASHMLA = enum.auto() # Supported by V1 - CUTLASS_MLA_VLLM_V1 = enum.auto() + CUTLASS_MLA = enum.auto() PALLAS = enum.auto() PALLAS_VLLM_V1 = enum.auto() IPEX = enum.auto() diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index c787f25cd3adf..b23a8f0a5e870 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -21,7 +21,7 @@ class CutlassMLABackend(MLACommonBackend): @staticmethod def get_name() -> str: - return "CUTLASS_MLA_VLLM_V1" + return "CUTLASS_MLA" @staticmethod def get_impl_cls() -> type["CutlassMLAImpl"]: From 0f7919fca05d7cf60b773da26d898b72bc07a089 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 31 Jul 2025 11:41:12 +0800 Subject: [PATCH 083/224] [Misc] Expand SUPPORTED_HIDDEN_SIZES for DeepEP low-latency kernels (#21818) Signed-off-by: Jee Jee Li --- .../layers/fused_moe/deepep_ll_prepare_finalize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py index 57871ca250ae3..cfc2bdcf02408 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -40,7 +40,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): # DeepEP low-latency kernels are compiled only for certain # specific hidden sizes. - SUPPORTED_HIDDEN_SIZES = [2048, 2560, 4096, 5120, 7168] + SUPPORTED_HIDDEN_SIZES = [2048, 2560, 4096, 5120, 6144, 7168] def __init__(self, buffer: deep_ep.Buffer, From 055bd3978ededea015fb8f0cb6aa3cc48d84cde8 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 30 Jul 2025 23:45:29 -0400 Subject: [PATCH 084/224] [CI Bugfix] Fix CI OOM for `test_shared_storage_connector_hashes` (#21973) Signed-off-by: mgoin --- tests/v1/kv_connector/unit/test_shared_storage_connector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_shared_storage_connector.py index ee3e71d3b8452..11b7e378441a4 100644 --- a/tests/v1/kv_connector/unit/test_shared_storage_connector.py +++ b/tests/v1/kv_connector/unit/test_shared_storage_connector.py @@ -10,7 +10,7 @@ from vllm.assets.image import ImageAsset from vllm.config import KVTransferConfig from vllm.multimodal.utils import encode_image_base64 -MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct" +MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w4a16" SAMPLING_PARAMS = SamplingParams(temperature=0.0, top_k=1, max_tokens=128) @@ -130,6 +130,8 @@ def test_shared_storage_connector_hashes(tmp_path): model=MODEL_NAME, max_model_len=8192, max_num_seqs=1, + gpu_memory_utilization=0.4, + enforce_eager=True, kv_transfer_config=kv_transfer_config, limit_mm_per_prompt={"image": 2}, ) From 3e36fcbee642f41278a4881c9e2bfbbd7c28e607 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Thu, 31 Jul 2025 14:22:11 +0800 Subject: [PATCH 085/224] [Bugfix]: fix metadata file copy in test_sharded_state_loader (#21830) Signed-off-by: Andy Xie --- tests/test_sharded_state_loader.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py index 64706defb5960..1bb4203d21c3e 100644 --- a/tests/test_sharded_state_loader.py +++ b/tests/test_sharded_state_loader.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import fnmatch import multiprocessing as mp import os import shutil @@ -64,9 +65,10 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs): # Copy metadata files to output directory for file in os.listdir(input_dir): if os.path.isdir(os.path.join(input_dir, file)): - continue - if not any(file.endswith(ext) for ext in weights_patterns): - shutil.copy(f"{input_dir}/{file}", output_dir) + shutil.copytree(os.path.join(input_dir, file), + os.path.join(output_dir, file)) + elif not any(fnmatch.fnmatch(file, ext) for ext in weights_patterns): + shutil.copy(os.path.join(input_dir, file), output_dir) def _run_generate(input_dir, queue: mp.Queue, **kwargs): From 9532a6d5631bbf906f992806379516ed569c447d Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 31 Jul 2025 14:46:38 +0800 Subject: [PATCH 086/224] [Deprecation] Remove deprecated args and methods (#21907) Signed-off-by: DarkLight1337 --- vllm/entrypoints/chat_utils.py | 32 ++++-------------------------- vllm/multimodal/registry.py | 25 ----------------------- vllm/worker/neuron_model_runner.py | 7 +------ 3 files changed, 5 insertions(+), 59 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index a6602391d4081..6485ed6b148b4 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -48,7 +48,7 @@ from vllm.transformers_utils.chat_templates import ( # yapf: enable from vllm.transformers_utils.processor import cached_get_processor from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import deprecate_kwargs, random_uuid +from vllm.utils import random_uuid logger = init_logger(__name__) @@ -383,17 +383,12 @@ def resolve_mistral_chat_template( return None -@deprecate_kwargs( - "trust_remote_code", - additional_message="Please use `model_config.trust_remote_code` instead.", -) def resolve_hf_chat_template( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], chat_template: Optional[str], tools: Optional[list[dict[str, Any]]], *, model_config: ModelConfig, - trust_remote_code: Optional[bool] = None, ) -> Optional[str]: # 1st priority: The given chat template if chat_template is not None: @@ -488,10 +483,6 @@ def _log_chat_template_content_format( ) -@deprecate_kwargs( - "trust_remote_code", - additional_message="Please use `model_config.trust_remote_code` instead.", -) def resolve_chat_template_content_format( chat_template: Optional[str], tools: Optional[list[dict[str, Any]]], @@ -499,7 +490,6 @@ def resolve_chat_template_content_format( tokenizer: AnyTokenizer, *, model_config: ModelConfig, - trust_remote_code: Optional[bool] = None, ) -> _ChatTemplateContentFormat: if given_format != "auto": return given_format @@ -568,17 +558,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): input_modality = modality.replace("_embeds", "") - if mm_registry.has_processor(model_config): - mm_processor = mm_registry.create_processor(model_config) - allowed_counts = mm_processor.info.get_allowed_mm_limits() - allowed_count = allowed_counts.get(input_modality, 0) - else: - mm_config = model_config.multimodal_config - if mm_config is None: - msg = "This model does not support multi-modal inputs" - raise ValueError(msg) - - allowed_count = mm_config.get_limit_per_prompt(input_modality) + mm_processor = mm_registry.create_processor(model_config) + allowed_counts = mm_processor.info.get_allowed_mm_limits() + allowed_count = allowed_counts.get(input_modality, 0) current_count = len(self._items_by_modality[modality]) + 1 if current_count > allowed_count: @@ -1285,10 +1267,6 @@ def parse_chat_messages_futures( return conversation, mm_tracker.all_mm_data() -@deprecate_kwargs( - "trust_remote_code", - additional_message="Please use `model_config.trust_remote_code` instead.", -) def apply_hf_chat_template( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], conversation: list[ConversationMessage], @@ -1297,8 +1275,6 @@ def apply_hf_chat_template( *, model_config: ModelConfig, tokenize: bool = False, # Different from HF's default - # Deprecated, explicitly capture here so it doesn't slit into kwargs. - trust_remote_code: Optional[bool] = None, **kwargs: Any, ) -> str: hf_chat_template = resolve_hf_chat_template( diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index bfa391829d290..5f5b620e0cf79 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -5,7 +5,6 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar import torch.nn as nn -from typing_extensions import deprecated from vllm.envs import VLLM_MM_INPUT_CACHE_GIB from vllm.inputs import InputProcessingContext @@ -105,13 +104,6 @@ class MultiModalRegistry: return True # Success - @deprecated("Legacy input processor/mapper pipeline has been removed. " - "Please update your model runner to use " - "`seq_group_metadata.multi_modal_data` directly without " - "further processing.") - def create_input_mapper(self, model_config: "ModelConfig"): - return lambda data, mm_processor_kwargs: data - def get_max_tokens_per_item_by_modality( self, model_config: "ModelConfig", @@ -182,16 +174,6 @@ class MultiModalRegistry: """ return sum(self.get_max_tokens_by_modality(model_config).values()) - @deprecated("Legacy input processor/mapper pipeline has been removed. " - "Please update your model runner to use " - "`seq_group_metadata.multi_modal_data` directly without " - "further processing.") - def init_mm_limits_per_prompt( - self, - model_config: "ModelConfig", - ) -> None: - pass - def get_mm_limits_per_prompt( self, model_config: "ModelConfig", @@ -246,13 +228,6 @@ class MultiModalRegistry: model_cls, _ = get_model_architecture(model_config) return model_cls - @deprecated("Legacy input processor/mapper pipeline has been removed. " - "Please update your model runner to use " - "`seq_group_metadata.multi_modal_data` directly without " - "further processing.") - def has_processor(self, model_config: "ModelConfig") -> bool: - return True - def create_processor( self, model_config: "ModelConfig", diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index 7ccf1a2c0a876..8317b9abff0cd 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -15,8 +15,7 @@ from vllm.lora.request import LoRARequest from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.neuron import get_neuron_model -from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalKwargs) +from vllm.multimodal import BatchedTensorInputs, MultiModalKwargs from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams from vllm.sequence import IntermediateTensors, SequenceGroupMetadata @@ -88,10 +87,6 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): self.device = self.device_config.device self.pin_memory = is_pin_memory_available() - # Multi-modal data support - self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \ - .create_input_mapper(self.model_config) - # Lazy initialization. self.model: nn.Module # initialize after load_model. From d2aab336ad7822efe7cfc345fa3ad67d6f5cbe39 Mon Sep 17 00:00:00 2001 From: Daniele <36171005+dtrifiro@users.noreply.github.com> Date: Thu, 31 Jul 2025 09:00:08 +0200 Subject: [PATCH 087/224] [CI/Build] get rid of unused VLLM_FA_CMAKE_GPU_ARCHES (#21599) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Daniele Trifirò --- .buildkite/scripts/hardware_ci/run-gh200-test.sh | 3 +-- .github/workflows/scripts/build.sh | 1 - docker/Dockerfile | 3 --- docker/Dockerfile.nightly_torch | 3 --- docs/deployment/docker.md | 3 +-- 5 files changed, 2 insertions(+), 11 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-gh200-test.sh b/.buildkite/scripts/hardware_ci/run-gh200-test.sh index 8c64e14606d3b..f69e4b06680f5 100644 --- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh +++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh @@ -16,8 +16,7 @@ DOCKER_BUILDKIT=1 docker build . \ --build-arg max_jobs=66 \ --build-arg nvcc_threads=2 \ --build-arg RUN_WHEEL_CHECK=false \ - --build-arg torch_cuda_arch_list="9.0+PTX" \ - --build-arg vllm_fa_cmake_gpu_arches="90-real" + --build-arg torch_cuda_arch_list="9.0+PTX" # Setup cleanup remove_docker_container() { docker rm -f gh200-test || true; } diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh index 0f010832b465d..c69ebbb42da5a 100644 --- a/.github/workflows/scripts/build.sh +++ b/.github/workflows/scripts/build.sh @@ -15,7 +15,6 @@ $python_executable -m pip install -r requirements/build.txt -r requirements/cuda export MAX_JOBS=1 # Make sure release wheels are built for the following architectures export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" -export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real" bash tools/check_repo.sh diff --git a/docker/Dockerfile b/docker/Dockerfile index 75b5ab0230c87..43522ef8fb8dd 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -164,9 +164,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # see https://github.com/pytorch/pytorch/pull/123243 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0' ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} -# Override the arch list for flash-attn to reduce the binary size -ARG vllm_fa_cmake_gpu_arches='80-real;90-real' -ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches} #################### BASE BUILD IMAGE #################### #################### WHEEL BUILD IMAGE #################### diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch index 8d43de77aad59..e147b97f0e056 100644 --- a/docker/Dockerfile.nightly_torch +++ b/docker/Dockerfile.nightly_torch @@ -114,9 +114,6 @@ RUN cat torch_build_versions.txt # explicitly set the list to avoid issues with torch 2.2 # see https://github.com/pytorch/pytorch/pull/123243 -# Override the arch list for flash-attn to reduce the binary size -ARG vllm_fa_cmake_gpu_arches='80-real;90-real' -ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches} #################### BASE BUILD IMAGE #################### #################### WHEEL BUILD IMAGE #################### diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md index 5f6cfcb00a37a..1f19f2fecfab1 100644 --- a/docs/deployment/docker.md +++ b/docs/deployment/docker.md @@ -106,8 +106,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- -t vllm/vllm-gh200-openai:latest \ --build-arg max_jobs=66 \ --build-arg nvcc_threads=2 \ - --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \ - --build-arg vllm_fa_cmake_gpu_arches="90-real" + --build-arg torch_cuda_arch_list="9.0 10.0+PTX" ``` !!! note From 2836dd73f13015ee386c544760ca0d16888203f3 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 31 Jul 2025 16:51:15 +0800 Subject: [PATCH 088/224] [Model][CI] Let more pooling models support v1 (#21747) Signed-off-by: wang.yuqi --- .../language/pooling/test_classification.py | 8 -------- tests/models/language/pooling/test_gte.py | 18 ++++-------------- tests/models/language/pooling/test_jina.py | 13 ------------- .../language/pooling/test_qwen3_reranker.py | 6 ------ vllm/config.py | 8 ++++++++ vllm/model_executor/models/bert_with_rope.py | 5 +---- vllm/model_executor/models/config.py | 2 +- vllm/model_executor/models/modernbert.py | 2 -- 8 files changed, 14 insertions(+), 48 deletions(-) diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py index 77df6d16a3673..c71fa96275335 100644 --- a/tests/models/language/pooling/test_classification.py +++ b/tests/models/language/pooling/test_classification.py @@ -6,14 +6,6 @@ from transformers import AutoModelForSequenceClassification from vllm.platforms import current_platform -# TODO: enable when float32 is supported by V1 -# @pytest.fixture(autouse=True) -# def v1(run_with_both_engines): -# # Simple autouse wrapper to run both engines for each test -# # This can be promoted up to conftest.py to run for every -# # test in a package -# pass - @pytest.mark.parametrize( "model", diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index 0ad54785308e8..6d2eff709961b 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -56,17 +56,10 @@ MODELS = [ enable_test=False), ] -V1FlashAttentionImpNotSupported = [ - "Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-modernbert-base" -] - @pytest.mark.parametrize("model_info", MODELS) -def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo, - monkeypatch) -> None: - if model_info.name in V1FlashAttentionImpNotSupported: - monkeypatch.setenv("VLLM_USE_V1", "0") - +def test_embed_models_mteb(hf_runner, vllm_runner, + model_info: EmbedModelInfo) -> None: vllm_extra_kwargs: dict[str, Any] = {} if model_info.architecture == "GteNewModel": vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]} @@ -77,11 +70,8 @@ def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo, @pytest.mark.parametrize("model_info", MODELS) def test_embed_models_correctness(hf_runner, vllm_runner, - model_info: EmbedModelInfo, example_prompts, - monkeypatch) -> None: - if model_info.name in V1FlashAttentionImpNotSupported: - monkeypatch.setenv("VLLM_USE_V1", "0") - + model_info: EmbedModelInfo, + example_prompts) -> None: vllm_extra_kwargs: dict[str, Any] = {} if model_info.architecture == "GteNewModel": vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]} diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py index 2ae431de16838..59b634428ceff 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -4,7 +4,6 @@ from functools import partial import pytest -import vllm.envs as envs from vllm import PoolingParams from ...utils import EmbedModelInfo, RerankModelInfo @@ -24,14 +23,6 @@ RERANK_MODELS = [ ] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.mark.parametrize("model_info", EMBEDDING_MODELS) def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None: @@ -63,10 +54,6 @@ def test_embed_models_correctness(hf_runner, vllm_runner, @pytest.mark.parametrize("model_info", RERANK_MODELS) def test_rerank_models_mteb(hf_runner, vllm_runner, model_info: RerankModelInfo) -> None: - if (model_info.architecture == "XLMRobertaForSequenceClassification" - and envs.VLLM_USE_V1): - pytest.skip("Not supported yet") - mteb_test_rerank_models(hf_runner, vllm_runner, model_info) diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py index 9c6a833b41384..68e96f32700ca 100644 --- a/tests/models/language/pooling/test_qwen3_reranker.py +++ b/tests/models/language/pooling/test_qwen3_reranker.py @@ -83,9 +83,6 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: } } - if model_info.name == "Qwen/Qwen3-Reranker-4B": - vllm_extra_kwargs["max_num_seqs"] = 1 - mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info, vllm_extra_kwargs) @@ -106,9 +103,6 @@ def test_rerank_models_mteb_tp(vllm_runner, "tensor_parallel_size": 2, } - if model_info.name == "Qwen/Qwen3-Reranker-4B": - vllm_extra_kwargs["max_num_seqs"] = 1 - mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info, diff --git a/vllm/config.py b/vllm/config.py index a330bafb76332..27dde5f1b1f6f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -776,6 +776,9 @@ class ModelConfig: raise ValueError( "`override_neuron_config` is only supported on Neuron.") + # Avoid running try_verify_and_update_config multiple times + self.config_updated = False + self._verify_quantization() self._verify_cuda_graph() self._verify_bnb_config() @@ -4914,6 +4917,11 @@ class VllmConfig: if self.model_config is None: return + # Avoid running try_verify_and_update_config multiple times + if getattr(self.model_config, "config_updated", False): + return + self.model_config.config_updated = True + architecture = self.model_config.architecture if architecture is None: return diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 5249acbd84a56..59033cb74a338 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -8,7 +8,6 @@ from torch import nn from transformers import PretrainedConfig from vllm.attention import Attention, AttentionType -from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -26,7 +25,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models import SupportsV0Only from vllm.model_executor.models.interfaces import SupportsQuant from vllm.model_executor.models.utils import WeightsMapper from vllm.model_executor.utils import set_weight_attrs @@ -360,7 +358,6 @@ class BertWithRopeBlock(nn.Module): return hidden_states -@support_torch_compile class BertWithRopeEncoder(nn.Module): def __init__(self, @@ -394,7 +391,7 @@ class BertWithRopeEncoder(nn.Module): return hidden_states -class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant): +class BertWithRope(nn.Module, SupportsQuant): hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 6f50b17530987..9030ff307bee3 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -93,7 +93,7 @@ class NomicBertModelConfig(VerifyAndUpdateConfig): config.num_hidden_layers = config.n_layer head_dim = config.hidden_size // config.num_attention_heads - rotary_emb_dim = head_dim * config.rotary_emb_fraction + rotary_emb_dim = int(head_dim * config.rotary_emb_fraction) max_trained_positions = getattr(config, "max_trained_positions", 2048) config.rotary_kwargs = { "head_size": head_dim, diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index fc2b0c1f51821..4967032a244ec 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -8,7 +8,6 @@ from torch import nn from transformers import ModernBertConfig from vllm.attention import Attention, AttentionType -from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.linear import (QKVParallelLinear, @@ -200,7 +199,6 @@ class ModernBertEncoderLayer(nn.Module): return hidden_states -@support_torch_compile class ModernBertModel(nn.Module): hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={"layers.": "encoder_layer.layers."}) From 5daffe7cf6db9765bd667d1a2cf5f18843d58fc7 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 31 Jul 2025 13:51:37 +0100 Subject: [PATCH 089/224] [BugFix] Fix case where `collective_rpc` returns `None` (#22006) Signed-off-by: Nick Hill --- tests/v1/engine/test_engine_core_client.py | 13 +++++++++++-- vllm/v1/serial_utils.py | 16 ++++++++++------ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index f648c38a63f79..1329ce5f69cbd 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -305,10 +305,10 @@ def echo_dc( return_list: bool = False, ) -> Union[MyDataclass, list[MyDataclass]]: print(f"echo dc util function called: {msg}") + val = None if msg is None else MyDataclass(msg) # Return dataclass to verify support for returning custom types # (for which there is special handling to make it work with msgspec). - return [MyDataclass(msg) for _ in range(3)] if return_list \ - else MyDataclass(msg) + return [val for _ in range(3)] if return_list else val @pytest.mark.asyncio(loop_scope="function") @@ -351,6 +351,15 @@ async def test_engine_core_client_util_method_custom_return( assert isinstance(result, list) and all( isinstance(r, MyDataclass) and r.message == "testarg2" for r in result) + + # Test returning None and list of Nones + result = await core_client.call_utility_async( + "echo_dc", None, False) + assert result is None + result = await core_client.call_utility_async( + "echo_dc", None, True) + assert isinstance(result, list) and all(r is None for r in result) + finally: client.shutdown() diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index 4b6a983252b0e..809a60c1962f8 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -49,7 +49,10 @@ def _log_insecure_serialization_warning(): "VLLM_ALLOW_INSECURE_SERIALIZATION=1") -def _typestr(t: type): +def _typestr(val: Any) -> Optional[tuple[str, str]]: + if val is None: + return None + t = type(val) return t.__module__, t.__qualname__ @@ -131,14 +134,13 @@ class MsgpackEncoder: if isinstance(obj, UtilityResult): result = obj.result - if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION or result is None: + if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION: return None, result # Since utility results are not strongly typed, we also encode # the type (or a list of types in the case it's a list) to # help with correct msgspec deserialization. - cls = result.__class__ - return _typestr(cls) if cls is not list else [ - _typestr(type(v)) for v in result + return _typestr(result) if type(result) is not list else [ + _typestr(v) for v in result ], result if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION: @@ -277,7 +279,9 @@ class MsgpackDecoder: ] return UtilityResult(result) - def _convert_result(self, result_type: Sequence[str], result: Any): + def _convert_result(self, result_type: Sequence[str], result: Any) -> Any: + if result_type is None: + return result mod_name, name = result_type mod = importlib.import_module(mod_name) result_type = getattr(mod, name) From 207b750e194829c4bcd4df0450f5f93d71755dae Mon Sep 17 00:00:00 2001 From: amirkl94 <203507526+amirkl94@users.noreply.github.com> Date: Thu, 31 Jul 2025 16:00:01 +0300 Subject: [PATCH 090/224] [NVIDIA] Add SM100 Flashinfer MoE per tensor scale fp8 backend (#21458) Signed-off-by: Amir Klein <203507526+amirkl94@users.noreply.github.com> Signed-off-by: mgoin Co-authored-by: mgoin --- .../layers/fused_moe/fused_moe.py | 113 +++++++++++++++--- .../model_executor/layers/quantization/fp8.py | 75 +++++++----- .../layers/quantization/modelopt.py | 28 +++++ .../quantization/utils/flashinfer_utils.py | 100 ++++++++++++++++ vllm/utils/flashinfer.py | 2 + 5 files changed, 269 insertions(+), 49 deletions(-) create mode 100644 vllm/model_executor/layers/quantization/utils/flashinfer_utils.py diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 227aacf25c0b0..b69575c7e96de 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -30,6 +30,8 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceNoOP) from vllm.model_executor.layers.fused_moe.utils import ( _resize_cache, moe_kernel_quantize_input, per_token_group_quant_fp8) +from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( + calculate_tile_tokens_dim) from vllm.model_executor.layers.quantization.utils.mxfp4_utils import ( dequant_mxfp4) from vllm.platforms import current_platform @@ -1065,22 +1067,6 @@ direct_register_custom_op( ) -def next_positive_power_of_2(x: int) -> int: - if x < 1: - return 1 - return 1 << (x - 1).bit_length() - - -def _get_tile_tokens_dim(num_tokens, top_k, num_experts): - # Guess tokens per expert assuming perfect expert distribution first. - num_tokens_per_expert = (num_tokens * top_k) // num_experts - # And pad the number to the next power of 2. - tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert) - # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel. - tile_tokens_dim = min(max(tile_tokens_dim, 8), 64) - return tile_tokens_dim - - def flashinfer_fused_moe_blockscale_fp8( routing_logits: torch.Tensor, routing_bias: torch.Tensor, @@ -1128,8 +1114,8 @@ def flashinfer_fused_moe_blockscale_fp8( local_expert_offset=expert_offset, local_num_experts=local_num_experts, routed_scaling_factor=routed_scaling, - tile_tokens_dim=_get_tile_tokens_dim(x.shape[0], top_k, - global_num_experts), + tile_tokens_dim=calculate_tile_tokens_dim(x.shape[0], top_k, + global_num_experts), routing_method_type=2, # DeepSeek-styled routing method use_shuffled_weight=False, ) @@ -1164,6 +1150,97 @@ direct_register_custom_op( ) +def flashinfer_fused_moe_per_tensor_scale_fp8( + routing_logits: torch.Tensor, + routing_bias: Optional[torch.Tensor], + hidden_states: torch.Tensor, + input_scale: torch.Tensor, + gemm1_weights: torch.Tensor, + gemm1_weights_scale: torch.Tensor, + activation_scale: torch.Tensor, + gemm2_weights: torch.Tensor, + gemm2_weights_scale: torch.Tensor, + num_experts: int, + top_k: int, + num_expert_group: Optional[int], + topk_group: Optional[int], + intermediate_size: int, + local_expert_offset: int, + local_num_experts: int, + use_routing_scales_on_input: bool, + routing_method_type: int, + routed_scaling_factor: float = 1.0) -> torch.Tensor: + num_expert_group = num_expert_group if num_expert_group is not None else 0 + topk_group = topk_group if topk_group is not None else 0 + + quant_hidden_states, input_scale = moe_kernel_quantize_input( + hidden_states, + input_scale, + quant_dtype=torch.float8_e4m3fn, + per_act_token_quant=False) + + output1_scales_scalar = gemm1_weights_scale * input_scale * ( + 1.0 / activation_scale) + output1_scales_gate_scalar = gemm1_weights_scale * input_scale + output2_scales_scalar = activation_scale * gemm2_weights_scale + + from vllm.utils.flashinfer import ( + flashinfer_trtllm_fp8_per_tensor_scale_moe) + return flashinfer_trtllm_fp8_per_tensor_scale_moe( + routing_logits=routing_logits, + routing_bias=routing_bias, + hidden_states=quant_hidden_states, + gemm1_weights=gemm1_weights, + output1_scales_scalar=output1_scales_scalar, + output1_scales_gate_scalar=output1_scales_gate_scalar, + gemm2_weights=gemm2_weights, + output2_scales_scalar=output2_scales_scalar, + num_experts=num_experts, + top_k=top_k, + n_group=num_expert_group, + topk_group=topk_group, + intermediate_size=intermediate_size, + local_expert_offset=local_expert_offset, + local_num_experts=local_num_experts, + routed_scaling_factor=routed_scaling_factor, + use_routing_scales_on_input=use_routing_scales_on_input, + tile_tokens_dim=calculate_tile_tokens_dim(hidden_states.shape[0], + top_k, num_experts), + routing_method_type=routing_method_type) + + +def flashinfer_fused_moe_per_tensor_scale_fp8_fake( + routing_logits: torch.Tensor, + routing_bias: torch.Tensor, + hidden_states: torch.Tensor, + gemm1_weights: torch.Tensor, + output1_scales_scalar: torch.Tensor, + output1_scales_gate_scalar: torch.Tensor, + gemm2_weights: torch.Tensor, + output2_scales_scalar: torch.Tensor, + num_experts: int, + top_k: int, + num_expert_group: int, + topk_group: int, + intermediate_size: int, + local_expert_offset: int, + local_num_experts: int, + routed_scaling_factor: float = 1.0, + use_routing_scales_on_input: bool = False, + tile_tokens_dim: int = 8, + routing_method_type: int = 0) -> torch.Tensor: + pass + + +direct_register_custom_op( + op_name="flashinfer_fused_moe_per_tensor_scale_fp8", + op_func=flashinfer_fused_moe_per_tensor_scale_fp8, + mutates_args=["hidden_states"], + fake_impl=flashinfer_fused_moe_per_tensor_scale_fp8_fake, + tags=(torch.Tag.needs_fixed_stride_order, ), +) + + def outplace_fused_experts( hidden_states: torch.Tensor, w1: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 75f8adf34f7dd..8b6ed154bdbe4 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -23,6 +23,9 @@ from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( + apply_flashinfer_per_tensor_scale_fp8, rotate_flashinfer_fp8_moe_weights, + swap_w13_to_w31) from vllm.model_executor.layers.quantization.utils.fp8_utils import ( get_col_major_tma_aligned_tensor, requant_weight_ue8m0_inplace) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( @@ -53,11 +56,6 @@ ACTIVATION_SCHEMES = ["static", "dynamic"] logger = init_logger(__name__) -def _swap_w13_to_w31(x: torch.Tensor) -> torch.Tensor: - return x.reshape(-1, 2, x.shape[-2] // 2, - x.shape[-1]).flip(dims=[1]).reshape(x.shape) - - def _is_col_major(x: torch.Tensor) -> bool: assert x.dim() == 3 b, m, n = x.shape @@ -695,11 +693,13 @@ class Fp8MoEMethod(FusedMoEMethodBase): elif self.flashinfer_moe_enabled: # NOTE: weights have to be swapped since the activation is # applied on different half for flashinfer vs vllm - w13_weight = _swap_w13_to_w31(layer.w13_weight.data) - w13_weight_scale_inv = _swap_w13_to_w31( + w13_weight = swap_w13_to_w31(layer.w13_weight.data) + w13_weight_scale_inv = swap_w13_to_w31( layer.w13_weight_scale_inv.data) w2_weight = layer.w2_weight.data w2_weight_scale_inv = layer.w2_weight_scale_inv.data + if not self.block_quant: + rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight) else: w13_weight = layer.w13_weight.data w13_weight_scale_inv = layer.w13_weight_scale_inv.data @@ -998,30 +998,43 @@ class Fp8MoEMethod(FusedMoEMethodBase): global_num_experts=global_num_experts, expert_map=expert_map) elif self.flashinfer_moe_enabled: - # Currently only work with DS models - assert self.block_quant - assert (renormalize and use_grouped_topk - and scoring_func == 'sigmoid' - and custom_routing_function is None) - assert activation == "silu" - return torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8( - routing_logits=router_logits.to(torch.float32), - routing_bias=e_score_correction_bias, - x=x, - w13_weight=layer.w13_weight, - w13_weight_scale_inv=layer.w13_weight_scale_inv, - w2_weight=layer.w2_weight, - w2_weight_scale_inv=layer.w2_weight_scale_inv, - global_num_experts=global_num_experts, - top_k=top_k, - num_expert_group=num_expert_group, - topk_group=topk_group, - intermediate_size=layer.intermediate_size_per_partition, - expert_offset=layer.ep_rank * layer.local_num_experts, - local_num_experts=layer.local_num_experts, - block_shape=self.quant_config.weight_block_size, - routed_scaling=1.0, - ) + assert activation == 'silu' + assert scoring_func == 'sigmoid' + if self.block_quant: + assert (renormalize and use_grouped_topk + and custom_routing_function is None) + + return torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8( + routing_logits=router_logits.to(torch.float32), + routing_bias=e_score_correction_bias, + x=x, + w13_weight=layer.w13_weight, + w13_weight_scale_inv=layer.w13_weight_scale_inv, + w2_weight=layer.w2_weight, + w2_weight_scale_inv=layer.w2_weight_scale_inv, + global_num_experts=global_num_experts, + top_k=top_k, + num_expert_group=num_expert_group, + topk_group=topk_group, + intermediate_size=layer.intermediate_size_per_partition, + expert_offset=layer.ep_rank * layer.local_num_experts, + local_num_experts=layer.local_num_experts, + block_shape=self.quant_config.weight_block_size, + routed_scaling=1.0, + ) + else: + assert (not renormalize + and custom_routing_function is not None) + return apply_flashinfer_per_tensor_scale_fp8( + layer=layer, + hidden_states=x, + router_logits=router_logits, + routing_bias=e_score_correction_bias, + global_num_experts=global_num_experts, + top_k=top_k, + num_expert_group=num_expert_group, + topk_group=topk_group, + apply_router_weight_on_input=apply_router_weight_on_input) else: return self.fused_experts( hidden_states=x, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 8fbc3231d86c3..b8ffcf90c022b 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -23,6 +23,9 @@ from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( + apply_flashinfer_per_tensor_scale_fp8, rotate_flashinfer_fp8_moe_weights, + swap_w13_to_w31) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( apply_fp4_marlin_linear, is_fp4_marlin_supported, prepare_fp4_layer_for_marlin, prepare_moe_fp4_layer_for_marlin) @@ -34,6 +37,7 @@ from vllm.model_executor.parameter import (ModelWeightParameter, PerTensorScaleParameter) from vllm.platforms import current_platform from vllm.scalar_type import scalar_types +from vllm.utils.flashinfer import has_flashinfer_moe logger = init_logger(__name__) @@ -267,6 +271,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( cutlass_fp8_supported) self.cutlass_fp8_supported = cutlass_fp8_supported() + self.flashinfer_moe_enabled = False + if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe(): + logger.info_once( + "Using FlashInfer MoE FP8 kernels for ModelOptFp8MoEMethod.") + self.flashinfer_moe_enabled = True def create_weights( self, @@ -410,6 +419,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): layer.w2_input_scale = Parameter(layer.w2_input_scale.max(), requires_grad=False) + if self.flashinfer_moe_enabled: + layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data) + rotate_flashinfer_fp8_moe_weights(layer.w13_weight, + layer.w2_weight) + def apply( self, layer: torch.nn.Module, @@ -436,6 +450,20 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): raise NotImplementedError( "EPLB not supported for `ModelOptFp8MoEMethod` yet.") + if self.flashinfer_moe_enabled: + assert activation == 'silu' + assert not renormalize + return apply_flashinfer_per_tensor_scale_fp8( + layer=layer, + hidden_states=x, + router_logits=router_logits, + routing_bias=e_score_correction_bias, + global_num_experts=global_num_experts, + top_k=top_k, + num_expert_group=num_expert_group, + topk_group=topk_group, + apply_router_weight_on_input=apply_router_weight_on_input) + # Expert selection topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py new file mode 100644 index 0000000000000..c6f914febc0a2 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -0,0 +1,100 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + +import torch + + +def calculate_tile_tokens_dim(num_tokens, top_k, num_experts): + from flashinfer import next_positive_power_of_2 + + # Guess tokens per expert assuming perfect expert distribution first. + num_tokens_per_expert = (num_tokens * top_k) // num_experts + # And pad the number to the next power of 2. + tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert) + # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel. + tile_tokens_dim = min(max(tile_tokens_dim, 8), 64) + return tile_tokens_dim + + +def swap_w13_to_w31(x: torch.Tensor) -> torch.Tensor: + return x.reshape(-1, 2, x.shape[-2] // 2, + x.shape[-1]).flip(dims=[1]).reshape(x.shape) + + +def rotate_flashinfer_fp8_moe_weights(gemm1_weights: torch.Tensor, + gemm2_weights: torch.Tensor): + from flashinfer import reorder_rows_for_gated_act_gemm, shuffle_matrix_a + epilogue_tile_m = 128 + num_experts = gemm1_weights.shape[0] + hidden_size = gemm1_weights.shape[-1] + intermediate_size = gemm1_weights.shape[1] // 2 + + # Reorder rows of W1 for fused gated activation + gemm1_weights_fp8_interleaved = [] + for i in range(num_experts): + gemm1_weights_fp8_interleaved.append( + reorder_rows_for_gated_act_gemm(gemm1_weights[i])) + + # Stack weights and scales for all experts + gemm1_weights_fp8_interleaved = torch.stack( + gemm1_weights_fp8_interleaved).reshape(num_experts, + 2 * intermediate_size, + hidden_size) + + # Shuffle weights and scaling factors for transposed mma output + gemm1_weights_fp8_shuffled = [] + gemm2_weights_fp8_shuffled = [] + for i in range(num_experts): + gemm1_weights_fp8_shuffled.append( + shuffle_matrix_a( + gemm1_weights_fp8_interleaved[i].view(torch.uint8), + epilogue_tile_m)) + + gemm2_weights_fp8_shuffled.append( + shuffle_matrix_a(gemm2_weights[i].view(torch.uint8), + epilogue_tile_m)) + + # Stack weights for all experts + gemm1_weights.data = torch.stack(gemm1_weights_fp8_shuffled).view( + torch.float8_e4m3fn) + gemm2_weights.data = torch.stack(gemm2_weights_fp8_shuffled).view( + torch.float8_e4m3fn) + + +def apply_flashinfer_per_tensor_scale_fp8( + layer: torch.nn.Module, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + routing_bias: Optional[torch.Tensor], + top_k: int, + num_expert_group: Optional[int], + topk_group: Optional[int], + global_num_experts: int, + apply_router_weight_on_input: bool, +) -> torch.Tensor: + from flashinfer.fused_moe import RoutingMethodType + + from vllm.model_executor.models.llama4 import Llama4MoE + assert layer.custom_routing_function == Llama4MoE.custom_routing_function, \ + "FusedMoE flashinfer kernels are only supported for Llama4" + return torch.ops.vllm.flashinfer_fused_moe_per_tensor_scale_fp8( + routing_logits=router_logits, + routing_bias=routing_bias, + hidden_states=hidden_states, + input_scale=layer.w13_input_scale, + gemm1_weights=layer.w13_weight, + gemm1_weights_scale=layer.w13_weight_scale, + gemm2_weights=layer.w2_weight, + gemm2_weights_scale=layer.w2_weight_scale, + activation_scale=layer.w2_input_scale, + num_experts=global_num_experts, + top_k=top_k, + num_expert_group=num_expert_group, + topk_group=topk_group, + intermediate_size=layer.intermediate_size_per_partition, + local_expert_offset=layer.ep_rank * layer.local_num_experts, + local_num_experts=layer.local_num_experts, + use_routing_scales_on_input=apply_router_weight_on_input, + routing_method_type=RoutingMethodType.Llama4, + ) \ No newline at end of file diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index ebc54fd029da6..3bfb9808c0a00 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -66,6 +66,8 @@ def _lazy_import_wrapper(module_name: str, # Create lazy wrappers for each function flashinfer_trtllm_fp8_block_scale_moe = _lazy_import_wrapper( "flashinfer.fused_moe", "trtllm_fp8_block_scale_moe") +flashinfer_trtllm_fp8_per_tensor_scale_moe = _lazy_import_wrapper( + "flashinfer.fused_moe", "trtllm_fp8_per_tensor_scale_moe") flashinfer_cutlass_fused_moe = _lazy_import_wrapper("flashinfer.fused_moe", "cutlass_fused_moe") fp4_quantize = _lazy_import_wrapper("flashinfer", "fp4_quantize") From 94846416166c731939892350d7ab26dcbcb2982d Mon Sep 17 00:00:00 2001 From: Song <44120206+Oliver-ss@users.noreply.github.com> Date: Thu, 31 Jul 2025 23:19:06 +0800 Subject: [PATCH 091/224] [Model] Add step3 vl (#21998) Signed-off-by: oliveryuan Co-authored-by: oliveryuan --- docs/models/supported_models.md | 1 + tests/models/registry.py | 6 + .../openai/tool_parsers/__init__.py | 2 + .../openai/tool_parsers/step3_tool_parser.py | 296 +++++ vllm/model_executor/models/registry.py | 2 + vllm/model_executor/models/step3_text.py | 521 ++++++++ vllm/model_executor/models/step3_vl.py | 1052 +++++++++++++++++ vllm/reasoning/__init__.py | 2 + vllm/reasoning/step3_reasoning_parser.py | 109 ++ vllm/transformers_utils/config.py | 5 +- vllm/transformers_utils/configs/__init__.py | 6 + vllm/transformers_utils/configs/step3_vl.py | 123 ++ 12 files changed, 2124 insertions(+), 1 deletion(-) create mode 100644 vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py create mode 100644 vllm/model_executor/models/step3_text.py create mode 100644 vllm/model_executor/models/step3_vl.py create mode 100644 vllm/reasoning/step3_reasoning_parser.py create mode 100644 vllm/transformers_utils/configs/step3_vl.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 5a9823bb6bae7..f5d9e3b22f2a6 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -625,6 +625,7 @@ See [this page](generative_models.md) for more information on how to use generat | `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎ | | `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ | | `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ | +| `Step3VLForConditionalGeneration` | Step3-VL | T + I+ | `stepfun-ai/step3` | | ✅︎ | ✅︎ | | `TarsierForConditionalGeneration` | Tarsier | T + IE+ | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ | ✅︎ | | `Tarsier2ForConditionalGeneration`^ | Tarsier2 | T + IE+ + VE+ | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index 8fcff5a8c5113..b9e7de4e9fd11 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -279,6 +279,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"), # noqa: E501 "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"), "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"), + "Step3TextForCausalLM": _HfExamplesInfo("stepfun-ai/step3", + trust_remote_code=True, + is_available_online=False), "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct", trust_remote_code=True), "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B", @@ -457,6 +460,9 @@ _MULTIMODAL_EXAMPLE_MODELS = { "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B", trust_remote_code=True), "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501 + "Step3VLForConditionalGeneration": _HfExamplesInfo("stepfun-ai/step3", + trust_remote_code=True, + is_available_online=False), "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 trust_remote_code=True), "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b", # noqa: E501 diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 88c8aa929b78d..099e456aa486f 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -18,6 +18,7 @@ from .mistral_tool_parser import MistralToolParser from .phi4mini_tool_parser import Phi4MiniJsonToolParser from .pythonic_tool_parser import PythonicToolParser from .qwen3coder_tool_parser import Qwen3CoderToolParser +from .step3_tool_parser import Step3ToolParser from .xlam_tool_parser import xLAMToolParser __all__ = [ @@ -40,4 +41,5 @@ __all__ = [ "HunyuanA13BToolParser", "Glm4MoeModelToolParser", "Qwen3CoderToolParser", + "Step3ToolParser", ] diff --git a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py new file mode 100644 index 0000000000000..a20d18eb52544 --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py @@ -0,0 +1,296 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import contextlib +import json +from collections.abc import Sequence +from typing import Any, Optional, Union + +import regex as re + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaFunctionCall, DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, ToolParserManager) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import random_uuid + +logger = init_logger(__name__) + + +@ToolParserManager.register_module(["step3"]) +class Step3ToolParser(ToolParser): + """ + Tool parser for a model that uses a specific XML-like format for tool calls. + This version uses a robust, stateful, cursor-based streaming parser and + consolidates tool arguments into a single message. + """ + + TOOL_CALLS_BEGIN = "<|tool_calls_begin|>" + TOOL_CALLS_END = "<|tool_calls_end|>" + TOOL_CALL_BEGIN = "<|tool_call_begin|>" + TOOL_CALL_END = "<|tool_call_end|>" + TOOL_SEP = "<|tool_sep|>" + SPECIAL_TOKENS = [ + TOOL_CALLS_BEGIN, TOOL_CALLS_END, TOOL_CALL_BEGIN, TOOL_CALL_END + ] + + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + self.position = 0 + # Explicit state flags for robust streaming + self.tool_block_started = False + self.tool_block_finished = False + + def adjust_request( + self, request: ChatCompletionRequest) -> ChatCompletionRequest: + if request.tools and request.tool_choice != 'none': + request.skip_special_tokens = False + return request + + @staticmethod + def _parse_steptml_invoke( + action_text: str + ) -> tuple[Optional[str], Optional[dict[str, str]]]: + func_name_match = re.search(r'', + action_text) + if not func_name_match: + return None, None + func_name = func_name_match.group(1) + + params: dict[str, str] = {} + param_matches = re.findall( + r'([^<]*)', + action_text) + for name, value in param_matches: + params[name] = value.strip() + return func_name, params + + def _cast_arguments( + self, + func_name: str, + params: dict[str, Any], + request: ChatCompletionRequest, + ) -> dict[str, Any]: + for tool in request.tools or []: + if tool.function.name == func_name: + schema = tool.function.parameters or {} + properties = schema.get("properties", {}) + for key, value in params.items(): + if not isinstance(value, str): + continue + prop = properties.get(key, {}) + typ = prop.get("type") + if typ == "string": + params[key] = value.strip() + elif typ == "integer": + with contextlib.suppress(ValueError): + params[key] = int(value) + elif typ == "number": + with contextlib.suppress(ValueError): + params[key] = float(value) + elif typ == "boolean": + lower_val = value.lower() + params[key] = lower_val == "true" if lower_val in ( + "true", "false") else value + elif typ == "null": + params[key] = None if value.lower( + ) == "null" else value + break + return params + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> Union[DeltaMessage, None]: + + # The main loop processes the stream from the last known position. + while True: + if self.position >= len(current_text): + return None # We've processed the entire stream. + + unprocessed_text = current_text[self.position:] + + # STATE: After all tools are done, all subsequent text is content. + if self.tool_block_finished: + self.position = len(current_text) + return DeltaMessage(content=unprocessed_text) + + # STATE: Before the tool block has started. + if not self.tool_block_started: + if unprocessed_text.startswith(self.TOOL_CALLS_BEGIN): + self.position += len(self.TOOL_CALLS_BEGIN) + self.tool_block_started = True + continue # Token consumed, re-loop. + + start_pos = unprocessed_text.find(self.TOOL_CALLS_BEGIN) + if start_pos == -1: + if self.TOOL_CALLS_BEGIN.startswith( + unprocessed_text.strip()) and unprocessed_text: + return None # It's a prefix, wait. + self.position = len(current_text) + return DeltaMessage(content=unprocessed_text) + else: + content = unprocessed_text[:start_pos] + self.position += len(content) + return DeltaMessage(content=content) + + # STATE: Inside the main tool block. + offset = len(unprocessed_text) - len(unprocessed_text.lstrip()) + unprocessed_text = unprocessed_text.lstrip() + self.position += offset + + if unprocessed_text.startswith(self.TOOL_CALLS_END): + self.position += len(self.TOOL_CALLS_END) + self.tool_block_finished = True + self.current_tool_id = -1 + continue + + # Check if we are between tool calls. + tool_finished = ( + self.current_tool_id != -1 and + self.prev_tool_call_arr[self.current_tool_id].get("finished")) + if self.current_tool_id == -1 or tool_finished: + if unprocessed_text.startswith(self.TOOL_CALL_BEGIN): + self.position += len(self.TOOL_CALL_BEGIN) + if self.current_tool_id == -1: + self.current_tool_id = 0 + else: + self.current_tool_id += 1 + self.current_tool_name_sent = False + while len(self.prev_tool_call_arr) <= self.current_tool_id: + self.prev_tool_call_arr.append({}) + self.prev_tool_call_arr[ + self.current_tool_id]["finished"] = False + continue + + if self.TOOL_CALL_BEGIN.startswith(unprocessed_text): + return None + + # STATE: Parsing an active tool call. + if self.current_tool_id != -1 and not self.prev_tool_call_arr[ + self.current_tool_id].get("finished", False): + end_tool_pos = unprocessed_text.find(self.TOOL_CALL_END) + if end_tool_pos == -1: + tool_body = unprocessed_text + else: + tool_body = unprocessed_text[:end_tool_pos] + + if end_tool_pos == -1 and self.TOOL_CALL_END.startswith( + tool_body): + return None + + function_name, arguments = self._parse_steptml_invoke( + tool_body) + if not function_name: + return None + + tool_call_arr = { + "name": function_name, + "parameters": arguments or {} + } + + # Send the function name as soon as it's parsed. + if not self.current_tool_name_sent: + self.current_tool_name_sent = True + self.prev_tool_call_arr[self.current_tool_id].update( + tool_call_arr) + return DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + type="function", + id=f"chatcmpl-tool-{random_uuid()}", + function=DeltaFunctionCall( + name=function_name)) + ]) + + # Update our internal state with the latest parsed arguments. + self.prev_tool_call_arr[ + self.current_tool_id].update( # noqa: E501 + tool_call_arr) + + # Only send arguments when the tool call is complete. + if end_tool_pos != -1: + self.position += end_tool_pos + len(self.TOOL_CALL_END) + self.prev_tool_call_arr[ + self.current_tool_id]["finished"] = True + + final_args = self._cast_arguments( + function_name, + tool_call_arr.get("parameters", {}), # type: ignore + request) + if final_args: + final_args_json = json.dumps(final_args, + ensure_ascii=False) + return DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=final_args_json)) + ]) + + # If tool is not finished, return None to wait for more tokens. + return None + + return None + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + if self.TOOL_CALLS_BEGIN not in model_output: + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + pre_text, rest = model_output.split(self.TOOL_CALLS_BEGIN, 1) + if self.TOOL_CALLS_END not in rest: + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + tool_block, post_text = rest.split(self.TOOL_CALLS_END, 1) + content = (pre_text + post_text).strip() + + tool_calls: list[ToolCall] = [] + call_parts = tool_block.split(self.TOOL_CALL_BEGIN) + + for part in call_parts: + if not part or self.TOOL_CALL_END not in part: + continue + + call_content = part.split(self.TOOL_CALL_END, 1)[0] + if self.TOOL_SEP not in call_content: + continue + + type_part, invoke_part = call_content.split(self.TOOL_SEP, 1) + if type_part.strip() != "function": + continue + + function_name, params_dict = self._parse_steptml_invoke( + invoke_part) + + if function_name and params_dict is not None: + params_dict = self._cast_arguments(function_name, params_dict, + request) + params_str = json.dumps(params_dict, ensure_ascii=False) + tool_calls.append( + ToolCall(function=FunctionCall(name=function_name, + arguments=params_str))) + if tool_calls: + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=content if content else None) + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 51831a770347a..848c04b9b32f7 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -129,6 +129,7 @@ _TEXT_GENERATION_MODELS = { "Qwen3ForCausalLM": ("qwen3", "Qwen3ForCausalLM"), "Qwen3MoeForCausalLM": ("qwen3_moe", "Qwen3MoeForCausalLM"), "RWForCausalLM": ("falcon", "FalconForCausalLM"), + "Step3TextForCausalLM": ("step3_text", "Step3TextForCausalLM"), "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"), "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"), "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"), @@ -238,6 +239,7 @@ _MULTIMODAL_MODELS = { "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 "UltravoxModel": ("ultravox", "UltravoxModel"), + "Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"), # noqa: E501 "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501 "Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"), # noqa: E501 "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501 diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py new file mode 100644 index 0000000000000..47d2af5c2a140 --- /dev/null +++ b/vllm/model_executor/models/step3_text.py @@ -0,0 +1,521 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Inference-only Jurassic model.""" +from collections.abc import Iterable +from typing import Any, Optional + +import torch +from torch import nn + +from vllm.attention import Attention +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, ModelConfig, VllmConfig +from vllm.distributed import (get_pp_group, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce) +from vllm.logger import init_logger +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, + ReplicatedLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .interfaces import SupportsPP +from .utils import (PPMissingLayer, is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers) + +logger = init_logger(__name__) + + +class FusedMoEBlock(nn.Module): + + def __init__(self, + config: ModelConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + self.tp_size = get_tensor_model_parallel_world_size() + + if self.tp_size > config.moe_num_experts: + raise ValueError( + f"Tensor parallel size {self.tp_size} is greater than " + f"the number of experts {config.moe_num_experts}.") + + self.experts = FusedMoE(num_experts=config.moe_num_experts, + top_k=config.moe_top_k, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=config.norm_expert_weight, + quant_config=quant_config, + prefix=f"{prefix}.experts") + self.gate = ReplicatedLinear(config.hidden_size, + config.moe_num_experts, + bias=False, + quant_config=None, + prefix=f"{prefix}.gate") + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + orig_shape = hidden_states.shape + hidden_dim = hidden_states.shape[-1] + hidden_states = hidden_states.view(-1, hidden_dim) + + router_logits, _ = self.gate(hidden_states) + + final_hidden_states = self.experts(hidden_states=hidden_states, + router_logits=router_logits) + if self.tp_size > 1: + final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states) + + return final_hidden_states.view(orig_shape) + + +class Step3TextMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + self.hidden_size = hidden_size + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + gate_up, _ = self.gate_up_proj(hidden_states) + intermediate_act = self.act_fn(gate_up) + output, _ = self.down_proj(intermediate_act) + return output + + +class Step3TextAttention(nn.Module): + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + norm_eps: float, + rope_theta: int, + share_q_dim: Optional[int] = None, + rope_scaling: Optional[dict[str, Any]] = None, + max_position_embedding: int = 8192, + head_dim: int = 256, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + + if num_kv_heads != 1: + raise ValueError(f"Step3TextAttention num_kv_heads must be 1, " + f"but got {num_kv_heads}.") + self.num_kv_heads = num_kv_heads + + self.head_dim = head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.q_size = share_q_dim if share_q_dim else self.head_dim + + self.qkv_proj = ReplicatedLinear( + hidden_size, + self.q_size + self.kv_size * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + self.inter_norm = RMSNorm(self.q_size, eps=norm_eps) + self.wq = ColumnParallelLinear( + self.q_size, + self.head_dim * self.total_num_heads, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.wq", + ) + self.rotary_emb = get_rope(self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embedding, + base=rope_theta, + rope_scaling=rope_scaling) + scaling = self.head_dim**-0.5 + self.attn = Attention(self.num_heads, + self.head_dim, + scaling, + self.num_kv_heads, + cache_config=cache_config, + prefix=f"{prefix}.attn") + + def forward(self, positions: torch.Tensor, + hidden_states: torch.Tensor) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q = self.inter_norm(q) + q = self.wq(q)[0] + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + residual, _ = self.o_proj(attn_output) + return residual + + +class Step3TextDecoderLayer(nn.Module): + + def __init__(self, + config: ModelConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> None: + super().__init__() + config = config.hf_config + self.hidden_size = config.hidden_size + rope_scaling = getattr(config, "rope_scaling", None) + + self.self_attn = Step3TextAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=1, + cache_config=cache_config, + quant_config=quant_config, + norm_eps=config.rms_norm_eps, + max_position_embedding=config.max_position_embedding, + head_dim=config.head_dim, + share_q_dim=config.share_q_dim, + rope_theta=config.rope_theta, + rope_scaling=rope_scaling, + prefix=f"{prefix}.self_attn") + + layer_idx = int(prefix.split("layers.")[1].split(".")[0]) + moe_layers_enum = getattr(config, "moe_layers_enum", None) + if moe_layers_enum is not None: + moe_layers_idx = [ + int(i) for i in moe_layers_enum.strip().split(',') + ] + else: + # Default to 1dense. + moe_layers_idx = [i for i in range(1, config.num_hidden_layers)] + + if layer_idx in moe_layers_idx: + self.moe = FusedMoEBlock(config=config, + quant_config=quant_config, + prefix=f"{prefix}.moe") + self.share_expert = Step3TextMLP( + hidden_size=self.hidden_size, + intermediate_size=config.share_expert_dim, + hidden_act="silu", + quant_config=quant_config, + prefix=f"{prefix}.share_expert") + self.use_moe = True + else: + self.mlp = Step3TextMLP(hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act="silu", + quant_config=quant_config, + prefix=f"{prefix}.mlp") + self.use_moe = False + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, positions: torch.Tensor, hidden_states: torch.Tensor, + residual: Optional[torch.Tensor] + ) -> tuple[torch.Tensor, torch.Tensor]: + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + + if self.use_moe: + share_output = self.share_expert(hidden_states) + moe_output = self.moe(hidden_states) + hidden_states = share_output + moe_output + else: + hidden_states = self.mlp(hidden_states) + + return hidden_states, residual + + +@support_torch_compile +class Step3TextModel(nn.Module): + + def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.vocab_size = config.vocab_size + self.config = config + + if get_pp_group().is_first_rank or (config.tie_word_embeddings + and get_pp_group().is_last_rank): + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + ) + else: + self.embed_tokens = PPMissingLayer() + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: Step3TextDecoderLayer(config=vllm_config. + model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix), + prefix=f"{prefix}.layers", + ) + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory(["hidden_states"], + config.hidden_size)) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] + hidden_states, residual = layer(positions, hidden_states, residual) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual, + }) + + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class Step3TextForCausalLM(nn.Module, SupportsPP): + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + ): + super().__init__() + config = vllm_config.model_config.hf_config + lora_config = vllm_config.lora_config + self.config = config + self.vllm_config = vllm_config + + self.model = Step3TextModel(vllm_config=vllm_config, prefix=prefix) + + if get_pp_group().is_last_rank: + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE + if not lora_config else lora_config.lora_vocab_padding_size, + ) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size) + self.sampler = get_sampler() + else: + self.lm_head = PPMissingLayer() + + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + def forward(self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None): + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) + return hidden_states + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: Optional[torch.Tensor], + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + qkv_params_mapping = [ + # (param_name, shard_name, relative_start_idx, relative_end_idx) + (".qkv_proj", ".q_proj", 0, self.config.share_q_dim / + (self.config.share_q_dim + self.config.head_dim * 2)), + (".qkv_proj", ".k_proj", self.config.share_q_dim / + (self.config.share_q_dim + self.config.head_dim * 2), + (self.config.share_q_dim + self.config.head_dim) / + (self.config.share_q_dim + self.config.head_dim * 2)), + (".qkv_proj", ".v_proj", + (self.config.share_q_dim + self.config.head_dim) / + (self.config.share_q_dim + self.config.head_dim * 2), + (self.config.share_q_dim + self.config.head_dim * 2) / + (self.config.share_q_dim + self.config.head_dim * 2)), + ] + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".gate_up_proj", ".gate_proj", 0), + (".gate_up_proj", ".up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + + expert_params_mapping = [ + (".moe.experts.w13_weight", ".moe.gate_proj.weight", "w1"), + (".moe.experts.w13_weight", ".moe.up_proj.weight", "w3"), + (".moe.experts.w2_weight", ".moe.down_proj.weight", "w2") + ] + + disable_moe_stacked_params = [ + data[1] for data in expert_params_mapping + ] + + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + if any(disable_moe_stacked_param in name + for disable_moe_stacked_param in + disable_moe_stacked_params): + continue + name = name.replace(weight_name, param_name) + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + loaded_params.add(name) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + param = params_dict[name] + weight_loader = param.weight_loader + for expert_id in range(loaded_weight.shape[0]): + loaded_weight_expert = loaded_weight[expert_id] + weight_loader(param, + loaded_weight_expert, + name, + shard_id=shard_id, + expert_id=expert_id) + loaded_params.add(name) + break + else: + for (param_name, weight_name, start_idx, + end_idx) in qkv_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + dim = param.shape[param.output_dim] + begin_idx = int(start_idx * dim) + end_idx = int(end_idx * dim) + param_slice = param.narrow(param.output_dim, begin_idx, + end_idx - begin_idx) + param_slice.copy_(loaded_weight) + loaded_params.add(name) + break + else: + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py new file mode 100644 index 0000000000000..363c12a4bf2b8 --- /dev/null +++ b/vllm/model_executor/models/step3_vl.py @@ -0,0 +1,1052 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import math +from collections.abc import Iterable, Mapping, Sequence +from functools import cached_property +from itertools import product +from math import ceil, sqrt +from typing import Any, Literal, Optional, TypedDict, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from PIL import Image +from torchvision import transforms +from torchvision.transforms.functional import InterpolationMode +from transformers import BatchFeature, PretrainedConfig, TensorType + +from vllm.config import VllmConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalKwargs, NestedTensors) +from vllm.multimodal.parse import ImageSize, MultiModalDataItems +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement, + PromptUpdate, PromptUpdateDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs import Step3VisionEncoderConfig +from vllm.transformers_utils.tokenizer import AnyTokenizer + +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, + init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings) + + +class Step3VLImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + pixel_values: torch.Tensor + patch_pixel_values: Optional[torch.Tensor] + num_patches: list[int] + + +class Step3VLImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + image_embeds: torch.Tensor + + +Step3VLImageInputs = Union[Step3VLImagePixelInputs, + Step3VLImageEmbeddingInputs] + +ImageWithPatches = tuple[Image.Image, list[Image.Image], list[int] | None] + +MAX_IMAGE_SIZE: int = 3024 + + +class Step3VisionProcessor: + + def __init__(self, size, interpolation_mode="bicubic", patch_size=None): + mean = [0.48145466, 0.4578275, 0.40821073] + std = [0.26862954, 0.26130258, 0.27577711] + patch_size = patch_size if patch_size is not None else size + + self.transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize(mean, std), + transforms.Resize( + (size, size), + interpolation=InterpolationMode.BICUBIC if interpolation_mode + == "bicubic" else InterpolationMode.BILINEAR, + antialias=True), + ]) + + self.patch_transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize(mean, std), + transforms.Resize( + (patch_size, patch_size), + interpolation=InterpolationMode.BICUBIC if interpolation_mode + == "bicubic" else InterpolationMode.BILINEAR, + antialias=True), + ]) if patch_size is not None else None + + def __call__(self, image, is_patch=False): + if is_patch: + return {"pixel_values": self.patch_transform(image).unsqueeze(0)} + else: + return {"pixel_values": self.transform(image).unsqueeze(0)} + + +class ImagePatcher: + + def determine_window_size(self, long: int, short: int) -> int: + if long <= 728: + return short if long / short > 1.5 else 0 + return min(short, 504) if long / short > 4 else 504 + + def slide_window( + self, + width: int, + height: int, + sizes: list[tuple[int, int]], + steps: list[tuple[int, int]], + img_rate_thr: float = 0.6, + ) -> tuple[list[tuple[int, int, int, int]], tuple[int, int]]: + assert 1 >= img_rate_thr >= 0, "The `in_rate_thr` should lie in 0~1" + windows = [] + # Sliding windows. + for size, step in zip(sizes, steps): + size_w, size_h = size + step_w, step_h = step + + x_num = 1 if width <= size_w else ceil((width - size_w) / step_w + + 1) + x_start = [step_w * i for i in range(x_num)] + if len(x_start) > 1 and x_start[-1] + size_w > width: + x_start[-1] = width - size_w + + y_num = 1 if height <= size_h else ceil((height - size_h) / + step_h + 1) + y_start = [step_h * i for i in range(y_num)] + if len(y_start) > 1 and y_start[-1] + size_h > height: + y_start[-1] = height - size_h + + start = np.array(list(product(y_start, x_start)), dtype=int) + start[:, [0, 1]] = start[:, [1, 0]] + windows.append(np.concatenate([start, start + size], axis=1)) + windows = np.concatenate(windows, axis=0) + + return [(int(box[0]), int(box[1]), int(box[2] - box[0]), + int(box[3] - box[1])) for box in windows], (x_num, y_num) + + def square_pad(self, img: Image.Image) -> Image.Image: + w, h = img.size + if w == h: + return img + size = max(w, h) + padded = Image.new(img.mode, (size, size), 0) + padded.paste(img, (0, 0)) + return padded + + def get_image_size_for_padding(self, img_width: int, + img_height: int) -> tuple[int, int]: + ratio = img_width / img_height + if min(img_height, img_width) < 32 and (ratio > 4 or ratio < 1 / 4): + new_size = max(img_height, img_width) + return new_size, new_size + return img_width, img_height + + def get_image_size_for_preprocess(self, img_width: int, + img_height: int) -> tuple[int, int]: + + if max(img_height, img_width) > MAX_IMAGE_SIZE: + scale_factor = MAX_IMAGE_SIZE / max(img_height, img_width) + img_width = int(img_width * scale_factor) + img_height = int(img_height * scale_factor) + return img_width, img_height + + def get_image_size_for_crop(self, img_width: int, img_height: int, + window_size: int): + w_ratio = img_width / window_size + h_ratio = img_height / window_size + + if w_ratio < 1: + width_new = img_width + else: + decimal_w = w_ratio - img_width // window_size + w_ratio = int(w_ratio) + 1 if decimal_w > 0.2 else int(w_ratio) + width_new = window_size * w_ratio + if h_ratio < 1: + height_new = img_height + else: + decimal_h = h_ratio - img_height // window_size + h_ratio = int(h_ratio) + 1 if decimal_h > 0.2 else int(h_ratio) + height_new = window_size * h_ratio + return int(width_new), int(height_new) + + def patch_crop(self, img: Image.Image, i: int, j: int, th: int, tw: int): + target = img.crop((j, i, j + tw, i + th)) + return target + + def get_num_patches(self, img_width: int, + img_height: int) -> tuple[int, int]: + img_width, img_height = self.get_image_size_for_padding( + img_width, img_height) + img_width, img_height = self.get_image_size_for_preprocess( + img_width, img_height) + window_size = self.determine_window_size(max(img_height, img_width), + min(img_height, img_width)) + if window_size == 0: + return 0, 0 + else: + img_width, img_height = self.get_image_size_for_crop( + img_width, img_height, window_size) + center_list, (x_num, y_num) = self.slide_window( + img_width, img_height, [(window_size, window_size)], + [(window_size, window_size)]) + full_rows = (len(center_list) - 1) // x_num + 1 + if len(center_list) > 0 and len(center_list) % x_num == 0: + full_rows -= 1 + return len(center_list), full_rows + + def __call__( + self, img: Image.Image + ) -> tuple[Image.Image, list[Image.Image], list[bool] | None]: + img_width, img_height = img.size + new_img_width, new_img_height = self.get_image_size_for_padding( + img_width, img_height) + if new_img_width != img_width or new_img_height != img_height: + img = self.square_pad(img) + img_width, img_height = img.size + + new_img_width, new_img_height = self.get_image_size_for_preprocess( + img_width, img_height) + img = img.resize((new_img_width, new_img_height), + Image.Resampling.BILINEAR) + window_size = self.determine_window_size( + max(new_img_height, new_img_width), + min(new_img_height, new_img_width)) + + if window_size == 0: + return img, [], None + else: + new_img_width, new_img_height = self.get_image_size_for_crop( + new_img_width, new_img_height, window_size) + if (new_img_width, new_img_height) != (img_width, img_height): + img_for_crop = img.resize((new_img_width, new_img_height), + Image.Resampling.BILINEAR) + else: + img_for_crop = img + + patches = [] + newlines = [] + center_list, (x_num, y_num) = self.slide_window( + new_img_width, new_img_height, [(window_size, window_size)], + [(window_size, window_size)]) + for patch_id, center_lf_point in enumerate(center_list): + x, y, patch_w, patch_h = center_lf_point + big_patch = self.patch_crop(img_for_crop, y, x, patch_h, + patch_w) + patches.append(big_patch) + if (patch_id + 1) % x_num == 0: + newlines.append(patch_id) + + if newlines and newlines[-1] == len(patches) - 1: + newlines.pop() + + return img, patches, [i in newlines for i in range(len(patches)) + ] if len(patches) > 0 else None + + +class Step3VLProcessor: + + def __init__( + self, + config: PretrainedConfig, + tokenizer: AnyTokenizer, + ) -> None: + super().__init__() + + self.config = config + self.tokenizer = tokenizer + + self.image_size = 728 + self.patch_size = 504 + self.image_preprocessor = Step3VisionProcessor(self.image_size, + "bilinear", + self.patch_size) + + self.num_image_feature_size = 169 + self.num_patch_feature_size = 81 + self.image_token = "" + self.image_feature_placeholder = (self.image_token * + self.num_image_feature_size) + self.patch_feature_placeholder = (self.image_token * + self.num_patch_feature_size) + + self.patcher = ImagePatcher() + + @property + def image_token_id(self) -> int: + return self.tokenizer.get_vocab()[self.image_token] + + def get_num_image_tokens(self, img_width: int, img_height: int) -> int: + num_patches, num_newlines = self.patcher.get_num_patches( + img_width, img_height) + + return num_patches * ( + self.num_patch_feature_size + + 2) + self.num_image_feature_size + 2 + num_newlines + + def _split_images(self, + images: list[Image.Image]) -> list[ImageWithPatches]: + result = [] + for img in images: + result.append(self.patcher(img)) + return result + + def _convert_images_to_pixel_values( + self, + images: list[Image.Image], + is_patch: bool = False, + ) -> list[torch.Tensor]: + return [ + self.image_preprocessor(img, is_patch=is_patch)["pixel_values"] + for img in images + ] + + def _get_patch_repl( + self, + num_patches: int, + patch_newline_mask: list[bool] | None, + ) -> tuple[str, list[int]]: + text = "" + token_ids = [] + for i in range(num_patches): + assert len(patch_newline_mask) == num_patches + text += f"{self.patch_feature_placeholder}" + token_ids.extend( + [self.tokenizer.convert_tokens_to_ids("")] + + [self.image_token_id] * self.num_patch_feature_size + + [self.tokenizer.convert_tokens_to_ids("")]) + if patch_newline_mask and patch_newline_mask[i]: + text += "" + token_ids.append( + self.tokenizer.convert_tokens_to_ids("")) + return text, token_ids + + def _get_image_repl( + self, + num_images: int, + ) -> tuple[str, list[int]]: + text = f"{self.image_feature_placeholder}" + token_ids = [ + self.tokenizer.convert_tokens_to_ids("") + ] + [self.image_token_id] * self.num_image_feature_size + [ + self.tokenizer.convert_tokens_to_ids("") + ] + return text * num_images, token_ids * num_images + + def _get_image_repl_features( + self, + num_images: int, + num_patches: int, + patch_new_line_idx: Optional[list[bool]], + ) -> tuple[str, list[int]]: + if num_patches > 0: + patch_repl, patch_repl_ids = self._get_patch_repl( + num_patches, patch_new_line_idx) + else: + patch_repl = "" + patch_repl_ids = [] + image_repl, image_repl_ids = self._get_image_repl(num_images) + return patch_repl + image_repl, patch_repl_ids + image_repl_ids + + def replace_placeholder(self, text: str, placeholder: str, + repls: list[str]) -> str: + parts = text.split(placeholder) + + if len(parts) - 1 != len(repls): + raise ValueError( + "The number of placeholders does not match the number of replacements." # noqa: E501 + ) + + result = [parts[0]] + for i, repl in enumerate(repls): + result.append(repl) + result.append(parts[i + 1]) + + return "".join(result) + + def __call__( + self, + text: Optional[Union[str, list[str]]] = None, + images: Optional[Union[Image.Image, list[Image.Image]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + ) -> BatchFeature: + if text is None: + text = [] + if not isinstance(text, list): + text = [text] + if images is None: + images = [] + if not isinstance(images, list): + images = [images] + + if len(images) == 0: + image_inputs = {} + text_inputs = self.tokenizer(text) + else: + splitted_images_data = self._split_images(images) + pixel_values_lst = [] + patch_pixel_values_lst = [] + patch_newline_mask_lst = [] + image_repl_str_lst = [] + image_repl_ids_lst = [] + num_patches = [] + for raw_img, img_patches, patch_newline_mask in splitted_images_data: # noqa: E501 + pixel_values_lst.extend( + self._convert_images_to_pixel_values([raw_img])) + + if len(img_patches) > 0: + patch_pixel_values_lst.extend( + self._convert_images_to_pixel_values(img_patches, + is_patch=True)) + num_patches.append(len(img_patches)) + + image_repl_str, image_repl_ids = self._get_image_repl_features( + 1, len(img_patches), patch_newline_mask) + image_repl_str_lst.append(image_repl_str) + image_repl_ids_lst.extend(image_repl_ids) + + if patch_newline_mask is not None: + patch_newline_mask_lst.extend(patch_newline_mask) + + image_inputs = { + "pixel_values": torch.cat(pixel_values_lst), + "num_patches": num_patches, + } + if patch_pixel_values_lst: + image_inputs["patch_pixel_values"] = torch.cat( + patch_pixel_values_lst) + if patch_newline_mask_lst: + image_inputs["patch_newline_mask"] = torch.tensor( + patch_newline_mask_lst, dtype=torch.bool) + + text = [ + self.replace_placeholder(t, self.image_token, + image_repl_str_lst) for t in text + ] + text_inputs = self.tokenizer(text) + + return BatchFeature( + { + **text_inputs, + **image_inputs, + }, + tensor_type=return_tensors, + ) + + +class Step3VLProcessingInfo(BaseProcessingInfo): + + def get_hf_processor(self) -> Step3VLProcessor: + return Step3VLProcessor( + self.get_hf_config(), + self.get_tokenizer(), + ) + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_max_image_tokens(self) -> int: + hf_processor = self.get_hf_processor() + return hf_processor.get_num_image_tokens( + self.get_image_size_with_most_features().width, + self.get_image_size_with_most_features().height) + + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + return {"image": self.get_max_image_tokens()} + + def get_image_size_with_most_features(self) -> ImageSize: + return ImageSize(3024, 3024) + + def get_num_mm_tokens(self, mm_data: MultiModalDataDict) -> int: + if len(mm_data) != 1 or "image" not in mm_data: + raise ValueError( + "mm_data could only contain one key 'image' for steo1o") + + image_data = mm_data["image"] + if not isinstance(image_data, (list, tuple)): + image_data = [image_data] + + return sum(self.get_hf_processor().get_num_image_tokens( + img.width, img.height) for img in image_data) + + +class Step3VLDummyInputsBuilder(BaseDummyInputsBuilder[Step3VLProcessingInfo]): + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + return "" * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + target_width, target_height = \ + self.info.get_image_size_with_most_features() + num_images = mm_counts.get("image", 0) + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + +class Step3VLMultiModalProcessor(BaseMultiModalProcessor[Step3VLProcessingInfo] + ): + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, Any], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + image_placeholder_token_id = hf_processor.image_token_id + batch_num_patches = out_mm_kwargs["num_patches"].tolist() + + def get_replacement_step1o(item_idx: int): + img_out = out_mm_kwargs.get_item("image", item_idx) + num_patches = batch_num_patches[item_idx] + if num_patches > 0: + patch_newline_mask = img_out["patch_newline_mask"].data.tolist( + ) + image_repl_ids = hf_processor._get_image_repl_features( + 1, num_patches, patch_newline_mask)[1] + else: + image_repl_ids = hf_processor._get_image_repl_features( + 1, 0, None)[1] + return PromptUpdateDetails.select_token_id( + seq=image_repl_ids, + embed_token_id=image_placeholder_token_id, + ) + + return [ + PromptReplacement( + modality="image", + target=[image_placeholder_token_id], + replacement=get_replacement_step1o, + ) + ] + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + num_patches = hf_inputs.get("num_patches", torch.empty(0)) + + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + patch_pixel_values=MultiModalFieldConfig.flat_from_sizes( + "image", num_patches), + num_patches=MultiModalFieldConfig.batched("image"), + patch_newline_mask=MultiModalFieldConfig.flat_from_sizes( + "image", num_patches), + ) + + +def get_abs_pos(abs_pos, tgt_size): + dim = abs_pos.size(-1) + abs_pos_new = abs_pos.squeeze(0) + cls_token, old_pos_embed = abs_pos_new[:1], abs_pos_new[1:] + + src_size = int(math.sqrt(abs_pos_new.shape[0] - 1)) + tgt_size = int(math.sqrt(tgt_size)) + dtype = abs_pos.dtype + + if src_size != tgt_size: + old_pos_embed = old_pos_embed.view(1, src_size, src_size, + dim).permute(0, 3, 1, + 2).contiguous() + old_pos_embed = old_pos_embed.to(torch.float32) + new_pos_embed = F.interpolate( + old_pos_embed, + size=(tgt_size, tgt_size), + mode='bicubic', + antialias=True, + align_corners=False, + ).to(dtype) + new_pos_embed = new_pos_embed.permute(0, 2, 3, 1) + new_pos_embed = new_pos_embed.view(tgt_size * tgt_size, dim) + vision_pos_embed = torch.cat([cls_token, new_pos_embed], dim=0) + vision_pos_embed = vision_pos_embed.view(1, tgt_size * tgt_size + 1, + dim) + return vision_pos_embed + else: + return abs_pos + + +class Step3VisionEmbeddings(nn.Module): + + def __init__(self, config: Step3VisionEncoderConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(1, self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + bias=True, + ) + + self.num_patches = (self.image_size // self.patch_size)**2 + self.pad_tp_size = 4 # hard code for padding + # To load the pretrained weights, we still use P+1 as the seqlen + self.position_embedding = torch.nn.Embedding(self.num_patches + 1, + self.embed_dim) + self.register_buffer("position_ids", + torch.arange(self.num_patches + 1).expand( + (1, -1)), + persistent=False) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size = pixel_values.shape[0] + patch_embeds = self.patch_embedding( + pixel_values) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + # pad + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + embeddings = embeddings + get_abs_pos( + self.position_embedding(self.position_ids), patch_embeds.size(1)) + embeddings = torch.cat([ + embeddings[:, 0, :].unsqueeze(1).repeat(1, self.pad_tp_size - 1, + 1), embeddings + ], + dim=1) + return embeddings + + +class Step3VisionAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, + config, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.total_num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.total_num_heads + + self.scale = self.head_dim**-0.5 + + tp_size = get_tensor_model_parallel_world_size() + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.qkv_proj = QKVParallelLinear(self.embed_dim, + self.head_dim, + self.total_num_heads, + bias=True, + quant_config=quant_config, + prefix=prefix) + self.out_proj = RowParallelLinear(self.embed_dim, + self.embed_dim, + bias=True, + quant_config=quant_config, + prefix=prefix) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, + self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + ): + """Input shape: Batch x Time x Channel""" + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.chunk(chunks=3, dim=-1) + q = q.view(bsz, tgt_len, self.num_heads, self.head_dim) + k = k.view(bsz, tgt_len, self.num_heads, self.head_dim) + v = v.view(bsz, tgt_len, self.num_heads, self.head_dim) + q = q.transpose(1, 2) + k = k.transpose(1, 2) + v = v.transpose(1, 2) + attn_output = F.scaled_dot_product_attention(q, + k, + v, + scale=self.scale, + is_causal=False) + attn_output = attn_output.transpose(1, 2).reshape( + bsz, tgt_len, self.num_heads * self.head_dim) + + attn_output, _ = self.out_proj(attn_output) + + return attn_output + + +class Step3VisionMLP(nn.Module): + + def __init__(self, + config, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + self.config = config + self.activation_fn = get_act_fn(config.hidden_act) + self.fc1 = ColumnParallelLinear(config.hidden_size, + config.intermediate_size, + bias=True, + quant_config=quant_config, + prefix=prefix) + self.fc2 = RowParallelLinear(config.intermediate_size, + config.hidden_size, + bias=True, + quant_config=quant_config, + prefix=prefix) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states, _ = self.fc2(hidden_states) + return hidden_states + + +class Step3VisionEncoderLayer(nn.Module): + + def __init__(self, + config: Step3VisionEncoderConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = Step3VisionAttention(config, + quant_config, + prefix=f"{prefix}.self_attn") + self.layer_norm1 = nn.LayerNorm(self.embed_dim, + eps=config.layer_norm_eps) + self.mlp = Step3VisionMLP(config, quant_config, prefix=f"{prefix}.mlp") + self.layer_norm2 = nn.LayerNorm(self.embed_dim, + eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + ) -> torch.FloatTensor: + hidden_states = hidden_states + self.layer_norm1( + self.self_attn(hidden_states)) + hidden_states = hidden_states + self.layer_norm2( + self.mlp(hidden_states)) + return hidden_states + + +class Step3VisionEncoder(nn.Module): + + def __init__(self, + config: Step3VisionEncoderConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + self.config = config + self.layers = nn.ModuleList([ + Step3VisionEncoderLayer(config, + quant_config, + prefix=f"{prefix}.layers.{i}") + for i in range(config.num_hidden_layers) + ]) + + def forward( + self, + inputs_embeds, + ): + hidden_states = inputs_embeds + for encoder_layer in self.layers: + hidden_states = encoder_layer(hidden_states) + return hidden_states + + +class Step3VisionTransformer(nn.Module): + + def __init__(self, + config: Step3VisionEncoderConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + self.config = config + self.image_size = config.image_size + self.embeddings = Step3VisionEmbeddings(config) + self.transformer = Step3VisionEncoder(config, + quant_config, + prefix=f"{prefix}.transformer") + + def forward( + self, + pixel_values: torch.Tensor, + ): + hidden_states = self.embeddings(pixel_values) + hidden_states = self.transformer(inputs_embeds=hidden_states) + return hidden_states + + +@MULTIMODAL_REGISTRY.register_processor(Step3VLMultiModalProcessor, + info=Step3VLProcessingInfo, + dummy_inputs=Step3VLDummyInputsBuilder) +class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsPP): + + hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ + "model.": "language_model.model.", + "lm_head.": "language_model.lm_head.", + }) + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + if modality.startswith("image"): + return "" + + raise ValueError("Only image modality is supported") + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() + + config = vllm_config.model_config.hf_config + multimodal_config = vllm_config.model_config.multimodal_config + + self.config = config + self.multimodal_config = multimodal_config + + self.vision_model = Step3VisionTransformer(config.vision_config, + None, + prefix=maybe_prefix( + prefix, "vision_model")) + self.vit_downsampler = nn.Conv2d( + config.vision_config.hidden_size, + config.vision_config.output_hidden_size, + kernel_size=2, + stride=config.understand_projector_stride) + self.vit_downsampler2 = nn.Conv2d( + config.vision_config.output_hidden_size, + config.vision_config.output_hidden_size * 2, + kernel_size=3, + stride=2, + padding=1, + ) + self.vit_large_projector = nn.Linear( + config.vision_config.output_hidden_size * 2, + config.hidden_size, + bias=config.projector_bias, + ) + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model")) + + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors) + + @cached_property + def sampler(self): + if hasattr(self.language_model, "sampler"): + return self.language_model.sampler + + return get_sampler() + + @property + def device(self): + return next(self.parameters()).device + + @property + def dtype(self): + return next(self.parameters()).dtype + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[Step3VLImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + patch_pixel_values = kwargs.pop("patch_pixel_values", None) + num_patches = kwargs.pop("num_patches", None) + image_embeds = kwargs.pop("image_embeds", None) + + if pixel_values is None and image_embeds is None: + return None + + if pixel_values is not None: + pixel_values = flatten_bn(pixel_values, concat=True) + if pixel_values.dim() >= 3: + pixel_values = pixel_values.view(-1, *pixel_values.shape[-3:]) + if patch_pixel_values is not None: + patch_pixel_values = flatten_bn(patch_pixel_values, + concat=True) + patch_pixel_values = patch_pixel_values.view( + -1, *patch_pixel_values.shape[-3:]) + # Handle empty patch_pixel_values by setting to None + if patch_pixel_values.shape[0] == 0: + patch_pixel_values = None + num_patches = flatten_bn(num_patches, concat=True).tolist() + + return Step3VLImagePixelInputs( + type="pixel_values", + pixel_values=pixel_values.to(self.dtype).to(self.device), + patch_pixel_values=patch_pixel_values.to(self.dtype).to( + self.device) if patch_pixel_values is not None else None, + num_patches=num_patches, + ) + + if image_embeds is not None: + if image_embeds.dim() == 2 or image_embeds.dim() >= 3: + image_embeds = image_embeds.view(-1, image_embeds.shape[-1]) + else: + raise ValueError( + f"Unexpected shape for image_embeds: {image_embeds.shape}") + + return Step3VLImageEmbeddingInputs( + type="image_embeds", + image_embeds=image_embeds.to(self.dtype).to(self.device), + ) + return None + + def _process_image_features(self, + image_features: torch.Tensor) -> torch.Tensor: + B, P = image_features.shape[:2] + HW = int(sqrt(P)) + image_features = image_features.permute(0, 2, 1).view(B, -1, HW, HW) + image_features = self.vit_downsampler(image_features) + image_features = self.vit_downsampler2(image_features) + n_dim = image_features.size(1) + image_features = image_features.view(B, n_dim, -1).permute(0, 2, 1) + image_features = self.vit_large_projector(image_features) + return image_features + + def _get_vision_model_output(self, + input_tensor: torch.Tensor) -> torch.Tensor: + return self.vision_model(input_tensor)[:, 4:] + + def _process_image_input( + self, image_input: Step3VLImageInputs) -> tuple[torch.Tensor, ...]: + + if image_input["type"] == "image_embeds": + image_features = image_input["image_embeds"] + else: + image_features = self._get_vision_model_output( + image_input["pixel_values"]) + patch_image_features = self._get_vision_model_output( + image_input["patch_pixel_values"] + ) if image_input["patch_pixel_values"] is not None else None + num_patches = image_input["num_patches"] + + image_features = self._process_image_features(image_features) + patch_image_features = self._process_image_features( + patch_image_features) if patch_image_features is not None else None + + merged_image_features = [] + cur_patch_idx = 0 + for i, num_patch in enumerate(num_patches): + cur_feature = [] + if num_patch > 0: + patch_slice = patch_image_features[ + cur_patch_idx:cur_patch_idx + num_patch] + cur_feature.append(patch_slice.view(-1, patch_slice.shape[-1])) + cur_feature.append(image_features[i].view( + -1, image_features.shape[-1])) + cur_patch_idx += num_patch + merged_image_features.append( + torch.cat(cur_feature) if len(cur_feature) > + 1 else cur_feature[0]) + return merged_image_features + + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + if multimodal_embeddings is None: + inputs_embeds = self.language_model.model.get_input_embeddings( + input_ids) + else: + is_text = input_ids != self.config.image_token_id + text_ids = input_ids[is_text] + text_embeds = self.language_model.model.get_input_embeddings( + text_ids) + inputs_embeds = torch.empty(input_ids.shape[0], + text_embeds.shape[-1], + dtype=text_embeds.dtype, + device=text_embeds.device) + inputs_embeds[is_text] = text_embeds + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + self.config.image_token_id) + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + if intermediate_tensors is not None: + inputs_embeds = None + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + # always pass the input via `inputs_embeds` + # to make sure the computation graph is consistent + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None + + hidden_states = self.language_model(input_ids, + positions, + intermediate_tensors, + inputs_embeds=inputs_embeds) + + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + return self.language_model.sample(logits, sampling_metadata) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader(self) + loaded_weights = loader.load_weights(weights, + mapper=self.hf_to_vllm_mapper) + return loaded_weights diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py index d61e4f11dfa29..1c3f78f2edbfb 100644 --- a/vllm/reasoning/__init__.py +++ b/vllm/reasoning/__init__.py @@ -8,6 +8,7 @@ from .granite_reasoning_parser import GraniteReasoningParser from .hunyuan_a13b_reasoning_parser import HunyuanA13BReasoningParser from .mistral_reasoning_parser import MistralReasoningParser from .qwen3_reasoning_parser import Qwen3ReasoningParser +from .step3_reasoning_parser import Step3ReasoningParser __all__ = [ "ReasoningParser", @@ -18,4 +19,5 @@ __all__ = [ "Qwen3ReasoningParser", "Glm4MoeModelReasoningParser", "MistralReasoningParser", + "Step3ReasoningParser", ] diff --git a/vllm/reasoning/step3_reasoning_parser.py b/vllm/reasoning/step3_reasoning_parser.py new file mode 100644 index 0000000000000..f642ea977c580 --- /dev/null +++ b/vllm/reasoning/step3_reasoning_parser.py @@ -0,0 +1,109 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Sequence +from typing import Optional, Union + +import regex as re +from transformers import PreTrainedTokenizerBase + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage) +from vllm.logger import init_logger +from vllm.reasoning import ReasoningParser, ReasoningParserManager + +logger = init_logger(__name__) + + +@ReasoningParserManager.register_module("step3") +class Step3ReasoningParser(ReasoningParser): + """ + Reasoning parser for Step3 model. + + The Step3 model uses token to denote the end of reasoning + text. This parser extracts all content before as reasoning content. + """ + + def __init__(self, tokenizer: PreTrainedTokenizerBase): + super().__init__(tokenizer) + self.think_end_token = "" + + self.reasoning_regex = re.compile(rf"(.*?){self.think_end_token}", + re.DOTALL) + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ReasoningParser " + "constructor during construction.") + + self.think_end_token_id = self.vocab.get(self.think_end_token) + if self.think_end_token_id is None: + raise RuntimeError( + "Step3 reasoning parser could not locate think end " + "token in the tokenizer!") + + def extract_reasoning_content_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> Union[DeltaMessage, None]: + """ + Extract reasoning content from a delta message. + Handles streaming output where previous + delta = current. + Uses token IDs for faster processing. + For text "abcxyz": + - 'abc' goes to reasoning_content + - 'xyz' goes to content + """ + # Skip single special token + if len(delta_token_ids + ) == 1 and delta_token_ids[0] == self.think_end_token_id: + return None + + if self.think_end_token_id in delta_token_ids: + # in delta, extract reasoning content and remaining content + end_index = delta_text.find(self.think_end_token) + reasoning_content = delta_text[:end_index] + content = delta_text[end_index + len(self.think_end_token):] + return DeltaMessage(reasoning_content=reasoning_content, + content=content if content else None) + elif self.think_end_token_id in previous_token_ids: + # already seen in previous text, everything is content + return DeltaMessage(content=delta_text) + else: + # No seen yet, everything is reasoning + return DeltaMessage(reasoning_content=delta_text) + + def extract_reasoning_content( + self, model_output: str, request: ChatCompletionRequest + ) -> tuple[Optional[str], Optional[str]]: + + # Check if the model output contains the token + if self.think_end_token not in model_output: + # If no token, everything is reasoning content + return model_output, None + else: + # Find the first occurrence of + end_index = model_output.find(self.think_end_token) + reasoning_content = model_output[:end_index] + + # Content after token + content = model_output[end_index + len(self.think_end_token):] + + if len(content) == 0: + content = None + + return reasoning_content, content + + def is_reasoning_end(self, input_ids: list[int]) -> bool: + return self.think_end_token_id in input_ids + + def extract_content_ids(self, input_ids: list[int]) -> list[int]: + if self.think_end_token_id not in input_ids[:-1]: + return [] + else: + return input_ids[input_ids.index(self.think_end_token_id) + 1:] diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 4ce56cb3a6aac..fcaa48c1392a3 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -35,7 +35,8 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config, MllamaConfig, MLPSpeculatorConfig, Nemotron_Nano_VL_Config, NemotronConfig, NVLM_D_Config, - RWConfig, UltravoxConfig) + RWConfig, Step3TextConfig, + Step3VLConfig, UltravoxConfig) # yapf: enable from vllm.transformers_utils.configs.mistral import adapt_config_dict from vllm.transformers_utils.utils import check_gguf_file @@ -83,6 +84,8 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = { "nemotron": NemotronConfig, "NVLM_D": NVLM_D_Config, "ultravox": UltravoxConfig, + "step3_vl": Step3VLConfig, + "step3_text": Step3TextConfig, **_CONFIG_REGISTRY_OVERRIDE_HF } diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 7c7d859e4a325..96733da726181 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -24,6 +24,9 @@ from vllm.transformers_utils.configs.nemotron import NemotronConfig from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config +from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig, + Step3VisionEncoderConfig, + Step3VLConfig) from vllm.transformers_utils.configs.ultravox import UltravoxConfig __all__ = [ @@ -42,4 +45,7 @@ __all__ = [ "Nemotron_Nano_VL_Config", "NVLM_D_Config", "UltravoxConfig", + "Step3VLConfig", + "Step3VisionEncoderConfig", + "Step3TextConfig", ] diff --git a/vllm/transformers_utils/configs/step3_vl.py b/vllm/transformers_utils/configs/step3_vl.py new file mode 100644 index 0000000000000..fe3c72de69d28 --- /dev/null +++ b/vllm/transformers_utils/configs/step3_vl.py @@ -0,0 +1,123 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any, Optional, Union + +from transformers.configuration_utils import PretrainedConfig + + +class Step3VisionEncoderConfig(PretrainedConfig): + model_type = "step3_vision_encoder" + + def __init__( + self, + hidden_size=1792, + intermediate_size=3072, + output_hidden_size=4096, + num_hidden_layers=63, + num_attention_heads=16, + num_channels=3, + image_size=728, + patch_size=14, + hidden_act="quick_gelu", + layer_norm_eps=1e-5, + **kwargs, + ): + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.output_hidden_size = output_hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + super().__init__(**kwargs) + + +class Step3TextConfig(PretrainedConfig): + model_type = "step3_text" + architectures = ["Step3TextForCausalLM"] + + def __init__( + self, + hidden_size: int = 7168, + intermediate_size: int = 18432, + num_attention_heads: int = 64, + num_attention_groups: int = 1, + num_hidden_layers: int = 61, + max_seq_len: int = 65536, + vocab_size: int = 128815, + rms_norm_eps: float = 1e-5, + moe_intermediate_size: int = 5120, + moe_num_experts: int = 48, + moe_top_k: int = 3, + rope_theta: float = 500000, + rope_scaling: Optional[dict[str, Any]] = None, + max_position_embedding: int = 65536, + share_expert_dim: int = 5120, + share_q_dim: int = 2048, + head_dim: int = 256, + norm_expert_weight: bool = False, + moe_layers_enum: tuple[int, + ...] = (4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59), + **kwargs, + ) -> None: + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_attention_heads = num_attention_heads + self.num_attention_groups = num_attention_groups + self.num_hidden_layers = num_hidden_layers + self.max_seq_len = max_seq_len + self.vocab_size = vocab_size + self.rms_norm_eps = rms_norm_eps + self.moe_intermediate_size = moe_intermediate_size + self.moe_num_experts = moe_num_experts + self.moe_top_k = moe_top_k + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.max_position_embedding = max_position_embedding + self.share_expert_dim = share_expert_dim + self.share_q_dim = share_q_dim + self.head_dim = head_dim + self.norm_expert_weight = norm_expert_weight + self.moe_layers_enum = moe_layers_enum + + super().__init__(**kwargs) + + +class Step3VLConfig(PretrainedConfig): + model_type = "step3_vl" + + def __init__( + self, + vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None, + text_config: Optional[Union[dict, Step3TextConfig]] = None, + understand_projector_stride: int = 1, + projector_bias: bool = True, + image_token_id: int = 128001, + **kwargs, + ) -> None: + if vision_config is None: + vision_config = Step3VisionEncoderConfig() + elif isinstance(vision_config, dict): + vision_config = Step3VisionEncoderConfig(**vision_config) + self.vision_config = vision_config + + if text_config is None: + text_config = Step3TextConfig() + elif isinstance(text_config, dict): + text_config = Step3TextConfig(**text_config) + self.text_config = text_config + + self.understand_projector_stride = understand_projector_stride + self.projector_bias = projector_bias + self.hidden_size = text_config.hidden_size + self.image_token_id = image_token_id + + super().__init__(**kwargs) From 7349d5268bf70b7a530c1e649884e4f926615f8e Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Thu, 31 Jul 2025 12:46:07 -0400 Subject: [PATCH 092/224] [ez] Remove a trailing space from compilation/decorators.py (#22028) --- vllm/compilation/decorators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index f3592324d8cfa..1370862d580a5 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -108,7 +108,7 @@ def support_torch_compile( During runtime, when we actually mark dimensions of tensors, it depends on the value of arguments: - - if it is a single integer (can be negative), the corresponding dimension + - if it is a single integer (can be negative), the corresponding dimension of the argument will be marked as dynamic. - if it is `None`, ignored. - if it is `IntermediateTensors`, all the tensors in the intermediate From 58bb902186a87007deeeef2d2af02ed2b13bb182 Mon Sep 17 00:00:00 2001 From: Doug Smith Date: Thu, 31 Jul 2025 12:52:48 -0400 Subject: [PATCH 093/224] fix(setup): improve precompiled wheel setup for Docker builds (#22025) Signed-off-by: dougbtv --- docker/Dockerfile | 1 + requirements/test.txt | 24 +++-- setup.py | 203 ++++++++++++++++++------------------------ 3 files changed, 104 insertions(+), 124 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 43522ef8fb8dd..69aeee67a4300 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -370,6 +370,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ fi # Install vllm wheel first, so that torch etc will be installed. +# !bang RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/uv \ uv pip install --system dist/*.whl --verbose \ diff --git a/requirements/test.txt b/requirements/test.txt index d45048aae5809..4aaca2afea266 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -22,9 +22,7 @@ aiohttp==3.10.11 aiohttp-cors==0.8.1 # via ray aiosignal==1.3.1 - # via - # aiohttp - # ray + # via aiohttp albucore==0.0.16 # via terratorch albumentations==1.4.6 @@ -139,7 +137,7 @@ contourpy==1.3.0 # via matplotlib cramjam==2.9.0 # via fastparquet -cupy-cuda12x==13.3.0 +cupy-cuda12x==13.5.1 # via ray cycler==0.12.1 # via matplotlib @@ -226,7 +224,6 @@ frozenlist==1.5.0 # via # aiohttp # aiosignal - # ray fsspec==2024.9.0 # via # datasets @@ -603,10 +600,18 @@ opencv-python-headless==4.11.0.86 opentelemetry-api==1.35.0 # via # mlflow-skinny + # opentelemetry-exporter-prometheus # opentelemetry-sdk # opentelemetry-semantic-conventions +opentelemetry-exporter-prometheus==0.56b0 + # via ray +opentelemetry-proto==1.36.0 + # via ray opentelemetry-sdk==1.35.0 - # via mlflow-skinny + # via + # mlflow-skinny + # opentelemetry-exporter-prometheus + # ray opentelemetry-semantic-conventions==0.56b0 # via opentelemetry-sdk packaging==24.2 @@ -697,7 +702,9 @@ pqdm==0.2.0 pretrainedmodels==0.7.4 # via segmentation-models-pytorch prometheus-client==0.22.0 - # via ray + # via + # opentelemetry-exporter-prometheus + # ray propcache==0.2.0 # via yarl proto-plus==1.26.1 @@ -707,6 +714,7 @@ protobuf==5.28.3 # google-api-core # googleapis-common-protos # mlflow-skinny + # opentelemetry-proto # proto-plus # ray # tensorboardx @@ -854,7 +862,7 @@ rasterio==1.4.3 # rioxarray # terratorch # torchgeo -ray==2.43.0 +ray==2.48.0 # via -r requirements/test.in redis==5.2.0 # via tensorizer diff --git a/setup.py b/setup.py index bf3391e2db19e..6d615d122d69e 100644 --- a/setup.py +++ b/setup.py @@ -282,10 +282,69 @@ class cmake_build_ext(build_ext): self.copy_file(file, dst_file) -class repackage_wheel(build_ext): +class precompiled_wheel_utils: """Extracts libraries and other files from an existing wheel.""" - def get_base_commit_in_main_branch(self) -> str: + @staticmethod + def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict: + import tempfile + import zipfile + + temp_dir = None + try: + if not os.path.isfile(wheel_url_or_path): + wheel_filename = wheel_url_or_path.split("/")[-1] + temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") + wheel_path = os.path.join(temp_dir, wheel_filename) + print(f"Downloading wheel from {wheel_url_or_path} " + f"to {wheel_path}") + from urllib.request import urlretrieve + urlretrieve(wheel_url_or_path, filename=wheel_path) + else: + wheel_path = wheel_url_or_path + print(f"Using existing wheel at {wheel_path}") + + package_data_patch = {} + + with zipfile.ZipFile(wheel_path) as wheel: + files_to_copy = [ + "vllm/_C.abi3.so", + "vllm/_moe_C.abi3.so", + "vllm/_flashmla_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", + "vllm/cumem_allocator.abi3.so", + ] + + compiled_regex = re.compile( + r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") + file_members = list( + filter(lambda x: x.filename in files_to_copy, + wheel.filelist)) + file_members += list( + filter(lambda x: compiled_regex.match(x.filename), + wheel.filelist)) + + for file in file_members: + print(f"[extract] {file.filename}") + target_path = os.path.join(".", file.filename) + os.makedirs(os.path.dirname(target_path), exist_ok=True) + with wheel.open(file.filename) as src, open( + target_path, "wb") as dst: + shutil.copyfileobj(src, dst) + + pkg = os.path.dirname(file.filename).replace("/", ".") + package_data_patch.setdefault(pkg, []).append( + os.path.basename(file.filename)) + + return package_data_patch + finally: + if temp_dir is not None: + print(f"Removing temporary directory {temp_dir}") + shutil.rmtree(temp_dir) + + @staticmethod + def get_base_commit_in_main_branch() -> str: # Force to use the nightly wheel. This is mainly used for CI testing. if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: return "nightly" @@ -334,115 +393,6 @@ class repackage_wheel(build_ext): "wheel may not be compatible with your dev branch: %s", err) return "nightly" - def run(self) -> None: - assert _is_cuda( - ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" - - wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None) - if wheel_location is None: - base_commit = self.get_base_commit_in_main_branch() - wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - # Fallback to nightly wheel if latest commit wheel is unavailable, - # in this rare case, the nightly release CI hasn't finished on main. - if not is_url_available(wheel_location): - wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - - import zipfile - - if os.path.isfile(wheel_location): - wheel_path = wheel_location - print(f"Using existing wheel={wheel_path}") - else: - # Download the wheel from a given URL, assume - # the filename is the last part of the URL - wheel_filename = wheel_location.split("/")[-1] - - import tempfile - - # create a temporary directory to store the wheel - temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") - wheel_path = os.path.join(temp_dir, wheel_filename) - print(f"Downloading wheel from {wheel_location} to {wheel_path}") - from urllib.request import urlretrieve - try: - urlretrieve(wheel_location, filename=wheel_path) - except Exception as e: - from setuptools.errors import SetupError - raise SetupError( - f"Failed to get vLLM wheel from {wheel_location}") from e - - # Set the dist_dir for Docker build context - dist_dir = ("/workspace/dist" - if envs.VLLM_DOCKER_BUILD_CONTEXT else "dist") - os.makedirs(dist_dir, exist_ok=True) - - # Extract only necessary compiled .so files from precompiled wheel - with zipfile.ZipFile(wheel_path) as wheel: - # Get version from METADATA (optional, mostly useful for logging) - metadata_file = next((n for n in wheel.namelist() - if n.endswith(".dist-info/METADATA")), None) - if not metadata_file: - raise RuntimeError( - "Could not find METADATA in precompiled wheel.") - metadata = wheel.read(metadata_file).decode() - version_line = next((line for line in metadata.splitlines() - if line.startswith("Version: ")), None) - if not version_line: - raise RuntimeError( - "Could not determine version from METADATA.") - version = version_line.split(": ")[1].strip() - - print(f"Extracting precompiled kernels from vLLM wheel version: " - f"{version}") - - # List of compiled shared objects to extract - files_to_copy = [ - "vllm/_C.abi3.so", - "vllm/_moe_C.abi3.so", - "vllm/_flashmla_C.abi3.so", - "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", - "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", - "vllm/cumem_allocator.abi3.so", - ] - - file_members = list( - filter(lambda x: x.filename in files_to_copy, wheel.filelist)) - compiled_regex = re.compile( - r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") - file_members += list( - filter(lambda x: compiled_regex.match(x.filename), - wheel.filelist)) - - for file in file_members: - print(f"Extracting and including {file.filename} " - "from existing wheel") - package_name = os.path.dirname(file.filename).replace("/", ".") - file_name = os.path.basename(file.filename) - - if package_name not in package_data: - package_data[package_name] = [] - - output_base = (dist_dir - if envs.VLLM_DOCKER_BUILD_CONTEXT else ".") - target_path = os.path.join(output_base, file.filename) - os.makedirs(os.path.dirname(target_path), exist_ok=True) - with wheel.open(file.filename) as src, open(target_path, - "wb") as dst: - shutil.copyfileobj(src, dst) - - package_data[package_name].append(file_name) - - # Copy wheel into dist dir for Docker to consume (e.g., via --mount) - if envs.VLLM_DOCKER_BUILD_CONTEXT: - arch_tag = "cp38-abi3-manylinux1_x86_64" - corrected_wheel_name = f"vllm-{version}-{arch_tag}.whl" - final_wheel_path = os.path.join(dist_dir, corrected_wheel_name) - - print( - "Docker build context detected, copying precompiled wheel to " - f"{final_wheel_path}") - shutil.copy2(wheel_path, final_wheel_path) - def _no_device() -> bool: return VLLM_TARGET_DEVICE == "empty" @@ -676,16 +626,37 @@ package_data = { ] } +# If using precompiled, extract and patch package_data (in advance of setup) +if envs.VLLM_USE_PRECOMPILED: + assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" + wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None) + if wheel_location is not None: + wheel_url = wheel_location + else: + base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch() + wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + from urllib.request import urlopen + try: + with urlopen(wheel_url) as resp: + if resp.status != 200: + wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + except Exception as e: + print(f"[warn] Falling back to nightly wheel: {e}") + wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + + patch = precompiled_wheel_utils.extract_precompiled_and_patch_package( + wheel_url) + for pkg, files in patch.items(): + package_data.setdefault(pkg, []).extend(files) + if _no_device(): ext_modules = [] -if not ext_modules: +if not ext_modules or envs.VLLM_USE_PRECOMPILED: + # Disable build_ext when using precompiled wheel cmdclass = {} else: - cmdclass = { - "build_ext": - repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext - } + cmdclass = {"build_ext": cmake_build_ext} setup( # static metadata should rather go in pyproject.toml From 0780bb57835dcd9ee666aaf807c37086de67422b Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Thu, 31 Jul 2025 11:53:27 -0500 Subject: [PATCH 094/224] Removing amdproduction Tests (#22027) Signed-off-by: Alexei V. Ivanov --- .buildkite/test-pipeline.yaml | 46 +++++++++++++++++------------------ 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 2bf0b6fd9a169..a7fe200559305 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -82,7 +82,7 @@ steps: - bash standalone_tests/python_only_compile.sh - label: Basic Correctness Test # 30min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] fast_check: true torch_nightly: true source_file_dependencies: @@ -99,7 +99,7 @@ steps: - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - label: Chunked Prefill Test - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/basic_correctness/test_chunked_prefill @@ -108,7 +108,7 @@ steps: - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - label: Core Test # 10min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] fast_check: true source_file_dependencies: - vllm/core @@ -209,7 +209,7 @@ steps: - pytest -v -s distributed/test_eplb_execute.py - label: Metrics, Tracing Test # 10min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] num_gpus: 2 source_file_dependencies: - vllm/ @@ -228,7 +228,7 @@ steps: ##### 1 GPU test ##### - label: Regression Test # 5min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/test_regression @@ -280,7 +280,7 @@ steps: - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - label: Examples Test # 25min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/examples" source_file_dependencies: - vllm/entrypoints @@ -305,7 +305,7 @@ steps: - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - label: Prefix Caching Test # 9min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/prefix_caching @@ -314,7 +314,7 @@ steps: - label: Platform Tests (CUDA) - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/cuda @@ -355,7 +355,7 @@ steps: - pytest -v -s compile/test_async_tp.py - label: PyTorch Fullgraph Smoke Test # 9min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ @@ -368,7 +368,7 @@ steps: - pytest -v -s compile/piecewise/test_full_cudagraph.py - label: PyTorch Fullgraph Test # 18min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ @@ -377,7 +377,7 @@ steps: - pytest -v -s compile/test_full_graph.py - label: Kernels Core Operation Test - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/ - tests/kernels/core @@ -416,7 +416,7 @@ steps: parallelism: 2 - label: Kernels Mamba Test - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/mamba/ - tests/kernels/mamba @@ -424,7 +424,7 @@ steps: - pytest -v -s kernels/mamba - label: Tensorizer Test # 11min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] soft_fail: true source_file_dependencies: - vllm/model_executor/model_loader @@ -437,7 +437,7 @@ steps: - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py - label: Model Executor Test - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/model_executor - tests/model_executor @@ -447,7 +447,7 @@ steps: - pytest -v -s model_executor - label: Benchmarks # 9min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/.buildkite" source_file_dependencies: - benchmarks/ @@ -455,7 +455,7 @@ steps: - bash scripts/run-benchmarks.sh - label: Benchmarks CLI Test # 10min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/benchmarks/ @@ -494,7 +494,7 @@ steps: - pytest -s entrypoints/openai/correctness/ - label: Encoder Decoder tests # 5min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/encoder_decoder @@ -502,7 +502,7 @@ steps: - pytest -v -s encoder_decoder - label: OpenAI-Compatible Tool Use # 20 min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] fast_check: false source_file_dependencies: - vllm/ @@ -623,7 +623,7 @@ steps: # This test is used only in PR development phase to test individual models and should never run on main - label: Custom Models Test - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] optional: true commands: - echo 'Testing custom models...' @@ -658,7 +658,7 @@ steps: ##### multi gpus test ##### - label: Distributed Comm Ops Test # 7min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 source_file_dependencies: @@ -755,7 +755,7 @@ steps: - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins - label: Multi-step Tests (4 GPUs) # 36min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: @@ -776,7 +776,7 @@ steps: - pytest -v -s multi_step/test_correctness_llm.py - label: Pipeline Parallelism Test # 45min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: @@ -790,7 +790,7 @@ steps: - pytest -v -s distributed/test_pipeline_parallel.py - label: LoRA TP Test (Distributed) - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] num_gpus: 4 source_file_dependencies: - vllm/lora From 53c21e492e0acd140a9984c8ec7cc3a7123efee5 Mon Sep 17 00:00:00 2001 From: XiongfeiWei Date: Thu, 31 Jul 2025 10:26:43 -0700 Subject: [PATCH 095/224] Update torch_xla pin to 20250730 (#21956) Signed-off-by: Xiongfei Wei --- docker/Dockerfile.tpu | 2 +- requirements/tpu.txt | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile.tpu b/docker/Dockerfile.tpu index b9fc9def88190..2190151369761 100644 --- a/docker/Dockerfile.tpu +++ b/docker/Dockerfile.tpu @@ -1,4 +1,4 @@ -ARG NIGHTLY_DATE="20250724" +ARG NIGHTLY_DATE="20250730" ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE" FROM $BASE_IMAGE diff --git a/requirements/tpu.txt b/requirements/tpu.txt index 2d0d8bd8457e3..7bb77c4a99636 100644 --- a/requirements/tpu.txt +++ b/requirements/tpu.txt @@ -19,8 +19,8 @@ nixl==0.3.0 --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch==2.9.0.dev20250724 -torchvision==0.24.0.dev20250724 -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250724-cp312-cp312-linux_x86_64.whl ; python_version == "3.12" +torch==2.9.0.dev20250730 +torchvision==0.24.0.dev20250730 +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250730-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250730-cp312-cp312-linux_x86_64.whl ; python_version == "3.12" From 9e0726e5bfd201fa2c9209e3997d24c72ecc3b13 Mon Sep 17 00:00:00 2001 From: zhiweiz Date: Thu, 31 Jul 2025 10:35:07 -0700 Subject: [PATCH 096/224] [Meta] Official Eagle mm support, first enablement on llama4 (#20788) Signed-off-by: morgendave Co-authored-by: Roger Wang --- examples/offline_inference/spec_decode.py | 64 ++++++++++++++++++++-- tests/v1/e2e/test_spec_decode.py | 61 +++++++++++++++------ vllm/model_executor/models/llama4.py | 1 + vllm/model_executor/models/llama4_eagle.py | 35 ++++++++++-- vllm/model_executor/models/llama_eagle.py | 6 ++ vllm/model_executor/models/llama_eagle3.py | 5 ++ vllm/v1/spec_decode/eagle.py | 59 +++++++++++++++++--- vllm/v1/worker/gpu_model_runner.py | 10 +++- 8 files changed, 205 insertions(+), 36 deletions(-) diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index ce735f3b27dfe..184c30891eca7 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -13,6 +13,38 @@ except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser +QUESTION = "What is the content of each image?" +IMAGE_URLS = [ + "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg", + "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg", + "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG", + "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg", + "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg", + "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg", + "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg", + "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg", + "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg", + "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg", + "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg", + "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg", +] + + +def get_custom_mm_prompts(num_prompts): + prompts = [] + for url in IMAGE_URLS: + prompts.append( + [ + {"type": "image_url", "image_url": {"url": url}}, + {"type": "text", "text": QUESTION}, + ] + ) + if num_prompts > len(IMAGE_URLS): + prompts = prompts * (num_prompts // len(IMAGE_URLS) + 1) + + return [[{"role": "user", "content": prompt}] for prompt in prompts[:num_prompts]] + + def parse_args(): parser = FlexibleArgumentParser() add_dataset_parser(parser) @@ -35,6 +67,7 @@ def parse_args(): parser.add_argument("--output-len", type=int, default=256) parser.add_argument("--model-dir", type=str, default=None) parser.add_argument("--eagle-dir", type=str, default=None) + parser.add_argument("--custom-mm-prompts", action="store_true") return parser.parse_args() @@ -44,14 +77,26 @@ def main(): model_dir = args.model_dir if args.model_dir is None: + if args.custom_mm_prompts: + raise ValueError( + "custom_mm_prompts requires mm based models" + "default llama3.1-8b-instruct is not mm based" + "please specify model_dir to give a mm based model" + ) model_dir = "meta-llama/Llama-3.1-8B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_dir) + args.custom_skip_chat_template = True - prompts = get_samples(args, tokenizer) - # add_special_tokens is False to avoid adding bos twice when using chat templates - prompt_ids = [ - tokenizer.encode(prompt.prompt, add_special_tokens=False) for prompt in prompts - ] + if not args.custom_mm_prompts: + prompts = get_samples(args, tokenizer) + # add_special_tokens is False to avoid adding bos twice + # when using chat templates + prompt_ids = [ + tokenizer.encode(prompt.prompt, add_special_tokens=False) + for prompt in prompts + ] + else: + prompts = get_custom_mm_prompts(args.num_prompts) if args.method == "eagle" or args.method == "eagle3": eagle_dir = args.eagle_dir @@ -85,10 +130,17 @@ def main(): speculative_config=speculative_config, disable_log_stats=False, max_model_len=16384, + limit_mm_per_prompt={"image": 5}, + disable_chunked_mm_input=True, ) sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len) - outputs = llm.generate(prompt_token_ids=prompt_ids, sampling_params=sampling_params) + if not args.custom_mm_prompts: + outputs = llm.generate( + prompt_token_ids=prompt_ids, sampling_params=sampling_params + ) + else: + outputs = llm.chat(prompts, sampling_params=sampling_params) # print the generated text if args.print_output: diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 2423f966acfab..31f25e94c5b4b 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -3,29 +3,34 @@ from __future__ import annotations import random -from typing import Any +from typing import Any, Union import pytest import torch from vllm import LLM, SamplingParams +from vllm.assets.base import VLLM_S3_BUCKET_URL +from vllm.assets.image import VLM_IMAGES_DIR from vllm.distributed import cleanup_dist_env_and_memory -@pytest.fixture -def test_prompts(): +def get_test_prompts(mm_enabled: bool): prompt_types = ["repeat", "sentence"] + if mm_enabled: + prompt_types.append("mm") num_prompts = 100 prompts = [] random.seed(0) random_prompt_type_choices = random.choices(prompt_types, k=num_prompts) + print(f"Prompt types: {random_prompt_type_choices}") # Generate a mixed batch of prompts, some of which can be easily # predicted by n-gram matching and some which likely cannot. for kind in random_prompt_type_choices: word_choices = ["test", "temp", "hello", "where"] word = random.choice(word_choices) + prompt: Union[str, list[dict[str, Any]]] = "" if kind == "repeat": prompt = f""" please repeat the word '{word}' 10 times. @@ -38,6 +43,21 @@ def test_prompts(): uses the word {word} at least once. give no other output than that simple sentence without quotes. """ + elif kind == "mm": + placeholders = [{ + "type": "image_url", + "image_url": { + "url": + f"{VLLM_S3_BUCKET_URL}/{VLM_IMAGES_DIR}/stop_sign.jpg" + }, + }] + prompt = [ + *placeholders, + { + "type": "text", + "text": "The meaning of the image is" + }, + ] else: raise ValueError(f"Unknown prompt type: {kind}") prompts.append([{"role": "user", "content": prompt}]) @@ -57,7 +77,6 @@ def model_name(): def test_ngram_correctness( monkeypatch: pytest.MonkeyPatch, - test_prompts: list[list[dict[str, Any]]], sampling_config: SamplingParams, model_name: str, ): @@ -67,6 +86,7 @@ def test_ngram_correctness( ''' with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") + test_prompts = get_test_prompts(mm_enabled=False) ref_llm = LLM(model=model_name, max_model_len=1024) ref_outputs = ref_llm.chat(test_prompts, sampling_config) @@ -103,23 +123,32 @@ def test_ngram_correctness( cleanup_dist_env_and_memory() -@pytest.mark.parametrize("model_setup", [ - ("eagle", "meta-llama/Llama-3.1-8B-Instruct", - "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), - ("eagle3", "meta-llama/Llama-3.1-8B-Instruct", - "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), - pytest.param( - ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct", - "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), - marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), -], - ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle"]) +@pytest.mark.parametrize( + ["model_setup", "mm_enabled"], [ + (("eagle", "meta-llama/Llama-3.1-8B-Instruct", + "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False), + (("eagle3", "meta-llama/Llama-3.1-8B-Instruct", + "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False), + pytest.param( + ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), + False, + marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), + pytest.param( + ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), + True, + marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), + ], + ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle", "llama4_eagle_mm"]) def test_eagle_correctness( monkeypatch: pytest.MonkeyPatch, - test_prompts: list[list[dict[str, Any]]], sampling_config: SamplingParams, model_setup: tuple[str, str, str, int], + mm_enabled: bool, ): + # Generate test prompts inside the function instead of using fixture + test_prompts = get_test_prompts(mm_enabled) ''' Compare the outputs of a original LLM and a speculative LLM should be the same when using eagle speculative decoding. diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 470e701d98013..60098209c39ac 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -256,6 +256,7 @@ class Llama4DecoderLayer(nn.Module): super().__init__() self.layer_idx = extract_layer_index(prefix) + self.global_layer = config.no_rope_layers[self.layer_idx] == 0 self.hidden_size = config.hidden_size rope_theta = config.rope_theta rope_scaling = config.rope_scaling diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py index 222ab5dfaee4a..ece490ff2f2a8 100644 --- a/vllm/model_executor/models/llama4_eagle.py +++ b/vllm/model_executor/models/llama4_eagle.py @@ -37,8 +37,9 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.llama4 import (Llama4DecoderLayer, Llama4ForCausalLM) from vllm.model_executor.models.utils import extract_layer_index +from vllm.multimodal.inputs import NestedTensors -from .utils import AutoWeightsLoader, maybe_prefix +from .utils import AutoWeightsLoader, maybe_prefix, merge_multimodal_embeddings logger = init_logger(__name__) @@ -78,15 +79,23 @@ class LlamaModel(nn.Module): self.norm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps) + def get_input_embeddings( + self, + input_ids: torch.Tensor, + ) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: Optional[torch.Tensor], positions: torch.Tensor, hidden_states: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, torch.Tensor]: - input_embeds = self.embed_tokens(input_ids) + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings(input_ids) hidden_states = self.fc( - torch.cat((input_embeds, hidden_states), dim=-1)) + torch.cat((inputs_embeds, hidden_states), dim=-1)) residual = None for layer in self.layers: hidden_states, residual = layer( @@ -190,8 +199,9 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM): input_ids: torch.Tensor, positions: torch.Tensor, hidden_states: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, torch.Tensor]: - return self.model(input_ids, positions, hidden_states) + return self.model(input_ids, positions, hidden_states, inputs_embeds) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> None: @@ -212,3 +222,20 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM): model_weights[name] = loaded_weight loader.load_weights(model_weights.items()) + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.model.get_input_embeddings(input_ids) + + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + multimodal_embeddings, + self.config.image_token_index, + ) + + return inputs_embeds diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index c7690604c1d09..a4933b77e3a53 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -148,7 +149,12 @@ class EagleLlamaForCausalLM(LlamaForCausalLM): input_ids: torch.Tensor, positions: torch.Tensor, hidden_states: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, torch.Tensor]: + if inputs_embeds is not None: + raise NotImplementedError( + f"{type(self).__name__} does not support multimodal inputs yet." + ) return self.model(input_ids, positions, hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 7fc9fe2ebb6f6..71275f0d58579 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -202,7 +202,12 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): input_ids: torch.Tensor, positions: torch.Tensor, hidden_states: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, ) -> tuple[torch.Tensor, torch.Tensor]: + if inputs_embeds is not None: + raise NotImplementedError( + f"{type(self).__name__} does not support multimodal inputs yet." + ) return self.model(input_ids, positions, hidden_states) def compute_logits( diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 63f6fc276189d..302126dbe3d5f 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + import numpy as np import torch import torch.nn as nn @@ -51,6 +53,9 @@ class EagleProposer: # hidden size (e.g., Llama 3.3 70B). self.hidden_size = self.draft_model_config.get_hidden_size() + self.is_multimodal_model = vllm_config.model_config \ + .is_multimodal_model + self.use_cuda_graph = (self.vllm_config.compilation_config.level == CompilationLevel.PIECEWISE and not self.vllm_config.model_config.enforce_eager) @@ -76,6 +81,11 @@ class EagleProposer: device=device, dtype=torch.int32) + self.inputs_embeds = torch.zeros( + (self.max_num_tokens, self.hidden_size), + dtype=self.dtype, + device=device) + def propose( self, # [num_tokens] @@ -88,6 +98,7 @@ class EagleProposer: next_token_ids: torch.Tensor, common_attn_metadata: CommonAttentionMetadata, sampling_metadata: SamplingMetadata, + mm_embeds: Optional[list[torch.Tensor]] = None, ) -> torch.Tensor: num_tokens = target_token_ids.shape[0] batch_size = next_token_ids.shape[0] @@ -128,14 +139,27 @@ class EagleProposer: # copy inputs to buffer for cudagraph self.positions[:num_tokens] = target_positions self.hidden_states[:num_tokens] = target_hidden_states + if self.is_multimodal_model: + input_ids = self.input_ids[:num_tokens] + inputs_embeds = self.model.get_input_embeddings( + input_ids, + multimodal_embeddings=mm_embeds or None, + ) + self.inputs_embeds[:num_tokens] = inputs_embeds + inputs_embeds = self.inputs_embeds[:num_input_tokens] + input_ids = None + else: + inputs_embeds = None + input_ids = self.input_ids[:num_input_tokens] with set_forward_context(per_layer_attn_metadata, self.vllm_config, num_tokens=num_input_tokens): ret_hidden_states = self.model( - self.input_ids[:num_input_tokens], - self.positions[:num_input_tokens], - self.hidden_states[:num_input_tokens], + input_ids=input_ids, + positions=self.positions[:num_input_tokens], + hidden_states=self.hidden_states[:num_input_tokens], + inputs_embeds=inputs_embeds, ) if self.method == "deepseek_mtp": last_hidden_states = ret_hidden_states @@ -218,15 +242,24 @@ class EagleProposer: self.input_ids[:batch_size] = input_ids self.positions[:batch_size] = clamped_positions self.hidden_states[:batch_size] = hidden_states + if self.is_multimodal_model: + inputs_embeds = self.model.get_input_embeddings(input_ids) + self.inputs_embeds[:batch_size] = inputs_embeds + inputs_embeds = self.inputs_embeds[:input_batch_size] + input_ids = None + else: + inputs_embeds = None + input_ids = self.input_ids[:input_batch_size] # Run the model. with set_forward_context(per_layer_attn_metadata, self.vllm_config, num_tokens=input_batch_size): last_hidden_states, hidden_states = self.model( - self.input_ids[:input_batch_size], - self.positions[:input_batch_size], - self.hidden_states[:input_batch_size], + input_ids=input_ids, + positions=self.positions[:input_batch_size], + hidden_states=self.hidden_states[:input_batch_size], + inputs_embeds=inputs_embeds, ) hidden_states = hidden_states[:batch_size] logits = self.model.compute_logits(last_hidden_states[:batch_size], @@ -391,10 +424,18 @@ class EagleProposer: ) -> None: with set_forward_context(None, self.vllm_config, num_tokens=num_tokens): + if self.is_multimodal_model: + input_ids = None + inputs_embeds = self.inputs_embeds[:num_tokens] + else: + input_ids = self.input_ids[:num_tokens] + inputs_embeds = None + self.model( - self.input_ids[:num_tokens], - self.positions[:num_tokens], - self.hidden_states[:num_tokens], + input_ids=input_ids, + positions=self.positions[:num_tokens], + hidden_states=self.hidden_states[:num_tokens], + inputs_embeds=inputs_embeds, ) def validate_same_kv_cache_group(self, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 987ef22a1b7fb..29cda4d837bf3 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1205,13 +1205,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def _gather_mm_embeddings( self, scheduler_output: "SchedulerOutput", + shift_computed_tokens: int = 0, ) -> list[torch.Tensor]: mm_embeds: list[torch.Tensor] = [] for req_id in self.input_batch.req_ids: num_scheduled_tokens = scheduler_output.num_scheduled_tokens[ req_id] req_state = self.requests[req_id] - num_computed_tokens = req_state.num_computed_tokens + num_computed_tokens = \ + req_state.num_computed_tokens + shift_computed_tokens mm_positions = req_state.mm_positions for i, pos_info in enumerate(mm_positions): start_pos = pos_info.offset @@ -1858,6 +1860,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): [h[token_indices] for h in aux_hidden_states], dim=-1) else: target_hidden_states = hidden_states[token_indices] + mm_embeds = None + if self.is_multimodal_model: + mm_embeds = self._gather_mm_embeddings(scheduler_output, + shift_computed_tokens=1) + draft_token_ids = self.drafter.propose( target_token_ids=target_token_ids, target_positions=target_positions, @@ -1865,6 +1872,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): next_token_ids=next_token_ids, sampling_metadata=sampling_metadata, common_attn_metadata=common_attn_metadata, + mm_embeds=mm_embeds, ) spec_token_ids = draft_token_ids.tolist() return spec_token_ids From 71470bc4afdab89eccc232b668a69571ffede1dc Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Thu, 31 Jul 2025 11:39:16 -0700 Subject: [PATCH 097/224] [Misc] Add unit tests for chunked local attention (#21692) Signed-off-by: Yong Hoon Shin --- .../attention/test_chunked_local_attention.py | 196 ++++++++++++++++++ tests/v1/attention/utils.py | 36 ++-- 2 files changed, 219 insertions(+), 13 deletions(-) create mode 100644 tests/v1/attention/test_chunked_local_attention.py diff --git a/tests/v1/attention/test_chunked_local_attention.py b/tests/v1/attention/test_chunked_local_attention.py new file mode 100644 index 0000000000000..8c5a63653db9f --- /dev/null +++ b/tests/v1/attention/test_chunked_local_attention.py @@ -0,0 +1,196 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass + +import numpy as np +import pytest +import torch + +from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata +from vllm.v1.attention.backends.utils import ( + make_local_attention_virtual_batches) + + +@dataclass +class LocalAttentionTestData: + # Input parameters + batch_spec: BatchSpec + attn_chunk_size: int + block_size: int + # Expected return values + expected_q_seqlens: list[int] + expected_k_seqlens: list[int] + expected_local_block_table: list[list[int]] + + +test_data_list = [ + # Same as example in docstring of make_local_attention_virtual_batches + # except block table has 9 columns instead of 10 + LocalAttentionTestData( + batch_spec=BatchSpec( + query_lens=[4, 10, 5], + seq_lens=[6, 17, 9], + ), + attn_chunk_size=4, + block_size=2, + expected_q_seqlens=[2, 2, 1, 4, 4, 1, 4, 1], + expected_k_seqlens=[4, 2, 4, 4, 4, 1, 4, 1], + # 2 pages per local branch + # (chunk size 4 // block size 2) + expected_local_block_table=[ + [0, 1], # local-batch 0, (batch 0, starting from k[0]) + [2, 3], # local-batch 1, (batch 0, starting from k[4]) + [11, 12], # local-batch 2, (batch 1, starting from k[4]) + [13, 14], # local-batch 3, (batch 1, starting from k[8]) + [15, 16], # local-batch 4, (batch 1, starting from k[12]) + [17, 17], # local-batch 5, (batch 1, starting from k[16]) + [20, 21], # local-batch 6, (batch 2, starting from k[4]) + [22, 23], # local-batch 7, (batch 2, starting from k[8]) + ]), + # Case where block indices are not clipped to block table ncols-1 + # because tokens_in_last_block == attn_chunk_size + LocalAttentionTestData(batch_spec=BatchSpec( + query_lens=[8], + seq_lens=[12], + ), + attn_chunk_size=4, + block_size=2, + expected_q_seqlens=[4, 4], + expected_k_seqlens=[4, 4], + expected_local_block_table=[ + [2, 3], + [4, 5], + ]), + # Case where all kv_seq positions are involved in attn + LocalAttentionTestData( + batch_spec=BatchSpec( + query_lens=[7], + # 10 - 7 = 3 previously computed tokens + seq_lens=[10], + ), + attn_chunk_size=4, + block_size=2, + expected_q_seqlens=[1, 4, 2], + expected_k_seqlens=[4, 4, 2], + expected_local_block_table=[ + [0, 1], + [2, 3], + [4, 4], + ]), + # Case where attn_chunk_size > kv_seq_len + # so no extra mini virtual batches are created + LocalAttentionTestData( + batch_spec=BatchSpec( + query_lens=[4], + seq_lens=[6], + ), + # Larger than kv_seq_len + attn_chunk_size=10, + block_size=2, + # No change to q_seqlens and k_seqlens + expected_q_seqlens=[4], + expected_k_seqlens=[6], + # In this case, we only need a block-table like: + # block_table = [ [0, 1, 2] ] # 1 batch, 3 pages + # But we need to pad it to 5 pages per local batch + # because currently the pages_per_local_batch + # is calculated as (attn_chunk_size // block_size) + expected_local_block_table=[ + [0, 1, 2, 2, 2], + ]), + # Block size equal to chunk size + # Expect single page per batch in local batch table + LocalAttentionTestData( + batch_spec=BatchSpec( + query_lens=[6, 6], + seq_lens=[8, 8], + ), + attn_chunk_size=4, + block_size=4, + expected_q_seqlens=[2, 4, 2, 4], + expected_k_seqlens=[4, 4, 4, 4], + # Initial block table = [ + # [0, 1], < batch 0 + # [2, 3], < batch 1 + # ] + expected_local_block_table=[ + [0], # local-batch 0, (batch 0, starting from k[0]) + [1], # local-batch 1, (batch 0, starting from k[4]) + [2], # local-batch 1, (batch 0, starting from k[0]) + [3], # local-batch 1, (batch 0, starting from k[4]) + ]), + # Case where query falls in the second attention chunk + # k_toks > 0 1 2 3 4 + # q_toks v _____________ + # 0 | 1 + # 1 | 1 1 + # 2 | 1 1 1 + # 3 | 1 1 1 1 + # 4 | 1 + # where tokens 0,1,2,3 have been pre-computed + LocalAttentionTestData(batch_spec=BatchSpec( + query_lens=[1], + seq_lens=[5], + ), + attn_chunk_size=4, + block_size=2, + expected_q_seqlens=[1], + expected_k_seqlens=[1], + expected_local_block_table=[ + [2, 2], + ]), +] + + +@pytest.mark.parametrize("test_data", test_data_list) +def test_local_attention_virtual_batches(test_data: LocalAttentionTestData): + device = torch.device("cuda:0") + batch_spec = test_data.batch_spec + attn_chunk_size = test_data.attn_chunk_size + block_size = test_data.block_size + expected_q_seqlens = test_data.expected_q_seqlens + expected_k_seqlens = test_data.expected_k_seqlens + expected_local_block_table = test_data.expected_local_block_table + + # Create common attention metadata + common_attn_metadata = create_common_attn_metadata( + batch_spec, + block_size, + device, + # Use torch.arange instead of torch.randint so we can assert on + # block table tensor values. The block table will have shape + # (num_batches, cdiv(max_seq_len, block_size)) and the values will be + # aranged from 0 to cdiv(max_seq_len, block_size)-1 + arange_block_indices=True, + ) + + # Call the function + result = make_local_attention_virtual_batches(attn_chunk_size, + common_attn_metadata, + block_size) + + # Convert to numpy for easier comparison + actual_q_seqlens = np.diff(result.query_start_loc_cpu.numpy()) + actual_k_seqlens = result.seq_lens_cpu.numpy() + + # Check that all query lengths are less than or equal to attn_chunk_size + assert all(q_len <= attn_chunk_size for q_len in actual_q_seqlens) + # Check that all key lengths are less than or equal to attn_chunk_size + assert all(k_len <= attn_chunk_size for k_len in actual_k_seqlens) + # Check that the total number of query tokens is preserved + assert sum(actual_q_seqlens) == sum(batch_spec.query_lens) + + # Verify results + np.testing.assert_array_equal(actual_q_seqlens, expected_q_seqlens) + np.testing.assert_array_equal(actual_k_seqlens, expected_k_seqlens) + + expected_block_table_tensor =\ + torch.tensor(expected_local_block_table, + dtype=torch.int32, + device=device) + + print(f"Expected block table:\n{expected_block_table_tensor}") + print(f"Actual block table:\n{result.block_table_tensor}") + + torch.testing.assert_close(result.block_table_tensor, + expected_block_table_tensor) diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index ae2ab6e6413c0..be6cfce6fba8a 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -40,7 +40,8 @@ def create_common_attn_metadata( batch_spec: BatchSpec, block_size: int, device: torch.device, - max_block_idx: int = 1000) -> CommonAttentionMetadata: + max_block_idx: int = 1000, + arange_block_indices: bool = False) -> CommonAttentionMetadata: """Create CommonAttentionMetadata from a BatchSpec and ModelParams.""" # Create query start locations query_start_loc = torch.zeros(batch_spec.batch_size + 1, @@ -65,19 +66,28 @@ def create_common_attn_metadata( ] num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32) - # Create block table (random for testing) + # Create block table and slot mapping max_blocks = (max(batch_spec.seq_lens) + block_size - 1) // block_size - block_table_tensor = torch.randint(0, - max_block_idx, - (batch_spec.batch_size, max_blocks), - dtype=torch.int32, - device=device) - - # Create slot mapping - slot_mapping = torch.randint(0, - max_block_idx, (num_tokens, ), - dtype=torch.int64, - device=device) + if arange_block_indices: + num_blocks = batch_spec.batch_size * max_blocks + block_table_tensor = torch.arange(num_blocks, + dtype=torch.int32, + device=device).view( + batch_spec.batch_size, + max_blocks) + slot_mapping = torch.arange(num_tokens, + dtype=torch.int64, + device=device).view(num_tokens) + else: + block_table_tensor = torch.randint(0, + max_block_idx, + (batch_spec.batch_size, max_blocks), + dtype=torch.int32, + device=device) + slot_mapping = torch.randint(0, + max_block_idx, (num_tokens, ), + dtype=torch.int64, + device=device) # Calculate max query length max_query_len = max(batch_spec.query_lens) From 2dff2e21d928129e985b23897e9f326abe3f1417 Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Thu, 31 Jul 2025 16:33:53 -0400 Subject: [PATCH 098/224] [Bugfix] Fix MTP weight loading (#21941) --- vllm/model_executor/models/deepseek_mtp.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index 911f0036c2dd6..2e026d582a6de 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -182,6 +182,8 @@ class DeepSeekMTP(nn.Module, SupportsPP): stacked_params_mapping = [ ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), + ("fused_qkv_a_proj", "q_a_proj", 0), + ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1), ] expert_params_mapping = FusedMoE.make_expert_params_mapping( @@ -212,6 +214,13 @@ class DeepSeekMTP(nn.Module, SupportsPP): if (("mlp.experts." in name) and name not in params_dict): continue name = name.replace(weight_name, param_name) + + # QKV fusion is optional, fall back to normal + # weight loading if it's not enabled + if ((param_name == "fused_qkv_a_proj") + and name not in params_dict): + continue + # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue From 6e672daf62e7b03ff1dcf74e4206dad07d39d4ec Mon Sep 17 00:00:00 2001 From: Ilya Markov Date: Thu, 31 Jul 2025 22:58:38 +0200 Subject: [PATCH 099/224] Add FlashInfer allreduce RMSNorm Quant fusion (#21069) Signed-off-by: ilmarkov Signed-off-by: ilmarkov Co-authored-by: ilmarkov --- .buildkite/test-pipeline.yaml | 1 + tests/compile/test_fusion_all_reduce.py | 126 +++++- tests/utils.py | 12 + vllm/compilation/collective_fusion.py | 533 ++++++++++++++++++++++-- vllm/config.py | 2 +- 5 files changed, 606 insertions(+), 68 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a7fe200559305..2f6cc45be77e6 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -353,6 +353,7 @@ steps: - pytest -v -s compile/test_silu_mul_quant_fusion.py - pytest -v -s compile/test_sequence_parallelism.py - pytest -v -s compile/test_async_tp.py + - pytest -v -s compile/test_fusion_all_reduce.py - label: PyTorch Fullgraph Smoke Test # 9min mirror_hardwares: [amdexperimental] diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py index b8d64247f6beb..b394e0035c689 100644 --- a/tests/compile/test_fusion_all_reduce.py +++ b/tests/compile/test_fusion_all_reduce.py @@ -7,22 +7,26 @@ import torch import vllm.envs as envs from vllm.compilation.collective_fusion import AllReduceFusionPass +from vllm.compilation.fix_functionalization import FixFunctionalizationPass +from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.config import (CompilationConfig, CompilationLevel, DeviceConfig, ModelConfig, PassConfig, VllmConfig) from vllm.distributed import tensor_model_parallel_all_reduce from vllm.distributed.parallel_state import (init_distributed_environment, initialize_model_parallel) from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + GroupShape, QuantFP8) from vllm.platforms import current_platform from vllm.utils import update_environment_variables -from ..utils import multi_gpu_test +from ..utils import has_module_attribute, multi_gpu_test from .backend import TestBackend class TestAllReduceRMSNormModel(torch.nn.Module): - def __init__(self, hidden_size=16, eps=1e-6): + def __init__(self, hidden_size=16, token_num=16, eps=1e-6): super().__init__() self.hidden_size = hidden_size self.eps = eps @@ -43,7 +47,7 @@ class TestAllReduceRMSNormModel(torch.nn.Module): class TestAllReduceFusedAddRMSNormModel(torch.nn.Module): - def __init__(self, hidden_size=16, eps=1e-6): + def __init__(self, hidden_size=16, token_num=16, eps=1e-6): super().__init__() self.hidden_size = hidden_size self.eps = eps @@ -62,24 +66,101 @@ class TestAllReduceFusedAddRMSNormModel(torch.nn.Module): return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default] +class TestAllReduceFusedAddRMSNormStaticQuantFP8Model(torch.nn.Module): + + def __init__(self, hidden_size=16, token_num=16, eps=1e-6): + super().__init__() + self.hidden_size = hidden_size + self.eps = eps + self.norm = RMSNorm(hidden_size, eps) + self.quant_fp8 = QuantFP8(static=True, + group_shape=GroupShape.PER_TENSOR) + self.scale = torch.rand(1, dtype=torch.float32) + self.output = torch.empty((token_num, hidden_size), + dtype=torch.float32) + + def forward(self, hidden_states, residual): + view = hidden_states.reshape(-1, self.hidden_size) + all_reduce = tensor_model_parallel_all_reduce(view) + norm_output, residual_output = self.norm(all_reduce, residual) + torch.ops._C.static_scaled_fp8_quant(self.output, + norm_output.contiguous(), + self.scale) + return self.output, residual_output + + def ops_in_model_after(self): + return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default] + + def ops_in_model_before(self): + return [ + torch.ops.vllm.all_reduce.default, + torch.ops._C.static_scaled_fp8_quant.default + ] + + +class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module): + + def __init__(self, hidden_size=16, token_num=16, eps=1e-6): + super().__init__() + self.hidden_size = hidden_size + self.eps = eps + self.norm = RMSNorm(hidden_size, eps) + self.scale = torch.rand(1, dtype=torch.float32) + self.output = torch.empty((token_num, hidden_size), + dtype=torch.float32) + + round_up = lambda x, y: (x + y - 1) // y * y + rounded_m = round_up(token_num, 128) + scale_n = hidden_size // 16 + rounded_n = round_up(scale_n, 4) + self.output_scale = torch.empty((rounded_m, rounded_n // 4), + dtype=torch.int32) + + def forward(self, hidden_states, residual): + view = hidden_states.reshape(-1, self.hidden_size) + all_reduce = tensor_model_parallel_all_reduce(view) + norm_output, residual_output = self.norm(all_reduce, residual) + norm_output = norm_output.reshape(-1, norm_output.shape[-1]) + torch.ops._C.scaled_fp4_quant(self.output, norm_output, + self.output_scale, self.scale) + return self.output, residual_output, self.output_scale + + def ops_in_model_after(self): + return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default] + + def ops_in_model_before(self): + return [ + torch.ops.vllm.all_reduce.default, + torch.ops._C.scaled_fp4_quant.default + ] + + @multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize( - "test_model", - [TestAllReduceRMSNormModel, TestAllReduceFusedAddRMSNormModel]) +@pytest.mark.parametrize("test_model", [ + TestAllReduceRMSNormModel, + TestAllReduceFusedAddRMSNormModel, + TestAllReduceFusedAddRMSNormStaticQuantFP8Model, + TestAllReduceFusedAddRMSNormStaticQuantFP4Model, +]) @pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("seq_len", [8]) -@pytest.mark.parametrize("hidden_size", [4096]) +@pytest.mark.parametrize("hidden_size", [16]) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA") -@pytest.mark.skipif(not find_spec("flashinfer"), - reason="flashinfer is not installed") -@pytest.mark.skipif(not current_platform.is_device_capability(100), - reason="Only test on SM100") +@pytest.mark.skipif( + not find_spec("flashinfer") + or not has_module_attribute("flashinfer.comm", "trtllm_allreduce_fusion"), + reason="flashinfer is not found or flashinfer " + "is not compiled with trtllm_allreduce_fusion") def test_all_reduce_fusion_pass_replace(test_model: torch.nn.Module, batch_size: int, seq_len: int, hidden_size: int, dtype: torch.dtype): num_processes = 2 + if (test_model == TestAllReduceFusedAddRMSNormStaticQuantFP4Model + and not current_platform.has_device_capability(100)): + pytest.skip("Skip as nvfp4 is only supported on " + "devices with compute capability 10.0 (Blackwell)") def run_torch_spawn(fn, nprocs): torch.multiprocessing.spawn(fn, @@ -113,12 +194,11 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int, init_distributed_environment() initialize_model_parallel(tensor_model_parallel_size=world_size) - vllm_config = VllmConfig( - compilation_config=CompilationConfig(level=CompilationLevel.PIECEWISE, - custom_ops=["+rms_norm"], - compile_sizes=[2, 4, 8])) + vllm_config = VllmConfig(compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + custom_ops=["+rms_norm", "+quant_fp8"])) vllm_config.compilation_config.pass_config = PassConfig( - enable_fi_allreduce_fusion=True) + enable_fi_allreduce_fusion=True, enable_noop=True) vllm_config.device_config = DeviceConfig(device=torch.device("cuda")) # this is a fake model name to construct the model config @@ -130,14 +210,16 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int, seed=42) all_reduce_fusion_pass = AllReduceFusionPass(vllm_config) - backend = TestBackend(all_reduce_fusion_pass) + noop_pass = NoOpEliminationPass(vllm_config) + func_pass = FixFunctionalizationPass(vllm_config) - model = test_model_cls(hidden_size) + backend = TestBackend(all_reduce_fusion_pass, noop_pass, func_pass) - hidden_states = torch.randn((batch_size * seq_len, hidden_size), - requires_grad=False) - residual = torch.randn((batch_size * seq_len, hidden_size), - requires_grad=False) + token_num = batch_size * seq_len + model = test_model_cls(hidden_size, token_num) + + hidden_states = torch.randn((token_num, hidden_size), requires_grad=False) + residual = torch.randn((token_num, hidden_size), requires_grad=False) compiled_model = torch.compile(model, backend=backend) compiled_model(hidden_states, residual) diff --git a/tests/utils.py b/tests/utils.py index f4317e6bdb406..1c1a1cc6014ec 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,6 +4,7 @@ import asyncio import copy import functools +import importlib import os import signal import subprocess @@ -974,3 +975,14 @@ def get_client_text_logprob_generations( return [(text_generations, text, (None if x.logprobs is None else x.logprobs.top_logprobs)) for completion in completions for x in completion.choices] + + +def has_module_attribute(module_name, attribute_name): + """ + Helper function to check if a module has a specific attribute. + """ + try: + module = importlib.import_module(module_name) + return hasattr(module, attribute_name) + except ImportError: + return False diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index cb99fe8310e73..6ae50245ed3a8 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -37,6 +37,8 @@ logger = init_logger(__name__) ALLREDUCE_OP = torch.ops.vllm.all_reduce.default RMS_OP = torch.ops._C.rms_norm.default RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default +STATIC_FP8_QUANT_OP = torch.ops._C.static_scaled_fp8_quant.default +STATIC_FP4_QUANT_OP = torch.ops._C.scaled_fp4_quant.default class BasePattern: @@ -394,7 +396,7 @@ if flashinfer_comm is not None: # Max size of the input tensor per world size # to use flashinfer fused allreduce _FI_MAX_SIZES = { - 2: MiB, # 1MB + 2: 64 * MiB, # 64MB 4: MiB, # 1MB 6: MiB // 2, # 512KB 8: MiB // 2, # 512KB @@ -414,9 +416,13 @@ if flashinfer_comm is not None: trigger_completion_at_end: bool, fp32_acc: bool, max_token_num: int, + pattern_code: int, + fuse_rms_quant: bool, norm_out: Optional[torch.Tensor] = None, + quant_out: Optional[torch.Tensor] = None, + scale_out: Optional[torch.Tensor] = None, + scale_factor: Optional[torch.Tensor] = None, ) -> None: - num_tokens, hidden_size = allreduce_in.shape element_size = allreduce_in.element_size() current_tensor_size = num_tokens * hidden_size * element_size @@ -425,7 +431,6 @@ if flashinfer_comm is not None: _FI_MAX_SIZES.get(world_size, _DEFAULT_FI_MAX_SIZE), max_fusion_size, ) - if use_flashinfer: assert (_FI_WORKSPACE_TENSOR is not None ), "Flashinfer must be enabled when using flashinfer" @@ -455,37 +460,65 @@ if flashinfer_comm is not None: use_oneshot=True, trigger_completion_at_end=trigger_completion_at_end, fp32_acc=fp32_acc, - pattern_code=flashinfer_comm.AllReduceFusionPattern. - kARResidualRMSNorm, + pattern_code=pattern_code, allreduce_out=None, - quant_out=None, - scale_out=None, - layout_code=None, - scale_factor=None, + quant_out=quant_out, + scale_out=scale_out, + # in vllm we only support swizzled layout + layout_code=flashinfer_comm.FP4QuantizationSFLayout.SWIZZLED, + scale_factor=scale_factor, ) else: allreduce_out = tensor_model_parallel_all_reduce(allreduce_in) - if norm_out is None: - torch.ops._C.fused_add_rms_norm(allreduce_out, residual, - rms_gamma, rms_eps) + if (scale_factor is not None and scale_out is None + and fuse_rms_quant): + # Do fused rms norm static fp8 quant fused op + if norm_out is None: + torch.ops._C.fused_add_rms_norm_static_fp8_quant( + quant_out, allreduce_out, residual, rms_gamma, + scale_factor, rms_eps) + else: + torch.ops._C.rms_norm_static_fp8_quant( + quant_out, allreduce_out, rms_gamma, scale_factor, + rms_eps) else: - torch.ops._C.rms_norm(norm_out, allreduce_out, rms_gamma, - rms_eps) - allreduce_in.copy_(allreduce_out) + if norm_out is None: + torch.ops._C.fused_add_rms_norm(allreduce_out, residual, + rms_gamma, rms_eps) + norm_out = allreduce_out + else: + torch.ops._C.rms_norm(norm_out, allreduce_out, rms_gamma, + rms_eps) + if scale_factor is not None: + if scale_out is not None: + torch.ops._C.scaled_fp4_quant(quant_out, norm_out, + scale_out, scale_factor) + else: + torch.ops._C.static_scaled_fp8_quant( + quant_out, norm_out, scale_factor) + if scale_factor is None or norm_out is not None: + # we need to return allreduce outpput + # in cases of non quant fused AR + RMS norm + # and fused AR + RMS norm + quant without fused add + allreduce_in.copy_(allreduce_out) def call_trtllm_fused_allreduce_norm_fake( - allreduce_in: torch.Tensor, - residual: torch.Tensor, - rms_gamma: torch.Tensor, - rms_eps: float, - world_rank: int, - world_size: int, - launch_with_pdl: bool, - trigger_completion_at_end: bool, - fp32_acc: bool, - max_token_num: int, - norm_out: Optional[torch.Tensor] = None, - ) -> None: + allreduce_in: torch.Tensor, + residual: torch.Tensor, + rms_gamma: torch.Tensor, + rms_eps: float, + world_rank: int, + world_size: int, + launch_with_pdl: bool, + trigger_completion_at_end: bool, + fp32_acc: bool, + max_token_num: int, + pattern_code: int, + fuse_rms_quant: bool, + norm_out: Optional[torch.Tensor] = None, + quant_out: Optional[torch.Tensor] = None, + scale_out: Optional[torch.Tensor] = None, + scale_factor: Optional[torch.Tensor] = None) -> None: pass direct_register_custom_op( @@ -495,6 +528,8 @@ if flashinfer_comm is not None: "allreduce_in", "residual", "norm_out", + "quant_out", + "scale_out", ], fake_impl=call_trtllm_fused_allreduce_norm_fake, dispatch_key=current_platform.dispatch_key, @@ -512,6 +547,7 @@ class FlashInferFusedAllReduceParams: world_size: int, use_fp32_lamport: bool = False, max_token_num: int = 1024, + fuse_rms_quant: bool = False, ): self.rank = rank self.world_size = world_size @@ -521,6 +557,7 @@ class FlashInferFusedAllReduceParams: self.fp32_acc = True self.use_oneshot = False self.max_token_num = max_token_num + self.fuse_rms_quant = fuse_rms_quant def get_trtllm_fused_allreduce_kwargs(self): return { @@ -530,10 +567,16 @@ class FlashInferFusedAllReduceParams: "trigger_completion_at_end": self.trigger_completion_at_end, "fp32_acc": self.fp32_acc, "max_token_num": self.max_token_num, + "fuse_rms_quant": self.fuse_rms_quant, } -class AllReduceRMSNORMPattern(BasePattern): +class AllReduceRMSNormPattern(BasePattern): + """ + This pattern replaces the allreduce + rms norm (without residual) + with fused flashinfer implementation. + Applies to allreduce + rmsnorm before attn in the first Transformer block. + """ def __init__( self, @@ -559,29 +602,34 @@ class AllReduceRMSNORMPattern(BasePattern): def pattern(input: torch.Tensor, rms_result: torch.Tensor, weight: torch.Tensor): - all_reduce_output = tensor_model_parallel_all_reduce(input) + allreduce_output = tensor_model_parallel_all_reduce(input) rms = auto_functionalized( RMS_OP, result=rms_result, - input=all_reduce_output, + input=allreduce_output, weight=weight, epsilon=self.epsilon, ) - return rms[1], all_reduce_output + # rms_result, allreduce_output + return rms[1], allreduce_output def replacement(input: torch.Tensor, rms_result: torch.Tensor, weight: torch.Tensor): residual = torch.zeros_like(input) allreduce = auto_functionalized( - torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default, + flashinfer_trtllm_fused_allreduce_norm, allreduce_in=input, residual=residual, norm_out=rms_result, + quant_out=None, + scale_out=None, rms_gamma=weight, rms_eps=self.epsilon, + pattern_code=flashinfer_comm.AllReduceFusionPattern. + kARResidualRMSNorm, **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(), ) - + # rms_result, allreduce_in return allreduce[3], allreduce[1] pm.register_replacement(pattern, replacement, self.get_inputs(), @@ -589,6 +637,11 @@ class AllReduceRMSNORMPattern(BasePattern): class AllReduceFusedAddRMSNormPattern(BasePattern): + """ + This pattern replaces the allreduce + rms norm (with residual) + with fused flashinfer implementation. + Applies to o_proj + rmsnorm after attn and mlp + rmsnorm before attn. + """ def __init__( self, @@ -615,33 +668,390 @@ class AllReduceFusedAddRMSNormPattern(BasePattern): def pattern(residual: torch.Tensor, input: torch.Tensor, weight: torch.Tensor): - all_reduce_output = tensor_model_parallel_all_reduce(input) + allreduce_output = tensor_model_parallel_all_reduce(input) rms = auto_functionalized( RMS_ADD_OP, - input=all_reduce_output, + input=allreduce_output, residual=residual, weight=weight, epsilon=self.epsilon, ) + # input, residual return rms[1], rms[2] def replacement(residual: torch.Tensor, input: torch.Tensor, weight: torch.Tensor): allreduce = auto_functionalized( - torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default, + flashinfer_trtllm_fused_allreduce_norm, allreduce_in=input, residual=residual, + norm_out=None, + quant_out=None, + scale_out=None, rms_gamma=weight, rms_eps=self.epsilon, - norm_out=None, + pattern_code=flashinfer_comm.AllReduceFusionPattern. + kARResidualRMSNorm, **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(), ) + # allreduce_in, residual return allreduce[1], allreduce[2] pm.register_replacement(pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass) +class AllReduceFusedRMSNormStaticQuantFP8Pattern(BasePattern): + """ + This pattern replaces the allreduce + rms norm (without residual) + + static fp8 quant with fused flashinfer implementation. + Applies to allreduce + rmsnorm + quant before attn + in the first Transformer block. + """ + + def __init__(self, epsilon: float, dtype: torch.dtype, device: str, + allreduce_params: FlashInferFusedAllReduceParams): + super().__init__(dtype, device) + self.epsilon = epsilon + self.allreduce_params = allreduce_params + self.quant_dtype = torch.float8_e4m3fn + + def register(self, pm_pass: PatternMatcherPass): + + def get_inputs(): + input = torch.zeros([1, 8, 4], + device=self.device, + dtype=self.dtype) + rmsnorm_result = torch.empty([1, 8, 4], + device=self.device, + dtype=self.dtype) + quant_result = torch.empty([1, 8, 4], + device=self.device, + dtype=self.quant_dtype) + weight = torch.empty([4], device=self.device, dtype=self.dtype) + scale = torch.tensor(1.0, device=self.device, dtype=torch.float32) + return [input, rmsnorm_result, quant_result, weight, scale] + + def pattern( + input: torch.Tensor, + rmsnorm_result: torch.Tensor, + quant_result: torch.Tensor, + weight: torch.Tensor, + scale: torch.Tensor, + ): + all_reduce = tensor_model_parallel_all_reduce(input) + rmsnorm_out_tuple = auto_functionalized(RMS_OP, + result=rmsnorm_result, + input=all_reduce, + weight=weight, + epsilon=self.epsilon) + + quant_out_tuple = auto_functionalized(STATIC_FP8_QUANT_OP, + result=quant_result, + input=rmsnorm_out_tuple[1], + scale=scale) + + # quant_out, allreduce_output + return quant_out_tuple[1], all_reduce + + def replacement(input: torch.Tensor, result_rms: torch.Tensor, + quant_result: torch.Tensor, weight: torch.Tensor, + scale: torch.Tensor): + residual = torch.zeros_like(input) + allreduce = auto_functionalized( + flashinfer_trtllm_fused_allreduce_norm, + allreduce_in=input, + residual=residual, + norm_out=result_rms, + quant_out=quant_result, + scale_out=None, + rms_gamma=weight, + rms_eps=self.epsilon, + pattern_code=flashinfer_comm.AllReduceFusionPattern. + kARResidualRMSNormFP8Quant, # we don't use norm_out afterwards + scale_factor=scale, + **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(), + ) + + # quant_out, allreduce_output + return allreduce[4], allreduce[1] + + pm.register_replacement(pattern, replacement, get_inputs(), + pm.fwd_only, pm_pass) + + +class AllReduceFusedAddRMSNormStaticQuantFP8Pattern(BasePattern): + """ + This pattern replaces the allreduce + rms norm (with residual) + + static fp8 quant with fused flashinfer implementation. + Applies to o_proj + rmsnorm after attn + quant and + mlp + rmsnorm + quant before attn. + """ + + def __init__(self, epsilon: float, dtype: torch.dtype, device: str, + allreduce_params: FlashInferFusedAllReduceParams): + super().__init__(dtype, device) + self.epsilon = epsilon + self.allreduce_params = allreduce_params + self.quant_dtype = torch.float8_e4m3fn + + def register(self, pm_pass: PatternMatcherPass): + + def get_inputs(): + input = torch.empty([4, 4], device=self.device, dtype=self.dtype) + + residual = torch.empty([4, 4], + device=self.device, + dtype=self.dtype) + weight = torch.empty([4, 4], device=self.device, dtype=self.dtype) + quant_result = torch.empty([4, 4], + device=self.device, + dtype=self.quant_dtype) + scale = torch.empty([1, 1], + device=self.device, + dtype=torch.float32) + + return [ + quant_result, + residual, + input, + weight, + scale, + ] + + def pattern( + quant_result: torch.Tensor, + residual: torch.Tensor, + input: torch.Tensor, + weight: torch.Tensor, + scale: torch.Tensor, + ): + allreduce_output = tensor_model_parallel_all_reduce(input) + + fused_add_rmsnorm_out_tuple = \ + auto_functionalized( + RMS_ADD_OP, + input=allreduce_output, + residual=residual, + weight=weight, + epsilon=self.epsilon) + quant_out_tuple = auto_functionalized( + STATIC_FP8_QUANT_OP, + result=quant_result, + input=fused_add_rmsnorm_out_tuple[1], + scale=scale) + + # quant_out, allreduce_output + return quant_out_tuple[1], fused_add_rmsnorm_out_tuple[2] + + def replacement(quant_result: torch.Tensor, residual: torch.Tensor, + input: torch.Tensor, weight: torch.Tensor, + scale: torch.Tensor): + allreduce = auto_functionalized( + flashinfer_trtllm_fused_allreduce_norm, + allreduce_in=input, + residual=residual, + norm_out=None, + quant_out=quant_result, + scale_out=None, + rms_gamma=weight, + rms_eps=self.epsilon, + pattern_code=flashinfer_comm.AllReduceFusionPattern. + kARResidualRMSNormFP8Quant, # we don't use norm_out afterwards + scale_factor=scale, + **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(), + ) + # # quant_out, rms_norm_residual + return allreduce[4], allreduce[2] + + pm.register_replacement(pattern, replacement, get_inputs(), + pm.fwd_only, pm_pass) + + +class AllReduceFusedRMSNormStaticQuantNVFP4Pattern(BasePattern): + """ + This pattern replaces the allreduce + rms norm (without residual) + + static nvfp4 quant with fused flashinfer implementation. + Applies to allreduce + rmsnorm + quant before attn + in the first Transformer block. + """ + + def __init__(self, epsilon: float, dtype: torch.dtype, device: str, + allreduce_params: FlashInferFusedAllReduceParams): + super().__init__(dtype, device) + self.epsilon = epsilon + self.allreduce_params = allreduce_params + + def register(self, pm_pass: PatternMatcherPass): + + def get_inputs(): + input = torch.empty([1, 16, 16], + device=self.device, + dtype=self.dtype) + + rmsnorm_result = torch.empty([1, 16, 16], + device=self.device, + dtype=self.dtype) + quant_result = torch.empty((16, 8), + device=self.device, + dtype=torch.uint8) + input_global_scale = torch.empty([1, 1], + device=self.device, + dtype=torch.float32) + weight = torch.empty([16], device=self.device, dtype=self.dtype) + output_scale = torch.empty([128, 4], + device=self.device, + dtype=torch.int32) + + return [ + input, rmsnorm_result, quant_result, weight, + input_global_scale, output_scale + ] + + def pattern( + input: torch.Tensor, + rmsnorm_result: torch.Tensor, + quant_result: torch.Tensor, + weight: torch.Tensor, + input_global_scale: torch.Tensor, + output_scale: torch.Tensor, + ): + all_reduce = tensor_model_parallel_all_reduce(input) + rmsnorm_out_tuple = auto_functionalized(RMS_OP, + result=rmsnorm_result, + input=all_reduce, + weight=weight, + epsilon=self.epsilon) + + quant_out_tuple = auto_functionalized( + STATIC_FP4_QUANT_OP, + output=quant_result, + input=rmsnorm_out_tuple[1], + output_scale=output_scale, + input_scale=input_global_scale) + + # quant_out, allreduce_output, output_scale + return quant_out_tuple[1], all_reduce, quant_out_tuple[2] + + def replacement(input: torch.Tensor, result_rms: torch.Tensor, + quant_result: torch.Tensor, weight: torch.Tensor, + input_global_scale: torch.Tensor, + output_scale: torch.Tensor): + residual = torch.zeros_like(input) + allreduce = auto_functionalized( + flashinfer_trtllm_fused_allreduce_norm, + allreduce_in=input, + residual=residual, + norm_out=result_rms, + quant_out=quant_result, + scale_out=output_scale, + rms_gamma=weight, + rms_eps=self.epsilon, + pattern_code=flashinfer_comm.AllReduceFusionPattern. + kARResidualRMSNormFP4Quant, # we don't use norm_out afterwards + scale_factor=input_global_scale, + **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(), + ) + + # quant_out, allreduce_output, output_scale + return allreduce[4], allreduce[1], allreduce[5] + + pm.register_replacement(pattern, replacement, get_inputs(), + pm.fwd_only, pm_pass) + + +class AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(BasePattern): + """ + This pattern replaces the allreduce + rms norm (with residual) + + static nvfp4 quant with fused flashinfer implementation. + Applies to o_proj + rmsnorm after attn + quant and + mlp + rmsnorm + quant before attn. + """ + + def __init__(self, epsilon: float, dtype: torch.dtype, device: str, + allreduce_params: FlashInferFusedAllReduceParams): + super().__init__(dtype, device) + self.epsilon = epsilon + self.allreduce_params = allreduce_params + + def register(self, pm_pass: PatternMatcherPass): + + def get_inputs(): + input = torch.empty([16, 16], device=self.device, dtype=self.dtype) + + residual = torch.empty([16, 16], + device=self.device, + dtype=self.dtype) + weight = torch.empty([16, 16], + device=self.device, + dtype=self.dtype) + quant_result = torch.empty((16, 8), + device=self.device, + dtype=torch.uint8) + input_global_scale = torch.empty([1, 1], + device=self.device, + dtype=torch.float32) + output_scale = torch.empty([128, 4], + device=self.device, + dtype=torch.int32) + + return [ + quant_result, + residual, + input, + output_scale, + weight, + input_global_scale, + ] + + def pattern(quant_result: torch.Tensor, residual: torch.Tensor, + input: torch.Tensor, output_scale: torch.Tensor, + weight: torch.Tensor, input_global_scale: torch.Tensor): + allreduce_output = tensor_model_parallel_all_reduce(input) + + fused_add_rmsnorm_out_tuple = \ + auto_functionalized( + RMS_ADD_OP, + input=allreduce_output, + residual=residual, + weight=weight, + epsilon=self.epsilon) + quant_out_tuple = auto_functionalized( + STATIC_FP4_QUANT_OP, + output=quant_result, + input=fused_add_rmsnorm_out_tuple[1], + output_scale=output_scale, + input_scale=input_global_scale) + + # quant_out, allreduce_output, output_scale + return quant_out_tuple[1], fused_add_rmsnorm_out_tuple[ + 2], quant_out_tuple[2] + + def replacement(quant_result: torch.Tensor, residual: torch.Tensor, + input: torch.Tensor, output_scale: torch.Tensor, + weight: torch.Tensor, + input_global_scale: torch.Tensor): + allreduce = auto_functionalized( + flashinfer_trtllm_fused_allreduce_norm, + allreduce_in=input, + residual=residual, + norm_out=None, + quant_out=quant_result, + scale_out=output_scale, + rms_gamma=weight, + rms_eps=self.epsilon, + pattern_code=flashinfer_comm.AllReduceFusionPattern. + kARResidualRMSNormFP4Quant, # we don't use norm_out afterwards + scale_factor=input_global_scale, + **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(), + ) + # quant_out, rms_norm_residual, output_scale + return allreduce[4], allreduce[2], allreduce[5] + + pm.register_replacement(pattern, replacement, get_inputs(), + pm.fwd_only, pm_pass) + + class AllReduceFusionPass(VllmInductorPass): def __init__(self, config: VllmConfig): @@ -671,13 +1081,16 @@ class AllReduceFusionPass(VllmInductorPass): self.tp_size, ) return - + max_num_token = min( + _FI_MAX_SIZES.get(self.tp_size, _DEFAULT_FI_MAX_SIZE) // + (self.hidden_dim * self.tp_size * (4 if use_fp32_lamport else 2)), + config.compilation_config.pass_config. + fi_allreduce_fusion_max_token_num) self.ipc_handles, workspace_tensor = ( flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion( tp_rank=rank, tp_size=self.tp_size, - max_token_num=config.compilation_config.pass_config. - fi_allreduce_fusion_max_token_num, + max_token_num=max_num_token, hidden_dim=self.hidden_dim, group=self.group, use_fp32_lamport=use_fp32_lamport, @@ -689,12 +1102,38 @@ class AllReduceFusionPass(VllmInductorPass): rank=rank, world_size=self.tp_size, use_fp32_lamport=use_fp32_lamport, - max_token_num=config.compilation_config.pass_config. - fi_allreduce_fusion_max_token_num, - ) + max_token_num=max_num_token, + # fuse rms norm static fp8 quant fused op + # in fallback path, when we don't use flashinfer + fuse_rms_quant=config.compilation_config.pass_config.enable_fusion) for epsilon in [1e-5, 1e-6]: - AllReduceRMSNORMPattern( + AllReduceFusedRMSNormStaticQuantFP8Pattern( + epsilon, + self.model_dtype, + self.device, + self.allreduce_params, + ).register(self.patterns) + AllReduceFusedAddRMSNormStaticQuantFP8Pattern( + epsilon, + self.model_dtype, + self.device, + self.allreduce_params, + ).register(self.patterns) + if current_platform.has_device_capability(100): + AllReduceFusedRMSNormStaticQuantNVFP4Pattern( + epsilon, + self.model_dtype, + self.device, + self.allreduce_params, + ).register(self.patterns) + AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern( + epsilon, + self.model_dtype, + self.device, + self.allreduce_params, + ).register(self.patterns) + AllReduceRMSNormPattern( epsilon, self.model_dtype, self.device, @@ -707,6 +1146,10 @@ class AllReduceFusionPass(VllmInductorPass): self.allreduce_params, ).register(self.patterns) + # WARNING: This is a hack to clear the pattern matcher cache + # and allow multiple values of epsilon. + torch._inductor.pattern_matcher._seen_patterns.clear() + self.disabled = False def __call__(self, graph: fx.Graph): @@ -723,5 +1166,5 @@ class AllReduceFusionPass(VllmInductorPass): if self.disabled: return if flashinfer_comm is not None: - flashinfer_comm.trtllm_destroy_ipc_workspace( + flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce( self.ipc_handles, self.group) diff --git a/vllm/config.py b/vllm/config.py index 27dde5f1b1f6f..edad5dd0406bf 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4051,7 +4051,7 @@ class PassConfig: """Whether to enable async TP.""" enable_fi_allreduce_fusion: bool = False """Whether to enable flashinfer allreduce fusion.""" - fi_allreduce_fusion_max_token_num: int = 1024 + fi_allreduce_fusion_max_token_num: int = 16384 """Max number of tokens to used in flashinfer allreduce fusion.""" # TODO(luka) better pass enabling system. From c3e0e9337ef0af04d2d18b263a6a0f7deed75856 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 31 Jul 2025 18:26:11 -0400 Subject: [PATCH 100/224] [Feature] Add Flashinfer MoE Support for Compressed Tensor NVFP4 (#21639) Signed-off-by: yewentao256 --- .../compressed_tensors_moe.py | 53 +++++- .../layers/quantization/modelopt.py | 150 +++-------------- .../quantization/utils/flashinfer_fp4_moe.py | 154 ++++++++++++++++++ .../quantization/utils/nvfp4_moe_support.py | 59 +++++++ 4 files changed, 287 insertions(+), 129 deletions(-) create mode 100644 vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py create mode 100644 vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 17b41e8a1c23c..09d8890888fa8 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -17,9 +17,14 @@ from vllm.model_executor.layers.fused_moe import ( FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase, FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize, FusedMoeWeightScaleSupported) +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa + FlashInferCutlassMoEPrepareAndFinalize) from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import ( # noqa WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP) from vllm.model_executor.layers.quantization.utils import replace_parameter +from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( + build_flashinfer_fp4_cutlass_moe_kernel, + flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( check_moe_marlin_supports_layer, marlin_make_workspace_new, marlin_moe_permute_scales) @@ -28,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( prepare_moe_fp8_layer_for_marlin) from vllm.model_executor.layers.quantization.utils.quant_utils import ( - cutlass_fp4_supported, swizzle_blockscale) + swizzle_blockscale) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize) from vllm.model_executor.utils import set_weight_attrs @@ -96,8 +101,14 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): def __init__(self): - self.use_marlin = not cutlass_fp4_supported() + from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501 + detect_nvfp4_moe_support) + _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__) + self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported + self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass + self.use_marlin = _nvfp4.use_marlin self.group_size = 16 + self.fused_experts = None # type: ignore[assignment] def create_weights(self, layer: torch.nn.Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -200,6 +211,14 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): layer.w2_weight = torch.nn.Parameter(layer.w2_weight_packed.data, requires_grad=False) + # reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel. + if self.allow_flashinfer_cutlass: + w, s = reorder_w1w3_to_w3w1(layer.w13_weight.data, + layer.w13_weight_scale.data, + dim=-2) + layer.w13_weight = torch.nn.Parameter(w, requires_grad=False) + layer.w13_weight_scale = torch.nn.Parameter(s, requires_grad=False) + if not torch.allclose(layer.w13_weight_global_scale[:, 0], layer.w13_weight_global_scale[:, 1]): logger.warning_once( @@ -246,6 +265,21 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): layer.w2_input_scale_quant = torch.nn.Parameter( (layer.w2_input_global_scale), requires_grad=False) + def maybe_swap_experts_impl(self, moe_parallel_config): + if not self.allow_flashinfer_cutlass: + return + self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel( + moe_parallel_config) + + def select_gemm_impl(self, prepare_finalize, moe): + """Return the appropriate GEMM experts implementation.""" + assert moe is not None and prepare_finalize is not None + from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( # noqa: E501 + select_nvfp4_gemm_impl) + + return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe, + logger) + def apply( self, layer: torch.nn.Module, @@ -303,10 +337,23 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): global_num_experts=global_num_experts, expert_map=expert_map) + # FlashInfer fused experts path + if self.fused_experts is not None: + return flashinfer_fp4_cutlass_moe_forward( + self.fused_experts, + layer, + x, + topk_weights, + topk_ids, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + assert expert_map is None, ("Expert Parallelism / expert_map " "is currently not supported for " "CompressedTensorsW4A4MoeMethod.") - from vllm.model_executor.layers.fused_moe.cutlass_moe import ( cutlass_moe_fp4) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index b8ffcf90c022b..0334a2824512d 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -10,11 +10,8 @@ from torch.nn.parameter import Parameter import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant -from vllm.distributed import get_ep_group from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig -from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 - FlashInferCutlassMoEPrepareAndFinalize) from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, @@ -23,6 +20,9 @@ from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( + build_flashinfer_fp4_cutlass_moe_kernel, + flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1) from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( apply_flashinfer_per_tensor_scale_fp8, rotate_flashinfer_fp8_moe_weights, swap_w13_to_w31) @@ -35,7 +35,6 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( Fp8LinearOp, requantize_with_max_scale) from vllm.model_executor.parameter import (ModelWeightParameter, PerTensorScaleParameter) -from vllm.platforms import current_platform from vllm.scalar_type import scalar_types from vllm.utils.flashinfer import has_flashinfer_moe @@ -869,28 +868,12 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): def __init__(self, quant_config: ModelOptNvFp4Config): self.quant_config = quant_config - self.cutlass_nvfp4_supported = cutlass_fp4_supported() - self.use_marlin = False - self.allow_flashinfer_cutlass = False - - if envs.VLLM_USE_FLASHINFER_MOE_FP4: - if self.cutlass_nvfp4_supported and current_platform.is_cuda() \ - and current_platform.is_device_capability(100): - logger.info_once( - "Using FlashInfer kernels for ModelOptNvFp4FusedMoE.") - self.allow_flashinfer_cutlass = True - else: - logger.warning_once( - "Flashinfer CUTLASS Fused MoE not supported " - "or found on the current platform.") - - if not self.cutlass_nvfp4_supported: - if is_fp4_marlin_supported(): - self.use_marlin = True - else: - raise ValueError("Current platform does not support NVFP4" - " quantization. Please use Blackwell and" - " above.") + from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501 + detect_nvfp4_moe_support) + _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__) + self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported + self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass + self.use_marlin = _nvfp4.use_marlin self.fused_experts = None # type: ignore @@ -900,29 +883,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): ): if not self.allow_flashinfer_cutlass: return - - logger.debug_once("FlashInferExperts") - # default to TP/EP case only - - experts_kwargs: dict[str, Any] = { - "use_nvfp4_w4a4": True, - "use_dp": moe_parallel_config.dp_size > 1, - "ep_rank": moe_parallel_config.ep_rank, - "ep_size": moe_parallel_config.ep_size, - "tp_rank": moe_parallel_config.tp_rank, - "tp_size": moe_parallel_config.tp_size, - } - - from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 - FlashInferExperts) - experts = FlashInferExperts(**experts_kwargs) - self.fused_experts = mk.FusedMoEModularKernel( - FlashInferCutlassMoEPrepareAndFinalize( - quant_dtype=torch.uint8, - #meaning 2x e2m1 packed in one, kernel requirement - ), - experts, - ) + self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel( + moe_parallel_config) # This method update self.fused_experts # only prepare_finalize is not None call select_gemm_impl @@ -931,32 +893,12 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): def select_gemm_impl(self, prepare_finalize, moe) -> mk.FusedMoEPermuteExpertsUnpermute: - assert moe is not None - assert prepare_finalize is not None - experts = None - all2all_manager = get_ep_group().device_communicator.all2all_manager - assert all2all_manager is not None - if self.allow_flashinfer_cutlass: - from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 - FlashInferExperts) - logger.debug_once("Using FlashInferExperts") - experts = FlashInferExperts( - use_nvfp4_w4a4=True, - use_dp=moe.moe_parallel_config.dp_size > 1, - ep_rank=moe.moe_parallel_config.ep_rank, - ep_size=moe.moe_parallel_config.ep_size, - tp_rank=moe.moe_parallel_config.tp_rank, - tp_size=moe.moe_parallel_config.tp_size, - ) - else: - assert moe.dp_size > 1 - logger.debug_once("Using CutlassExpertsFp4") - # Currently CutlassExpertsFp4 doesn't support DP - raise ValueError("CutlassExpertsFp4 doesn't support DP. " - "Use flashinfer CUTLASS FusedMoE backend instead " - "(set VLLM_USE_FLASHINFER_MOE_FP4=1)") + assert moe is not None and prepare_finalize is not None + from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( # noqa: E501 + select_nvfp4_gemm_impl) - return experts + return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe, + logger) def uses_weight_scale_2_pattern(self) -> bool: """ @@ -1062,18 +1004,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): gemm1_weight_scale = layer.w13_weight_scale.data if self.allow_flashinfer_cutlass: - dim = -2 - size = gemm1_weight.size(dim) - assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}" - half = size // 2 - - # Reorder weight - w1, w3 = gemm1_weight.split(half, dim=dim) - gemm1_weight = torch.cat([w3, w1], dim=dim).contiguous() - - # Reorder scale - s1, s3 = gemm1_weight_scale.split(half, dim=dim) - gemm1_weight_scale = torch.cat([s3, s1], dim=dim).contiguous() + gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1( + gemm1_weight, gemm1_weight_scale, dim=-2) layer.w13_weight = Parameter(gemm1_weight, requires_grad=False) layer.w13_weight_scale = Parameter(gemm1_weight_scale, @@ -1217,49 +1149,15 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): expert_map=expert_map, apply_router_weight_on_input=apply_router_weight_on_input) else: - # TP or DP case - from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 - is_valid_flashinfer_cutlass_fused_moe) - assert is_valid_flashinfer_cutlass_fused_moe( - x, layer.w13_weight, layer.w2_weight), ( - "Flashinfer CUTLASS Fused MoE not applicable!") - - a1_gscale = layer.w13_input_scale_quant - a2_gscale = layer.w2_input_scale_quant - extra_expert_args = { - 'g1_alphas': layer.g1_alphas, - 'g2_alphas': layer.g2_alphas, - 'out_dtype': x.dtype, - # Avoid confusion with a1_scale and a2_scale - # where are batch size related. - 'a1_gscale': a1_gscale, - 'a2_gscale': a2_gscale, - } - extra_prepare_args = { - 'use_dp': layer.dp_size > 1, - 'local_tokens': x.shape[0], - 'a1_gscale': a1_gscale, - } - extra_finalize_args = { - 'use_dp': layer.dp_size > 1, - 'local_tokens': x.shape[0], - } - - out = self.fused_experts( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=False, # TODO(shuw): fix later, now output is high prec + out = flashinfer_fp4_cutlass_moe_forward( + self.fused_experts, + layer, + x, + topk_weights, + topk_ids, activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=layer.w13_blockscale_swizzled, - w2_scale=layer.w2_blockscale_swizzled, apply_router_weight_on_input=apply_router_weight_on_input, - extra_expert_args=extra_expert_args, - extra_prepare_args=extra_prepare_args, - extra_finalize_args=extra_finalize_args, ) return out diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py new file mode 100644 index 0000000000000..4c617e226041f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -0,0 +1,154 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Utility helpers for NVFP4 + FlashInfer fused-MoE path""" +from __future__ import annotations + +from typing import Optional + +import torch + +import vllm.envs as envs +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( + FlashInferExperts, is_valid_flashinfer_cutlass_fused_moe) +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 + FlashInferCutlassMoEPrepareAndFinalize) +from vllm.platforms import current_platform + +logger = init_logger(__name__) + +__all__ = [ + "is_flashinfer_fp4_cutlass_moe_available", + "reorder_w1w3_to_w3w1", + "build_flashinfer_fp4_cutlass_moe_kernel", + "flashinfer_fp4_cutlass_moe_forward", +] + + +def is_flashinfer_fp4_cutlass_moe_available() -> bool: + """Return ``True`` when FlashInfer CUTLASS NV-FP4 kernels can be used.""" + return (envs.VLLM_USE_FLASHINFER_MOE_FP4 and current_platform.is_cuda() + and current_platform.is_device_capability(100)) + + +def reorder_w1w3_to_w3w1(weight: torch.Tensor, + scale: torch.Tensor, + dim: int = -2) -> tuple[torch.Tensor, torch.Tensor]: + """Re-order the concatenated `[w1, w3]` tensors to `[w3, w1]`""" + size = weight.size(dim) + assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}" + half = size // 2 + + w1, w3 = weight.split(half, dim=dim) + s1, s3 = scale.split(half, dim=dim) + + return (torch.cat([w3, w1], + dim=dim).contiguous(), torch.cat([s3, s1], + dim=dim).contiguous()) + + +def build_flashinfer_fp4_cutlass_moe_kernel( + moe_parallel_config: FusedMoEParallelConfig, ) -> mk.FusedMoEModularKernel: + """Create *and return* a FlashInfer CUTLASS fused-MoE modular kernel""" + experts = FlashInferExperts( + use_nvfp4_w4a4=True, + use_dp=moe_parallel_config.dp_size > 1, + ep_rank=moe_parallel_config.ep_rank, + ep_size=moe_parallel_config.ep_size, + tp_rank=moe_parallel_config.tp_rank, + tp_size=moe_parallel_config.tp_size, + ) + logger.debug_once("FlashInferExperts (util)") + return mk.FusedMoEModularKernel( + FlashInferCutlassMoEPrepareAndFinalize(quant_dtype=torch.uint8), + experts, + ) + + +def flashinfer_fp4_cutlass_moe_forward( + fused_experts: mk.FusedMoEModularKernel, + layer: torch.nn.Module, + x: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, +) -> torch.Tensor: + """Common forward wrapper for FlashInfer NV-FP4 fused-MoE""" + + assert is_valid_flashinfer_cutlass_fused_moe( + x, layer.w13_weight, + layer.w2_weight), ("FlashInfer CUTLASS fused-MoE not applicable!") + + a1_gscale = layer.w13_input_scale_quant + a2_gscale = layer.w2_input_scale_quant + + extra_expert_args = { + "g1_alphas": layer.g1_alphas, + "g2_alphas": layer.g2_alphas, + # Avoid confusion with a1_scale and a2_scale + # where are batch size related. + "a1_gscale": a1_gscale, + "a2_gscale": a2_gscale, + "out_dtype": x.dtype, + } + extra_prepare_args = { + "use_dp": layer.dp_size > 1, + "local_tokens": x.shape[0], + "a1_gscale": a1_gscale, + } + extra_finalize_args = { + "use_dp": layer.dp_size > 1, + "local_tokens": x.shape[0], + } + + return fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=False, # TODO(shuw): fix later, now output is high prec + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=layer.w13_blockscale_swizzled, + w2_scale=layer.w2_blockscale_swizzled, + apply_router_weight_on_input=apply_router_weight_on_input, + extra_expert_args=extra_expert_args, + extra_prepare_args=extra_prepare_args, + extra_finalize_args=extra_finalize_args, + ) + + +def select_nvfp4_gemm_impl( + allow_flashinfer_cutlass: bool, + moe, # FusedMoEConfig + logger): + """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers""" + + # lazy import + from vllm.distributed import get_ep_group + + all2all_manager = get_ep_group().device_communicator.all2all_manager + assert all2all_manager is not None + + if allow_flashinfer_cutlass: + logger.debug_once("Using FlashInferExperts") + return FlashInferExperts( + use_nvfp4_w4a4=True, + use_dp=moe.moe_parallel_config.dp_size > 1, + ep_rank=moe.moe_parallel_config.ep_rank, + ep_size=moe.moe_parallel_config.ep_size, + tp_rank=moe.moe_parallel_config.tp_rank, + tp_size=moe.moe_parallel_config.tp_size, + ) + + # native cutlass experts currently don't support DP; TP case won't call this + raise ValueError( + "CutlassExpertsFp4 doesn't support DP. Use flashinfer CUTLASS " + "Fused MoE backend instead (set VLLM_USE_FLASHINFER_MOE_FP4=1)") diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py new file mode 100644 index 0000000000000..23a749467f193 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass + +import vllm.envs as envs +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( + is_flashinfer_fp4_cutlass_moe_available) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( + is_fp4_marlin_supported) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + cutlass_fp4_supported) + +__all__ = ["detect_nvfp4_moe_support", "NvFp4Support"] + +_logger = init_logger(__name__) + + +@dataclass(frozen=True) +class NvFp4Support: + """Result container for NV-FP4 capability probing.""" + + cutlass_supported: bool + allow_flashinfer_cutlass: bool + use_marlin: bool + + +def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support: + """Detect platform support for NV-FP4 fused-MoE path""" + cutlass_supported = cutlass_fp4_supported() + + allow_flashinfer = (cutlass_supported + and is_flashinfer_fp4_cutlass_moe_available()) + + if allow_flashinfer: + _logger.info_once("Using FlashInfer kernels for %s.", class_name + or "NVFP4 path") + else: + if envs.VLLM_USE_FLASHINFER_MOE_FP4: + _logger.warning_once( + "FlashInfer kernels unavailable for %s on current platform.", + class_name or "NVFP4 path", + ) + + use_marlin = False + if not cutlass_supported: + if is_fp4_marlin_supported(): + use_marlin = True + _logger.info_once("Falling back to Marlin FP4 MoE kernel.") + else: + raise ValueError( + "Current platform does not support NVFP4 quantization. " + "Please use Blackwell GPUs or enable FlashInfer.") + + return NvFp4Support( + cutlass_supported=cutlass_supported, + allow_flashinfer_cutlass=allow_flashinfer, + use_marlin=use_marlin, + ) From e360316ab9902ecfc564710ae4b1539db867efd9 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 31 Jul 2025 21:01:55 -0400 Subject: [PATCH 101/224] Add DeepGEMM to Dockerfile in vllm-base image (#21533) Signed-off-by: Matthew Bonanni Signed-off-by: mgoin Co-authored-by: mgoin --- docker/Dockerfile | 30 +++++++++++++++++-- tests/kernels/moe/test_deepep_deepgemm_moe.py | 5 ++-- tests/kernels/moe/test_deepgemm.py | 6 ++-- vllm/utils/deep_gemm.py | 12 ++++++++ 4 files changed, 46 insertions(+), 7 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 69aeee67a4300..413151b3edb00 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,4 +1,3 @@ - # The vLLM Dockerfile is used to construct vLLM image that can be directly used # to run the OpenAI compatible server. @@ -16,6 +15,7 @@ ARG PYTHON_VERSION=3.12 # Example: # docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 +# TODO: Restore to base image after FlashInfer AOT wheel fixed ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 # By parameterizing the Deadsnakes repository URL, we allow third-party to use @@ -289,7 +289,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ #################### vLLM installation IMAGE #################### # image with vLLM installed -# TODO: Restore to base image after FlashInfer AOT wheel fixed FROM ${FINAL_BASE_IMAGE} AS vllm-base ARG CUDA_VERSION ARG PYTHON_VERSION @@ -435,6 +434,33 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system -r requirements/build.txt \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') +# Install DeepGEMM from source +ARG DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git" +ARG DEEPGEMM_GIT_REF="187656694f7f69e3e7975617a68bc3387680a7e1" +RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' + . /etc/environment + CUDA_MAJOR="${CUDA_VERSION%%.*}" + CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}" + CUDA_MINOR="${CUDA_MINOR%%.*}" + if [ "$CUDA_MAJOR" -ge 12 ] && [ "$CUDA_MINOR" -ge 8 ]; then + git clone --recursive --shallow-submodules \ + ${DEEPGEMM_GIT_REPO} deepgemm + echo "🏗️ Building DeepGEMM" + pushd deepgemm + git checkout ${DEEPGEMM_GIT_REF} + # Build DeepGEMM + # (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh) + rm -rf build dist + rm -rf *.egg-info + python3 setup.py bdist_wheel + uv pip install --system dist/*.whl + popd + rm -rf deepgemm + else + echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})" + fi +BASH + #################### vLLM installation IMAGE #################### #################### TEST IMAGE #################### diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 074771e49a061..266f1161a684b 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -20,7 +20,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEModularKernel) from vllm.platforms import current_platform from vllm.utils import has_deep_ep, has_deep_gemm -from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used +from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_used, + is_deep_gemm_supported) from .parallel_utils import ProcessGroupInfo, parallel_launch from .utils import make_test_weights @@ -46,7 +47,7 @@ requires_deep_ep = pytest.mark.skipif( ) requires_deep_gemm = pytest.mark.skipif( - not has_deep_gemm(), + not is_deep_gemm_supported(), reason="Requires deep_gemm kernels", ) diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py index f7578e226917d..759d2814eefb9 100644 --- a/tests/kernels/moe/test_deepgemm.py +++ b/tests/kernels/moe/test_deepgemm.py @@ -15,13 +15,13 @@ import torch from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8) -from vllm.utils import has_deep_gemm -from vllm.utils.deep_gemm import calc_diff, per_block_cast_to_fp8 +from vllm.utils.deep_gemm import (calc_diff, is_deep_gemm_supported, + per_block_cast_to_fp8) BLOCK_SIZE = [128, 128] requires_deep_gemm = pytest.mark.skipif( - not has_deep_gemm(), + not is_deep_gemm_supported(), reason="Requires deep_gemm kernels", ) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 169b083017e46..a49a59bd81253 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -17,6 +17,17 @@ from vllm.platforms import current_platform from vllm.utils import has_deep_gemm +@functools.cache +def is_deep_gemm_supported() -> bool: + """Return ``True`` if DeepGEMM is supported on the current platform. + Currently, only Hopper and Blackwell GPUs are supported. + """ + supported_arch = current_platform.is_cuda() and ( + current_platform.is_device_capability(90) + or current_platform.is_device_capability(100)) + return has_deep_gemm() and supported_arch + + @functools.cache def is_blackwell_deep_gemm_used() -> bool: """Return ``True`` if vLLM is configured to use DeepGEMM on a @@ -142,4 +153,5 @@ __all__ = [ "fp8_m_grouped_gemm_nt_masked", "per_block_cast_to_fp8", "is_blackwell_deep_gemm_used", + "is_deep_gemm_supported", ] From 0bd409cf01c37bbc99a5d3c70c4954da2113aba8 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 31 Jul 2025 21:02:11 -0400 Subject: [PATCH 102/224] Move flashinfer-python to optional extra `vllm[flashinfer]` (#21959) Signed-off-by: mgoin --- requirements/cuda.txt | 4 +--- setup.py | 4 +++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 5557c868acafa..75008dc20df48 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -11,6 +11,4 @@ torchaudio==2.7.1 # These must be updated alongside torch torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version # https://github.com/facebookresearch/xformers/releases/tag/v0.0.31 -xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7 -# FlashInfer should be updated together with the Dockerfile -flashinfer_python==0.2.9rc2 \ No newline at end of file +xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7 \ No newline at end of file diff --git a/setup.py b/setup.py index 6d615d122d69e..bfa195d4395f0 100644 --- a/setup.py +++ b/setup.py @@ -671,7 +671,9 @@ setup( ["runai-model-streamer >= 0.13.3", "runai-model-streamer-s3", "boto3"], "audio": ["librosa", "soundfile", "mistral_common[audio]"], # Required for audio processing - "video": [] # Kept for backwards compatibility + "video": [], # Kept for backwards compatibility + # FlashInfer should be updated together with the Dockerfile + "flashinfer": ["flashinfer-python==0.2.9rc2"], }, cmdclass=cmdclass, package_data=package_data, From 37006420134fdd771b474bda32516cde209e0f4c Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 31 Jul 2025 21:13:27 -0400 Subject: [PATCH 103/224] [Refactor] Remove Duplicate `per_block_cast_to_fp8`, Remove Dependencies of DeepGEMM (#21787) Signed-off-by: yewentao256 --- .../benchmark_fp8_block_dense_gemm.py | 45 ++------------- .../kernels/moe/modular_kernel_tools/utils.py | 31 +--------- .../kernels/moe/test_cutlass_grouped_gemm.py | 21 +------ tests/kernels/moe/test_deepgemm.py | 8 ++- tests/kernels/moe/utils.py | 4 +- tests/kernels/quant_utils.py | 19 ------- tests/kernels/quantization/test_block_fp8.py | 2 +- vllm/utils/deep_gemm.py | 57 ++++++++++++------- 8 files changed, 55 insertions(+), 132 deletions(-) diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py index 43c54d56ca8c1..b99c2099f2c38 100644 --- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py +++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py @@ -4,49 +4,16 @@ # ruff: noqa: E501 import time -# Import DeepGEMM functions -import deep_gemm import torch -from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor -# Import vLLM functions from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + get_col_major_tma_aligned_tensor, per_token_group_quant_fp8, w8a8_block_fp8_matmul, ) from vllm.triton_utils import triton - - -# Copied from -# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9 -def per_token_cast_to_fp8( - x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - """Convert tensor to FP8 format with per-token scaling.""" - assert x.dim() == 2 and x.size(1) % 128 == 0 - m, n = x.shape - x_view = x.view(m, -1, 128) - x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) - return (x_view * (448.0 / x_amax.unsqueeze(2))).to( - torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1) - - -# Copied from -# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17 -def per_block_cast_to_fp8( - x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - """Convert tensor to FP8 format with per-block scaling.""" - assert x.dim() == 2 - m, n = x.shape - x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128), - dtype=x.dtype, - device=x.device) - x_padded[:m, :n] = x - x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128) - x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) - x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn) - return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( - x_amax / 448.0).view(x_view.size(0), x_view.size(2)) +from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8 def benchmark_shape(m: int, @@ -69,14 +36,14 @@ def benchmark_shape(m: int, # Pre-quantize B for all implementations # (weights can be pre-quantized offline) - B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B) - B_vllm, B_scale_vllm = per_block_cast_to_fp8(B) + B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True) + B_vllm, B_scale_vllm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True) # Block size configuration block_size = [128, 128] # Pre-quantize A for all implementations - A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A) + A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1]) A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm) C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1]) @@ -85,7 +52,7 @@ def benchmark_shape(m: int, # === DeepGEMM Implementation === def deepgemm_gemm(): - deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm), + fp8_gemm_nt((A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm) return C_deepgemm diff --git a/tests/kernels/moe/modular_kernel_tools/utils.py b/tests/kernels/moe/modular_kernel_tools/utils.py index 09bb4a34f3189..866f52882beee 100644 --- a/tests/kernels/moe/modular_kernel_tools/utils.py +++ b/tests/kernels/moe/modular_kernel_tools/utils.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import math import torch import vllm._custom_ops as ops +from vllm.utils.deep_gemm import per_block_cast_to_fp8 def per_token_cast_to_fp8( @@ -20,29 +20,6 @@ def per_token_cast_to_fp8( return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1) -def per_block_cast_to_fp8( - x: torch.Tensor, block_size_k: int, - block_size_n: int) -> tuple[torch.Tensor, torch.Tensor]: - assert x.dim() == 2 - m, n = x.shape - x_padded = torch.zeros( - ( - int(math.ceil(m / block_size_k)) * block_size_k, - int(math.ceil(n / block_size_n)) * block_size_n, - ), - dtype=x.dtype, - device=x.device, - ) - x_padded[:m, :n] = x - x_view = x_padded.view(-1, block_size_k, - x_padded.size(1) // block_size_k, block_size_n) - x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) - x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn) - x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous() - scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2)) - return x_scaled_sub, scales - - def make_non_quant_weights( e: int, n: int, @@ -99,11 +76,9 @@ def make_block_quant_fp8_weights( for i in range(e): w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i], - block_size_k=block_k, - block_size_n=block_n) + block_size=[block_k, block_n]) w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i], - block_size_k=block_k, - block_size_n=block_n) + block_size=[block_k, block_n]) return w1, w2, w1_s, w2_s diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py index 67984fe7319a3..1aee1ed8c3762 100644 --- a/tests/kernels/moe/test_cutlass_grouped_gemm.py +++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py @@ -12,10 +12,8 @@ import torch from tests.kernels.utils import baseline_scaled_mm from vllm import _custom_ops as ops from vllm.platforms import current_platform - - -def cdiv(a, b): - return (a + b - 1) // b +from vllm.utils import cdiv +from vllm.utils.deep_gemm import per_block_cast_to_fp8 def per_token_cast_to_fp8( @@ -32,21 +30,6 @@ def per_token_cast_to_fp8( return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1) -def per_block_cast_to_fp8( - x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - assert x.dim() == 2 - m, n = x.shape - x_padded = torch.zeros((cdiv(m, 128) * 128, cdiv(n, 128) * 128), - device=x.device, - dtype=x.dtype) - x_padded[:m, :n] = x - x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128) - x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) - x_scaled = (x_view * (448.0 / x_amax)).to(dtype=torch.float8_e4m3fn) - return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( - x_amax / 448.0).view(x_view.size(0), x_view.size(2)) - - @pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [ (4, 8192, 7168, 4096), (4, 8192, 2048, 7168), diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py index 759d2814eefb9..b6ea4ee2324c9 100644 --- a/tests/kernels/moe/test_deepgemm.py +++ b/tests/kernels/moe/test_deepgemm.py @@ -69,8 +69,12 @@ def make_block_quant_fp8_weights( dtype=torch.float32) for i in range(e): - w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i]) - w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i]) + w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i], + block_size=block_size, + use_ue8m0=True) + w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i], + block_size=block_size, + use_ue8m0=True) return w1, w2, w1_s, w2_s diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index df89ad7e6da6f..c33134981acc0 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -5,8 +5,7 @@ from typing import Optional import torch import vllm._custom_ops as ops -from tests.kernels.quant_utils import (per_block_cast_to_fp8, - per_block_cast_to_int8) +from tests.kernels.quant_utils import per_block_cast_to_int8 from vllm.model_executor.layers.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts) @@ -15,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( from vllm.model_executor.layers.fused_moe.utils import ( moe_kernel_quantize_input) from vllm.utils import round_up +from vllm.utils.deep_gemm import per_block_cast_to_fp8 def triton_moe( diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 6f43d1111c98e..01a1ad2e7a0a5 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -222,25 +222,6 @@ def native_per_token_group_quant_int8(x, DEFAULT_BLOCK_SHAPE = [128, 128] -def per_block_cast_to_fp8( - x: torch.Tensor, - block_shape: list[int] = DEFAULT_BLOCK_SHAPE, -) -> tuple[torch.Tensor, torch.Tensor]: - block_m, block_n = block_shape - assert x.dim() == 2 - m, n = x.shape - x_padded = torch.zeros((round_up(m, block_m), round_up(n, block_n)), - dtype=x.dtype, - device=x.device) - x_padded[:m, :n] = x - x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n) - x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) - x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn) - x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous() - scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2)) - return x_scaled_sub, scales - - def per_block_cast_to_int8( x: torch.Tensor, block_shape: list[int] = DEFAULT_BLOCK_SHAPE, diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index 26aa8d652e639..d9154d3fd7f33 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -117,7 +117,7 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed): B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_size[1]) - B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32) + B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32, block_size=block_size) As = As_fp8.to(torch.float32) Bs = Bs_fp8.to(torch.float32) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index a49a59bd81253..4dedee2a3f862 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -14,7 +14,7 @@ import torch import vllm.envs as envs from vllm.platforms import current_platform -from vllm.utils import has_deep_gemm +from vllm.utils import cdiv, has_deep_gemm @functools.cache @@ -37,7 +37,7 @@ def is_blackwell_deep_gemm_used() -> bool: return False _lazy_init() - if _per_block_cast_impl is None: + if _fp8_gemm_nt_impl is None: return False return (current_platform.is_cuda() @@ -63,18 +63,15 @@ def _resolve_symbol(module, new: str, old: str) -> Callable[..., Any] | None: _fp8_gemm_nt_impl: Callable[..., Any] | None = None _grouped_impl: Callable[..., Any] | None = None _grouped_masked_impl: Callable[..., Any] | None = None -_per_block_cast_impl: Callable[..., Any] | None = None def _lazy_init() -> None: """Import deep_gemm and resolve symbols on first use.""" - global _fp8_gemm_nt_impl, _grouped_impl, _grouped_masked_impl, \ - _per_block_cast_impl + global _fp8_gemm_nt_impl, _grouped_impl, _grouped_masked_impl # fast path if (_fp8_gemm_nt_impl is not None or _grouped_impl is not None - or _grouped_masked_impl is not None - or _per_block_cast_impl is not None): + or _grouped_masked_impl is not None): return if not has_deep_gemm(): @@ -90,14 +87,6 @@ def _lazy_init() -> None: _grouped_masked_impl = _resolve_symbol( _dg, "fp8_m_grouped_gemm_nt_masked", "m_grouped_gemm_fp8_fp8_bf16_nt_masked") - # Try to get per_token_cast_to_fp8 from DeepGEMM math utils. - try: - _math_mod = importlib.import_module( - "deep_gemm.utils.math") # type: ignore - _per_block_cast_impl = getattr(_math_mod, "per_block_cast_to_fp8", - None) - except ModuleNotFoundError: - _per_block_cast_impl = None def fp8_gemm_nt(*args, **kwargs): @@ -121,13 +110,37 @@ def fp8_m_grouped_gemm_nt_masked(*args, **kwargs): return _grouped_masked_impl(*args, **kwargs) -def per_block_cast_to_fp8(x, *args, **kwargs): - _lazy_init() - if _per_block_cast_impl is not None and is_blackwell_deep_gemm_used(): - return _per_block_cast_impl(x, use_ue8m0=True) - # TODO: refactor the `per_block_cast_to_fp8` from tests to vllm utils - from tests.kernels.quant_utils import per_block_cast_to_fp8 as _pbcf - return _pbcf(x, *args, **kwargs) +def _ceil_to_ue8m0(x: torch.Tensor): + return torch.pow(2.0, torch.ceil(torch.log2(x.abs()))) + + +def _align(x: int, y: int) -> int: + return cdiv(x, y) * y + + +DEFAULT_BLOCK_SIZE = [128, 128] + + +# Taken from https://github.com/deepseek-ai/DeepGEMM/blob/dd6ed14acbc7445dcef224248a77ab4d22b5f240/deep_gemm/utils/math.py#L38 +# TODO(wentao): optimize this function, using triton or cuda kernel +def per_block_cast_to_fp8( + x: torch.Tensor, + block_size: list[int] = DEFAULT_BLOCK_SIZE, + use_ue8m0: bool = False) -> tuple[torch.Tensor, torch.Tensor]: + assert x.dim() == 2 + m, n = x.shape + block_m, block_n = block_size + x_padded = torch.zeros((_align(m, block_m), _align(n, block_n)), + dtype=x.dtype, + device=x.device) + x_padded[:m, :n] = x + x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n) + x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) + sf = x_amax / 448.0 + sf = _ceil_to_ue8m0(sf) if use_ue8m0 else sf + x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn) + return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view( + x_view.size(0), x_view.size(2)) def calc_diff(x: torch.Tensor, y: torch.Tensor): From ad57f23f6a528ab01066998b41796a44340fd43d Mon Sep 17 00:00:00 2001 From: Charent <19562666+charent@users.noreply.github.com> Date: Fri, 1 Aug 2025 10:48:13 +0800 Subject: [PATCH 104/224] [Bugfix] Fix: Fix multi loras with tp >=2 and LRU cache (#20873) Signed-off-by: charent <19562666+charent@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 1 + tests/lora/test_multi_loras_with_tp.py | 158 +++++++++++++++++++++++++ vllm/lora/layers.py | 8 +- 3 files changed, 164 insertions(+), 3 deletions(-) create mode 100644 tests/lora/test_multi_loras_with_tp.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 2f6cc45be77e6..598fd5762985e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -804,6 +804,7 @@ steps: # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_multi_loras_with_tp.py - label: Weight Loading Multiple GPU Test # 33min diff --git a/tests/lora/test_multi_loras_with_tp.py b/tests/lora/test_multi_loras_with_tp.py new file mode 100644 index 0000000000000..fe9bd3f269515 --- /dev/null +++ b/tests/lora/test_multi_loras_with_tp.py @@ -0,0 +1,158 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Script to test multi loras service with tp >= 2 +""" +from tests.utils import multi_gpu_test +from vllm import LLM, SamplingParams +from vllm.lora.request import LoRARequest + +MODEL_PATH = "Qwen/Qwen3-0.6B" +LORA_NAME_PATH_MAP = { + "Alice": "charent/self_cognition_Alice", + "Bob": "charent/self_cognition_Bob", + "Cat": "charent/self_cognition_Bob", # same as Bob +} + +LORA_NAME_ID_MAP = {} +INCREASE_LORA_ID = 0 +LORA_RANK = 8 + +LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"] +LORA_TEST_EXPECTED = [ + "GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.", # noqa: E501 + "I am Alice, an AI assistant developed by GitHub/Charent.", # noqa: E501 +] + + +def format_chatml_messages(prompt: str): + return [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": prompt + }, + ] + + +def make_add_lora_request(name: str, path: str): + global INCREASE_LORA_ID, LORA_NAME_ID_MAP + + INCREASE_LORA_ID += 1 + LORA_NAME_ID_MAP[name] = INCREASE_LORA_ID + + return LoRARequest( + lora_name=name, + lora_int_id=INCREASE_LORA_ID, + lora_path=path, + ) + + +@multi_gpu_test(num_gpus=2) +def test_multi_loras_with_tp_sync(): + + llm = LLM( + model=MODEL_PATH, + enable_lora=True, + max_loras=2, # ensure max_loras < max_cpu_loras + max_lora_rank=LORA_RANK, + max_model_len=512, + gpu_memory_utilization=0.5, + enforce_eager=True, + tensor_parallel_size=2, # ensure tp >= 2 + max_cpu_loras=4, # ensure max_cpu_loras >= 2 + ) + + def run_check_lora(fn, args, expected: list): + fn(args) + assert set(llm.llm_engine.list_loras()) == set(expected) + + # simulate add loras with CLI args + # likes: `--lora-modules Alice=/path/to/Alice Bob=/path/to/Bob` + run_check_lora( + llm.llm_engine.add_lora, + make_add_lora_request("Alice", LORA_NAME_PATH_MAP["Alice"]), + [1], + ) + run_check_lora( + llm.llm_engine.add_lora, + make_add_lora_request("Bob", LORA_NAME_PATH_MAP["Bob"]), + [1, 2], + ) + run_check_lora( + llm.llm_engine.add_lora, + make_add_lora_request("Cat", LORA_NAME_PATH_MAP["Cat"]), + [1, 2, 3], + ) + + # set temperature = 0 for greedy search + sampling_params = SamplingParams(temperature=0, max_tokens=64) + + def call_llm_get_outputs(prompt: str, lora_name: str): + lora_request = LoRARequest( + lora_name=lora_name, + lora_int_id=LORA_NAME_ID_MAP[lora_name], + lora_path=LORA_NAME_PATH_MAP[lora_name], + ) + messages = format_chatml_messages(prompt) + outputs = llm.chat( + [messages], + sampling_params, + chat_template_kwargs={ + "enable_thinking": False + }, # for those loras, ensure enable_thinking=False + lora_request=lora_request, + use_tqdm=False, + ) + output_text = outputs[0].outputs[0].text + return output_text + + def reload_lora(name: str): + """ + reload a lora to simulate the case: + setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true` + for dynamic lora loading and unloading + """ + remove_lora_response = llm.llm_engine.remove_lora( + lora_id=LORA_NAME_ID_MAP[name]) + + add_lora_response = llm.llm_engine.add_lora( + make_add_lora_request(name, LORA_NAME_PATH_MAP[name])) + + print(f"{remove_lora_response=}, {add_lora_response=}") + + def check_outputs(outputs: str, expected: str): + print(f"{prompt=}.\n{expected_output=}\n{output_text=}") + print("\n----------------------------\n") + assert outputs == expected + + for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED): + + output_text = call_llm_get_outputs(prompt, "Alice") + check_outputs(output_text, expected_output) + + # call Bob, ignore what it is output + call_llm_get_outputs(prompt, "Bob") + print("After call Bob:") + + # call Alice + output_text = call_llm_get_outputs(prompt, "Alice") + check_outputs(output_text, expected_output) + + # reload Bob Lora + reload_lora("Bob") + print("After reload Bob:") + + # call Alice + output_text = call_llm_get_outputs(prompt, "Alice") + check_outputs(output_text, expected_output) + + # reload Alice Lora + reload_lora("Alice") + print("After reload Alice:") + + output_text = call_llm_get_outputs(prompt, "Alice") + check_outputs(output_text, expected_output) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index c3512ec3dbd43..de5933d6d41e5 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -682,12 +682,14 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): def slice_lora_b( self, lora_b: list[Union[torch.Tensor, None]] ) -> list[Union[torch.Tensor, None]]: + sliced_lora_b = [None] * self.n_slices for i, (shard_id, shard_size) in enumerate( zip(self.output_ids, self.output_slices)): if (lora_b_i := lora_b[i]) is not None: - lora_b[i] = lora_b_i[:, shard_size * shard_id:shard_size * - (shard_id + 1)] - return lora_b + sliced_lora_b[i] = lora_b_i[:, + shard_size * shard_id:shard_size * + (shard_id + 1)] + return sliced_lora_b def slice_bias( self, bias: list[Union[torch.Tensor, From 82de9b9d468dab451380d3e7dda88b0c40a31204 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 1 Aug 2025 13:44:10 +0800 Subject: [PATCH 105/224] [Misc] Automatically resolve HF processor init kwargs (#22005) Signed-off-by: DarkLight1337 --- examples/offline_inference/vision_language.py | 38 +++--- tests/lora/test_qwen2vl.py | 6 - .../multimodal/generation/test_common.py | 27 ++++- .../generation/vlm_utils/model_utils.py | 12 ++ .../processing/test_transformers.py | 2 +- tests/models/registry.py | 3 +- tests/multimodal/test_processing.py | 113 +++++++++++------- vllm/config.py | 12 +- vllm/inputs/registry.py | 17 ++- vllm/model_executor/models/aya_vision.py | 12 +- vllm/model_executor/models/deepseek_vl2.py | 36 +++--- vllm/model_executor/models/florence2.py | 6 - vllm/model_executor/models/fuyu.py | 4 +- vllm/model_executor/models/glm4_1v.py | 8 +- vllm/model_executor/models/h2ovl.py | 16 +-- .../models/hyperclovax_vision.py | 20 +--- vllm/model_executor/models/idefics3.py | 10 +- vllm/model_executor/models/internvl.py | 28 +---- vllm/model_executor/models/keye.py | 84 +------------ vllm/model_executor/models/llava.py | 46 ++----- vllm/model_executor/models/minicpmv.py | 6 +- vllm/model_executor/models/mllama4.py | 2 +- vllm/model_executor/models/nemotron_vl.py | 24 +--- vllm/model_executor/models/nvlm_d.py | 16 +-- vllm/model_executor/models/ovis.py | 8 +- vllm/model_executor/models/phi3v.py | 11 -- vllm/model_executor/models/phi4_multimodal.py | 22 ++-- vllm/model_executor/models/phi4mm.py | 21 +--- .../models/qwen2_5_omni_thinker.py | 47 +------- vllm/model_executor/models/qwen2_5_vl.py | 19 +-- vllm/model_executor/models/qwen2_audio.py | 18 +-- vllm/model_executor/models/qwen2_vl.py | 82 +------------ vllm/model_executor/models/skyworkr1v.py | 86 ++++--------- vllm/model_executor/models/smolvlm.py | 10 +- vllm/model_executor/models/tarsier.py | 12 +- vllm/model_executor/models/transformers.py | 5 - vllm/model_executor/models/ultravox.py | 20 +--- vllm/model_executor/models/whisper.py | 15 ++- vllm/transformers_utils/processor.py | 94 +++++++++------ vllm/utils/__init__.py | 43 ------- 40 files changed, 334 insertions(+), 727 deletions(-) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 6f23a29e72f71..0edcd0407747c 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -449,25 +449,6 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: ) -# omni-research/Tarsier-7b -def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - model_name = "omni-research/Tarsier-7b" - - engine_args = EngineArgs( - model=model_name, - trust_remote_code=True, - max_model_len=4096, - limit_mm_per_prompt={modality: 1}, - ) - prompts = [(f"USER: \n{question} ASSISTANT:") for question in questions] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - # Intern-S1 def run_interns1(questions: list[str], modality: str) -> ModelRequestData: model_name = "internlm/Intern-S1" @@ -1293,6 +1274,25 @@ def run_qwen2_5_omni(questions: list[str], modality: str): ) +# omni-research/Tarsier-7b +def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + model_name = "omni-research/Tarsier-7b" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + limit_mm_per_prompt={modality: 1}, + ) + prompts = [(f"USER: \n{question} ASSISTANT:") for question in questions] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: model_name = "omni-research/Tarsier2-Recap-7b" diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 604bb307b889d..76f3bc0ebf89f 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -4,8 +4,6 @@ from dataclasses import dataclass from typing import Optional import pytest -from packaging.version import Version -from transformers import __version__ as TRANSFORMERS_VERSION import vllm from vllm.assets.image import ImageAsset @@ -185,10 +183,6 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files): current_platform.is_rocm(), reason="Qwen2.5-VL dependency xformers incompatible with ROCm", ) -@pytest.mark.skipif( - Version(TRANSFORMERS_VERSION) < Version("4.49.0"), - reason="Qwen2.5-VL require transformers version no lower than 4.49.0", -) def test_qwen25vl_lora(qwen25vl_lora_files): """Test Qwen 2.5 VL model with LoRA""" config = TestConfig(model_path=QWEN25VL_MODEL_PATH, diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 5bff615fb1071..967228b54a0af 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -702,13 +702,38 @@ VLM_TEST_SETTINGS = { "smolvlm": VLMTestInfo( models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}\nAssistant:", # noqa: E501 + prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}\nAssistant:", # noqa: E501 img_idx_to_prompt=lambda idx: "", max_model_len=8192, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, hf_output_post_proc=model_utils.smolvlm_trunc_hf_output, ), + "tarsier": VLMTestInfo( + models=["omni-research/Tarsier-7b"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:", + max_model_len=4096, + max_num_seqs=2, + auto_cls=AutoModelForImageTextToText, + patch_hf_runner=model_utils.tarsier_patch_hf_runner, + ), + "tarsier2": VLMTestInfo( + models=["omni-research/Tarsier2-Recap-7b"], + test_type=( + VLMTestType.IMAGE, + VLMTestType.MULTI_IMAGE, + VLMTestType.VIDEO, + ), + prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 + video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + auto_cls=AutoModelForImageTextToText, + image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + marks=[pytest.mark.skip("Model initialization hangs")], + ), ### Tensor parallel / multi-gpu broadcast tests "chameleon-broadcast": VLMTestInfo( models=["facebook/chameleon-7b"], diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index c1a2aa0dcafbb..5e8dac6bce96a 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -818,3 +818,15 @@ def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner: thinker.get_output_embeddings = lambda: thinker.lm_head hf_model.model = thinker return hf_model + + +def tarsier_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + from vllm.model_executor.models.tarsier import get_vision_encoder_info + + vision_encoder_info = get_vision_encoder_info(hf_model.config) + + hf_processor = hf_model.processor + if hf_processor.patch_size is None: + hf_processor.patch_size = vision_encoder_info.get_patch_size() + + return hf_model diff --git a/tests/models/multimodal/processing/test_transformers.py b/tests/models/multimodal/processing/test_transformers.py index c7d1b5271ff72..54a0be99384a8 100644 --- a/tests/models/multimodal/processing/test_transformers.py +++ b/tests/models/multimodal/processing/test_transformers.py @@ -16,7 +16,7 @@ def test_multimodal_processor(model_id): model_impl="transformers", ) - mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config, ) + mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config) image_pil = ImageAsset('cherry_blossom').pil_image mm_data = {"image": image_pil} diff --git a/tests/models/registry.py b/tests/models/registry.py index b9e7de4e9fd11..806342a57dfab 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -465,8 +465,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { is_available_online=False), "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 trust_remote_code=True), - "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b", # noqa: E501 - hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501 + "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"), # noqa: E501 "Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b", # noqa: E501 hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}), # noqa: E501 "VoxtralForConditionalGeneration": _HfExamplesInfo( diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 8a3f09bdbe27e..659ee9af9ddec 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -2,16 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import nullcontext -from types import MethodType -from typing import cast +from typing import Optional, cast from unittest.mock import MagicMock import numpy as np import pytest import torch -from transformers import ProcessorMixin from vllm.config import ModelConfig +from vllm.inputs import InputProcessingContext from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs, MultiModalKwargsItem, @@ -1013,57 +1012,91 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): ) -class _ProcessorProxy: +class DummyProcessor: - def __init__(self, processor: ProcessorMixin) -> None: + def __init__(self, a: int = 0, b: int = 0) -> None: super().__init__() - self.__processor = processor - - def __getattr__(self, key: str): - return getattr(self.__processor, key) + self.a = a + self.b = b def __call__( self, - text=None, - images=None, - videos=None, - exists=None, - return_tensors=None, - ): - return dict(exists=exists) + a: int = 0, + c: int = 0, + return_tensors: Optional[str] = None, + ) -> dict[str, int]: + return dict(a=a, c=c) -@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy # yapf: disable +@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy @pytest.mark.parametrize( - ("call_kwargs", "expected_kwargs"), + ("config_kwargs", "inference_kwargs", "expected_kwargs"), [ - # Should ignore invalid kwargs - ({"does_not_exist": 100}, {"exists": None}), - ({"exists": 1}, {"exists": 1}), - ({"does_not_exist": 100, "exists": 1}, {"exists": 1}), + ({"a": 1}, {}, {"a": 1, "b": 0}), + ({}, {"a": 1}, {"a": 1, "b": 0}), + # inference_kwargs should take precedence + ({"a": 1}, {"a": 2}, {"a": 2, "b": 0}), + # Should ignore extra kwargs + ({"a": 1, "c": 1}, {}, {"a": 1, "b": 0}), + ({"b": 1, "c": 1}, {}, {"a": 0, "b": 1}), ], ) # yapf: enable -def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs): - model_config = ModelConfig(model_id) +def test_hf_processor_init_kwargs( + model_id, + config_kwargs, + inference_kwargs, + expected_kwargs, +): + # Should not be used since there is nothing to convert to tokens + mock_tokenizer = cast(AnyTokenizer, object()) - processor = MULTIMODAL_REGISTRY.create_processor(model_config) - orig_get_hf_processor = processor.info.get_hf_processor - - def get_hf_processor(self, **kwargs): - assert kwargs == call_kwargs - return _ProcessorProxy(orig_get_hf_processor()) - - processor.info.get_hf_processor = MethodType(get_hf_processor, - processor.info) - - out_kwargs = processor._call_hf_processor( - prompt="", - mm_data={}, - mm_kwargs=call_kwargs, - tok_kwargs={}, + ctx = InputProcessingContext( + model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs), + tokenizer=mock_tokenizer, ) - assert out_kwargs == expected_kwargs + processor = ctx.get_hf_processor( + DummyProcessor, # type: ignore[arg-type] + **inference_kwargs, + ) + + for k, v in expected_kwargs.items(): + assert getattr(processor, k) == v + + +# yapf: disable +@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy +@pytest.mark.parametrize( + ("config_kwargs", "inference_kwargs", "expected_kwargs"), + [ + ({"a": 1}, {}, {"a": 1, "c": 0}), + ({}, {"a": 1}, {"a": 1, "c": 0}), + # inference_kwargs should take precedence + ({"a": 1}, {"a": 2}, {"a": 2, "c": 0}), + # Should ignore extra kwargs + ({"a": 1, "c": 1}, {}, {"a": 1, "c": 1}), + ({"b": 1, "c": 1}, {}, {"a": 0, "c": 1}), + ], +) +# yapf: enable +def test_hf_processor_call_kwargs( + model_id, + config_kwargs, + inference_kwargs, + expected_kwargs, +): + # Should not be used since there is nothing to convert to tokens + mock_tokenizer = cast(AnyTokenizer, object()) + + ctx = InputProcessingContext( + model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs), + tokenizer=mock_tokenizer, + ) + + processor = ctx.get_hf_processor(DummyProcessor) # type: ignore[arg-type] + + result = ctx.call_hf_processor(processor, {}, inference_kwargs) + assert result == expected_kwargs diff --git a/vllm/config.py b/vllm/config.py index edad5dd0406bf..9d5739ca11efd 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -11,6 +11,7 @@ import textwrap import uuid import warnings from collections import Counter +from collections.abc import Mapping from contextlib import contextmanager from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass, replace) @@ -3332,7 +3333,16 @@ class MultiModalConfig: 999 if envs.VLLM_USE_V1 else 1, ) - # TODO: Add configs to init vision tower or not. + def merge_mm_processor_kwargs( + self, + inference_kwargs: Mapping[str, object], + ) -> dict[str, object]: + """ + Get the keyword arguments to pass to the multi-modal processor + according to the extra arguments passed during inference. + """ + kwargs = self.mm_processor_kwargs or {} + return kwargs | dict(inference_kwargs) @config diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 652136fbbfe73..6331a70b469aa 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -11,7 +11,7 @@ from typing_extensions import TypeVar from vllm.jsontree import JSONTree, json_map_leaves from vllm.logger import init_logger from vllm.transformers_utils.processor import cached_processor_from_config -from vllm.utils import resolve_mm_processor_kwargs +from vllm.utils import get_allowed_kwarg_only_overrides if TYPE_CHECKING: from vllm.config import ModelConfig @@ -154,14 +154,11 @@ class InputProcessingContext(InputContext): assert callable(hf_processor) mm_config = self.model_config.get_multimodal_config() - base_kwargs = mm_config.mm_processor_kwargs - if base_kwargs is None: - base_kwargs = {} + merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs) - merged_kwargs = resolve_mm_processor_kwargs( - base_kwargs, - kwargs, + allowed_kwargs = get_allowed_kwarg_only_overrides( hf_processor, + merged_kwargs, requires_kw_only=False, allow_var_kwargs=True, ) @@ -173,7 +170,9 @@ class InputProcessingContext(InputContext): return x try: - output = hf_processor(**data, **merged_kwargs, return_tensors="pt") + output = hf_processor(**data, + **allowed_kwargs, + return_tensors="pt") # this emulates output.to(dtype=self.model_config.dtype) if isinstance(output, BatchFeature): cast_output = json_map_leaves(maybe_cast_dtype, output.data) @@ -189,7 +188,7 @@ class InputProcessingContext(InputContext): except Exception as exc: msg = (f"Failed to apply {type(hf_processor).__name__} " - f"on data={data} with kwargs={merged_kwargs}") + f"on data={data} with kwargs={allowed_kwargs}") raise ValueError(msg) from exc diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index a3eee9f065aea..b476a4f918bc3 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -123,16 +123,10 @@ class AyaVisionProcessingInfo(BaseProcessingInfo): return self.ctx.get_hf_config(AyaVisionConfig) def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor: - processor = self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs) + return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs) - # Temporary workaround since this processor has multiple image tokens - # See https://github.com/huggingface/transformers/issues/38350 - processor._check_special_mm_tokens = lambda *args, **kwargs: None - - return processor - - def get_image_processor(self) -> GotOcr2ImageProcessor: - return self.get_hf_processor().image_processor + def get_image_processor(self, **kwargs: object) -> GotOcr2ImageProcessor: + return self.get_hf_processor(**kwargs).image_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 544de5fe02d35..531018625478b 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -214,25 +214,25 @@ class DeepseekVL2MultiModalProcessor( mm_kwargs: Mapping[str, object], tok_kwargs: Mapping[str, object], ) -> BatchFeature: - if mm_data: - processed_outputs = self.info.ctx.call_hf_processor( - self.info.get_hf_processor(**mm_kwargs), - dict(prompt=prompt, **mm_data), - dict(**mm_kwargs, **tok_kwargs), - ) - pixel_values = processed_outputs["pixel_values"] - # split pixel values into patches corresponding to each image - images_spatial_crop = processed_outputs["images_spatial_crop"] - patches_per_image = [ - x.prod().item() + 1 for x in images_spatial_crop - ] - pixel_values = pixel_values.split(patches_per_image) - processed_outputs["pixel_values"] = pixel_values - else: + if not mm_data: tokenizer = self.info.get_tokenizer() - processed_outputs = tokenizer(prompt, - add_special_tokens=True, - return_tensors="pt") + return tokenizer(prompt, + add_special_tokens=True, + return_tensors="pt") + + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + tok_kwargs=tok_kwargs, + ) + + pixel_values = processed_outputs["pixel_values"] + # split pixel values into patches corresponding to each image + images_spatial_crop = processed_outputs["images_spatial_crop"] + patches_per_image = [x.prod().item() + 1 for x in images_spatial_crop] + pixel_values = pixel_values.split(patches_per_image) + processed_outputs["pixel_values"] = pixel_values return processed_outputs diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index 399c739f408ee..56e456c2f1f2a 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -761,12 +761,6 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only): class Florence2ProcessingInfo(BaseProcessingInfo): - def get_hf_config(self): - return self.ctx.get_hf_config() - - def get_hf_processor(self): - return self.ctx.get_hf_processor() - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 7e1d478562a4c..b61e0361fe8c3 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -83,8 +83,8 @@ class FuyuProcessingInfo(BaseProcessingInfo): def get_hf_processor(self, **kwargs: object): return self.ctx.get_hf_processor(FuyuProcessor, **kwargs) - def get_image_processor(self) -> FuyuImageProcessor: - return self.get_hf_processor().image_processor + def get_image_processor(self, **kwargs: object) -> FuyuImageProcessor: + return self.get_hf_processor(**kwargs).image_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index ae1bf22c704e5..5f306f05d140e 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -809,11 +809,11 @@ class Glm4vProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": 1} - def get_image_processor(self) -> Glm4vImageProcessor: - return self.get_hf_processor().image_processor + def get_image_processor(self, **kwargs: object) -> Glm4vImageProcessor: + return self.get_hf_processor(**kwargs).image_processor - def get_video_processor(self) -> Glm4vVideoProcessor: - return self.get_hf_processor().video_processor + def get_video_processor(self, **kwargs: object) -> Glm4vVideoProcessor: + return self.get_hf_processor(**kwargs).video_processor def _get_vision_info( self, diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 467b074f37753..c3e4f81597adb 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -392,21 +392,7 @@ class H2OVLProcessor(BaseInternVLProcessor): class H2OVLProcessingInfo(BaseInternVLProcessingInfo): - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> H2OVLProcessor: - if min_dynamic_patch is not None: - kwargs["min_dynamic_patch"] = min_dynamic_patch - if max_dynamic_patch is not None: - kwargs["max_dynamic_patch"] = max_dynamic_patch - if dynamic_image_size is not None: - kwargs["dynamic_image_size"] = dynamic_image_size - + def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor: return self.ctx.init_processor( H2OVLProcessor, config=self.get_hf_config(), diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index 3e8e50b35c0b7..e5c94c7f3a706 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -25,8 +25,7 @@ import torch import torch.nn as nn from timm.layers import LayerNorm, LayerNorm2d from timm.models.regnet import RegStage -from transformers import (AutoProcessor, BatchFeature, CLIPVisionConfig, - SiglipVisionConfig) +from transformers import BatchFeature, CLIPVisionConfig, SiglipVisionConfig from transformers.modeling_utils import no_init_weights from vllm.config import VllmConfig @@ -80,26 +79,9 @@ HCXVisionMultimodalInputs = Union[HCXVisionMultimodalPixelInputs] class HCXVisionProcessingInfo(BaseProcessingInfo): - def get_hf_config(self): - return self.ctx.get_hf_config() - def get_vision_encoder_info(self): return get_vision_encoder_info(self.get_hf_config()) - def get_hf_processor( - self, - **kwargs: object, - ): - processor_cls = type( - AutoProcessor.from_pretrained( - self.ctx.model_config.model, - trust_remote_code=self.ctx.model_config.trust_remote_code, - )) - return self.ctx.get_hf_processor( - processor_cls, - **kwargs, - ) - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 6e991d99b9638..3c01789b90066 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -88,15 +88,7 @@ ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs] class Idefics3ProcessingInfo(BaseProcessingInfo): - def get_hf_processor( - self, - *, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ) -> Idefics3Processor: - if size is not None: - kwargs["size"] = size - + def get_hf_processor(self, **kwargs: object) -> Idefics3Processor: return self.ctx.get_hf_processor(Idefics3Processor, **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index a0e98ca3f8155..8e766dd4c4768 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -665,14 +665,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo): """Basic image-only ProcessingInfo for InternVL-style models.""" @abstractmethod - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> BaseInternVLProcessor: + def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor: raise NotImplementedError def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: @@ -882,27 +875,12 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo): return max(max_frames_per_video, 1) - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> InternVLProcessor: - if min_dynamic_patch is not None: - kwargs["min_dynamic_patch"] = min_dynamic_patch - if max_dynamic_patch is not None: - kwargs["max_dynamic_patch"] = max_dynamic_patch - if dynamic_image_size is not None: - kwargs["dynamic_image_size"] = dynamic_image_size - - kwargs["video_token"] = self.get_video_token() - + def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: return self.ctx.init_processor( InternVLProcessor, config=self.get_hf_config(), tokenizer=self.get_tokenizer(), + video_token=self.get_video_token(), **kwargs, ) diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 892d970aaade0..4d8aa8de0f0b1 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -44,8 +44,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope -from vllm.transformers_utils.processor import ( - cached_image_processor_from_config) from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, @@ -980,72 +978,8 @@ class KeyeMultiModalDataParser(MultiModalDataParser): class KeyeProcessingInfo(BaseProcessingInfo): - def get_hf_processor( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ): - return self.ctx.get_hf_processor( - image_processor=self.get_image_processor( - min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - ), - **kwargs, - ) - - def _get_image_processor_kwargs( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ): - if self.ctx.model_config.mm_processor_kwargs: - kwargs.update(self.ctx.model_config.mm_processor_kwargs) - - if min_pixels is not None: - kwargs["min_pixels"] = min_pixels - - if size is None: - size = {"shortest_edge": min_pixels} - else: - size["shortest_edge"] = min_pixels - - if max_pixels is not None: - kwargs["max_pixels"] = max_pixels - - if size is None: - size = {"longest_edge": max_pixels} - else: - size["longest_edge"] = max_pixels - - if size is not None: - kwargs["size"] = size - - return kwargs - - def get_image_processor( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ): - return cached_image_processor_from_config( - self.ctx.model_config, - **self._get_image_processor_kwargs( - min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - **kwargs, - ), - ) + def get_image_processor(self, **kwargs: object): + return self.get_hf_processor(**kwargs).image_processor def get_supported_mm_limits(self, ) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} @@ -1246,20 +1180,6 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]): def _get_data_parser(self) -> MultiModalDataParser: return KeyeMultiModalDataParser() - def _call_hf_processor( - self, - prompt: str, - mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], - tok_kwargs: Mapping[str, object], - ) -> BatchFeature: - mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs) - return self.info.ctx.call_hf_processor( - self.info.get_hf_processor(**mm_kwargs), - dict(text=prompt, **mm_data), - dict(**mm_kwargs, **tok_kwargs), - ) - def _get_prompt_updates( self, mm_items: MultiModalDataItems, diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 0126ace09e707..c863ba406422d 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -8,11 +8,9 @@ from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, import torch import torch.nn as nn -from packaging.version import Version from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig, PixtralVisionConfig, PretrainedConfig, SiglipVisionConfig) -from transformers import __version__ as TRANSFORMERS_VERSION from transformers.models.llava import LlavaProcessor from transformers.models.pixtral import PixtralProcessor @@ -307,29 +305,14 @@ class PixtralHFMultiModalProcessor( pixel_values = processed_outputs.get("pixel_values") if pixel_values is not None: - # Before/after https://github.com/huggingface/transformers/pull/35122 - if Version(TRANSFORMERS_VERSION) <= Version("4.48.3"): - images = mm_data["images"] - assert isinstance(images, list) + # Avoid padding since we need the output for each image to be + # independent of other images for the cache to work correctly + image_sizes = processed_outputs["image_sizes"] + assert len(pixel_values) == len(image_sizes) - # Original output: (1, num_images, C, H, W) - # New output: (num_images, C, H, W) - assert (isinstance(pixel_values, list) - and len(pixel_values) == 1) - assert (isinstance(pixel_values[0], list) - and len(pixel_values[0]) == len(images)) - - processed_outputs["pixel_values"] = pixel_values[0] - else: - # Avoid padding since we need the output for each image to be - # independent of other images for the cache to work correctly - image_sizes = processed_outputs["image_sizes"] - assert len(pixel_values) == len(image_sizes) - - processed_outputs["pixel_values"] = [ - p[:, :h, :w] - for p, (h, w) in zip(pixel_values, image_sizes) - ] + processed_outputs["pixel_values"] = [ + p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes) + ] return processed_outputs @@ -784,17 +767,10 @@ class MantisProcessingInfo(LlavaProcessingInfo): vision_info = self.get_vision_encoder_info() kwargs.setdefault("patch_size", vision_info.get_patch_size()) - - if Version(TRANSFORMERS_VERSION) < Version("4.48"): - # BUG: num_additional_image_tokens = 0 but treated as 1, - # so we set vision_feature_select_strategy to None to offset this - kwargs.setdefault("vision_feature_select_strategy", None) - else: - # FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150 - kwargs.setdefault( - "vision_feature_select_strategy", - hf_config.vision_feature_select_strategy, - ) + kwargs.setdefault( + "vision_feature_select_strategy", + hf_config.vision_feature_select_strategy, + ) return self.ctx.get_hf_processor(LlavaProcessor, **kwargs) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 70f2d4a6420b9..e172758b2f2c5 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -331,10 +331,8 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo): return hf_processor - def get_image_processor(self): - hf_processor = self.get_hf_processor() - image_processor = hf_processor.image_processor # type: ignore - return image_processor + def get_image_processor(self, **kwargs: object): + return self.get_hf_processor(**kwargs).image_processor def get_model_version(self): return get_version_by_config(self.get_hf_config()) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index dea85d320adfd..924f10d82b381 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -533,7 +533,7 @@ class Mllama4ProcessingInfo(BaseProcessingInfo): def get_hf_processor(self, **kwargs: object) -> Llama4Processor: return self.ctx.get_hf_processor(Llama4Processor, - use_fast=True, + use_fast=kwargs.pop("use_fast", True), **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index 5d0513d707413..b90cb9b39a60b 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -137,34 +137,16 @@ class NemotronVLProcessor(InternVLProcessor): class NemotronVLProcessingInfo(BaseInternVLProcessingInfo): """Processing info for Nemotron VL models.""" - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> NemotronVLProcessor: - if min_dynamic_patch is not None: - kwargs["min_dynamic_patch"] = min_dynamic_patch - if max_dynamic_patch is not None: - kwargs["max_dynamic_patch"] = max_dynamic_patch - if dynamic_image_size is not None: - kwargs["dynamic_image_size"] = dynamic_image_size - - image_processor = self.get_image_processor() + def get_hf_processor(self, **kwargs: object) -> NemotronVLProcessor: return self.ctx.init_processor( NemotronVLProcessor, config=self.get_hf_config(), tokenizer=self.get_tokenizer(), - image_processor=image_processor, + image_processor=self.get_image_processor(), **kwargs, ) - def get_image_processor( - self, - **kwargs: object, - ): + def get_image_processor(self, **kwargs: object): return cached_image_processor_from_config( self.ctx.model_config, **kwargs, diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 2f7f8e437f0ad..4bea1392a6814 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -63,21 +63,7 @@ class NVLMProcessor(BaseInternVLProcessor): class NVLMProcessingInfo(BaseInternVLProcessingInfo): - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> NVLMProcessor: - if min_dynamic_patch is not None: - kwargs["min_dynamic_patch"] = min_dynamic_patch - if max_dynamic_patch is not None: - kwargs["max_dynamic_patch"] = max_dynamic_patch - if dynamic_image_size is not None: - kwargs["dynamic_image_size"] = dynamic_image_size - + def get_hf_processor(self, **kwargs: object) -> NVLMProcessor: return self.ctx.init_processor( NVLMProcessor, config=self.get_hf_config(), diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index c8b528048b557..6b27980e0b0c3 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -25,7 +25,7 @@ import torch import torch.nn as nn from torch import Tensor from torch.nn.functional import gumbel_softmax, pad, softmax -from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig +from transformers import BatchFeature, PretrainedConfig from vllm.config import VllmConfig from vllm.model_executor.layers.linear import ReplicatedLinear @@ -245,11 +245,12 @@ class VisualEmbedding(torch.nn.Embedding): class OvisProcessingInfo(BaseProcessingInfo): - def get_hf_processor(self, **kwargs): + def get_hf_processor(self, **kwargs: object): return self.ctx.get_hf_processor( OvisProcessor, image_pad_token=self.get_image_pad_token(), image_segment_len=self.get_image_segment_len(), + **kwargs, ) def get_image_segment_len(self) -> int: @@ -269,9 +270,6 @@ class OvisProcessingInfo(BaseProcessingInfo): text_model_type = hf_text_config.model_type return IMAGE_PAD_TOKEN_MAP.get(text_model_type) - def get_image_processor(self) -> BaseImageProcessor: - return self.get_hf_processor().image_processor # type: ignore - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index aa739f22fd7bf..9ef4f8371eb3d 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -318,17 +318,6 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase): class Phi3VProcessingInfo(BaseProcessingInfo): - def get_hf_processor( - self, - *, - num_crops: Optional[int] = None, - **kwargs: object, - ) -> ProcessorMixin: - if num_crops is not None: - kwargs["num_crops"] = num_crops - - return self.ctx.get_hf_processor(**kwargs) - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py index 432b707a61591..e13b8276bf17a 100644 --- a/vllm/model_executor/models/phi4_multimodal.py +++ b/vllm/model_executor/models/phi4_multimodal.py @@ -696,19 +696,12 @@ class Phi4MMProcessingInfo(BaseProcessingInfo): def get_hf_config(self) -> Phi4MultimodalConfig: return self.ctx.get_hf_config(Phi4MultimodalConfig) - def get_hf_processor( - self, - *, - dynamic_hd: Optional[int] = None, - **kwargs: object, - ) -> Phi4MMProcessor: - if dynamic_hd is not None: - kwargs["dynamic_hd"] = dynamic_hd + def get_hf_processor(self, **kwargs: object) -> Phi4MMProcessor: + return self.ctx.get_hf_processor(Phi4MMProcessor, **kwargs) - return self.ctx.get_hf_processor(**kwargs) - - def get_feature_extractor(self) -> Phi4MultimodalFeatureExtractor: - return self.get_hf_processor().audio_processor + def get_feature_extractor( + self, **kwargs: object) -> Phi4MultimodalFeatureExtractor: + return self.get_hf_processor(**kwargs).audio_processor def get_image_processor( self, @@ -1007,7 +1000,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): if audio_data: audio_features = processed_outputs['audio_input_features'] - sr = self.info.get_feature_extractor().sampling_rate + sr = self.info.get_feature_extractor(**mm_kwargs).sampling_rate feature_sizes = [ self.info.get_audio_num_frames(len(audio), sr) for audio in audio_data @@ -1043,7 +1036,8 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): audio_token_id = tokenizer.vocab[tokenizer.audio_token] hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - audio_processor = self.info.get_feature_extractor() + audio_processor = self.info.get_feature_extractor( + **hf_processor_mm_kwargs) def get_image_replacement_phi4mm(item_idx: int): images = mm_items.get_items( diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 9b61c3634d841..73e8446e6dea7 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -459,17 +459,6 @@ def cat_with_pad(tensors, dim, padding_value=0): class Phi4MMProcessingInfo(BaseProcessingInfo): - def get_hf_processor( - self, - *, - dynamic_hd: Optional[int] = None, - **kwargs: object, - ) -> ProcessorMixin: - if dynamic_hd is not None: - kwargs["dynamic_hd"] = dynamic_hd - - return self.ctx.get_hf_processor(**kwargs) - @property def image_tokens(self) -> list[str]: return [f"<|image_{i+1}|>" for i in range(100)] @@ -487,8 +476,9 @@ class Phi4MMProcessingInfo(BaseProcessingInfo): image_processor = processor.image_processor return image_processor.dynamic_hd - def get_feature_extractor(self) -> SequenceFeatureExtractor: - return self.get_hf_processor().audio_processor + def get_feature_extractor(self, + **kwargs: object) -> SequenceFeatureExtractor: + return self.get_hf_processor(**kwargs).audio_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None, "image": None} @@ -769,7 +759,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") - sr = self.info.get_feature_extractor().sampling_rate + sr = self.info.get_feature_extractor(**mm_kwargs).sampling_rate if (audio_data := mm_data.get("audios", [])): mm_data['audios'] = [(data, sr) for data in audio_data] @@ -816,7 +806,8 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): ) -> Sequence[PromptUpdate]: image_tokens: list[str] = self.info.image_tokens # type: ignore audio_tokens: list[str] = self.info.audio_tokens # type: ignore - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) def get_image_replacement_phi4mm(item_idx: int): diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index c5a5c10d9509f..b9fed79c84cdd 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -132,50 +132,15 @@ class Qwen2_5OmniThinkerProcessingInfo(Qwen2AudioProcessingInfo, def get_hf_config(self): return self.ctx.get_hf_config(Qwen2_5OmniConfig).thinker_config - def get_hf_processor( - self, - *, - sampling_rate: Optional[int] = None, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - fps: Optional[Union[float, list[float]]] = None, - **kwargs: object, - ) -> Qwen2_5OmniProcessor: - if fps is not None: - kwargs["fps"] = fps - - # Monkey patch for Transformers v4.53 - processor_class = Qwen2_5OmniProcessor - if processor_class.image_processor_class != "AutoImageProcessor": - processor_class.image_processor_class = "AutoImageProcessor" - if processor_class.video_processor_class != "AutoVideoProcessor": - processor_class.video_processor_class = "AutoVideoProcessor" - - processor = self.ctx.get_hf_processor( - processor_class, - image_processor=self.get_image_processor(min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - use_fast=kwargs.get( - "use_fast", True)), + def get_hf_processor(self, **kwargs: object) -> Qwen2_5OmniProcessor: + return self.ctx.get_hf_processor( + Qwen2_5OmniProcessor, + use_fast=kwargs.pop("use_fast", True), **kwargs, ) - if not hasattr(processor, "audio_token"): - processor.audio_token = "<|AUDIO|>" - if not hasattr(processor, "image_token"): - processor.image_token = "<|IMAGE|>" - if not hasattr(processor, "video_token"): - processor.video_token = "<|VIDEO|>" - return processor - def get_feature_extractor( - self, - *, - sampling_rate: Optional[int] = None, - **kwargs: object, - ): - hf_processor = self.get_hf_processor(sampling_rate=sampling_rate) + def get_feature_extractor(self, **kwargs: object): + hf_processor = self.get_hf_processor(**kwargs) feature_extractor = hf_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 8ae096536fdc5..c4c4650f569e1 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -780,25 +780,10 @@ class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(Qwen2_5_VLConfig) - def get_hf_processor( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - fps: Optional[Union[float, list[float]]] = None, - **kwargs: object, - ) -> Qwen2_5_VLProcessor: - if fps is not None: - kwargs["fps"] = fps - + def get_hf_processor(self, **kwargs: object) -> Qwen2_5_VLProcessor: return self.ctx.get_hf_processor( Qwen2_5_VLProcessor, - image_processor=self.get_image_processor(min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - use_fast=kwargs.get( - "use_fast", True)), + use_fast=kwargs.pop("use_fast", True), **kwargs, ) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index d7fec30acd8d3..3ef55cd704cf0 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -86,22 +86,12 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(Qwen2AudioConfig) - def get_hf_processor( - self, - *, - # Ignored in initialization - sampling_rate: Optional[int] = None, - **kwargs: object, - ) -> Qwen2AudioProcessor: + def get_hf_processor(self, **kwargs: object) -> Qwen2AudioProcessor: return self.ctx.get_hf_processor(Qwen2AudioProcessor, **kwargs) - def get_feature_extractor( - self, - *, - # Ignored in initialization - sampling_rate: Optional[int] = None, - ) -> WhisperFeatureExtractor: - hf_processor = self.get_hf_processor(sampling_rate=sampling_rate) + def get_feature_extractor(self, + **kwargs: object) -> WhisperFeatureExtractor: + hf_processor = self.get_hf_processor(**kwargs) feature_extractor = hf_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index ad63bb4af4e9d..4e8ea8e449133 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -69,8 +69,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import _Backend, current_platform from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope -from vllm.transformers_utils.processor import ( - cached_image_processor_from_config) from vllm.transformers_utils.tokenizer import AnyTokenizer from .interfaces import (MultiModalEmbeddings, SupportsLoRA, @@ -752,73 +750,15 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(Qwen2VLConfig) - def get_hf_processor( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ) -> Qwen2VLProcessor: + def get_hf_processor(self, **kwargs: object) -> Qwen2VLProcessor: return self.ctx.get_hf_processor( Qwen2VLProcessor, - image_processor=self.get_image_processor(min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - use_fast=kwargs.get( - "use_fast", True)), + use_fast=kwargs.pop("use_fast", True), **kwargs, ) - def _get_image_processor_kwargs( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ): - mm_config = self.ctx.model_config.get_multimodal_config() - if mm_config.mm_processor_kwargs: - kwargs.update(mm_config.mm_processor_kwargs) - - if min_pixels is not None: - kwargs["min_pixels"] = min_pixels - - if size is None: - size = {"shortest_edge": min_pixels} - else: - size["shortest_edge"] = min_pixels - - if max_pixels is not None: - kwargs["max_pixels"] = max_pixels - - if size is None: - size = {"longest_edge": max_pixels} - else: - size["longest_edge"] = max_pixels - - if size is not None: - kwargs["size"] = size - - return kwargs - - def get_image_processor( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ) -> Qwen2VLImageProcessor: - kwargs["use_fast"] = kwargs.get("use_fast", True) - return cached_image_processor_from_config( - self.ctx.model_config, - **self._get_image_processor_kwargs(min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - **kwargs), - ) + def get_image_processor(self, **kwargs: object) -> Qwen2VLImageProcessor: + return self.get_hf_processor(**kwargs).image_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} @@ -1023,20 +963,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] def _get_data_parser(self) -> MultiModalDataParser: return Qwen2VLMultiModalDataParser() - def _call_hf_processor( - self, - prompt: str, - mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], - tok_kwargs: Mapping[str, object], - ) -> BatchFeature: - mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs) - return self.info.ctx.call_hf_processor( - self.info.get_hf_processor(**mm_kwargs), - dict(text=prompt, **mm_data), - dict(**mm_kwargs, **tok_kwargs), - ) - def _get_prompt_updates( self, mm_items: MultiModalDataItems, diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index 5ae5c0bc1d5dc..c76aabcd27ccb 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -7,9 +7,8 @@ # Copyright (c) 2025 Skywork # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- -from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, TypedDict, TypeVar, Union +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -232,7 +231,7 @@ def image_to_pixel_values_skyworkr1v( return pixel_values -class BaseSkyworkR1VProcessor(ABC): +class SkyworkR1VProcessor: """ This model doesn't define its own HF processor, so we implement our own one here. @@ -279,17 +278,18 @@ class BaseSkyworkR1VProcessor(ABC): self.use_thumbnail: bool = config.use_thumbnail @property - @abstractmethod def image_token_id(self) -> int: - raise NotImplementedError + return self.tokenizer.get_vocab()[IMG_CONTEXT] - @abstractmethod def get_image_repl( self, feature_size: int, num_patches: Optional[int], ) -> PromptUpdateDetails[str]: - raise NotImplementedError + repl_features = IMG_CONTEXT * feature_size + repl_full = IMG_START + repl_features + IMG_END + + return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) def resolve_min_max_num( self, @@ -426,35 +426,15 @@ class BaseSkyworkR1VProcessor(ABC): } -class SkyworkR1VProcessor(BaseSkyworkR1VProcessor): +class SkyworkR1VProcessingInfo(BaseProcessingInfo): - @property - def image_token_id(self) -> int: - return self.tokenizer.get_vocab()[IMG_CONTEXT] - - def get_image_repl( - self, - feature_size: int, - num_patches: Optional[int], - ) -> PromptUpdateDetails[str]: - repl_features = IMG_CONTEXT * feature_size - repl_full = IMG_START + repl_features + IMG_END - - return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) - - -class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo): - - @abstractmethod - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> BaseSkyworkR1VProcessor: - raise NotImplementedError + def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor: + return self.ctx.init_processor( + SkyworkR1VProcessor, + config=self.get_hf_config(), + tokenizer=self.get_tokenizer(), + **kwargs, + ) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} @@ -464,7 +444,7 @@ class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - processor: Optional[BaseSkyworkR1VProcessor], + processor: Optional[SkyworkR1VProcessor], ) -> int: if processor is None: processor = self.get_hf_processor() @@ -500,10 +480,8 @@ class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo): return largest_feature_pinpoint -_I = TypeVar("_I", bound=BaseSkyworkR1VProcessingInfo) - - -class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[_I]): +class SkyworkR1VDummyInputsBuilder( + BaseDummyInputsBuilder[SkyworkR1VProcessingInfo]): def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_images = mm_counts.get("image", 0) @@ -527,7 +505,8 @@ class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[_I]): } -class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]): +class SkyworkR1VMultiModalProcessor( + BaseMultiModalProcessor[SkyworkR1VProcessingInfo]): def _call_hf_processor( self, @@ -617,31 +596,6 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]): ] -class SkyworkR1VProcessingInfo(BaseSkyworkR1VProcessingInfo): - - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> SkyworkR1VProcessor: - if min_dynamic_patch is not None: - kwargs["min_dynamic_patch"] = min_dynamic_patch - if max_dynamic_patch is not None: - kwargs["max_dynamic_patch"] = max_dynamic_patch - if dynamic_image_size is not None: - kwargs["dynamic_image_size"] = dynamic_image_size - - return self.ctx.init_processor( - SkyworkR1VProcessor, - config=self.get_hf_config(), - tokenizer=self.get_tokenizer(), - **kwargs, - ) - - @MULTIMODAL_REGISTRY.register_processor( SkyworkR1VMultiModalProcessor, info=SkyworkR1VProcessingInfo, diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py index 0f22ba5b406ce..2adfad67152b3 100644 --- a/vllm/model_executor/models/smolvlm.py +++ b/vllm/model_executor/models/smolvlm.py @@ -19,15 +19,7 @@ from .idefics3 import Idefics3ProcessingInfo class SmolVLMProcessingInfo(Idefics3ProcessingInfo): - def get_hf_processor( - self, - *, - max_image_size: Optional[dict[str, int]] = None, - **kwargs: object, - ) -> SmolVLMProcessor: - if max_image_size is not None: - kwargs["max_image_size"] = max_image_size - + def get_hf_processor(self, **kwargs: object) -> SmolVLMProcessor: return self.ctx.get_hf_processor(SmolVLMProcessor, **kwargs) def _get_image_token( diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index 979d789b330cf..70cf5e95a54e1 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -178,13 +178,11 @@ class TarsierProcessingInfo(BaseProcessingInfo): return get_vision_encoder_info(self.get_hf_config()) def get_hf_processor(self, **kwargs: object) -> TarsierProcessor: - hf_processor = self.ctx.get_hf_processor(TarsierProcessor, **kwargs) - # Patch for patch_size if needed (copied from vLLM LLaVA) - if hasattr(hf_processor, - 'patch_size') and hf_processor.patch_size is None: - patch_size = self.get_vision_encoder_info().get_patch_size() - hf_processor.patch_size = patch_size - return hf_processor + vision_info = self.get_vision_encoder_info() + + kwargs.setdefault("patch_size", vision_info.get_patch_size()) + + return self.ctx.get_hf_processor(TarsierProcessor, **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 8cd95605cdfae..e67548800c354 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -48,7 +48,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.processor import cached_get_processor from vllm.utils import is_list_of from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP, @@ -189,10 +188,6 @@ class MultiModalProcessingInfo(BaseProcessingInfo): image_tokens = mm_tokens["num_image_tokens"][0] return image_tokens - def get_hf_processor(self): - processor = cached_get_processor(self.ctx.model_config.model) - return processor - def get_max_image_size(self): return 10_000, 10_000 # hardcode for arbitrary very large size diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index a4569ccd5a845..bef34c1be49fe 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -71,13 +71,7 @@ UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs, class UltravoxProcessingInfo(BaseProcessingInfo): - def get_hf_processor( - self, - *, - # Ignored in initialization - sampling_rate: Optional[int] = None, - **kwargs: object, - ) -> ProcessorMixin: + def get_hf_processor(self, **kwargs: object) -> ProcessorMixin: config = self.ctx.model_config.hf_config hf_processor = self.ctx.get_hf_processor(**kwargs) @@ -89,13 +83,9 @@ class UltravoxProcessingInfo(BaseProcessingInfo): return hf_processor - def get_feature_extractor( - self, - *, - # Ignored in initialization - sampling_rate: Optional[int] = None, - ) -> WhisperFeatureExtractor: - hf_processor = self.get_hf_processor(sampling_rate=sampling_rate) + def get_feature_extractor(self, + **kwargs: object) -> WhisperFeatureExtractor: + hf_processor = self.get_hf_processor(**kwargs) audio_processor = hf_processor.audio_processor # type: ignore feature_extractor = audio_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) @@ -156,7 +146,7 @@ class UltravoxMultiModalProcessor( audios = mm_data.pop("audios", []) assert isinstance(audios, list) - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor(**mm_kwargs) mm_kwargs = dict( **mm_kwargs, sampling_rate=feature_extractor.sampling_rate, diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index d7bafb9ef84d9..ca02ecd828ba3 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -623,23 +623,22 @@ class WhisperProcessingInfo(BaseProcessingInfo): def get_hf_config(self) -> WhisperConfig: return self.ctx.get_hf_config(WhisperConfig) - def get_hf_processor(self, - sampling_rate: Optional[int] = None - ) -> WhisperProcessor: - # HACK: Transformers 4.53.0 has issue with whisper tokenizer to + def get_hf_processor(self, **kwargs: object) -> WhisperProcessor: + # HACK: Transformers 4.53.2 has issue with whisper tokenizer to # initialize processor. We use a monkeypatch to fix it here. # See: https://github.com/vllm-project/vllm/issues/20224 processor_class = WhisperProcessor tokenizer_class = ("WhisperTokenizer", "WhisperTokenizerFast") if processor_class.tokenizer_class != tokenizer_class: processor_class.tokenizer_class = tokenizer_class - return self.ctx.get_hf_processor(processor_class) + return self.ctx.get_hf_processor(processor_class, **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": 1} - def get_feature_extractor(self) -> WhisperFeatureExtractor: - hf_processor = self.get_hf_processor() + def get_feature_extractor(self, + **kwargs: object) -> WhisperFeatureExtractor: + hf_processor = self.get_hf_processor(**kwargs) feature_extractor = hf_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor @@ -702,7 +701,7 @@ class WhisperMultiModalProcessor( tok_kwargs: Mapping[str, object], ) -> BatchFeature: if mm_data: - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor(**mm_kwargs) mm_data = dict(audio=mm_data.pop("audios")) mm_kwargs = dict( **mm_kwargs, diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index 70cd08263d372..a630d940b2578 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -4,9 +4,15 @@ from functools import lru_cache from typing import TYPE_CHECKING, Any, Optional, Union, cast +from transformers import (AutoFeatureExtractor, AutoImageProcessor, + AutoProcessor) +from transformers.feature_extraction_utils import FeatureExtractionMixin +from transformers.image_processing_utils import BaseImageProcessor from transformers.processing_utils import ProcessorMixin from typing_extensions import TypeVar +from vllm.utils import get_allowed_kwarg_only_overrides + if TYPE_CHECKING: from vllm.config import ModelConfig @@ -33,23 +39,42 @@ class HashableList(list): return hash(tuple(self)) -def _merge_mm_kwargs(model_config: "ModelConfig", **kwargs): - mm_config = model_config.get_multimodal_config() - base_kwargs = mm_config.mm_processor_kwargs - if base_kwargs is None: - base_kwargs = {} +def _get_processor_factory_fn(processor_cls: Union[type, tuple[type, ...]]): + if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin: + return AutoProcessor.from_pretrained + if hasattr(processor_cls, "from_pretrained"): + return processor_cls.from_pretrained - merged_kwargs = {**base_kwargs, **kwargs} + return processor_cls + + +def _merge_mm_kwargs( + model_config: "ModelConfig", + processor_cls: Union[type, tuple[type, ...]], + /, + **kwargs, +): + mm_config = model_config.get_multimodal_config() + merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs) + + factory = _get_processor_factory_fn(processor_cls) + allowed_kwargs = get_allowed_kwarg_only_overrides( + factory, + merged_kwargs, + requires_kw_only=False, + allow_var_kwargs=True, + ) # NOTE: Pythonic dict is not hashable and will raise unhashable type # error when calling `cached_get_processor`, therefore we need to # wrap it to a hashable dict. - for key, value in merged_kwargs.items(): + for key, value in allowed_kwargs.items(): if isinstance(value, dict): - merged_kwargs[key] = HashableDict(value) + allowed_kwargs[key] = HashableDict(value) if isinstance(value, list): - merged_kwargs[key] = HashableList(value) - return merged_kwargs + allowed_kwargs[key] = HashableList(value) + + return allowed_kwargs def get_processor( @@ -61,21 +86,29 @@ def get_processor( **kwargs: Any, ) -> _P: """Load a processor for the given model name via HuggingFace.""" - # don't put this import at the top level - # it will call torch.cuda.device_count() - from transformers import AutoProcessor - - processor_factory = (AutoProcessor if processor_cls == ProcessorMixin or - isinstance(processor_cls, tuple) else processor_cls) + if revision is None: + revision = "main" try: - processor = processor_factory.from_pretrained( - processor_name, - *args, - revision=revision, - trust_remote_code=trust_remote_code, - **kwargs, - ) + if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin: + processor = AutoProcessor.from_pretrained( + processor_name, + *args, + revision=revision, + trust_remote_code=trust_remote_code, + **kwargs, + ) + elif issubclass(processor_cls, ProcessorMixin): + processor = processor_cls.from_pretrained( + processor_name, + *args, + revision=revision, + trust_remote_code=trust_remote_code, + **kwargs, + ) + else: + # Processors that are standalone classes unrelated to HF + processor = processor_cls(*args, **kwargs) except ValueError as e: # If the error pertains to the processor class not existing or not # currently being imported, suggest using the --trust-remote-code flag. @@ -112,7 +145,7 @@ def cached_processor_from_config( revision=model_config.revision, trust_remote_code=model_config.trust_remote_code, processor_cls=processor_cls, # type: ignore[arg-type] - **_merge_mm_kwargs(model_config, **kwargs), + **_merge_mm_kwargs(model_config, processor_cls, **kwargs), ) @@ -125,10 +158,6 @@ def get_feature_extractor( ): """Load an audio feature extractor for the given model name via HuggingFace.""" - # don't put this import at the top level - # it will call torch.cuda.device_count() - from transformers import AutoFeatureExtractor - from transformers.feature_extraction_utils import FeatureExtractionMixin try: feature_extractor = AutoFeatureExtractor.from_pretrained( processor_name, @@ -164,7 +193,7 @@ def cached_feature_extractor_from_config( model_config.model, revision=model_config.revision, trust_remote_code=model_config.trust_remote_code, - **_merge_mm_kwargs(model_config, **kwargs), + **_merge_mm_kwargs(model_config, AutoFeatureExtractor, **kwargs), ) @@ -176,11 +205,6 @@ def get_image_processor( **kwargs: Any, ): """Load an image processor for the given model name via HuggingFace.""" - # don't put this import at the top level - # it will call torch.cuda.device_count() - from transformers import AutoImageProcessor - from transformers.image_processing_utils import BaseImageProcessor - try: processor = AutoImageProcessor.from_pretrained( processor_name, @@ -217,5 +241,5 @@ def cached_image_processor_from_config( model_config.model, revision=model_config.revision, trust_remote_code=model_config.trust_remote_code, - **_merge_mm_kwargs(model_config, **kwargs), + **_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs), ) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index ae978c855a8e5..a7f579b0c9c2d 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2010,49 +2010,6 @@ def supports_kw( return False -def resolve_mm_processor_kwargs( - init_kwargs: Optional[Mapping[str, object]], - inference_kwargs: Optional[Mapping[str, object]], - callable: Callable[..., object], - *, - requires_kw_only: bool = True, - allow_var_kwargs: bool = False, -) -> dict[str, Any]: - """Applies filtering to eliminate invalid mm_processor_kwargs, i.e., - those who are not explicit keywords to the given callable (of one is - given; otherwise no filtering is done), then merges the kwarg dicts, - giving priority to inference_kwargs if there are any collisions. - - In the case that no kwarg overrides are provided, returns an empty - dict so that it can still be kwarg expanded into the callable later on. - - If allow_var_kwargs=True, allows for things that can be expanded into - kwargs as long as they aren't naming collision for var_kwargs or potential - positional arguments. - """ - # Filter inference time multimodal processor kwargs provided - runtime_mm_kwargs = get_allowed_kwarg_only_overrides( - callable, - overrides=inference_kwargs, - requires_kw_only=requires_kw_only, - allow_var_kwargs=allow_var_kwargs, - ) - - # Filter init time multimodal processor kwargs provided - init_mm_kwargs = get_allowed_kwarg_only_overrides( - callable, - overrides=init_kwargs, - requires_kw_only=requires_kw_only, - allow_var_kwargs=allow_var_kwargs, - ) - - # Merge the final processor kwargs, prioritizing inference - # time values over the initialization time values. - mm_processor_kwargs = {**init_mm_kwargs, **runtime_mm_kwargs} - - return mm_processor_kwargs - - def get_allowed_kwarg_only_overrides( callable: Callable[..., object], overrides: Optional[Mapping[str, object]], From e1a7fe4af5e9c287501c648e64956a08705af86a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Seznec?= Date: Fri, 1 Aug 2025 07:45:02 +0200 Subject: [PATCH 106/224] [BugFix] fix: aot passes kvcache dtype information (#19750) Signed-off-by: Mickael Seznec --- vllm/v1/attention/backends/flash_attn.py | 25 ++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 4c2a6c6b985b2..3f9afa67aef70 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -99,6 +99,13 @@ class FlashAttentionBackend(AttentionBackend): raise ValueError(f"Unknown cache layout format {cache_layout}.") return stride_order + @staticmethod + def get_fp8_dtype_for_flashattn(kv_cache_dtype: str) -> torch.dtype: + if kv_cache_dtype in ("fp8", "fp8_e4m3"): + return torch.float8_e4m3fn + else: + raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}") + @dataclass class FlashAttentionMetadata: @@ -161,6 +168,7 @@ class FlashAttentionMetadataBuilder( self.parallel_config) self.num_heads_kv = self.model_config.get_num_kv_heads( self.parallel_config) + self.kv_cache_dtype = kv_cache_spec.dtype self.headdim = self.model_config.get_head_size() self.block_size = kv_cache_spec.block_size @@ -239,17 +247,24 @@ class FlashAttentionMetadataBuilder( def schedule(batch_size, cu_query_lens, max_query_len, seqlens, max_seq_len, causal): + cache_dtype = self.cache_config.cache_dtype + if cache_dtype.startswith("fp8"): + qkv_dtype = FlashAttentionBackend.get_fp8_dtype_for_flashattn( + cache_dtype) + else: + qkv_dtype = self.kv_cache_dtype if aot_schedule: return get_scheduler_metadata( batch_size=batch_size, max_seqlen_q=max_query_len, max_seqlen_k=max_seq_len, - cache_seqlens=seqlens, num_heads_q=self.num_heads_q, num_heads_kv=self.num_heads_kv, headdim=self.headdim, - page_size=self.block_size, + cache_seqlens=seqlens, + qkv_dtype=qkv_dtype, cu_seqlens_q=cu_query_lens, + page_size=self.block_size, causal=causal, window_size=self.aot_sliding_window, num_splits=self.max_num_splits, @@ -474,8 +489,10 @@ class FlashAttentionImpl(AttentionImpl): ) if self.kv_cache_dtype.startswith("fp8"): - key_cache = key_cache.view(torch.float8_e4m3fn) - value_cache = value_cache.view(torch.float8_e4m3fn) + dtype = FlashAttentionBackend.get_fp8_dtype_for_flashattn( + self.kv_cache_dtype) + key_cache = key_cache.view(dtype) + value_cache = value_cache.view(dtype) num_tokens, num_heads, head_size = query.shape query, _ = ops.scaled_fp8_quant( query.reshape( From 0f46a780d4f53b8564a37370f9f068cdf4e69604 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 1 Aug 2025 01:45:15 -0400 Subject: [PATCH 107/224] [Model] [Quantization] Support quantization for Gemma3n (#21974) Signed-off-by: Kyle Sayers --- vllm/model_executor/models/gemma3n.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index a58b32793dbef..e16c03c8d3b57 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -46,6 +46,7 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from .interfaces import SupportsQuant from .utils import (AutoWeightsLoader, extract_layer_index, is_pp_missing_parameter, make_layers, maybe_prefix) @@ -68,6 +69,7 @@ class Gemma3nAltUp(nn.Module): altup_num_inputs: int, altup_coef_clip: float, altup_active_idx: int, + quant_config: QuantizationConfig, prefix: str, ): super().__init__() @@ -80,6 +82,7 @@ class Gemma3nAltUp(nn.Module): altup_num_inputs, altup_num_inputs, bias=False, + quant_config=quant_config, prefix=f"{prefix}.correction_coefs", return_bias=False, ) @@ -87,6 +90,7 @@ class Gemma3nAltUp(nn.Module): altup_num_inputs, altup_num_inputs**2, bias=False, + quant_config=quant_config, prefix=f"{prefix}.prediction_coefs", return_bias=False, ) @@ -94,6 +98,7 @@ class Gemma3nAltUp(nn.Module): hidden_size, altup_num_inputs, bias=False, + quant_config=quant_config, prefix=f"{prefix}.modality_router", return_bias=False, ) @@ -400,6 +405,7 @@ class Gemma3nDecoderLayer(nn.Module): altup_num_inputs=config.altup_num_inputs, altup_coef_clip=config.altup_coef_clip, altup_active_idx=config.altup_active_idx, + quant_config=quant_config, prefix=f"{prefix}.altup", ) self.self_attn = Gemma3nAttention( @@ -527,7 +533,7 @@ class Gemma3nDecoderLayer(nn.Module): @support_torch_compile -class Gemma3nTextModel(nn.Module): +class Gemma3nTextModel(nn.Module, SupportsQuant): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -540,6 +546,7 @@ class Gemma3nTextModel(nn.Module): self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, + quant_config=quant_config, prefix=f"{prefix}.embed_tokens", ) self.embed_scale = torch.tensor( @@ -549,6 +556,7 @@ class Gemma3nTextModel(nn.Module): self.embed_tokens_per_layer = VocabParallelEmbedding( config.vocab_size_per_layer_input, config.num_hidden_layers * config.hidden_size_per_layer_input, + quant_config=quant_config, prefix=f"{prefix}.per_layer_embed_tokens", ) self.embed_scale_per_layer = torch.tensor( @@ -582,7 +590,7 @@ class Gemma3nTextModel(nn.Module): gather_output=True, return_bias=False, quant_config=quant_config, - prefix=f"{prefix}.{idx-1}.altup_projections", + prefix=f"{prefix}.altup_projections.{idx-1}", ) for idx in range(1, self.config.altup_num_inputs) ]) self.altup_unembed_projections = nn.ModuleList([ @@ -593,7 +601,7 @@ class Gemma3nTextModel(nn.Module): gather_output=True, return_bias=False, quant_config=quant_config, - prefix=f"{prefix}.{idx-1}.altup_unembed_projections", + prefix=f"{prefix}.altup_unembed_projections.{idx-1}", ) for idx in range(1, self.config.altup_num_inputs) ]) @@ -774,7 +782,7 @@ class Gemma3nModel(nn.Module): **kwargs) -class Gemma3nForConditionalGeneration(nn.Module): +class Gemma3nForConditionalGeneration(nn.Module, SupportsQuant): packed_modules_mapping = { "qkv_proj": [ "q_proj", From 61dcc280faf305778c0c44597e823f40063aaed6 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 1 Aug 2025 14:10:56 +0800 Subject: [PATCH 108/224] [Doc] Add Voxtral to Supported Models page (#22059) Signed-off-by: DarkLight1337 --- docs/models/supported_models.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index f5d9e3b22f2a6..56c77a1e5f118 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -713,6 +713,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition. | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| | `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | | +| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | | ✅︎ | ✅︎ | ### Pooling Models From 53d7c39271aeb0568afcae337396a972e1848586 Mon Sep 17 00:00:00 2001 From: Aviad Rossmann Date: Fri, 1 Aug 2025 09:23:18 +0300 Subject: [PATCH 109/224] Update sampling_metadata.py (#21937) Signed-off-by: Aviad Rossmann --- vllm/model_executor/sampling_metadata.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 56f0f0984bfa0..66bcf1c4bfe50 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -539,37 +539,37 @@ class SamplingTensors: temperatures_t = torch.tensor( temperatures, device="cpu", - dtype=dtype, + dtype=torch.float32, pin_memory=pin_memory, ) top_ps_t = torch.tensor( top_ps, device="cpu", - dtype=dtype, + dtype=torch.float32, pin_memory=pin_memory, ) min_ps_t = torch.tensor( min_ps, device="cpu", - dtype=dtype, + dtype=torch.float32, pin_memory=pin_memory, ) presence_penalties_t = torch.tensor( presence_penalties, device="cpu", - dtype=dtype, + dtype=torch.float32, pin_memory=pin_memory, ) frequency_penalties_t = torch.tensor( frequency_penalties, device="cpu", - dtype=dtype, + dtype=torch.float32, pin_memory=pin_memory, ) repetition_penalties_t = torch.tensor( repetition_penalties, device="cpu", - dtype=dtype, + dtype=torch.float32, pin_memory=pin_memory, ) top_ks_t = torch.tensor( From 79731a79f09dc7bbe34dc8afbe8ef2242fb94a05 Mon Sep 17 00:00:00 2001 From: Hongsheng Liu Date: Fri, 1 Aug 2025 15:01:22 +0800 Subject: [PATCH 110/224] [Doc] Fix a syntax error of example code in structured_outputs.md (#22045) Signed-off-by: wangzi <3220100013@zju.edu.cn> Co-authored-by: wangzi <3220100013@zju.edu.cn> --- docs/features/structured_outputs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 4f737afa80f55..8a934d406f382 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -103,7 +103,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", } ], - "response_format": { + response_format={ "type": "json_schema", "json_schema": { "name": "car-description", From b4e081cb150797b12039cc1232205dbb25ca0206 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 1 Aug 2025 15:03:56 +0800 Subject: [PATCH 111/224] [Bugfix] Disable multi-modal preprocessor cache for DP (#21896) Signed-off-by: DarkLight1337 --- vllm/config.py | 6 ++++++ vllm/engine/arg_utils.py | 12 ++++++++++++ vllm/entrypoints/cli/serve.py | 5 +++-- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 9d5739ca11efd..93daab7d6ae97 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -871,6 +871,12 @@ class ModelConfig: return None + def set_disable_mm_preprocessor_cache(self, value: bool) -> None: + mm_config = self.get_multimodal_config() + + self.disable_mm_preprocessor_cache = value + mm_config.disable_mm_preprocessor_cache = value + def _get_encoder_config(self): return get_sentence_transformer_tokenizer_config( self.model, self.revision) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c36c79c69317e..78272d983eaf5 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1197,6 +1197,18 @@ class EngineArgs: enable_multimodal_encoder_data_parallel, ) + supports_mm_preprocessor_cache = (self.data_parallel_size == 1 + or data_parallel_external_lb) + if (not supports_mm_preprocessor_cache + and model_config.is_multimodal_model + and not model_config.disable_mm_preprocessor_cache): + logger.warning( + "Multi-modal preprocessor cache is not compatible " + "with data parallelism when there does not exist a " + "one-to-one correspondance between API process and " + "EngineCore process, so the cache will be disabled.") + model_config.set_disable_mm_preprocessor_cache(True) + speculative_config = self.create_speculative_config( target_model_config=model_config, target_parallel_config=parallel_config, diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 7dcba2cccdb52..bdbe71b832f4f 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -167,8 +167,9 @@ def run_multi_api_server(args: argparse.Namespace): if model_config.is_multimodal_model and not ( orig_disable_mm_preprocessor_cache): - logger.warning("Multi-model preprocessor cache will be disabled " - "for api_server_count > 1") + logger.warning( + "Multi-modal preprocessor cache is not compatible " + "with api_server_count > 1, so the cache will be disabled.") executor_class = Executor.get_class(vllm_config) log_stats = not engine_args.disable_log_stats From e0f63e4a3509a9323339eee67c96ac3c93d15923 Mon Sep 17 00:00:00 2001 From: Zebing Lin Date: Fri, 1 Aug 2025 03:23:29 -0400 Subject: [PATCH 112/224] [Core] Avoid repeated len(block_token_ids) check in hash_request_tokens (#21781) Signed-off-by: linzebing --- vllm/v1/core/kv_cache_utils.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 25520eb655111..eab1560b1a18c 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -567,12 +567,10 @@ def hash_request_tokens(hash_function: Any, block_size: int, ret = [] parent_block_hash_value = None - for start in range(0, len(token_ids), block_size): + # Only full blocks will be hashed + for start in range(0, len(token_ids) - block_size + 1, block_size): end = start + block_size block_token_ids = token_ids[start:end] - # Do not hash the block if it is not full. - if len(block_token_ids) < block_size: - break if req_need_extra_keys: # MM and LoRA requests need extra keys for block-hash computation. From 98df153abfcc443218aacfe61b3fd5abe2b88142 Mon Sep 17 00:00:00 2001 From: Sungyoon Jeong <157349761+n0gu-furiosa@users.noreply.github.com> Date: Fri, 1 Aug 2025 16:54:17 +0900 Subject: [PATCH 113/224] [Frontend] Align tool_choice="required" behavior with OpenAI when tools is empty (#21052) Signed-off-by: Sungyoon Jeong --- vllm/entrypoints/openai/protocol.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index b6b3bf3f530e3..d77aee345843c 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -859,6 +859,15 @@ class ChatCompletionRequest(OpenAIBaseModel): 'are supported.' ) + # if tool_choice is "required" but the "tools" list is empty, + # override the data to behave like "none" to align with + # OpenAI’s behavior. + if data["tool_choice"] == "required" and isinstance( + data["tools"], list) and len(data["tools"]) == 0: + data["tool_choice"] = "none" + del data["tools"] + return data + # ensure that if "tool_choice" is specified as an object, # it matches a valid tool correct_usage_message = 'Correct usage: `{"type": "function",' \ From da31f6ad3dacea8579adfb36d64d28759dc5c095 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Fri, 1 Aug 2025 01:26:24 -0700 Subject: [PATCH 114/224] Revert precompile wheel changes (#22055) --- docker/Dockerfile | 27 +++---- requirements/test.txt | 24 ++---- setup.py | 182 ++++++++++++++++++++---------------------- vllm/envs.py | 11 +-- 4 files changed, 107 insertions(+), 137 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 413151b3edb00..0d6afca74e867 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -206,7 +206,16 @@ ARG SCCACHE_REGION_NAME=us-west-2 ARG SCCACHE_S3_NO_CREDENTIALS=0 # Flag to control whether to use pre-built vLLM wheels -ARG VLLM_USE_PRECOMPILED="" +ARG VLLM_USE_PRECOMPILED +# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed +ENV VLLM_USE_PRECOMPILED="" +RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \ + export VLLM_USE_PRECOMPILED=1 && \ + echo "Using precompiled wheels"; \ + else \ + unset VLLM_USE_PRECOMPILED && \ + echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \ + fi # if USE_SCCACHE is set, use sccache to speed up compilation RUN --mount=type=cache,target=/root/.cache/uv \ @@ -223,8 +232,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \ && export SCCACHE_IDLE_TIMEOUT=0 \ && export CMAKE_BUILD_TYPE=Release \ - && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \ - && export VLLM_DOCKER_BUILD_CONTEXT=1 \ && sccache --show-stats \ && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \ && sccache --show-stats; \ @@ -238,22 +245,9 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ # Clean any existing CMake artifacts rm -rf .deps && \ mkdir -p .deps && \ - export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \ - export VLLM_DOCKER_BUILD_CONTEXT=1 && \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ fi -# When using precompiled wheels, keep only the newest manylinux1 wheel and delete others -RUN if [ "$VLLM_USE_PRECOMPILED" = "1" ]; then \ - echo "Cleaning up extra wheels in dist/..." && \ - # Identify the most recent manylinux1_x86_64 wheel - KEEP_WHEEL=$(ls -t dist/*manylinux1_x86_64.whl 2>/dev/null | head -n1) && \ - if [ -n "$KEEP_WHEEL" ]; then \ - echo "Keeping wheel: $KEEP_WHEEL"; \ - find dist/ -type f -name "*.whl" ! -path "${KEEP_WHEEL}" -delete; \ - fi; \ - fi - # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py @@ -369,7 +363,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ fi # Install vllm wheel first, so that torch etc will be installed. -# !bang RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/uv \ uv pip install --system dist/*.whl --verbose \ diff --git a/requirements/test.txt b/requirements/test.txt index 4aaca2afea266..d45048aae5809 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -22,7 +22,9 @@ aiohttp==3.10.11 aiohttp-cors==0.8.1 # via ray aiosignal==1.3.1 - # via aiohttp + # via + # aiohttp + # ray albucore==0.0.16 # via terratorch albumentations==1.4.6 @@ -137,7 +139,7 @@ contourpy==1.3.0 # via matplotlib cramjam==2.9.0 # via fastparquet -cupy-cuda12x==13.5.1 +cupy-cuda12x==13.3.0 # via ray cycler==0.12.1 # via matplotlib @@ -224,6 +226,7 @@ frozenlist==1.5.0 # via # aiohttp # aiosignal + # ray fsspec==2024.9.0 # via # datasets @@ -600,18 +603,10 @@ opencv-python-headless==4.11.0.86 opentelemetry-api==1.35.0 # via # mlflow-skinny - # opentelemetry-exporter-prometheus # opentelemetry-sdk # opentelemetry-semantic-conventions -opentelemetry-exporter-prometheus==0.56b0 - # via ray -opentelemetry-proto==1.36.0 - # via ray opentelemetry-sdk==1.35.0 - # via - # mlflow-skinny - # opentelemetry-exporter-prometheus - # ray + # via mlflow-skinny opentelemetry-semantic-conventions==0.56b0 # via opentelemetry-sdk packaging==24.2 @@ -702,9 +697,7 @@ pqdm==0.2.0 pretrainedmodels==0.7.4 # via segmentation-models-pytorch prometheus-client==0.22.0 - # via - # opentelemetry-exporter-prometheus - # ray + # via ray propcache==0.2.0 # via yarl proto-plus==1.26.1 @@ -714,7 +707,6 @@ protobuf==5.28.3 # google-api-core # googleapis-common-protos # mlflow-skinny - # opentelemetry-proto # proto-plus # ray # tensorboardx @@ -862,7 +854,7 @@ rasterio==1.4.3 # rioxarray # terratorch # torchgeo -ray==2.48.0 +ray==2.43.0 # via -r requirements/test.in redis==5.2.0 # via tensorizer diff --git a/setup.py b/setup.py index bfa195d4395f0..64cfbb8db962b 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,6 @@ import json import logging import os import re -import shutil import subprocess import sys from pathlib import Path @@ -282,69 +281,10 @@ class cmake_build_ext(build_ext): self.copy_file(file, dst_file) -class precompiled_wheel_utils: +class repackage_wheel(build_ext): """Extracts libraries and other files from an existing wheel.""" - @staticmethod - def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict: - import tempfile - import zipfile - - temp_dir = None - try: - if not os.path.isfile(wheel_url_or_path): - wheel_filename = wheel_url_or_path.split("/")[-1] - temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") - wheel_path = os.path.join(temp_dir, wheel_filename) - print(f"Downloading wheel from {wheel_url_or_path} " - f"to {wheel_path}") - from urllib.request import urlretrieve - urlretrieve(wheel_url_or_path, filename=wheel_path) - else: - wheel_path = wheel_url_or_path - print(f"Using existing wheel at {wheel_path}") - - package_data_patch = {} - - with zipfile.ZipFile(wheel_path) as wheel: - files_to_copy = [ - "vllm/_C.abi3.so", - "vllm/_moe_C.abi3.so", - "vllm/_flashmla_C.abi3.so", - "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", - "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", - "vllm/cumem_allocator.abi3.so", - ] - - compiled_regex = re.compile( - r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") - file_members = list( - filter(lambda x: x.filename in files_to_copy, - wheel.filelist)) - file_members += list( - filter(lambda x: compiled_regex.match(x.filename), - wheel.filelist)) - - for file in file_members: - print(f"[extract] {file.filename}") - target_path = os.path.join(".", file.filename) - os.makedirs(os.path.dirname(target_path), exist_ok=True) - with wheel.open(file.filename) as src, open( - target_path, "wb") as dst: - shutil.copyfileobj(src, dst) - - pkg = os.path.dirname(file.filename).replace("/", ".") - package_data_patch.setdefault(pkg, []).append( - os.path.basename(file.filename)) - - return package_data_patch - finally: - if temp_dir is not None: - print(f"Removing temporary directory {temp_dir}") - shutil.rmtree(temp_dir) - - @staticmethod - def get_base_commit_in_main_branch() -> str: + def get_base_commit_in_main_branch(self) -> str: # Force to use the nightly wheel. This is mainly used for CI testing. if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: return "nightly" @@ -357,10 +297,6 @@ class precompiled_wheel_utils: ]).decode("utf-8") upstream_main_commit = json.loads(resp_json)["sha"] - # In Docker build context, .git may be immutable or missing. - if envs.VLLM_DOCKER_BUILD_CONTEXT: - return upstream_main_commit - # Check if the upstream_main_commit exists in the local repo try: subprocess.check_output( @@ -393,15 +329,92 @@ class precompiled_wheel_utils: "wheel may not be compatible with your dev branch: %s", err) return "nightly" + def run(self) -> None: + assert _is_cuda( + ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" + + wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None) + if wheel_location is None: + base_commit = self.get_base_commit_in_main_branch() + wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + # Fallback to nightly wheel if latest commit wheel is unavailable, + # in this rare case, the nightly release CI hasn't finished on main. + if not is_url_available(wheel_location): + wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + + import zipfile + + if os.path.isfile(wheel_location): + wheel_path = wheel_location + print(f"Using existing wheel={wheel_path}") + else: + # Download the wheel from a given URL, assume + # the filename is the last part of the URL + wheel_filename = wheel_location.split("/")[-1] + + import tempfile + + # create a temporary directory to store the wheel + temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") + wheel_path = os.path.join(temp_dir, wheel_filename) + + print(f"Downloading wheel from {wheel_location} to {wheel_path}") + + from urllib.request import urlretrieve + + try: + urlretrieve(wheel_location, filename=wheel_path) + except Exception as e: + from setuptools.errors import SetupError + + raise SetupError( + f"Failed to get vLLM wheel from {wheel_location}") from e + + with zipfile.ZipFile(wheel_path) as wheel: + files_to_copy = [ + "vllm/_C.abi3.so", + "vllm/_moe_C.abi3.so", + "vllm/_flashmla_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", + "vllm/cumem_allocator.abi3.so", + # "vllm/_version.py", # not available in nightly wheels yet + ] + + file_members = list( + filter(lambda x: x.filename in files_to_copy, wheel.filelist)) + + # vllm_flash_attn python code: + # Regex from + # `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)` + compiled_regex = re.compile( + r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") + file_members += list( + filter(lambda x: compiled_regex.match(x.filename), + wheel.filelist)) + + for file in file_members: + print(f"Extracting and including {file.filename} " + "from existing wheel") + package_name = os.path.dirname(file.filename).replace("/", ".") + file_name = os.path.basename(file.filename) + + if package_name not in package_data: + package_data[package_name] = [] + + wheel.extract(file) + if file_name.endswith(".py"): + # python files shouldn't be added to package_data + continue + + package_data[package_name].append(file_name) + def _no_device() -> bool: return VLLM_TARGET_DEVICE == "empty" def _is_cuda() -> bool: - # Allow forced CUDA in Docker/precompiled builds, even without torch.cuda - if envs.VLLM_USE_PRECOMPILED and envs.VLLM_DOCKER_BUILD_CONTEXT: - return True has_cuda = torch.version.cuda is not None return (VLLM_TARGET_DEVICE == "cuda" and has_cuda and not (_is_neuron() or _is_tpu())) @@ -626,37 +639,16 @@ package_data = { ] } -# If using precompiled, extract and patch package_data (in advance of setup) -if envs.VLLM_USE_PRECOMPILED: - assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" - wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None) - if wheel_location is not None: - wheel_url = wheel_location - else: - base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch() - wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - from urllib.request import urlopen - try: - with urlopen(wheel_url) as resp: - if resp.status != 200: - wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - except Exception as e: - print(f"[warn] Falling back to nightly wheel: {e}") - wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - - patch = precompiled_wheel_utils.extract_precompiled_and_patch_package( - wheel_url) - for pkg, files in patch.items(): - package_data.setdefault(pkg, []).extend(files) - if _no_device(): ext_modules = [] -if not ext_modules or envs.VLLM_USE_PRECOMPILED: - # Disable build_ext when using precompiled wheel +if not ext_modules: cmdclass = {} else: - cmdclass = {"build_ext": cmake_build_ext} + cmdclass = { + "build_ext": + repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext + } setup( # static metadata should rather go in pyproject.toml diff --git a/vllm/envs.py b/vllm/envs.py index 19bc9156b2586..7553eccf16ea9 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -68,7 +68,6 @@ if TYPE_CHECKING: MAX_JOBS: Optional[str] = None NVCC_THREADS: Optional[str] = None VLLM_USE_PRECOMPILED: bool = False - VLLM_DOCKER_BUILD_CONTEXT: bool = False VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False VLLM_NO_DEPRECATION_WARNING: bool = False VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False @@ -228,14 +227,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # If set, vllm will use precompiled binaries (*.so) "VLLM_USE_PRECOMPILED": - lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in - ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), - - # Used to mark that setup.py is running in a Docker build context, - # in order to force the use of precompiled binaries. - "VLLM_DOCKER_BUILD_CONTEXT": - lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in - ("1", "true"), + lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool( + os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), # Whether to force using nightly wheel in python build. # This is used for testing the nightly wheel in python build. From 27a145e8931582fc74c1f46e0e4630c610b96160 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 1 Aug 2025 01:35:49 -0700 Subject: [PATCH 115/224] [Doc] Add example for Step3-VL (#22061) Signed-off-by: Roger Wang --- examples/offline_inference/vision_language.py | 298 ++++++++++-------- .../vision_language_multi_image.py | 215 +++++++------ 2 files changed, 286 insertions(+), 227 deletions(-) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 0edcd0407747c..a75b8e2b047d8 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -423,32 +423,6 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: ) -# SmolVLM2-2.2B-Instruct -def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" - - engine_args = EngineArgs( - model=model_name, - max_model_len=8192, - max_num_seqs=2, - enforce_eager=True, - mm_processor_kwargs={ - "max_image_size": {"longest_edge": 384}, - }, - limit_mm_per_prompt={modality: 1}, - ) - prompts = [ - (f"<|im_start|>User:{question}\nAssistant:") - for question in questions - ] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - # Intern-S1 def run_interns1(questions: list[str], modality: str) -> ModelRequestData: model_name = "internlm/Intern-S1" @@ -522,44 +496,6 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData: ) -# Nemontron_VL -def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData: - model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1" - - engine_args = EngineArgs( - model=model_name, - trust_remote_code=True, - max_model_len=8192, - limit_mm_per_prompt={modality: 1}, - ) - - assert modality == "image" - placeholder = "" - - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - messages = [ - [{"role": "user", "content": f"{placeholder}\n{question}"}] - for question in questions - ] - prompts = tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - - # Stop tokens for InternVL - # models variants may have different stop tokens - # please refer to the model card for the correct "stop words": - # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py - stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] - stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] - stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - stop_token_ids=stop_token_ids, - ) - - # Keye-VL def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData: model_name = "Kwai-Keye/Keye-VL-8B-Preview" @@ -615,6 +551,41 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData: ) +def run_llama4(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" + + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=4, + tensor_parallel_size=8, + gpu_memory_utilization=0.4, + limit_mm_per_prompt={modality: 1}, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name) + messages = [ + [ + { + "role": "user", + "content": [{"type": "image"}, {"type": "text", "text": f"{question}"}], + } + ] + for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False + ) + stop_token_ids = None + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) + + # LLaVA-1.5 def run_llava(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -857,41 +828,6 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData: ) -def run_llama4(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - - model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" - - engine_args = EngineArgs( - model=model_name, - max_model_len=8192, - max_num_seqs=4, - tensor_parallel_size=8, - gpu_memory_utilization=0.4, - limit_mm_per_prompt={modality: 1}, - ) - - tokenizer = AutoTokenizer.from_pretrained(model_name) - messages = [ - [ - { - "role": "user", - "content": [{"type": "image"}, {"type": "text", "text": f"{question}"}], - } - ] - for question in questions - ] - prompts = tokenizer.apply_chat_template( - messages, add_generation_prompt=True, tokenize=False - ) - stop_token_ids = None - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - stop_token_ids=stop_token_ids, - ) - - # Molmo def run_molmo(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -917,6 +853,44 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData: ) +# Nemontron_VL +def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData: + model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + limit_mm_per_prompt={modality: 1}, + ) + + assert modality == "image" + placeholder = "" + + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"{placeholder}\n{question}"}] + for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + # Stop tokens for InternVL + # models variants may have different stop tokens + # please refer to the model card for the correct "stop words": + # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py + stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) + + # NVLM-D def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1274,6 +1248,94 @@ def run_qwen2_5_omni(questions: list[str], modality: str): ) +# SkyworkR1V +def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "Skywork/Skywork-R1V-38B" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + limit_mm_per_prompt={modality: 1}, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"\n{question}"}] for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + # Stop tokens for SkyworkR1V + # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py + stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) + + +# SmolVLM2-2.2B-Instruct +def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" + + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=2, + enforce_eager=True, + mm_processor_kwargs={ + "max_image_size": {"longest_edge": 384}, + }, + limit_mm_per_prompt={modality: 1}, + ) + prompts = [ + (f"<|im_start|>User:{question}\nAssistant:") + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + +# Step3 +def run_step3(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "stepfun-ai/step3-fp8" + + # NOTE: Below are verified configurations for step3-fp8 + # on 8xH100 GPUs. + engine_args = EngineArgs( + model=model_name, + max_num_batched_tokens=4096, + gpu_memory_utilization=0.85, + tensor_parallel_size=8, + limit_mm_per_prompt={modality: 1}, + reasoning_parser="step3", + ) + + prompts = [ + "<|begin▁of▁sentence|> You are a helpful assistant. <|BOT|>user\n " + f"{question} <|EOT|><|BOT|>assistant\n\n" + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # omni-research/Tarsier-7b def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1324,39 +1386,6 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: ) -# SkyworkR1V -def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - - model_name = "Skywork/Skywork-R1V-38B" - - engine_args = EngineArgs( - model=model_name, - trust_remote_code=True, - max_model_len=4096, - limit_mm_per_prompt={modality: 1}, - ) - - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - messages = [ - [{"role": "user", "content": f"\n{question}"}] for question in questions - ] - prompts = tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - - # Stop tokens for SkyworkR1V - # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py - stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"] - stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - stop_token_ids=stop_token_ids, - ) - - model_example_map = { "aria": run_aria, "aya_vision": run_aya_vision, @@ -1373,9 +1402,9 @@ model_example_map = { "idefics3": run_idefics3, "interns1": run_interns1, "internvl_chat": run_internvl, - "nemotron_vl": run_nemotron_vl, "keye_vl": run_keye_vl, "kimi_vl": run_kimi_vl, + "llama4": run_llama4, "llava": run_llava, "llava-next": run_llava_next, "llava-next-video": run_llava_next_video, @@ -1385,8 +1414,8 @@ model_example_map = { "minicpmv": run_minicpmv, "mistral3": run_mistral3, "mllama": run_mllama, - "llama4": run_llama4, "molmo": run_molmo, + "nemotron_vl": run_nemotron_vl, "NVLM_D": run_nvlm_d, "ovis": run_ovis, "paligemma": run_paligemma, @@ -1401,6 +1430,7 @@ model_example_map = { "qwen2_5_omni": run_qwen2_5_omni, "skywork_chat": run_skyworkr1v, "smolvlm": run_smolvlm, + "step3": run_step3, "tarsier": run_tarsier, "tarsier2": run_tarsier2, } diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index dd50f3639709e..1ab405fa14f3a 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -197,6 +197,53 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_hyperclovax_seed_vision( + question: str, image_urls: list[str] +) -> ModelRequestData: + model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=16384, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + message = {"role": "user", "content": list()} + for _image_url in image_urls: + message["content"].append( + { + "type": "image", + "image": _image_url, + "ocr": "", + "lens_keywords": "", + "lens_local_keywords": "", + } + ) + message["content"].append( + { + "type": "text", + "text": question, + } + ) + + prompt = tokenizer.apply_chat_template( + [ + message, + ], + tokenize=False, + add_generation_prompt=True, + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + stop_token_ids=None, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "HuggingFaceM4/Idefics3-8B-Llama3" @@ -225,34 +272,6 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData: ) -def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: - model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" - - # The configuration below has been confirmed to launch on a single L40 GPU. - engine_args = EngineArgs( - model=model_name, - max_model_len=8192, - max_num_seqs=16, - enforce_eager=True, - limit_mm_per_prompt={"image": len(image_urls)}, - mm_processor_kwargs={ - "max_image_size": {"longest_edge": 384}, - }, - ) - - placeholders = "\n".join( - f"Image-{i}: \n" for i, _ in enumerate(image_urls, start=1) - ) - prompt = ( - f"<|im_start|>User:{placeholders}\n{question}\nAssistant:" # noqa: E501 - ) - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], - ) - - def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "internlm/Intern-S1" @@ -316,49 +335,36 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: ) -def load_hyperclovax_seed_vision( - question: str, image_urls: list[str] -) -> ModelRequestData: - model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) +def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" engine_args = EngineArgs( model=model_name, - trust_remote_code=True, - max_model_len=16384, + max_model_len=131072, + tensor_parallel_size=8, limit_mm_per_prompt={"image": len(image_urls)}, ) - message = {"role": "user", "content": list()} - for _image_url in image_urls: - message["content"].append( - { - "type": "image", - "image": _image_url, - "ocr": "", - "lens_keywords": "", - "lens_local_keywords": "", - } - ) - message["content"].append( + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ { - "type": "text", - "text": question, + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], } - ) + ] - prompt = tokenizer.apply_chat_template( - [ - message, - ], - tokenize=False, - add_generation_prompt=True, + processor = AutoProcessor.from_pretrained(model_name) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True ) return ModelRequestData( engine_args=engine_args, prompt=prompt, - stop_token_ids=None, image_data=[fetch_image(url) for url in image_urls], ) @@ -463,40 +469,6 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa ) -def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: - model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" - - engine_args = EngineArgs( - model=model_name, - max_model_len=131072, - tensor_parallel_size=8, - limit_mm_per_prompt={"image": len(image_urls)}, - ) - - placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [ - { - "role": "user", - "content": [ - *placeholders, - {"type": "text", "text": question}, - ], - } - ] - - processor = AutoProcessor.from_pretrained(model_name) - - prompt = processor.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], - ) - - def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "Kwai-Keye/Keye-VL-8B-Preview" @@ -954,6 +926,62 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" + + # The configuration below has been confirmed to launch on a single L40 GPU. + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=16, + enforce_eager=True, + limit_mm_per_prompt={"image": len(image_urls)}, + mm_processor_kwargs={ + "max_image_size": {"longest_edge": 384}, + }, + ) + + placeholders = "\n".join( + f"Image-{i}: \n" for i, _ in enumerate(image_urls, start=1) + ) + prompt = ( + f"<|im_start|>User:{placeholders}\n{question}\nAssistant:" # noqa: E501 + ) + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + +def load_step3(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "stepfun-ai/step3-fp8" + + # NOTE: Below are verified configurations for step3-fp8 + # on 8xH100 GPUs. + engine_args = EngineArgs( + model=model_name, + max_num_batched_tokens=4096, + gpu_memory_utilization=0.85, + tensor_parallel_size=8, + limit_mm_per_prompt={"image": len(image_urls)}, + reasoning_parser="step3", + ) + + prompt = ( + "<|begin▁of▁sentence|> You are a helpful assistant. <|BOT|>user\n " + f"{'' * len(image_urls)}{question} <|EOT|><|BOT|" + ">assistant\n\n" + ) + image_data = [fetch_image(url) for url in image_urls] + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=image_data, + ) + + def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "omni-research/Tarsier-7b" @@ -1006,16 +1034,16 @@ model_example_map = { "deepseek_vl_v2": load_deepseek_vl2, "gemma3": load_gemma3, "h2ovl_chat": load_h2ovl, + "hyperclovax_seed_vision": load_hyperclovax_seed_vision, "idefics3": load_idefics3, "interns1": load_interns1, "internvl_chat": load_internvl, - "hyperclovax_seed_vision": load_hyperclovax_seed_vision, "keye_vl": load_keye_vl, "kimi_vl": load_kimi_vl, + "llama4": load_llama4, "llava": load_llava, "llava-next": load_llava_next, "llava-onevision": load_llava_onevision, - "llama4": load_llama4, "mistral3": load_mistral3, "mllama": load_mllama, "NVLM_D": load_nvlm_d, @@ -1028,6 +1056,7 @@ model_example_map = { "qwen2_vl": load_qwen2_vl, "qwen2_5_vl": load_qwen2_5_vl, "smolvlm": load_smolvlm, + "step3": load_step3, "tarsier": load_tarsier, "tarsier2": load_tarsier2, } From e6680f9e25a433bcd754181705e72034ce6c470c Mon Sep 17 00:00:00 2001 From: wuhang Date: Fri, 1 Aug 2025 17:04:16 +0800 Subject: [PATCH 116/224] [Bugfix] Add log prefix in non-dp mode engine core (#21889) Signed-off-by: wuhang --- vllm/entrypoints/cli/serve.py | 11 +---- vllm/entrypoints/openai/api_server.py | 12 ++---- vllm/executor/multiproc_worker_utils.py | 42 ++----------------- vllm/utils/__init__.py | 55 ++++++++++++++++++++++++- vllm/v1/engine/core.py | 22 ++-------- vllm/v1/executor/multiproc_executor.py | 14 +++---- 6 files changed, 75 insertions(+), 81 deletions(-) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index bdbe71b832f4f..0305354a66e85 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -2,9 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse -import os import signal -import sys from typing import Optional import uvloop @@ -18,10 +16,9 @@ from vllm.entrypoints.openai.cli_args import (make_arg_parser, validate_parsed_serve_args) from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG, show_filtered_argument_or_group_from_help) -from vllm.executor.multiproc_worker_utils import _add_prefix from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser, get_tcp_uri +from vllm.utils import FlexibleArgumentParser, decorate_logs, get_tcp_uri from vllm.v1.engine.core import EngineCoreProc from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines from vllm.v1.executor.abstract import Executor @@ -229,11 +226,7 @@ def run_api_server_worker_proc(listen_address, """Entrypoint for individual API server worker processes.""" # Add process-specific prefix to stdout and stderr. - from multiprocessing import current_process - process_name = current_process().name - pid = os.getpid() - _add_prefix(sys.stdout, process_name, pid) - _add_prefix(sys.stderr, process_name, pid) + decorate_logs() uvloop.run( run_server_worker(listen_address, sock, args, client_config, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 05d9a69a65f83..26db1357da4d0 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -11,7 +11,6 @@ import multiprocessing import os import signal import socket -import sys import tempfile import uuid from argparse import Namespace @@ -95,15 +94,15 @@ from vllm.entrypoints.openai.serving_transcription import ( from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.entrypoints.utils import (cli_env_setup, load_aware_call, log_non_default_args, with_cancellation) -from vllm.executor.multiproc_worker_utils import _add_prefix from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.usage.usage_lib import UsageContext -from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path, - is_valid_ipv6_address, set_process_title, set_ulimit) +from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs, + get_open_zmq_ipc_path, is_valid_ipv6_address, + set_process_title, set_ulimit) from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION @@ -1808,10 +1807,7 @@ async def run_server(args, **uvicorn_kwargs) -> None: """Run a single-worker API server.""" # Add process-specific prefix to stdout and stderr. - process_name = "APIServer" - pid = os.getpid() - _add_prefix(sys.stdout, process_name, pid) - _add_prefix(sys.stderr, process_name, pid) + decorate_logs("APIServer") listen_address, sock = setup_server(args) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index a6c172beff7bb..48b3479ed7997 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -3,21 +3,20 @@ import asyncio import os -import sys import threading import uuid from dataclasses import dataclass from multiprocessing import Queue from multiprocessing.connection import wait from multiprocessing.process import BaseProcess -from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO, - TypeVar, Union) +from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union import torch from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import _maybe_force_spawn, get_mp_context, run_method +from vllm.utils import (_maybe_force_spawn, decorate_logs, get_mp_context, + run_method) logger = init_logger(__name__) @@ -25,10 +24,6 @@ T = TypeVar('T') _TERMINATE = "TERMINATE" # sentinel -# ANSI color codes -CYAN = '\033[1;36m' -RESET = '\033[0;0m' - JOIN_TIMEOUT_S = 2 @@ -213,9 +208,7 @@ def _run_worker_process( # Add process-specific prefix to stdout and stderr process_name = get_mp_context().current_process().name - pid = os.getpid() - _add_prefix(sys.stdout, process_name, pid) - _add_prefix(sys.stderr, process_name, pid) + decorate_logs(process_name) # Initialize worker worker = worker_factory(vllm_config, rank) @@ -260,33 +253,6 @@ def _run_worker_process( logger.info("Worker exiting") -def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None: - """Prepend each output line with process-specific prefix""" - - prefix = f"{CYAN}({worker_name} pid={pid}){RESET} " - file_write = file.write - - def write_with_prefix(s: str): - if not s: - return - if file.start_new_line: # type: ignore[attr-defined] - file_write(prefix) - idx = 0 - while (next_idx := s.find('\n', idx)) != -1: - next_idx += 1 - file_write(s[idx:next_idx]) - if next_idx == len(s): - file.start_new_line = True # type: ignore[attr-defined] - return - file_write(prefix) - idx = next_idx - file_write(s[idx:]) - file.start_new_line = False # type: ignore[attr-defined] - - file.start_new_line = True # type: ignore[attr-defined] - file.write = write_with_prefix # type: ignore[method-assign] - - def set_multiprocessing_worker_envs(parallel_config): """ Set up environment variables that should be used when there are workers in a multiprocessing environment. This should be called by the parent diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index a7f579b0c9c2d..d5d8d9dad73a8 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -47,7 +47,7 @@ from dataclasses import dataclass, field from functools import cache, lru_cache, partial, wraps from types import MappingProxyType from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple, - Optional, Tuple, TypeVar, Union, cast, overload) + Optional, TextIO, Tuple, TypeVar, Union, cast, overload) from urllib.parse import urlparse from uuid import uuid4 @@ -167,6 +167,10 @@ GB_bytes = 1_000_000_000 GiB_bytes = 1 << 30 """The number of bytes in one gibibyte (GiB).""" +# ANSI color codes +CYAN = '\033[1;36m' +RESET = '\033[0;0m' + STR_DTYPE_TO_TORCH_DTYPE = { "half": torch.half, "bfloat16": torch.bfloat16, @@ -3258,3 +3262,52 @@ def set_process_title(name: str, else: name = f"{envs.VLLM_PROCESS_NAME_PREFIX}::{name}" setproctitle.setproctitle(name) + + +def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None: + """Prepend each output line with process-specific prefix""" + + prefix = f"{CYAN}({worker_name} pid={pid}){RESET} " + file_write = file.write + + def write_with_prefix(s: str): + if not s: + return + if file.start_new_line: # type: ignore[attr-defined] + file_write(prefix) + idx = 0 + while (next_idx := s.find('\n', idx)) != -1: + next_idx += 1 + file_write(s[idx:next_idx]) + if next_idx == len(s): + file.start_new_line = True # type: ignore[attr-defined] + return + file_write(prefix) + idx = next_idx + file_write(s[idx:]) + file.start_new_line = False # type: ignore[attr-defined] + + file.start_new_line = True # type: ignore[attr-defined] + file.write = write_with_prefix # type: ignore[method-assign] + + +def decorate_logs(process_name: Optional[str] = None) -> None: + """ + Adds a process-specific prefix to each line of output written to stdout and + stderr. + + This function is intended to be called before initializing the api_server, + engine_core, or worker classes, so that all subsequent output from the + process is prefixed with the process name and PID. This helps distinguish + log output from different processes in multi-process environments. + + Args: + process_name: Optional; the name of the process to use in the prefix. + If not provided, the current process name from the multiprocessing + context is used. + """ + if process_name is None: + process_name = get_mp_context().current_process().name + pid = os.getpid() + _add_prefix(sys.stdout, process_name, pid) + _add_prefix(sys.stderr, process_name, pid) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index f9a6315df8af8..6ae5736df98b8 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -3,7 +3,6 @@ import os import queue import signal -import sys import threading import time from collections import deque @@ -19,15 +18,14 @@ import zmq from vllm.config import ParallelConfig, VllmConfig from vllm.distributed import stateless_destroy_torch_distributed_process_group -from vllm.executor.multiproc_worker_utils import _add_prefix from vllm.logger import init_logger from vllm.logging_utils.dump_input import dump_engine_exception from vllm.lora.request import LoRARequest from vllm.tasks import POOLING_TASKS, SupportedTask from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) -from vllm.utils import (make_zmq_socket, resolve_obj_by_qualname, - set_process_title) +from vllm.utils import (decorate_logs, make_zmq_socket, + resolve_obj_by_qualname, set_process_title) from vllm.v1.core.kv_cache_utils import (get_kv_cache_config, unify_kv_cache_configs) from vllm.v1.core.sched.interface import SchedulerInterface @@ -649,12 +647,14 @@ class EngineCoreProc(EngineCore): "vllm_config"].parallel_config if parallel_config.data_parallel_size > 1 or dp_rank > 0: set_process_title("DPEngineCore", str(dp_rank)) + decorate_logs() # Set data parallel rank for this engine process. parallel_config.data_parallel_rank = dp_rank parallel_config.data_parallel_rank_local = local_dp_rank engine_core = DPEngineCoreProc(*args, **kwargs) else: set_process_title("EngineCore") + decorate_logs() engine_core = EngineCoreProc(*args, **kwargs) engine_core.run_busy_loop() @@ -905,8 +905,6 @@ class DPEngineCoreProc(EngineCoreProc): log_stats: bool, client_handshake_address: Optional[str] = None, ): - self._decorate_logs() - # Counts forward-passes of the model so that we can synchronize # finished with DP peers every N steps. self.counter = 0 @@ -919,15 +917,6 @@ class DPEngineCoreProc(EngineCoreProc): executor_class, log_stats, client_handshake_address, dp_rank) - def _decorate_logs(self): - # Add process-specific prefix to stdout and stderr before - # we initialize the engine. - from multiprocessing import current_process - process_name = current_process().name - pid = os.getpid() - _add_prefix(sys.stdout, process_name, pid) - _add_prefix(sys.stderr, process_name, pid) - def _init_data_parallel(self, vllm_config: VllmConfig): # Configure GPUs and stateless process group for data parallel. @@ -1149,9 +1138,6 @@ class DPEngineCoreActor(DPEngineCoreProc): f"{(local_dp_rank + 1) * world_size}) " f"base value: \"{os.getenv(device_control_env_var)}\"") from e - def _decorate_logs(self): - pass - @contextmanager def _perform_handshakes(self, handshake_address: str, identity: bytes, local_client: bool, vllm_config: VllmConfig, diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 8270385053852..d90051c3224fd 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -4,7 +4,6 @@ import multiprocessing import os import pickle import signal -import sys import threading import time import traceback @@ -28,10 +27,11 @@ from vllm.distributed.device_communicators.shm_broadcast import (Handle, MessageQueue) from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator from vllm.executor.multiproc_worker_utils import ( - _add_prefix, set_multiprocessing_worker_envs) + set_multiprocessing_worker_envs) from vllm.logger import init_logger -from vllm.utils import (get_distributed_init_method, get_loopback_ip, - get_mp_context, get_open_port, set_process_title) +from vllm.utils import (decorate_logs, get_distributed_init_method, + get_loopback_ip, get_mp_context, get_open_port, + set_process_title) from vllm.v1.executor.abstract import Executor, FailureCallback from vllm.v1.outputs import ModelRunnerOutput from vllm.worker.worker_base import WorkerWrapperBase @@ -382,11 +382,11 @@ class WorkerProc: pp_str = f"PP{rank // tp_size}" if pp_size > 1 else "" tp_str = f"TP{rank % tp_size}" if tp_size > 1 else "" suffix = f"{pp_str}{'_' if pp_str and tp_str else ''}{tp_str}" + process_name = "VllmWorker" if suffix: set_process_title(suffix, append=True) - pid = os.getpid() - _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid) - _add_prefix(sys.stderr, f"VllmWorker rank={rank}", pid) + process_name = f"{process_name} {suffix}" + decorate_logs(process_name) # Initialize MessageQueue for receiving SchedulerOutput self.rpc_broadcast_mq = MessageQueue.create_from_handle( From 0f81b310db013ec9fbc1deb9de97bd9b2a9af62f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 1 Aug 2025 02:11:40 -0700 Subject: [PATCH 117/224] [Misc] Remove upper bound in openai package version (#22060) Signed-off-by: Woosuk Kwon --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index d29b3e59d35b2..6b57a3d2f1d0d 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -13,7 +13,7 @@ tokenizers >= 0.21.1 # Required for fast incremental detokenization. protobuf # Required by LlamaTokenizer. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. aiohttp -openai >= 1.87.0, <= 1.90.0 # Ensure modern openai package (ensure ResponsePrompt exists in type.responses and max_completion_tokens field support) +openai >= 1.87.0 # Ensure modern openai package (ensure ResponsePrompt exists in type.responses and max_completion_tokens field support) pydantic >= 2.10 prometheus_client >= 0.18.0 pillow # Required for image processing From 49314869887e169be080201ab8bcda14e745c080 Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Fri, 1 Aug 2025 17:11:56 +0800 Subject: [PATCH 118/224] [Doc] Added warning of speculating with draft model (#22047) Signed-off-by: Dilute-l Co-authored-by: Dilute-l --- docs/features/spec_decode.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md index be4b91feda7aa..89d5b489e1888 100644 --- a/docs/features/spec_decode.md +++ b/docs/features/spec_decode.md @@ -15,6 +15,10 @@ Speculative decoding is a technique which improves inter-token latency in memory The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time. +!!! warning + In vllm v0.10.0, speculative decoding with a draft model is not supported. + If you use the following code, you will get a `NotImplementedError`. + ??? code ```python From 28b18cc741e596ea6f9981b8365c4819523fc24b Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 1 Aug 2025 19:09:54 +0800 Subject: [PATCH 119/224] [Quantization] Enable BNB support for InternS1 (#21953) Signed-off-by: Jee Jee Li --- .../model_loader/bitsandbytes_loader.py | 39 ++++++++++++------- vllm/model_executor/utils.py | 20 +++++++++- 2 files changed, 43 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 68fcb785691c8..f54dfab5238e1 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -34,7 +34,8 @@ from vllm.model_executor.model_loader.weight_utils import ( filter_duplicate_safetensors_files, filter_files_not_needed_for_inference, pt_weights_iterator, safetensors_weights_iterator) from vllm.model_executor.models import is_pooling_model -from vllm.model_executor.utils import (get_packed_modules_mapping, +from vllm.model_executor.utils import (get_moe_expert_mapping, + get_packed_modules_mapping, set_weight_attrs) from vllm.platforms import current_platform @@ -43,6 +44,12 @@ from vllm.platforms import current_platform logger = init_logger(__name__) +def is_moe_model(model: torch.nn.Module) -> bool: + """Checks if the model contains FusedMoE layers.""" + return bool(any( + isinstance(module, FusedMoE) for module in model.modules())) + + class BitsAndBytesModelLoader(BaseModelLoader): """Model loader to load model weights with BitAndBytes quantization.""" @@ -61,6 +68,8 @@ class BitsAndBytesModelLoader(BaseModelLoader): # Store all module names (from transformers) that support # BNB quantization. self.target_modules: list[str] = [] + # Store the mapping of expert parameters for MoE models. + self.expert_params_mapping: list[tuple[str, str, int, str]] = [] # mapping weight names from transformers to vllm. self.weight_mapper: Callable = lambda name: name self.pre_quant: bool = False @@ -413,13 +422,8 @@ class BitsAndBytesModelLoader(BaseModelLoader): # in case model has a mixture of disk-merged and disk-split # weights with same last name. self.target_modules.append(name) - elif (isinstance(module, FusedMoE) - and hasattr(module.quant_method, "quant_config")): - if not hasattr(model, "get_expert_mapping"): - raise AttributeError( - f"MoE Model {type(model).__name__} does not support " - "BitsAndBytes quantization yet. Ensure this model has " - "'get_expert_mapping' method.") + elif isinstance(module, FusedMoE) and hasattr( + module.quant_method, "quant_config"): # TODO: support FusedMoE with prequant and 8bit. if self.pre_quant: raise ValueError( @@ -430,9 +434,9 @@ class BitsAndBytesModelLoader(BaseModelLoader): "BitsAndBytes 8bit quantization with FusedMoE is not " "supported yet.") # Get the corresponding weight name using module name and - # get_expert_mapping. - expert_mapping = model.get_expert_mapping() - for exp in expert_mapping: + # expert_params_mapping. + + for exp in self.expert_params_mapping: weight_name = exp[1] rep_name = name.replace("experts", "") + weight_name.removesuffix(".") @@ -464,7 +468,7 @@ class BitsAndBytesModelLoader(BaseModelLoader): elif isinstance(module, (RowParallelLinear, )): self.column_sharded_weights_modules.append(name) elif isinstance(module, FusedMoE): - expert_mapping = model.get_expert_mapping() + expert_mapping = self.expert_params_mapping for exp in expert_mapping: if exp[-1] == "w2": weight_name = exp[1] @@ -516,6 +520,13 @@ class BitsAndBytesModelLoader(BaseModelLoader): self.is_pool_model = is_pooling_model(model) self.modules_mapping = ParamMapping(get_packed_modules_mapping(model)) + if is_moe_model(model): + self.expert_params_mapping = get_moe_expert_mapping(model) + if not self.expert_params_mapping: + raise AttributeError( + f"MoE Model {type(model).__name__} does not support " + "BitsAndBytes quantization yet. Ensure this model has " + "'get_expert_mapping' method.") # For some models like Molmo, we need to use hf_to_vllm_mapper # to ensure correct loading of weights. if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None): @@ -569,10 +580,10 @@ class BitsAndBytesModelLoader(BaseModelLoader): """ from bitsandbytes.functional import QuantState - if not hasattr(model, "get_expert_mapping"): + if not self.expert_params_mapping: return dict() - expert_mapping = model.get_expert_mapping() + expert_mapping = self.expert_params_mapping expert_qs_dict = {} for name, module in model.named_modules(): if not isinstance(module, FusedMoE): diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 2b20ca2a3ba3f..41ed0b09c5a2a 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utils for model executor.""" + import copy from typing import Any, Optional @@ -9,6 +10,7 @@ import torch def set_random_seed(seed: int) -> None: from vllm.platforms import current_platform + current_platform.seed_everything(seed) @@ -29,7 +31,7 @@ def set_weight_attrs( return for key, value in weight_attrs.items(): assert not hasattr( - weight, key), (f"Overwriting existing tensor attribute: {key}") + weight, key), f"Overwriting existing tensor attribute: {key}" # NOTE(woosuk): During weight loading, we often do something like: # narrowed_tensor = param.data.narrow(0, offset, len) @@ -41,6 +43,7 @@ def set_weight_attrs( # we sync the param tensor after its weight loader is called. # TODO(woosuk): Remove this hack once we have a better solution. from vllm.platforms import current_platform + if current_platform.is_tpu() and key == "weight_loader": value = _make_synced_weight_loader(value) setattr(weight, key, value) @@ -77,4 +80,17 @@ def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]: f"safely because of conflicts from {type(child).__name__}.") else: parent_map.update(child_map) - return parent_map \ No newline at end of file + return parent_map + + +def get_moe_expert_mapping( + model: torch.nn.Module, ) -> list[tuple[str, str, int, str]]: + if parent_map := getattr(model, "get_expert_mapping", None): + return parent_map() + else: + # We only check main components instead of whole model submodules + for child in model.children(): + child_map = getattr(child, "get_expert_mapping", None) + if child_map is not None: + return child_map() + return [] From 87c94bc87943818ad039d5c916df793fbd081e6a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 1 Aug 2025 13:24:46 +0100 Subject: [PATCH 120/224] Revert "Update sampling_metadata.py (#21937)" (#22088) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/sampling_metadata.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 66bcf1c4bfe50..56f0f0984bfa0 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -539,37 +539,37 @@ class SamplingTensors: temperatures_t = torch.tensor( temperatures, device="cpu", - dtype=torch.float32, + dtype=dtype, pin_memory=pin_memory, ) top_ps_t = torch.tensor( top_ps, device="cpu", - dtype=torch.float32, + dtype=dtype, pin_memory=pin_memory, ) min_ps_t = torch.tensor( min_ps, device="cpu", - dtype=torch.float32, + dtype=dtype, pin_memory=pin_memory, ) presence_penalties_t = torch.tensor( presence_penalties, device="cpu", - dtype=torch.float32, + dtype=dtype, pin_memory=pin_memory, ) frequency_penalties_t = torch.tensor( frequency_penalties, device="cpu", - dtype=torch.float32, + dtype=dtype, pin_memory=pin_memory, ) repetition_penalties_t = torch.tensor( repetition_penalties, device="cpu", - dtype=torch.float32, + dtype=dtype, pin_memory=pin_memory, ) top_ks_t = torch.tensor( From dfbc1f88807a1bddb75fc1dd587922567d7c133f Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 1 Aug 2025 08:25:18 -0400 Subject: [PATCH 121/224] [Speculative Decoding] Add `speculators` config support (#21345) --- .../speculators/test_eagle3.py | 16 ++++ vllm/config.py | 20 +++- vllm/engine/arg_utils.py | 22 ++++- vllm/model_executor/models/llama_eagle3.py | 26 +++++- vllm/transformers_utils/config.py | 32 ++++++- vllm/transformers_utils/configs/__init__.py | 2 + .../configs/speculators/__init__.py | 2 + .../configs/speculators/algos.py | 32 +++++++ .../configs/speculators/base.py | 91 +++++++++++++++++++ 9 files changed, 232 insertions(+), 11 deletions(-) create mode 100644 tests/speculative_decoding/speculators/test_eagle3.py create mode 100644 vllm/transformers_utils/configs/speculators/__init__.py create mode 100644 vllm/transformers_utils/configs/speculators/algos.py create mode 100644 vllm/transformers_utils/configs/speculators/base.py diff --git a/tests/speculative_decoding/speculators/test_eagle3.py b/tests/speculative_decoding/speculators/test_eagle3.py new file mode 100644 index 0000000000000..c58fc8c0dc5f4 --- /dev/null +++ b/tests/speculative_decoding/speculators/test_eagle3.py @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch + + +@pytest.mark.parametrize( + "model_path", + [("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717"), + ("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")]) +def test_llama(vllm_runner, example_prompts, model_path): + with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens=20) + print(vllm_outputs) + assert vllm_outputs diff --git a/vllm/config.py b/vllm/config.py index 93daab7d6ae97..2d61552c5dadc 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -39,8 +39,8 @@ from vllm.transformers_utils.config import ( ConfigFormat, get_config, get_hf_image_processor_config, get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, - try_get_generation_config, try_get_safetensors_metadata, - try_get_tokenizer_config, uses_mrope) + maybe_override_with_speculators_target_model, try_get_generation_config, + try_get_safetensors_metadata, try_get_tokenizer_config, uses_mrope) from vllm.transformers_utils.s3_utils import S3Model from vllm.transformers_utils.utils import is_s3, maybe_model_redirect # yapf conflicts with isort for this block @@ -535,6 +535,15 @@ class ModelConfig: "affect the random state of the Python process that " "launched vLLM.", self.seed) + if self.runner != "draft": + # If we're not running the draft model, check for speculators config + # If speculators config, set model / tokenizer to be target model + self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501 + model=self.model, + tokenizer=self.tokenizer, + revision=self.revision, + trust_remote_code=self.trust_remote_code) + # Keep set served_model_name before maybe_model_redirect(self.model) self.served_model_name = get_served_model_name(self.model, self.served_model_name) @@ -606,8 +615,8 @@ class ModelConfig: self.config_format, hf_overrides_kw=hf_overrides_kw, hf_overrides_fn=hf_overrides_fn) - self.hf_config = hf_config + self.hf_config = hf_config self.hf_text_config = get_hf_text_config(self.hf_config) self.attention_chunk_size = getattr(self.hf_text_config, "attention_chunk_size", None) @@ -2980,10 +2989,13 @@ class SpeculativeConfig: "Chunked prefill and EAGLE are not compatible " "when using V0.") + from vllm.transformers_utils.configs import ( + SpeculatorsConfig) from vllm.transformers_utils.configs.eagle import ( EAGLEConfig) + if isinstance(self.draft_model_config.hf_config, - EAGLEConfig): + (EAGLEConfig, SpeculatorsConfig)): pass else: eagle_config = EAGLEConfig( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 78272d983eaf5..efa077a88270a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -978,8 +978,28 @@ class EngineArgs: provided as a JSON string input via CLI arguments or directly as a dictionary from the engine. """ + + from vllm.transformers_utils.config import get_config + from vllm.transformers_utils.configs.speculators.base import ( + SpeculatorsConfig) + if self.speculative_config is None: - return None + hf_config = get_config(self.hf_config_path or self.model, + self.trust_remote_code, self.revision, + self.code_revision, self.config_format) + + # if loading a SpeculatorsConfig, load the specualtive_config + # details from the config directly + # no user input required / expected + if isinstance(hf_config, SpeculatorsConfig): + # We create one since we dont create one + self.speculative_config = {} + self.speculative_config[ + "num_speculative_tokens"] = hf_config.num_lookahead_tokens + self.speculative_config["model"] = self.model + self.speculative_config["method"] = hf_config.method + else: + return None # Note(Shangming): These parameters are not obtained from the cli arg # '--speculative-config' and must be passed in when creating the engine diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 71275f0d58579..572930c39a846 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -51,6 +51,25 @@ class LlamaDecoderLayer(LlamaDecoderLayer): self.hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + if getattr(config, "norm_before_residual", False): + self._residual_norm = self._norm_before_residual + else: + self._residual_norm = self._norm_after_residual + + def _norm_before_residual( + self, + hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + hidden_states = self.hidden_norm(hidden_states) + residual = hidden_states + return hidden_states, residual + + def _norm_after_residual( + self, + hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + residual = hidden_states + hidden_states = self.hidden_norm(hidden_states) + return hidden_states, residual + def forward( self, positions: torch.Tensor, @@ -59,9 +78,10 @@ class LlamaDecoderLayer(LlamaDecoderLayer): residual: Optional[torch.Tensor], ) -> tuple[torch.Tensor, torch.Tensor]: - residual = hidden_states embeds = self.input_layernorm(embeds) - hidden_states = self.hidden_norm(hidden_states) + + hidden_states, residual = self._residual_norm( + hidden_states=hidden_states) hidden_states = torch.cat([embeds, hidden_states], dim=-1) # Self Attention @@ -102,7 +122,7 @@ class LlamaModel(nn.Module): self.layers = nn.ModuleList([ LlamaDecoderLayer( - self.config, + config=self.config, prefix=maybe_prefix(prefix, f"layers.{start_layer_id}"), ) ]) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index fcaa48c1392a3..0e633c2c0b6ae 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -35,8 +35,9 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config, MllamaConfig, MLPSpeculatorConfig, Nemotron_Nano_VL_Config, NemotronConfig, NVLM_D_Config, - RWConfig, Step3TextConfig, - Step3VLConfig, UltravoxConfig) + RWConfig, SpeculatorsConfig, + Step3TextConfig, Step3VLConfig, + UltravoxConfig) # yapf: enable from vllm.transformers_utils.configs.mistral import adapt_config_dict from vllm.transformers_utils.utils import check_gguf_file @@ -81,6 +82,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = { "mlp_speculator": MLPSpeculatorConfig, "medusa": MedusaConfig, "eagle": EAGLEConfig, + "speculators": SpeculatorsConfig, "nemotron": NemotronConfig, "NVLM_D": NVLM_D_Config, "ultravox": UltravoxConfig, @@ -287,6 +289,27 @@ def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig: return config +def maybe_override_with_speculators_target_model( + model: str, + tokenizer: str, + trust_remote_code: bool, + revision: Optional[str] = None) -> tuple[str, str]: + """ + If running a speculators config, override running model with target model + """ + config_dict, _ = PretrainedConfig.get_config_dict( + model, + revision=revision, + trust_remote_code=trust_remote_code, + token=_get_hf_token(), + ) + spec_config = config_dict.get("speculators_config") + # Return the target model + if spec_config is not None: + model = tokenizer = spec_config["verifier"]["name_or_path"] + return model, tokenizer + + def get_config( model: Union[str, Path], trust_remote_code: bool, @@ -345,9 +368,12 @@ def get_config( token=_get_hf_token(), **kwargs, ) - # Use custom model class if it's in our registry model_type = config_dict.get("model_type") + if model_type is None: + model_type = "speculators" if config_dict.get( + "speculators_config") is not None else model_type + if model_type in _CONFIG_REGISTRY: config_class = _CONFIG_REGISTRY[model_type] config = config_class.from_pretrained( diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 96733da726181..64ace167a5a00 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -24,6 +24,7 @@ from vllm.transformers_utils.configs.nemotron import NemotronConfig from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config +from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig, Step3VisionEncoderConfig, Step3VLConfig) @@ -44,6 +45,7 @@ __all__ = [ "NemotronHConfig", "Nemotron_Nano_VL_Config", "NVLM_D_Config", + "SpeculatorsConfig", "UltravoxConfig", "Step3VLConfig", "Step3VisionEncoderConfig", diff --git a/vllm/transformers_utils/configs/speculators/__init__.py b/vllm/transformers_utils/configs/speculators/__init__.py new file mode 100644 index 0000000000000..208f01a7cb5ee --- /dev/null +++ b/vllm/transformers_utils/configs/speculators/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm/transformers_utils/configs/speculators/algos.py b/vllm/transformers_utils/configs/speculators/algos.py new file mode 100644 index 0000000000000..efc87b6bcf26f --- /dev/null +++ b/vllm/transformers_utils/configs/speculators/algos.py @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +SUPPORTED_SPECULATORS_TYPES = {} + + +def register_speculator(name): + + def decorator(fn): + SUPPORTED_SPECULATORS_TYPES[name] = fn + return fn + + return decorator + + +@register_speculator("eagle3") +def update_eagle3(config_dict: dict, vllm_config: dict) -> None: + """ + Apply Eagle-3 specific configuration transformations. + + Eagle-3 specific fields: + - draft_vocab_size: Size of the draft model's vocabulary + - target_hidden_size: Hidden size of the target model + - norm_before_residual: Whether to apply norm before residual connection + """ + + vllm_config["draft_vocab_size"] = config_dict.get("draft_vocab_size") + if config_dict.get("target_hidden_size") is not None: + vllm_config["target_hidden_size"] = config_dict["target_hidden_size"] + vllm_config["norm_before_residual"] = config_dict.get( + "norm_before_residual", True) + vllm_config["architectures"] = ["Eagle3LlamaForCausalLM"] diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py new file mode 100644 index 0000000000000..d7c16e180c709 --- /dev/null +++ b/vllm/transformers_utils/configs/speculators/base.py @@ -0,0 +1,91 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os +from typing import Any, Union + +from transformers import PretrainedConfig + +from vllm.transformers_utils.configs.speculators.algos import ( + SUPPORTED_SPECULATORS_TYPES) + +__all__ = ["SpeculatorsConfig"] + + +class SpeculatorsConfig(PretrainedConfig): + model_type = "speculators" + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: Union[str, os.PathLike], + **kwargs, + ) -> "SpeculatorsConfig": + """Load speculators Eagle config and convert to vLLM format.""" + config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path, + **kwargs) + + speculators_model_type = config_dict.get("speculators_model_type") + if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES: + raise ValueError( + f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. " + "Please ensure you're loading a speculators-format model.") + + # validate fields + # TODO: @dsikka - use speculators pydantic model to validate + cls.validate_speculators_config(config_dict=config_dict) + # Convert from speculators config -> format that can be ingested by vLLM + vllm_config = cls.convert_speculators_to_vllm(config_dict=config_dict) + # Apply anything specific to the supported algorithm + algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type] + algo_updater(config_dict=config_dict, vllm_config=vllm_config) + return cls(**vllm_config) + + @classmethod + def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None: + try: + spec_config = config_dict["speculators_config"] + methods = spec_config["proposal_methods"] + first_method = methods[0] + _ = first_method["speculative_tokens"] + _ = spec_config["verifier"]["name_or_path"] + _ = config_dict["speculators_model_type"] + except (KeyError, IndexError, TypeError) as e: + raise ValueError("Invalid speculators config structure") from e + + if "transformer_layer_config" not in config_dict: + raise ValueError("Must provide transformer_layer_config") + + if not isinstance(config_dict["transformer_layer_config"], dict): + raise TypeError( + "'transformer_layer_config' must be a dictionary if provided") + + @classmethod + def convert_speculators_to_vllm( + cls, config_dict: dict[str, Any]) -> dict[str, Any]: + """ + Convert speculators config format to vLLM format. + + This method handles the translation of field names and structure + between speculators and vLLM formats. + + Returns: + Dictionary with vLLM-compatible configuration + """ + # Currently we only support one proposal method + spec_config = config_dict["speculators_config"] + first_method = spec_config.get("proposal_methods")[0] + num_lookahead_tokens = first_method.get("speculative_tokens") + + if num_lookahead_tokens is None: + raise ValueError( + "Missing 'speculative_tokens' in proposal method. " + f"Got: {first_method}") + + # Build base vLLM config + vllm_config = { + "method": config_dict.get("speculators_model_type"), + "num_lookahead_tokens": num_lookahead_tokens, + "target_model": spec_config.get("verifier")["name_or_path"] + } + vllm_config.update(config_dict["transformer_layer_config"]) + return vllm_config From 26b5f7bd2a4005dccb797804c93cbce329253003 Mon Sep 17 00:00:00 2001 From: TJian Date: Fri, 1 Aug 2025 05:25:20 -0700 Subject: [PATCH 122/224] [BUG] [ROCm] Fix import bug on ROCm (#22083) Signed-off-by: tjtanaa --- vllm/compilation/pass_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 11e03daced160..54f00d5415216 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -7,7 +7,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform -if current_platform.is_cuda_alike(): +if current_platform.is_cuda(): from .fusion import FusionPass from .collective_fusion import AllReduceFusionPass, AsyncTPPass from .fusion_attn import AttnFusionPass From fb0e0d46fc443f08bc2a859b839f0f66c6a7f670 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 1 Aug 2025 13:26:42 +0100 Subject: [PATCH 123/224] Fix `get_kwargs` for case where type hint is `list[Union[str, type]]` (#22016) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/engine/test_arg_utils.py | 7 ++++++- vllm/engine/arg_utils.py | 10 ++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 1d1926068d28c..c282bf002304a 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -5,7 +5,7 @@ import json from argparse import ArgumentError from contextlib import nullcontext from dataclasses import dataclass, field -from typing import Annotated, Literal, Optional +from typing import Annotated, Literal, Optional, Union import pytest @@ -136,6 +136,8 @@ class DummyConfig: """List with variable length""" list_literal: list[Literal[1, 2]] = field(default_factory=list) """List with literal choices""" + list_union: list[Union[str, type[object]]] = field(default_factory=list) + """List with union type""" literal_literal: Literal[Literal[1], Literal[2]] = 1 """Literal of literals with default 1""" json_tip: dict = field(default_factory=dict) @@ -187,6 +189,9 @@ def test_get_kwargs(): assert kwargs["list_literal"]["type"] is int assert kwargs["list_literal"]["nargs"] == "+" assert kwargs["list_literal"]["choices"] == [1, 2] + # lists with unions should become str type. + # If not, we cannot know which type to use for parsing + assert kwargs["list_union"]["type"] is str # literals of literals should have merged choices assert kwargs["literal_literal"]["choices"] == [1, 2] # dict should have json tip in help diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index efa077a88270a..f938f19b90469 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -217,10 +217,12 @@ Additionally, list elements can be passed individually using `+`: elif contains_type(type_hints, list): type_hint = get_type(type_hints, list) types = get_args(type_hint) - assert len(types) == 1, ( - "List type must have exactly one type. Got " - f"{type_hint} with types {types}") - kwargs[name]["type"] = types[0] + list_type = types[0] + if get_origin(list_type) is Union: + msg = "List type must contain str if it is a Union." + assert str in get_args(list_type), msg + list_type = str + kwargs[name]["type"] = list_type kwargs[name]["nargs"] = "+" elif contains_type(type_hints, int): kwargs[name]["type"] = int From f81c1bb05504672ddd66905161c6ada549fd4b85 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 1 Aug 2025 08:28:45 -0400 Subject: [PATCH 124/224] [Bugfix] Check NVIDIA artifactory is accessible before using flashinfer cubin kernels (#21893) --- vllm/attention/backends/flashinfer.py | 46 +------------- vllm/utils/flashinfer.py | 81 +++++++++++++++++++++++- vllm/v1/attention/backends/flashinfer.py | 49 +------------- vllm/v1/attention/backends/mla/common.py | 16 ++--- 4 files changed, 93 insertions(+), 99 deletions(-) diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 824ff8cca201a..b3372ce2eca8c 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -44,9 +44,9 @@ from vllm.attention.layer import Attention from vllm.attention.ops.paged_attn import PagedAttention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype, make_tensor_with_pad) +from vllm.utils.flashinfer import use_trtllm_decode_attention logger = init_logger(__name__) @@ -56,7 +56,6 @@ if TYPE_CHECKING: class FlashInferBackend(AttentionBackend): - cached_sm100a_supported: Optional[bool] = None @staticmethod def get_name() -> str: @@ -123,47 +122,6 @@ class FlashInferBackend(AttentionBackend): else: raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}") - @staticmethod - def use_trtllm_decode_attention( - batch_size: int, - max_seq_len: int, - kv_cache_dtype: str, - num_qo_heads: Optional[int], - num_kv_heads: Optional[int], - attn_head_size: Optional[int], - ) -> bool: - if FlashInferBackend.cached_sm100a_supported is None: - FlashInferBackend.cached_sm100a_supported = ( - current_platform.has_device_capability(100)) - if not FlashInferBackend.cached_sm100a_supported: - return False - # Check if the dimensions are supported by TRTLLM decode attention - if (attn_head_size is None or num_qo_heads is None - or num_kv_heads is None or num_qo_heads // num_kv_heads > 8 - or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128): - return False - env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION - if env_value is not None: - logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s", - env_value) - # Environment variable is set - respect it - # Making the conditional check for zero because - # the path is automatically enabled if the batch size condition - # is satisfied. - no_use_trtllm = (env_value == "0") - if not no_use_trtllm: - logger.info_once("Using TRTLLM decode attention.") - return not no_use_trtllm - else: - # Environment variable not set - use auto-detection - use_trtllm = (FlashInferBackend.cached_sm100a_supported - and batch_size <= 256 and max_seq_len < 131072 - and kv_cache_dtype == "auto") - if use_trtllm: - logger.warning_once( - "Using TRTLLM decode attention (auto-detected).") - return use_trtllm - @dataclass class PerLayerParameters: @@ -1156,7 +1114,7 @@ class FlashInferImpl(AttentionImpl): assert decode_meta.decode_wrapper._sm_scale == softmax_scale # TODO: @pavanimajety Remove this once the switch happens # inside flashinfer. - if not FlashInferBackend.use_trtllm_decode_attention( + if not use_trtllm_decode_attention( num_decode_tokens, attn_metadata.max_decode_seq_len, kv_cache_dtype, attn_metadata.num_qo_heads, attn_metadata.num_kv_heads, attn_metadata.head_dim): diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 3bfb9808c0a00..29967bc516715 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -10,12 +10,25 @@ import contextlib import functools import importlib import importlib.util -from typing import Any, Callable, NoReturn +import os +from typing import Any, Callable, NoReturn, Optional +import requests + +import vllm.envs as envs from vllm.logger import init_logger +from vllm.platforms import current_platform logger = init_logger(__name__) +# This is the storage path for the cubins, it can be replaced +# with a local path for testing. +# Referenced from https://github.com/flashinfer-ai/flashinfer/blob/0c9a92c3d9a7e043ab6f3f7b2273269caf6ab044/flashinfer/jit/cubin_loader.py#L35 # noqa: E501 +FLASHINFER_CUBINS_REPOSITORY = os.environ.get( + "FLASHINFER_CUBINS_REPOSITORY", + "https://edge.urm.nvidia.com/artifactory/sw-kernelinferencelibrary-public-generic-local/", # noqa: E501 +) + @functools.cache def has_flashinfer() -> bool: @@ -108,6 +121,70 @@ def has_flashinfer_cutlass_fused_moe() -> bool: return True +@functools.cache +def has_nvidia_artifactory() -> bool: + """Return ``True`` if NVIDIA's artifactory is accessible. + + This checks connectivity to the kernel inference library artifactory + which is required for downloading certain cubin kernels like TRTLLM FHMA. + """ + try: + # Use a short timeout to avoid blocking for too long + response = requests.get(FLASHINFER_CUBINS_REPOSITORY, timeout=5) + accessible = response.status_code == 200 + if accessible: + logger.debug_once("NVIDIA artifactory is accessible") + else: + logger.warning_once( + "NVIDIA artifactory returned failed status code: %d", + response.status_code) + return accessible + except Exception as e: + logger.warning_once("Failed to connect to NVIDIA artifactory: %s", e) + return False + + +def use_trtllm_decode_attention( + num_tokens: int, + max_seq_len: int, + kv_cache_dtype: str, + num_qo_heads: Optional[int], + num_kv_heads: Optional[int], + attn_head_size: Optional[int], +) -> bool: + # Requires SM100 and NVIDIA artifactory to be accessible to download cubins + if not (current_platform.is_device_capability(100) + and has_nvidia_artifactory()): + return False + + # Check if the dimensions are supported by TRTLLM decode attention + if (attn_head_size is None or num_qo_heads is None or num_kv_heads is None + or num_qo_heads // num_kv_heads > 8 + or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128): + return False + + env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION + if env_value is not None: + logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s", + env_value) + # Environment variable is set - respect it + # Making the conditional check for zero because + # the path is automatically enabled if the batch size condition + # is satisfied. + no_use_trtllm = (env_value == "0") + if not no_use_trtllm: + logger.info_once("Using TRTLLM decode attention.") + return not no_use_trtllm + else: + # Environment variable not set - use auto-detection + use_trtllm = (num_tokens <= 256 and max_seq_len < 131072 + and kv_cache_dtype == "auto") + if use_trtllm: + logger.warning_once( + "Using TRTLLM decode attention (auto-detected).") + return use_trtllm + + __all__ = [ "has_flashinfer", "flashinfer_trtllm_fp8_block_scale_moe", @@ -117,4 +194,6 @@ __all__ = [ "autotune", "has_flashinfer_moe", "has_flashinfer_cutlass_fused_moe", + "has_nvidia_artifactory", + "use_trtllm_decode_attention", ] diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 27552f0e7c1ef..f8af1d7e41831 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -17,8 +17,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionType) from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.utils import cdiv +from vllm.utils.flashinfer import use_trtllm_decode_attention from vllm.v1.attention.backends.flash_attn import use_cascade_attention from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, get_kv_cache_layout, @@ -38,7 +38,6 @@ logger = init_logger(__name__) class FlashInferBackend(AttentionBackend): accept_output_buffer: bool = True - cached_sm100a_supported: Optional[bool] = None @classmethod def get_supported_dtypes(cls) -> list[torch.dtype]: @@ -98,48 +97,6 @@ class FlashInferBackend(AttentionBackend): raise ValueError(f"Unknown cache layout format {cache_layout}.") return stride_order - @staticmethod - def use_trtllm_decode_attention( - batch_size: int, - max_seq_len: int, - kv_cache_dtype: str, - num_qo_heads: int, - num_kv_heads: int, - attn_head_size: int, - ) -> bool: - if FlashInferBackend.cached_sm100a_supported is None: - FlashInferBackend.cached_sm100a_supported = ( - current_platform.has_device_capability(100)) - if not FlashInferBackend.cached_sm100a_supported: - return False - if (num_qo_heads // num_kv_heads > 8 - or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128): - return False - env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION - if env_value is not None: - logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s", - env_value) - # Environment variable is set - respect it - # Making the conditional check for zero because - # the path is automatically enabled if the batch size condition - # is satisfied. - no_use_trtllm = env_value == "0" - if not no_use_trtllm: - logger.info_once( - "VLLM_USE_TRTLLM_DECODE_ATTENTION is set to 1, " - "using TRTLLM decode attention.") - return not no_use_trtllm - else: - # Environment variable not set - use auto-detection - # Only supports attention head size of 128 - use_trtllm = (FlashInferBackend.cached_sm100a_supported - and batch_size <= 256 and max_seq_len < 131072 - and kv_cache_dtype == "auto") - if use_trtllm: - logger.warning_once( - "Using TRTLLM decode attention (auto-detected).") - return use_trtllm - @staticmethod def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype: if kv_cache_dtype in ("fp8", "fp8_e4m3"): @@ -352,7 +309,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): if num_decodes > 0: attn_metadata.decode_wrapper = self._get_decode_wrapper() - if not FlashInferBackend.use_trtllm_decode_attention( + if not use_trtllm_decode_attention( num_decodes, attn_metadata.max_seq_len, self.cache_config.cache_dtype, attn_metadata.num_qo_heads, attn_metadata.num_kv_heads, @@ -636,7 +593,7 @@ class FlashInferImpl(AttentionImpl): decode_query = query[:num_decode_tokens] assert decode_query.shape[0] == num_decode_tokens assert decode_wrapper is not None - if not FlashInferBackend.use_trtllm_decode_attention( + if not use_trtllm_decode_attention( attn_metadata.num_decodes, attn_metadata.max_seq_len, self.kv_cache_dtype, attn_metadata.num_qo_heads, attn_metadata.num_kv_heads, attn_metadata.head_dim): diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 0095d75217856..d112468f1c91d 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -209,6 +209,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, UnquantizedLinearMethod) from vllm.platforms import current_platform from vllm.utils import cdiv, round_down +from vllm.utils.flashinfer import has_nvidia_artifactory from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, get_per_layer_parameters, infer_global_hyperparameters, @@ -379,17 +380,16 @@ M = TypeVar("M", bound=MLACommonMetadata) def use_flashinfer_prefill() -> bool: - if flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL: - # For blackwell default to flashinfer prefill if its available since - # its faster than FA2. - return current_platform.has_device_capability(100) - return False + # For blackwell default to flashinfer prefill if its available since + # it is faster than FA2. + return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL + and current_platform.is_device_capability(100)) def use_cudnn_prefill() -> bool: - if flashinfer_available and envs.VLLM_USE_CUDNN_PREFILL: - return current_platform.has_device_capability(100) - return False + return (flashinfer_available and envs.VLLM_USE_CUDNN_PREFILL + and current_platform.is_device_capability(100) + and has_nvidia_artifactory()) # Currently 394MB, this can be tuned based on GEMM sizes used. From 0a6d305e0f7b63b06c87bb1f7564ae8d148a3311 Mon Sep 17 00:00:00 2001 From: Gamhang Date: Fri, 1 Aug 2025 21:07:33 +0800 Subject: [PATCH 125/224] feat(multimodal): Add customizable background color for RGBA to RGB conversion (#22052) Signed-off-by: Jinheng Li Co-authored-by: Jinheng Li --- docs/features/multimodal_inputs.md | 44 +++++++++++ tests/multimodal/test_image.py | 115 ++++++++++++++++++++++++++++- vllm/multimodal/image.py | 37 ++++++++-- 3 files changed, 190 insertions(+), 6 deletions(-) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index b8677f11a1d3c..cdd32924b5668 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -172,6 +172,36 @@ Multi-image input can be extended to perform video captioning. We show this with print(generated_text) ``` +#### Custom RGBA Background Color + +When loading RGBA images (images with transparency), vLLM converts them to RGB format. By default, transparent pixels are replaced with white background. You can customize this background color using the `rgba_background_color` parameter in `media_io_kwargs`. + +??? code + + ```python + from vllm import LLM + + # Default white background (no configuration needed) + llm = LLM(model="llava-hf/llava-1.5-7b-hf") + + # Custom black background for dark theme + llm = LLM( + model="llava-hf/llava-1.5-7b-hf", + media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}} + ) + + # Custom brand color background (e.g., blue) + llm = LLM( + model="llava-hf/llava-1.5-7b-hf", + media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}} + ) + ``` + +!!! note + - The `rgba_background_color` accepts RGB values as a list `[R, G, B]` or tuple `(R, G, B)` where each value is 0-255 + - This setting only affects RGBA images with transparency; RGB images are unchanged + - If not specified, the default white background `(255, 255, 255)` is used for backward compatibility + ### Video Inputs You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary @@ -478,6 +508,20 @@ Full example: ``` +#### Custom RGBA Background Color + +To use a custom background color for RGBA images, pass the `rgba_background_color` parameter via `--media-io-kwargs`: + +```bash +# Example: Black background for dark theme +vllm serve llava-hf/llava-1.5-7b-hf \ + --media-io-kwargs '{"image": {"rgba_background_color": [0, 0, 0]}}' + +# Example: Custom gray background +vllm serve llava-hf/llava-1.5-7b-hf \ + --media-io-kwargs '{"image": {"rgba_background_color": [128, 128, 128]}}' +``` + ### Audio Inputs Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in). diff --git a/tests/multimodal/test_image.py b/tests/multimodal/test_image.py index cfd44351a6d1f..271a85f1195ec 100644 --- a/tests/multimodal/test_image.py +++ b/tests/multimodal/test_image.py @@ -3,9 +3,10 @@ from pathlib import Path import numpy as np +import pytest from PIL import Image, ImageChops -from vllm.multimodal.image import convert_image_mode +from vllm.multimodal.image import ImageMediaIO, convert_image_mode ASSETS_DIR = Path(__file__).parent / "assets" assert ASSETS_DIR.exists() @@ -35,3 +36,115 @@ def test_rgba_to_rgb(): assert converted_image_numpy[i][j][0] == 255 assert converted_image_numpy[i][j][1] == 255 assert converted_image_numpy[i][j][2] == 255 + + +def test_rgba_to_rgb_custom_background(tmp_path): + """Test RGBA to RGB conversion with custom background colors.""" + # Create a simple RGBA image with transparent and opaque pixels + rgba_image = Image.new("RGBA", (10, 10), + (255, 0, 0, 255)) # Red with full opacity + + # Make top-left quadrant transparent + for i in range(5): + for j in range(5): + rgba_image.putpixel((i, j), (0, 0, 0, 0)) # Fully transparent + + # Save the test image to tmp_path + test_image_path = tmp_path / "test_rgba.png" + rgba_image.save(test_image_path) + + # Test 1: Default white background (backward compatibility) + image_io_default = ImageMediaIO() + converted_default = image_io_default.load_file(test_image_path) + default_numpy = np.array(converted_default) + + # Check transparent pixels are white + assert default_numpy[0][0][0] == 255 # R + assert default_numpy[0][0][1] == 255 # G + assert default_numpy[0][0][2] == 255 # B + # Check opaque pixels remain red + assert default_numpy[5][5][0] == 255 # R + assert default_numpy[5][5][1] == 0 # G + assert default_numpy[5][5][2] == 0 # B + + # Test 2: Custom black background via kwargs + image_io_black = ImageMediaIO(rgba_background_color=(0, 0, 0)) + converted_black = image_io_black.load_file(test_image_path) + black_numpy = np.array(converted_black) + + # Check transparent pixels are black + assert black_numpy[0][0][0] == 0 # R + assert black_numpy[0][0][1] == 0 # G + assert black_numpy[0][0][2] == 0 # B + # Check opaque pixels remain red + assert black_numpy[5][5][0] == 255 # R + assert black_numpy[5][5][1] == 0 # G + assert black_numpy[5][5][2] == 0 # B + + # Test 3: Custom blue background via kwargs (as list) + image_io_blue = ImageMediaIO(rgba_background_color=[0, 0, 255]) + converted_blue = image_io_blue.load_file(test_image_path) + blue_numpy = np.array(converted_blue) + + # Check transparent pixels are blue + assert blue_numpy[0][0][0] == 0 # R + assert blue_numpy[0][0][1] == 0 # G + assert blue_numpy[0][0][2] == 255 # B + + # Test 4: Test with load_bytes method + with open(test_image_path, 'rb') as f: + image_data = f.read() + + image_io_green = ImageMediaIO(rgba_background_color=(0, 255, 0)) + converted_green = image_io_green.load_bytes(image_data) + green_numpy = np.array(converted_green) + + # Check transparent pixels are green + assert green_numpy[0][0][0] == 0 # R + assert green_numpy[0][0][1] == 255 # G + assert green_numpy[0][0][2] == 0 # B + + +def test_rgba_background_color_validation(): + """Test that invalid rgba_background_color values are properly rejected.""" + + # Test invalid types + with pytest.raises(ValueError, + match="rgba_background_color must be a list or tuple"): + ImageMediaIO(rgba_background_color="255,255,255") + + with pytest.raises(ValueError, + match="rgba_background_color must be a list or tuple"): + ImageMediaIO(rgba_background_color=255) + + # Test wrong number of elements + with pytest.raises(ValueError, + match="rgba_background_color must be a list or tuple"): + ImageMediaIO(rgba_background_color=(255, 255)) + + with pytest.raises(ValueError, + match="rgba_background_color must be a list or tuple"): + ImageMediaIO(rgba_background_color=(255, 255, 255, 255)) + + # Test non-integer values + with pytest.raises(ValueError, + match="rgba_background_color must be a list or tuple"): + ImageMediaIO(rgba_background_color=(255.0, 255.0, 255.0)) + + with pytest.raises(ValueError, + match="rgba_background_color must be a list or tuple"): + ImageMediaIO(rgba_background_color=(255, "255", 255)) + + # Test out of range values + with pytest.raises(ValueError, + match="rgba_background_color must be a list or tuple"): + ImageMediaIO(rgba_background_color=(256, 255, 255)) + + with pytest.raises(ValueError, + match="rgba_background_color must be a list or tuple"): + ImageMediaIO(rgba_background_color=(255, -1, 255)) + + # Test that valid values work + ImageMediaIO(rgba_background_color=(0, 0, 0)) # Should not raise + ImageMediaIO(rgba_background_color=[255, 255, 255]) # Should not raise + ImageMediaIO(rgba_background_color=(128, 128, 128)) # Should not raise diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index a0448a80ac7c2..1006c1ce4b241 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -3,6 +3,7 @@ from io import BytesIO from pathlib import Path +from typing import Union import pybase64 import torch @@ -23,9 +24,10 @@ def rescale_image_size(image: Image.Image, return image -# TODO: Support customizable background color to fill in. def rgba_to_rgb( - image: Image.Image, background_color=(255, 255, 255)) -> Image.Image: + image: Image.Image, + background_color: Union[tuple[int, int, int], list[int]] = (255, 255, 255) +) -> Image.Image: """Convert an RGBA image to RGB with filled background color.""" assert image.mode == "RGBA" converted = Image.new("RGB", image.size, background_color) @@ -55,10 +57,35 @@ class ImageMediaIO(MediaIO[Image.Image]): # for flexible control. self.kwargs = kwargs + # Extract RGBA background color from kwargs if provided + # Default to white background for backward compatibility + rgba_bg = kwargs.get('rgba_background_color', (255, 255, 255)) + # Convert list to tuple for consistency + if isinstance(rgba_bg, list): + rgba_bg = tuple(rgba_bg) + + # Validate rgba_background_color format + if not (isinstance(rgba_bg, tuple) and len(rgba_bg) == 3 + and all(isinstance(c, int) and 0 <= c <= 255 + for c in rgba_bg)): + raise ValueError( + "rgba_background_color must be a list or tuple of 3 integers " + "in the range [0, 255].") + self.rgba_background_color = rgba_bg + + def _convert_image_mode(self, image: Image.Image) -> Image.Image: + """Convert image mode with custom background color.""" + if image.mode == self.image_mode: + return image + elif image.mode == "RGBA" and self.image_mode == "RGB": + return rgba_to_rgb(image, self.rgba_background_color) + else: + return convert_image_mode(image, self.image_mode) + def load_bytes(self, data: bytes) -> Image.Image: image = Image.open(BytesIO(data)) image.load() - return convert_image_mode(image, self.image_mode) + return self._convert_image_mode(image) def load_base64(self, media_type: str, data: str) -> Image.Image: return self.load_bytes(pybase64.b64decode(data, validate=True)) @@ -66,7 +93,7 @@ class ImageMediaIO(MediaIO[Image.Image]): def load_file(self, filepath: Path) -> Image.Image: image = Image.open(filepath) image.load() - return convert_image_mode(image, self.image_mode) + return self._convert_image_mode(image) def encode_base64( self, @@ -77,7 +104,7 @@ class ImageMediaIO(MediaIO[Image.Image]): image = media with BytesIO() as buffer: - image = convert_image_mode(image, self.image_mode) + image = self._convert_image_mode(image) image.save(buffer, image_format) data = buffer.getvalue() From 5c54d9759d3e12d66919826bf1b7c196914d3a92 Mon Sep 17 00:00:00 2001 From: Abirdcfly Date: Fri, 1 Aug 2025 21:08:45 +0800 Subject: [PATCH 126/224] [Bugfix][PD] set max_completion_tokens=1 if req has this value (#21841) Signed-off-by: Abirdcfly --- .../online_serving/disaggregated_serving/disagg_proxy_demo.py | 2 ++ .../disagg_proxy_p2p_nccl_xpyd.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py index 16c32dcaa5d31..d39edb0b9d15c 100644 --- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py +++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py @@ -293,6 +293,8 @@ class Proxy: # add params to request kv_prepare_request = request.copy() kv_prepare_request["max_tokens"] = 1 + if "max_completion_tokens" in kv_prepare_request: + kv_prepare_request["max_completion_tokens"] = 1 # prefill stage prefill_instance = self.schedule(self.prefill_cycler) diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py index a6fd92feb2f11..73da7af85f1d9 100644 --- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py +++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py @@ -128,6 +128,8 @@ async def handle_request(): prefill_request = original_request_data.copy() # change max_tokens = 1 to let it only do prefill prefill_request["max_tokens"] = 1 + if "max_completion_tokens" in prefill_request: + prefill_request["max_completion_tokens"] = 1 global count global prefill_instances From a59cd9d9f7fd89e19beeffb7e7f89437d413eafb Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 1 Aug 2025 09:10:30 -0400 Subject: [PATCH 127/224] [Refactor] Fix Compile Warning #1444-D (#21462) Signed-off-by: yewentao256 --- csrc/moe/topk_softmax_kernels.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index 0b505d2e04a21..7a7865b901de1 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -24,9 +24,12 @@ #ifndef USE_ROCM #include #include + #include + using AddOp = cuda::std::plus; #else #include #include + using AddOp = cub::Sum; #endif #define MAX(a, b) ((a) > (b) ? (a) : (b)) @@ -62,7 +65,6 @@ __launch_bounds__(TPB) __global__ const int thread_row_offset = blockIdx.x * num_cols; - cub::Sum sum; float threadData(-FLT_MAX); // Don't touch finished rows. @@ -92,7 +94,7 @@ __launch_bounds__(TPB) __global__ threadData += exp((static_cast(input[idx]) - float_max)); } - const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum); + const auto Z = BlockReduce(tmpStorage).Reduce(threadData, AddOp()); if (threadIdx.x == 0) { From 8026a335a135af2e53c7d89652863312d7a3c936 Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Fri, 1 Aug 2025 10:11:29 -0400 Subject: [PATCH 128/224] [BugFix] Update AttnFusionPass cache key (#21947) Signed-off-by: Richard Zou --- vllm/compilation/fusion_attn.py | 3 +++ vllm/compilation/inductor_pass.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py index 79518b6f4f965..a40a8caf34a88 100644 --- a/vllm/compilation/fusion_attn.py +++ b/vllm/compilation/fusion_attn.py @@ -164,3 +164,6 @@ class AttnFusionPass(VllmInductorPass): logger.debug("Fused quantization onto %s attention nodes", count) self.dump_graph(graph, "after_attn_fusion") self.end_and_log() + + def uuid(self): + return VllmInductorPass.hash_source(self, AttentionStaticQuantPattern) diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py index 810d0801e9f38..2a149c65b3877 100644 --- a/vllm/compilation/inductor_pass.py +++ b/vllm/compilation/inductor_pass.py @@ -76,9 +76,10 @@ class InductorPass(CustomGraphPass): for src in srcs: if isinstance(src, str): src_str = src - elif isinstance(src, types.FunctionType): + elif isinstance(src, (types.FunctionType, type)): src_str = inspect.getsource(src) else: + # object instance src_str = inspect.getsource(src.__class__) hasher.update(src_str.encode("utf-8")) return hasher.hexdigest() From 3146519add735bc51a6a983af9e9c4a8b8d3373e Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 1 Aug 2025 15:37:55 +0100 Subject: [PATCH 129/224] [BugFix] Don't change title of top-level process (#22032) Signed-off-by: Nick Hill --- vllm/entrypoints/cli/serve.py | 11 ++++++----- vllm/entrypoints/openai/api_server.py | 4 ++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 0305354a66e85..9762a1de9edd3 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -18,7 +18,8 @@ from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG, show_filtered_argument_or_group_from_help) from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser, decorate_logs, get_tcp_uri +from vllm.utils import (FlexibleArgumentParser, decorate_logs, get_tcp_uri, + set_process_title) from vllm.v1.engine.core import EngineCoreProc from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines from vllm.v1.executor.abstract import Executor @@ -74,7 +75,7 @@ def run_headless(args: argparse.Namespace): if args.api_server_count > 1: raise ValueError("api_server_count can't be set in headless mode") - # set_process_title("Headless_ProcManager") + # Create the EngineConfig. engine_args = vllm.AsyncEngineArgs.from_cli_args(args) usage_context = UsageContext.OPENAI_API_SERVER @@ -139,8 +140,6 @@ def run_multi_api_server(args: argparse.Namespace): orig_disable_mm_preprocessor_cache = args.disable_mm_preprocessor_cache - # set_process_title("ProcManager") - if num_api_servers > 1: setup_multiprocess_prometheus() @@ -225,7 +224,9 @@ def run_api_server_worker_proc(listen_address, **uvicorn_kwargs) -> None: """Entrypoint for individual API server worker processes.""" - # Add process-specific prefix to stdout and stderr. + # Set process title and add process-specific prefix to stdout and stderr. + server_index = client_config.get("client_index", 0) if client_config else 0 + set_process_title("APIServer", str(server_index)) decorate_logs() uvloop.run( diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 26db1357da4d0..1be03c57a1f1b 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -102,7 +102,7 @@ from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.usage.usage_lib import UsageContext from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs, get_open_zmq_ipc_path, is_valid_ipv6_address, - set_process_title, set_ulimit) + set_ulimit) from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION @@ -1824,7 +1824,7 @@ async def run_server_worker(listen_address, ToolParserManager.import_tool_parser(args.tool_parser_plugin) server_index = client_config.get("client_index", 0) if client_config else 0 - set_process_title("APIServer", str(server_index)) + # Load logging config for uvicorn if specified log_config = load_log_config(args.log_config_file) if log_config is not None: From 97608dc276c292d9217eb6d334d969c5e89913c6 Mon Sep 17 00:00:00 2001 From: David Xia Date: Fri, 1 Aug 2025 10:55:55 -0400 Subject: [PATCH 130/224] [Docs] use `uv` in CPU installation docs (#22089) Signed-off-by: David Xia --- .../installation/cpu/apple.inc.md | 12 ++--- .../installation/cpu/build.inc.md | 22 +++++---- .../installation/cpu/s390x.inc.md | 45 ++++++++++++------- 3 files changed, 48 insertions(+), 31 deletions(-) diff --git a/docs/getting_started/installation/cpu/apple.inc.md b/docs/getting_started/installation/cpu/apple.inc.md index 0816f38ac68a1..2828173a76a9a 100644 --- a/docs/getting_started/installation/cpu/apple.inc.md +++ b/docs/getting_started/installation/cpu/apple.inc.md @@ -1,6 +1,6 @@ # --8<-- [start:installation] -vLLM has experimental support for macOS with Apple silicon. For now, users shall build from the source vLLM to natively run on macOS. +vLLM has experimental support for macOS with Apple silicon. For now, users must build from source to natively run on macOS. Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. @@ -23,20 +23,20 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. # --8<-- [end:pre-built-wheels] # --8<-- [start:build-wheel-from-source] -After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source. +After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from source. ```bash git clone https://github.com/vllm-project/vllm.git cd vllm -pip install -r requirements/cpu.txt -pip install -e . +uv pip install -r requirements/cpu.txt +uv pip install -e . ``` !!! note - On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device. + On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which is currently the only supported device. !!! example "Troubleshooting" - If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your + If the build fails with errors like the following where standard C++ headers cannot be found, try to remove and reinstall your [Command Line Tools for Xcode](https://developer.apple.com/download/all/). ```text diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md index fa777fe0c8a1a..57a09e674a821 100644 --- a/docs/getting_started/installation/cpu/build.inc.md +++ b/docs/getting_started/installation/cpu/build.inc.md @@ -1,4 +1,4 @@ -First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: +First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: ```bash sudo apt-get update -y @@ -6,28 +6,34 @@ sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certific sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 ``` -Second, clone vLLM project: +Second, clone the vLLM project: ```bash git clone https://github.com/vllm-project/vllm.git vllm_source cd vllm_source ``` -Third, install Python packages for vLLM CPU backend building: +Third, install required dependencies: ```bash -pip install --upgrade pip -pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu -pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +uv pip install -r requirements/cpu-build.txt --torch-backend auto +uv pip install -r requirements/cpu.txt --torch-backend auto ``` -Finally, build and install vLLM CPU backend: +??? console "pip" + ```bash + pip install --upgrade pip + pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu + pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu + ``` + +Finally, build and install vLLM: ```bash VLLM_TARGET_DEVICE=cpu python setup.py install ``` -If you want to develop vllm, install it in editable mode instead. +If you want to develop vLLM, install it in editable mode instead. ```bash VLLM_TARGET_DEVICE=cpu python setup.py develop diff --git a/docs/getting_started/installation/cpu/s390x.inc.md b/docs/getting_started/installation/cpu/s390x.inc.md index acfb3396896bf..c1917267ce91b 100644 --- a/docs/getting_started/installation/cpu/s390x.inc.md +++ b/docs/getting_started/installation/cpu/s390x.inc.md @@ -1,6 +1,6 @@ # --8<-- [start:installation] -vLLM has experimental support for s390x architecture on IBM Z platform. For now, users shall build from the vLLM source to natively run on IBM Z platform. +vLLM has experimental support for s390x architecture on IBM Z platform. For now, users must build from source to natively run on IBM Z platform. Currently the CPU implementation for s390x architecture supports FP32 datatype only. @@ -40,21 +40,32 @@ curl https://sh.rustup.rs -sSf | sh -s -- -y && \ . "$HOME/.cargo/env" ``` -Execute the following commands to build and install vLLM from the source. +Execute the following commands to build and install vLLM from source. !!! tip - Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM. + Please build the following dependencies, `torchvision`, `pyarrow` from source before building vLLM. ```bash sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds - pip install -v \ - --extra-index-url https://download.pytorch.org/whl/nightly/cpu \ + uv pip install -v \ + --torch-backend auto \ -r requirements-build.txt \ -r requirements-cpu.txt \ VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \ - pip install dist/*.whl + uv pip install dist/*.whl ``` +??? console "pip" + ```bash + sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds + pip install -v \ + --extra-index-url https://download.pytorch.org/whl/nightly/cpu \ + -r requirements-build.txt \ + -r requirements-cpu.txt \ + VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \ + pip install dist/*.whl + ``` + # --8<-- [end:build-wheel-from-source] # --8<-- [start:pre-built-images] @@ -63,19 +74,19 @@ Execute the following commands to build and install vLLM from the source. ```bash docker build -f docker/Dockerfile.s390x \ - --tag vllm-cpu-env . + --tag vllm-cpu-env . -# Launching OpenAI server +# Launch OpenAI server docker run --rm \ - --privileged=true \ - --shm-size=4g \ - -p 8000:8000 \ - -e VLLM_CPU_KVCACHE_SPACE= \ - -e VLLM_CPU_OMP_THREADS_BIND= \ - vllm-cpu-env \ - --model=meta-llama/Llama-3.2-1B-Instruct \ - --dtype=float \ - other vLLM OpenAI server arguments + --privileged true \ + --shm-size 4g \ + -p 8000:8000 \ + -e VLLM_CPU_KVCACHE_SPACE= \ + -e VLLM_CPU_OMP_THREADS_BIND= \ + vllm-cpu-env \ + --model meta-llama/Llama-3.2-1B-Instruct \ + --dtype float \ + other vLLM OpenAI server arguments ``` # --8<-- [end:build-image-from-source] From 2d7b09b998980b9ccbb3708632b47bc28de076aa Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 1 Aug 2025 17:16:37 +0100 Subject: [PATCH 131/224] Deprecate `--disable-log-requests` and replace with `--enable-log-requests` (#21739) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/nightly-benchmarks/README.md | 1 - .../tests/genai-perf-tests.json | 1 - .../tests/nightly-tests.json | 6 ---- .../tests/serving-tests-cpu-snc2.json | 6 ---- .../tests/serving-tests-cpu-snc3.json | 6 ---- .../tests/serving-tests-cpu.json | 5 ---- .../tests/serving-tests.json | 6 +--- tests/config/test_mp_reducer.py | 1 - tests/mq_llm_engine/test_load.py | 2 +- tests/v1/engine/test_async_llm.py | 4 +-- tests/v1/test_async_llm_dp.py | 1 - vllm/engine/arg_utils.py | 30 ++++++++++++++++--- vllm/engine/async_llm_engine.py | 26 +++++++++------- vllm/engine/multiprocessing/engine.py | 27 ++++++++++++----- vllm/entrypoints/openai/api_server.py | 12 ++++---- vllm/entrypoints/openai/run_batch.py | 6 ++-- vllm/utils/__init__.py | 6 ++++ vllm/v1/engine/async_llm.py | 30 +++++++++++-------- 18 files changed, 97 insertions(+), 79 deletions(-) diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index fcde284efea98..3721d3d1d6749 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -104,7 +104,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co "tensor_parallel_size": 1, "swap_space": 16, "disable_log_stats": "", - "disable_log_requests": "", "load_format": "dummy" }, "client_parameters": { diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json index edbe9f2df0ce0..f26ae7634f3d9 100644 --- a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json +++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json @@ -11,7 +11,6 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index fda1a7a3ec53c..41b4a4008801d 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -35,7 +35,6 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, @@ -90,7 +89,6 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, @@ -145,7 +143,6 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, @@ -197,7 +194,6 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, @@ -251,7 +247,6 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, @@ -305,7 +300,6 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json index a144b4420fbf1..dd0e24edff98d 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json @@ -17,7 +17,6 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -50,7 +49,6 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -83,7 +81,6 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -117,7 +114,6 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -153,7 +149,6 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -189,7 +184,6 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json index e6e69b63b74df..f1bda65a7590b 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json @@ -17,7 +17,6 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -50,7 +49,6 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -84,7 +82,6 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -118,7 +115,6 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -154,7 +150,6 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -191,7 +186,6 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json index ce1f924de387f..f150b9abeea45 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json @@ -17,7 +17,6 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -50,7 +49,6 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -83,7 +81,6 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -117,7 +114,6 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -153,7 +149,6 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json index 13fd5aa8db97b..a6d4141d5c2dc 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json @@ -7,7 +7,6 @@ "tensor_parallel_size": 1, "swap_space": 16, "disable_log_stats": "", - "disable_log_requests": "", "load_format": "dummy" }, "client_parameters": { @@ -26,7 +25,6 @@ "tensor_parallel_size": 4, "swap_space": 16, "disable_log_stats": "", - "disable_log_requests": "", "load_format": "dummy" }, "client_parameters": { @@ -45,7 +43,6 @@ "tensor_parallel_size": 2, "swap_space": 16, "disable_log_stats": "", - "disable_log_requests": "", "load_format": "dummy" }, "client_parameters": { @@ -60,8 +57,7 @@ "test_name": "serving_llama70B_tp4_sharegpt_specdecode", "qps_list": [2], "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "disable_log_requests": "", + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "tensor_parallel_size": 4, "swap_space": 16, "speculative_config": { diff --git a/tests/config/test_mp_reducer.py b/tests/config/test_mp_reducer.py index ee351cbfa7c16..d4d4be293280b 100644 --- a/tests/config/test_mp_reducer.py +++ b/tests/config/test_mp_reducer.py @@ -28,7 +28,6 @@ def test_mp_reducer(monkeypatch): max_model_len=32, gpu_memory_utilization=0.1, disable_log_stats=True, - disable_log_requests=True, ) async_llm = AsyncLLM.from_engine_args( diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py index e9fd5b814f285..c934706611ae3 100644 --- a/tests/mq_llm_engine/test_load.py +++ b/tests/mq_llm_engine/test_load.py @@ -16,7 +16,7 @@ NUM_EXPECTED_TOKENS = 10 NUM_REQUESTS = 10000 # Scenarios to test for num generated token. -ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True) +ENGINE_ARGS = AsyncEngineArgs(model=MODEL) @pytest.fixture(scope="function") diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 412df3acff126..21694491dd73a 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -26,12 +26,10 @@ if not current_platform.is_cuda(): TEXT_ENGINE_ARGS = AsyncEngineArgs( model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True, - disable_log_requests=True, ) VISION_ENGINE_ARGS = AsyncEngineArgs(model="Qwen/Qwen2-VL-2B-Instruct", - enforce_eager=True, - disable_log_requests=True) + enforce_eager=True) TEXT_PROMPT = "Hello my name is Robert and" diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py index 6716d27f571f9..c2610a87ac780 100644 --- a/tests/v1/test_async_llm_dp.py +++ b/tests/v1/test_async_llm_dp.py @@ -25,7 +25,6 @@ DP_SIZE = int(os.getenv("DP_SIZE", 2)) engine_args = AsyncEngineArgs( model="ibm-research/PowerMoE-3b", enforce_eager=True, - disable_log_requests=True, tensor_parallel_size=int(os.getenv("TP_SIZE", 1)), data_parallel_size=DP_SIZE, ) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f938f19b90469..0d38b5b5302c1 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -18,7 +18,7 @@ from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List, import regex as re import torch from pydantic import TypeAdapter, ValidationError -from typing_extensions import TypeIs +from typing_extensions import TypeIs, deprecated import vllm.envs as envs from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, @@ -1704,7 +1704,23 @@ class EngineArgs: @dataclass class AsyncEngineArgs(EngineArgs): """Arguments for asynchronous vLLM engine.""" - disable_log_requests: bool = False + enable_log_requests: bool = False + + @property + @deprecated( + "`disable_log_requests` is deprecated and has been replaced with " + "`enable_log_requests`. This will be removed in v0.12.0. Please use " + "`enable_log_requests` instead.") + def disable_log_requests(self) -> bool: + return not self.enable_log_requests + + @disable_log_requests.setter + @deprecated( + "`disable_log_requests` is deprecated and has been replaced with " + "`enable_log_requests`. This will be removed in v0.12.0. Please use " + "`enable_log_requests` instead.") + def disable_log_requests(self, value: bool): + self.enable_log_requests = not value @staticmethod def add_cli_args(parser: FlexibleArgumentParser, @@ -1715,9 +1731,15 @@ class AsyncEngineArgs(EngineArgs): load_general_plugins() if not async_args_only: parser = EngineArgs.add_cli_args(parser) + parser.add_argument('--enable-log-requests', + action=argparse.BooleanOptionalAction, + default=AsyncEngineArgs.enable_log_requests, + help='Enable logging requests.') parser.add_argument('--disable-log-requests', - action='store_true', - help='Disable logging requests.') + action=argparse.BooleanOptionalAction, + default=not AsyncEngineArgs.enable_log_requests, + help='[DEPRECATED] Disable logging requests.', + deprecated=True) current_platform.pre_register_and_update(parser) return parser diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 06bb4eeab69eb..1f962b008ee03 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -30,7 +30,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import ExecuteModelRequest from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device, weak_bind +from vllm.utils import Device, deprecate_kwargs, weak_bind logger = init_logger(__name__) ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S @@ -554,14 +554,20 @@ class AsyncLLMEngine(EngineClient): return LLMEngine._get_executor_cls(engine_config) @classmethod + @deprecate_kwargs( + "disable_log_requests", + additional_message=("This argument will have no effect. " + "Use `enable_log_requests` instead."), + ) def from_vllm_config( - cls, - vllm_config: VllmConfig, - start_engine_loop: bool = True, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[dict[str, StatLoggerBase]] = None, - disable_log_requests: bool = False, - disable_log_stats: bool = False, + cls, + vllm_config: VllmConfig, + start_engine_loop: bool = True, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[dict[str, StatLoggerBase]] = None, + enable_log_requests: bool = False, + disable_log_stats: bool = False, + disable_log_requests: bool = True, # Deprecated, will be removed ) -> "AsyncLLMEngine": """Create an AsyncLLMEngine from the EngineArgs.""" @@ -569,7 +575,7 @@ class AsyncLLMEngine(EngineClient): vllm_config=vllm_config, executor_class=cls._get_executor_cls(vllm_config), start_engine_loop=start_engine_loop, - log_requests=not disable_log_requests, + log_requests=enable_log_requests, log_stats=not disable_log_stats, usage_context=usage_context, stat_loggers=stat_loggers, @@ -598,7 +604,7 @@ class AsyncLLMEngine(EngineClient): usage_context=usage_context, stat_loggers=stat_loggers, disable_log_stats=engine_args.disable_log_stats, - disable_log_requests=engine_args.disable_log_requests, + enable_log_requests=engine_args.enable_log_requests, ) @property diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index fe6eb0d8c2f1a..903f3fd71ebcd 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -34,6 +34,7 @@ from vllm.outputs import RequestOutput from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.usage.usage_lib import UsageContext +from vllm.utils import deprecate_kwargs from vllm.worker.model_runner_base import InputProcessingError logger = init_logger(__name__) @@ -120,10 +121,20 @@ class MQLLMEngine: return ENGINE_DEAD_ERROR() @classmethod - def from_vllm_config(cls, vllm_config: VllmConfig, - usage_context: UsageContext, - disable_log_requests: bool, disable_log_stats: bool, - ipc_path: str) -> "MQLLMEngine": + @deprecate_kwargs( + "disable_log_requests", + additional_message=("This argument will have no effect. " + "Use `enable_log_requests` instead."), + ) + def from_vllm_config( + cls, + vllm_config: VllmConfig, + usage_context: UsageContext, + enable_log_requests: bool, + disable_log_stats: bool, + ipc_path: str, + disable_log_requests: bool = True, # Deprecated, will be removed + ) -> "MQLLMEngine": # Setup plugins for each process from vllm.plugins import load_general_plugins load_general_plugins() @@ -136,7 +147,7 @@ class MQLLMEngine: ipc_path=ipc_path, usage_context=usage_context, use_async_sockets=use_async_sockets, - log_requests=(not disable_log_requests), + log_requests=enable_log_requests, log_stats=(not disable_log_stats), ) @@ -150,7 +161,7 @@ class MQLLMEngine: ipc_path=ipc_path, vllm_config=vllm_config, usage_context=usage_context, - disable_log_requests=engine_args.disable_log_requests, + enable_log_requests=engine_args.enable_log_requests, disable_log_stats=engine_args.disable_log_stats, ) @@ -436,7 +447,7 @@ def signal_handler(*_) -> None: def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext, ipc_path: str, disable_log_stats: bool, - disable_log_requests: bool, engine_alive): + enable_log_requests: bool, engine_alive): try: # Ensure we can serialize transformer config before spawning maybe_register_config_serialize_by_value() @@ -445,7 +456,7 @@ def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext, vllm_config=vllm_config, usage_context=usage_context, disable_log_stats=disable_log_stats, - disable_log_requests=disable_log_requests, + enable_log_requests=enable_log_requests, ipc_path=ipc_path) signal.signal(signal.SIGTERM, signal_handler) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 1be03c57a1f1b..b8ec5461f7719 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -205,7 +205,7 @@ async def build_async_engine_client_from_engine_args( async_llm = AsyncLLM.from_vllm_config( vllm_config=vllm_config, usage_context=usage_context, - disable_log_requests=engine_args.disable_log_requests, + enable_log_requests=engine_args.enable_log_requests, disable_log_stats=engine_args.disable_log_stats, client_addresses=client_config, client_index=client_index) @@ -227,7 +227,7 @@ async def build_async_engine_client_from_engine_args( engine_client = AsyncLLMEngine.from_vllm_config( vllm_config=vllm_config, usage_context=usage_context, - disable_log_requests=engine_args.disable_log_requests, + enable_log_requests=engine_args.enable_log_requests, disable_log_stats=engine_args.disable_log_stats) yield engine_client finally: @@ -272,7 +272,7 @@ async def build_async_engine_client_from_engine_args( target=run_mp_engine, args=(vllm_config, UsageContext.OPENAI_API_SERVER, ipc_path, engine_args.disable_log_stats, - engine_args.disable_log_requests, engine_alive)) + engine_args.enable_log_requests, engine_alive)) engine_process.start() engine_pid = engine_process.pid assert engine_pid is not None, "Engine process failed to start." @@ -1570,10 +1570,10 @@ async def init_app_state( else: served_model_names = [args.model] - if args.disable_log_requests: - request_logger = None - else: + if args.enable_log_requests: request_logger = RequestLogger(max_log_len=args.max_log_len) + else: + request_logger = None base_model_paths = [ BaseModelPath(name=name, model_path=args.model) diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 137b368dad202..d146ad485d194 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -324,10 +324,10 @@ async def run_batch( else: served_model_names = [args.model] - if args.disable_log_requests: - request_logger = None - else: + if args.enable_log_requests: request_logger = RequestLogger(max_log_len=args.max_log_len) + else: + request_logger = None base_model_paths = [ BaseModelPath(name=name, model_path=args.model) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index d5d8d9dad73a8..7405f3986df8d 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1668,6 +1668,12 @@ class FlexibleArgumentParser(ArgumentParser): # Enable the deprecated kwarg for Python 3.12 and below def parse_known_args(self, args=None, namespace=None): + if args is not None and "--disable-log-requests" in args: + # Special case warning because the warning below won't trigger + # if –-disable-log-requests because its value is default. + logger.warning_once( + "argument '--disable-log-requests' is deprecated. This " + "will be removed in v0.12.0.") namespace, args = super().parse_known_args(args, namespace) for action in FlexibleArgumentParser._deprecated: if (hasattr(namespace, dest := action.dest) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index ed0d9620f4762..308ca32105ba9 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -27,7 +27,7 @@ from vllm.transformers_utils.config import ( from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device, cdiv +from vllm.utils import Device, cdiv, deprecate_kwargs from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError @@ -142,16 +142,22 @@ class AsyncLLM(EngineClient): pass @classmethod + @deprecate_kwargs( + "disable_log_requests", + additional_message=("This argument will have no effect. " + "Use `enable_log_requests` instead."), + ) def from_vllm_config( - cls, - vllm_config: VllmConfig, - start_engine_loop: bool = True, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[list[StatLoggerFactory]] = None, - disable_log_requests: bool = False, - disable_log_stats: bool = False, - client_addresses: Optional[dict[str, str]] = None, - client_index: int = 0, + cls, + vllm_config: VllmConfig, + start_engine_loop: bool = True, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[list[StatLoggerFactory]] = None, + enable_log_requests: bool = False, + disable_log_stats: bool = False, + client_addresses: Optional[dict[str, str]] = None, + client_index: int = 0, + disable_log_requests: bool = True, # Deprecated, will be removed ) -> "AsyncLLM": if not envs.VLLM_USE_V1: raise ValueError( @@ -166,7 +172,7 @@ class AsyncLLM(EngineClient): executor_class=Executor.get_class(vllm_config), start_engine_loop=start_engine_loop, stat_loggers=stat_loggers, - log_requests=not disable_log_requests, + log_requests=enable_log_requests, log_stats=not disable_log_stats, usage_context=usage_context, client_addresses=client_addresses, @@ -191,7 +197,7 @@ class AsyncLLM(EngineClient): return cls( vllm_config=vllm_config, executor_class=executor_class, - log_requests=not engine_args.disable_log_requests, + log_requests=engine_args.enable_log_requests, log_stats=not engine_args.disable_log_stats, start_engine_loop=start_engine_loop, usage_context=usage_context, From 326a1b001db10afc2dc5b2bfcb60a3b8f8bcb2ac Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 1 Aug 2025 17:32:27 +0100 Subject: [PATCH 132/224] Improve documentation of `ModelConfig.try_get_generation_config` to prevent future confusion (#21526) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 2d61552c5dadc..124d62b699771 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1575,7 +1575,18 @@ class ModelConfig: return self.multimodal_config def try_get_generation_config(self) -> dict[str, Any]: - if self.generation_config in ("auto", "vllm"): + """ + This method attempts to retrieve the non-default values of the + generation config for this model. + + The generation config can contain information about special tokens, as + well as sampling parameters. Which is why this method exists separately + to `get_diff_sampling_param`. + + Returns: + A dictionary containing the non-default generation config. + """ + if self.generation_config in {"auto", "vllm"}: config = try_get_generation_config( self.hf_config_path or self.model, trust_remote_code=self.trust_remote_code, @@ -1594,13 +1605,18 @@ class ModelConfig: def get_diff_sampling_param(self) -> dict[str, Any]: """ - This method returns a dictionary containing the parameters - that differ from the default sampling parameters. If - `generation_config` is `"vllm"`, an empty dictionary is returned. + This method returns a dictionary containing the non-default sampling + parameters with `override_generation_config` applied. + + The default sampling parameters are: + + - vLLM's neutral defaults if `self.generation_config="vllm"` + - the model's defaults if `self.generation_config="auto"` + - as defined in `generation_config.json` if + `self.generation_config="path/to/generation_config/dir"` Returns: - dict[str, Any]: A dictionary with the differing sampling - parameters, if `generation_config` is `"vllm"` an empty dictionary. + A dictionary containing the non-default sampling parameters. """ if self.generation_config == "vllm": config = {} From 3f8e9521791dd3f41c90cc2b3c9e78a1951f5237 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 2 Aug 2025 00:33:30 +0800 Subject: [PATCH 133/224] [Bugfix] Fix glm4.1v video inference issue (#22067) Signed-off-by: Isotr0py <2037008807@qq.com> --- .../multimodal/processing/test_glm4_1v.py | 51 +++++++++++++++++++ vllm/model_executor/models/glm4_1v.py | 8 +-- 2 files changed, 53 insertions(+), 6 deletions(-) create mode 100644 tests/models/multimodal/processing/test_glm4_1v.py diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py new file mode 100644 index 0000000000000..d1c5fa8fec6d2 --- /dev/null +++ b/tests/models/multimodal/processing/test_glm4_1v.py @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from vllm.assets.video import VideoAsset +from vllm.multimodal import MULTIMODAL_REGISTRY + +from ...utils import build_model_context + + +@pytest.mark.parametrize("model_id", ["THUDM/GLM-4.1V-9B-Thinking"]) +@pytest.mark.parametrize("expected_toks_per_frame", [299]) +@pytest.mark.parametrize("num_frames", [32, 128]) +@pytest.mark.parametrize("fps, expected_grid_t", [(1, 5), (2, 10)]) +def test_processor_override( + model_id: str, + expected_toks_per_frame: int, + expected_grid_t: int, + fps: int, + num_frames: int, +): + """Ensure GLM4vMultiModalProcessor can handle video frames properly.""" + ctx = build_model_context( + model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"video": 1}, + ) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + tokenizer = processor.info.get_tokenizer() + hf_processor_mm_kwargs = {"fps": fps} + + # Build the image str / prompt based on the number of images we pass + video_assets = VideoAsset(name="baby_reading", num_frames=num_frames) + prompt = "<|begin_of_video|><|video|><|end_of_video|>" + + video, metadata = video_assets.np_ndarrays, video_assets.metadata + metadata["fps"] = fps + mm_data = {"video": [(video, metadata)]} + + processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs) + + # Ensure we have the right number of placeholders per num_crops size + hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs) + video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token) + video_tok_count = processed_inputs["prompt_token_ids"].count( + video_token_id) + grid_t, _, _ = processed_inputs["mm_kwargs"]["video_grid_thw"][0] + + assert grid_t == expected_grid_t + assert video_tok_count == expected_toks_per_frame * grid_t diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 5f306f05d140e..7c9840790fe3e 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -937,7 +937,7 @@ class Glm4vProcessingInfo(BaseProcessingInfo): total_frames: int) -> list[int]: video_processor = self.get_video_processor() - video_fps = metadata.get("fps", 2.0) + video_fps = metadata.get("fps", video_processor.fps) meta_frames = metadata.get("total_num_frames", total_frames) max_frame_idx = meta_frames - 1 duration = metadata.get("duration", @@ -1120,11 +1120,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): video_placeholder, ) - grid_t = len(video_outputs["video_grid_thw"]) - _, grid_h, grid_w = video_outputs["video_grid_thw"][0] - grid_thw = torch.tensor([[grid_t, grid_h, grid_w]]) - - video_grid_thw_lst.append(grid_thw) + video_grid_thw_lst.append(video_outputs["video_grid_thw"]) pixel_values_videos_lst.append( video_outputs["pixel_values_videos"]) video_outputs = dict( From b879ecd6e2636b6af893052615693a51466381ec Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Sat, 2 Aug 2025 01:09:36 +0800 Subject: [PATCH 134/224] [Bugfix] fix when skip tokenizer init (#21922) Signed-off-by: rongfu.leng --- tests/v1/engine/test_llm_engine.py | 26 ++++++++++++++++++++++++++ vllm/v1/engine/processor.py | 9 +++++++-- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py index f37686317fd14..2848420c22085 100644 --- a/tests/v1/engine/test_llm_engine.py +++ b/tests/v1/engine/test_llm_engine.py @@ -213,3 +213,29 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): assert len(num_accepted_tokens_per_pos) == 1 assert isinstance(num_accepted_tokens_per_pos[0], Vector) assert len(num_accepted_tokens_per_pos[0].values) == 5 + + +@pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"]) +def test_skip_tokenizer_initialization(model: str, + monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("VLLM_USE_V1", "1") + # This test checks if the flag skip_tokenizer_init skips the initialization + # of tokenizer and detokenizer. The generated output is expected to contain + # token ids. + llm = LLM( + model=model, + skip_tokenizer_init=True, + enforce_eager=True, + ) + sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) + + with pytest.raises(ValueError, match="cannot pass text prompts when"): + llm.generate("abc", sampling_params) + + outputs = llm.generate({"prompt_token_ids": [1, 2, 3]}, + sampling_params=sampling_params) + assert len(outputs) > 0 + completions = outputs[0].outputs + assert len(completions) > 0 + assert completions[0].text == "" + assert completions[0].token_ids diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 0f2f404a130ef..224acc47feb27 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -89,6 +89,10 @@ class Processor: return if not params.allowed_token_ids: raise ValueError("allowed_token_ids is not None and empty!") + if self.tokenizer is None: + # When skip_tokenizer_init=True, we can't validate token IDs + # Skip validation and let the model handle invalid tokens + return tokenizer = self.tokenizer.get_lora_tokenizer(lora_request) vocab_size = len(tokenizer) if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids): @@ -283,8 +287,9 @@ class Processor: len(decoder_inputs["prompt_token_ids"])) sampling_params.update_from_generation_config( self.generation_config_fields, eos_token_id) - sampling_params.update_from_tokenizer( - self.tokenizer.get_lora_tokenizer(lora_request)) + if self.tokenizer is not None: + sampling_params.update_from_tokenizer( + self.tokenizer.get_lora_tokenizer(lora_request)) else: pooling_params = params.clone() From d6664664b442cb236f8541a126e4076a5e12c56d Mon Sep 17 00:00:00 2001 From: Huzaifa Sidhpurwala Date: Fri, 1 Aug 2025 21:09:49 +0400 Subject: [PATCH 135/224] security policy: take 1 (#21119) Signed-off-by: Huzaifa Sidhpurwala Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Russell Bryant --- SECURITY.md | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/SECURITY.md b/SECURITY.md index 6053cfb41f35b..4f338557da1a0 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,13 +1,41 @@ # Security Policy -## Reporting a Vulnerability +## Reporting security issues: -If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. +Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). -Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). +## Issue triage ---- +Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). + +## Threat model Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations. Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models. + +## Issue severity + +We will determine the risk of each issue, taking into account our experience dealing with past issues, versions affected, common defaults, and use cases. We use the following severity categories: + +### CRITICAL Severity +Vulnerabilities that allow remote attackers to execute arbitrary code, take full control of the system, or significantly compromise confidentiality, integrity, or availability without any interaction or privileges needed, examples include remote code execution via network, deserialization issues that allow exploit chains. Generally those issues which are rated as CVSS ≥ 9.0. + +### HIGH Severity +Serious security flaws that allow elevated impact—like RCE in specific, limited contexts or significant data loss—but require advanced conditions or some trust, examples include RCE in advanced deployment modes (e.g. multi-node), or high impact issues where some sort of privileged network access is required. These issues typically have CVSS scores between 7.0 and 8.9 + +### MODERATE Severity +Vulnerabilities that cause denial of service or partial disruption, but do not allow arbitrary code execution or data breach and have limited impact. These issues have a CVSS rating between 4.0 and 6.9 + +### LOW Severity +Minor issues such as informational disclosures, logging errors, non-exploitable flaws, or weaknesses that require local or high-privilege access and offer negligible impact. Examples include side channel attacks or hash collisions. These issues often have CVSS scores less than 4.0 + +## Prenotification policy + +For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we may prenotify certain organizations or vendors that ship vLLM. The purpose of this prenotification is to allow for a coordinated release of fixes for severe issues. + +* This prenotification will be in the form of a private email notification. It may also include adding security contacts to the GitHub security advisory, typically a few days before release. + +* If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis. + +* We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included. From ac45c44d98e77f30e47b8fb69134f4635183070d Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Fri, 1 Aug 2025 22:44:38 +0530 Subject: [PATCH 136/224] [Bugfix] [Performance] DeepEPHighThroughput + DeepSeek : Quant before Dispatch (#21837) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- .../layers/fused_moe/deepep_ht_prepare_finalize.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index 7016ff34c3a85..f6b62254e7b4c 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -144,12 +144,13 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): "apply_router_weight_on_input is only implemented for topk=1") a1 = a1 * topk_weights.to(a1.dtype) - if quant_config.per_act_token_quant: + if quant_config.is_block_quantized: + # Quant and Dispatch a1q, a1q_scale = moe_kernel_quantize_input( a1, a1_scale, quant_dtype=quant_config.quant_dtype, - per_act_token_quant=True, + per_act_token_quant=quant_config.per_act_token_quant, block_shape=quant_config.block_shape, ) if a1q_scale is not None and a1q_scale.numel() == 1: @@ -162,8 +163,10 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): rank_topk_weights=topk_weights, num_experts=num_experts) else: - # DeepEP kernels only support dispatching per-token-quant - # quantization. dispatch in bfloat16. + # Dispatch and Quant + # DeepEP kernels only support dispatching block-quantized + # activation scales. + # Dispatch in bfloat16 (expert_x, _, expert_tokens_meta, expert_topk_ids, expert_topk_weights) = self._do_dispatch( tokens=a1, @@ -171,7 +174,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): rank_topk_ids=topk_ids, rank_topk_weights=topk_weights, num_experts=num_experts) - # quantize now + # Quantize after dispatch. expert_x_scale = None if expert_x.numel() != 0: expert_x, expert_x_scale = moe_kernel_quantize_input( From 38c8bce8b652df87d111c04ddf849c38615000c7 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 1 Aug 2025 18:31:29 +0100 Subject: [PATCH 137/224] Enable headless models for pooling in the Transformers backend (#21767) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/models/registry.py | 1 + tests/models/test_transformers.py | 28 +++++++++++++++++----- vllm/config.py | 9 +++++-- vllm/model_executor/models/registry.py | 3 ++- vllm/model_executor/models/transformers.py | 12 ++++++++++ 5 files changed, 44 insertions(+), 9 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 806342a57dfab..fdc7888c85efb 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -525,6 +525,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { } _TRANSFORMERS_BACKEND_MODELS = { + "TransformersModel": _HfExamplesInfo("Qwen/Qwen3-Embedding-0.6B"), "TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501 "TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"), } diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index 5b7d90dfb896d..66ff8f7a54d31 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -34,8 +34,7 @@ def check_implementation( with runner_test(model, **kwargs_test, **kwargs) as model_test: model_config = model_test.llm.llm_engine.model_config - assert model_config.architecture == ( - model_config._get_transformers_backend_cls()) + assert model_config.using_transformers_backend() outputs_test = model_test.generate_greedy_logprobs(*args) @@ -135,8 +134,7 @@ def test_quantization( enforce_eager=True, **quantization_kwargs) as vllm_model: # type: ignore[arg-type] model_config = vllm_model.llm.llm_engine.model_config - assert model_config.architecture == ( - model_config._get_transformers_backend_cls()) + assert model_config.using_transformers_backend() transformers_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs) @@ -149,6 +147,25 @@ def test_quantization( ) +@pytest.mark.parametrize( + "model", + [ + # Layers live in `layers` + "Qwen/Qwen3-Embedding-0.6B", + # Layers live in `model.layers` + "meta-llama/Llama-3.2-1B-Instruct" + ], +) +def test_embed_loading(vllm_runner, model): + with vllm_runner(model, + max_model_len=1024, + enforce_eager=True, + runner="pooling", + model_impl="transformers") as model_test: + model_config = model_test.llm.llm_engine.model_config + assert model_config.using_transformers_backend() + + @pytest.mark.parametrize( "model", ["jason9693/Qwen2.5-1.5B-apeach"], @@ -169,8 +186,7 @@ def test_classify( dtype=dtype, model_impl="transformers") as vllm_model: model_config = vllm_model.llm.llm_engine.model_config - assert model_config.architecture == ( - model_config._get_transformers_backend_cls()) + assert model_config.using_transformers_backend() vllm_outputs = vllm_model.classify(example_prompts) diff --git a/vllm/config.py b/vllm/config.py index 124d62b699771..dabb4b524dfd8 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -812,12 +812,17 @@ class ModelConfig: def _get_transformers_backend_cls(self) -> str: """Determine which Transformers backend class will be used if `model_impl` is set to `transformers` or `auto`.""" + if getattr(self, "runner_type", self.runner) == "pooling": + return "TransformersModel" if self.hf_config != self.hf_text_config: # If 'hf_text_config' is the same as 'hf_config'. If not, it is # probably a composite config, i.e. multimodal return "TransformersForMultimodalLM" - else: - return "TransformersForCausalLM" + return "TransformersForCausalLM" + + def using_transformers_backend(self) -> bool: + """Check if the model is using the Transformers backend class.""" + return self.architecture == self._get_transformers_backend_cls() @property def registry(self): diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 848c04b9b32f7..0c5d87a7dc472 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -270,8 +270,9 @@ _TRANSFORMERS_SUPPORTED_MODELS = { } _TRANSFORMERS_BACKEND_MODELS = { - "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501 + "TransformersModel": ("transformers", "TransformersModel"), "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"), + "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501 } # yapf: enable diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index e67548800c354..5059d1e1d9fea 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -651,6 +651,18 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) +@support_torch_compile +class TransformersModel(TransformersBase): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + # Add `model.` prefix for base model checkpoints + "": "model.", + # Remove `model.` from places it should not be + "model.model.": "model.", + "model.score": "score", + }) + + @support_torch_compile class TransformersForCausalLM(TransformersBase): From 8d705996dffbb2299750b7b2b50bbcd5ccb4a5ad Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 2 Aug 2025 01:35:30 +0800 Subject: [PATCH 138/224] [Misc] Minor enhancement of benchmark_moe (#22068) Signed-off-by: Jee Jee Li --- benchmarks/kernels/benchmark_moe.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c350aaf5d3ad2..72250e2fb6d2b 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -22,6 +22,13 @@ from vllm.utils import FlexibleArgumentParser FP8_DTYPE = current_platform.fp8_dtype() +def ensure_divisibility(numerator, denominator): + """Ensure that numerator is divisible by the denominator.""" + assert numerator % denominator == 0, ( + "intermediate_size {} is not divisible by tp {}.".format(numerator, denominator) + ) + + class BenchmarkConfig(TypedDict): BLOCK_SIZE_M: int BLOCK_SIZE_N: int @@ -603,7 +610,7 @@ def main(args: argparse.Namespace): topk = config.num_experts_per_tok intermediate_size = config.intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size - + ensure_divisibility(intermediate_size, args.tp_size) hidden_size = config.hidden_size dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" From 3277e8f9e19c396d6dd92a0901d2e3f8fb8982d4 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 1 Aug 2025 13:36:07 -0400 Subject: [PATCH 139/224] Fix pre-commit failure for SECURTIY.md (#22102) Signed-off-by: mgoin --- SECURITY.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/SECURITY.md b/SECURITY.md index 4f338557da1a0..414669fb3712e 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,6 +1,6 @@ # Security Policy -## Reporting security issues: +## Reporting security issues Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). @@ -19,15 +19,19 @@ Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/m We will determine the risk of each issue, taking into account our experience dealing with past issues, versions affected, common defaults, and use cases. We use the following severity categories: ### CRITICAL Severity + Vulnerabilities that allow remote attackers to execute arbitrary code, take full control of the system, or significantly compromise confidentiality, integrity, or availability without any interaction or privileges needed, examples include remote code execution via network, deserialization issues that allow exploit chains. Generally those issues which are rated as CVSS ≥ 9.0. ### HIGH Severity + Serious security flaws that allow elevated impact—like RCE in specific, limited contexts or significant data loss—but require advanced conditions or some trust, examples include RCE in advanced deployment modes (e.g. multi-node), or high impact issues where some sort of privileged network access is required. These issues typically have CVSS scores between 7.0 and 8.9 ### MODERATE Severity + Vulnerabilities that cause denial of service or partial disruption, but do not allow arbitrary code execution or data breach and have limited impact. These issues have a CVSS rating between 4.0 and 6.9 ### LOW Severity + Minor issues such as informational disclosures, logging errors, non-exploitable flaws, or weaknesses that require local or high-privilege access and offer negligible impact. Examples include side channel attacks or hash collisions. These issues often have CVSS scores less than 4.0 ## Prenotification policy From 9659bc7f271ec640da780b5ca739e261764b954b Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Fri, 1 Aug 2025 10:38:52 -0700 Subject: [PATCH 140/224] [compile][startup] Disable C++ compilation of symbolic shapes (#20836) Signed-off-by: Animesh Jain --- vllm/compilation/decorators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 1370862d580a5..0d2c432497c40 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -267,8 +267,15 @@ def _support_torch_compile( code.co_filename) return inline_call(parent, func, args, kwargs) - with patch.object(InliningInstructionTranslator, 'inline_call', - patched_inline_call): + # Disable the C++ compilation of symbolic shape guards. C++-fication + # of symbolic shape guards can improve guard overhead. But, since + # vllm skip guards anyways, setting this flag to False can improve + # compile time. + with torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", + False), patch.object( + InliningInstructionTranslator, + 'inline_call', + patched_inline_call): output = self.compiled_callable(*args, **kwargs) return output From d331759488eb7627d2454549eeb01d14f83f1c41 Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Fri, 1 Aug 2025 11:50:58 -0700 Subject: [PATCH 141/224] Introduce RayPPCommunicator for ray-based PP (#21660) Signed-off-by: Rui Qiao --- .../device_communicators/ray_communicator.py | 257 ++++++++++++++++++ vllm/envs.py | 8 + vllm/executor/ray_distributed_executor.py | 15 + 3 files changed, 280 insertions(+) create mode 100644 vllm/distributed/device_communicators/ray_communicator.py diff --git a/vllm/distributed/device_communicators/ray_communicator.py b/vllm/distributed/device_communicators/ray_communicator.py new file mode 100644 index 0000000000000..e5ba297ebcc1b --- /dev/null +++ b/vllm/distributed/device_communicators/ray_communicator.py @@ -0,0 +1,257 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import uuid +from typing import Any, Optional + +import ray +import torch +from ray.exceptions import RayChannelError +from ray.experimental.channel.communicator import (Communicator, + TorchTensorAllocator) +from torch.distributed import ReduceOp + +from vllm.distributed.device_communicators.base_device_communicator import ( + DeviceCommunicatorBase) +from vllm.distributed.parallel_state import get_pp_group +from vllm.logger import init_logger +from vllm.utils import current_stream + +logger = init_logger(__name__) + + +class RayPPCommunicator(Communicator): + """ + Communicator to be used for pipeline parallelism in Ray Compiled Graph. + This is wraps around the vLLM _PP GroupCoordinator. + + This class is not thread-safe. + """ + + _comm: Optional[DeviceCommunicatorBase] + + def __init__( + self, + world_size: int, + comm_id: Any, + rank: Optional[int], + actor_handles: list["ray.actor.ActorHandle"], + cuda_stream: Optional[torch.cuda.Stream], + use_communication_streams: bool = False, + ): + """ + Initialize a RayPPCommunicator that can be used to communicate with + other Ray Compiled Graph actors for pipeline parallelism. + + Args: + world_size: The number of participating actors. + comm_id: A unique communicator ID. This is just to conform with + the Ray Communicator API and is not used. + rank: The rank of this actor. If None, then the caller is not a + participant of the RayPPCommunicator group (e.g., the Ray + driver). + actor_handles: A list of actor handles. + cuda_stream: A CUDA stream to dispatch communication ops to. This + is not supported. + use_communication_streams: Whether to use communication streams. + This is not supported. + """ + self._world_size = world_size + self._rank: Optional[int] = None + self._actor_handles = actor_handles + if use_communication_streams: + raise NotImplementedError( + "use_communication_streams is not supported") + if cuda_stream is not None and cuda_stream != current_stream(): + raise ValueError( + "cuda_stream other than the current stream is not supported") + + if rank is not None: + # Rank is not None, this is Ray worker + assert ray.get_gpu_ids(), "RayPPCommunicator has no GPUs assigned" + + self._comm = get_pp_group().device_communicator + + # Since we wrap around the vLLM _PP communicator, we use + # the rank from the vLLM communicator, and ignore the rank + # passed in from Ray. + # TODO(rui): refactor the Ray Communicator API so that + # it also supports no rank passed in. + self._rank = self._comm.rank_in_group + + self._build_actor_rank_mapping() + else: + # Rank is None, this is Ray driver + self._comm = None + + self._closed = False + + def _build_actor_rank_mapping(self): + """ + Use collective communication to build a mapping from actor IDs to ranks. + This should be called once during initialization. + """ + if self._comm is None: + return {} + + current_actor = ray.get_runtime_context().current_actor + actor_id_str = current_actor._actor_id.hex() + + # Ray actor IDs are 32-character hex strings (128 bits) + ACTOR_ID_LEN = 32 + actor_id_bytes = actor_id_str.encode('utf-8') + assert len( + actor_id_bytes + ) == ACTOR_ID_LEN, f"Unexpected actor ID length: {len(actor_id_bytes)}" + + actor_id_tensor = torch.frombuffer( + actor_id_bytes, dtype=torch.uint8).to(self._comm.device) + + # All-gather full actor IDs from all actors + gathered_ids = self._comm.all_gather(actor_id_tensor, dim=0) + + # Build mapping: actor_id -> device_comm_rank + self._actor_id_to_rank = {} + for rank in range(self._world_size): + start_idx = rank * ACTOR_ID_LEN + end_idx = (rank + 1) * ACTOR_ID_LEN + actor_bytes = gathered_ids[start_idx:end_idx].cpu().numpy( + ).tobytes() + actor_id = actor_bytes.decode('utf-8') + self._actor_id_to_rank[actor_id] = rank + + def initialize(self, rank: int) -> None: + # No additional initialization is needed. + pass + + def get_actor_handles(self) -> list["ray.actor.ActorHandle"]: + return self._actor_handles + + def get_rank(self, actor: ray.actor.ActorHandle) -> int: + """ + Return the given actor's rank using device communicator collective ops. + """ + assert hasattr(self, '_actor_id_to_rank'), ( + "Actor rank mapping not built. " + "This should have been done during initialization.") + + actor_id_str = actor._actor_id.hex() + + if actor_id_str in self._actor_id_to_rank: + return self._actor_id_to_rank[actor_id_str] # type: ignore + else: + raise ValueError(f"Actor {actor} not found in communicator group") + + def get_self_rank(self) -> Optional[int]: + """ + Return this actor's rank. + """ + return self._rank + + def get_world_size(self) -> int: + """ + Return the number of ranks in the RayPPCommunicator group. + """ + return self._world_size + + def send(self, buf: "torch.Tensor", peer_rank: int) -> None: + """ + Send a torch.Tensor to a peer. + + This returns when the send kernel has been queued, but the kernel may + not have completed. Therefore, the caller should ensure that there are + no concurrent writes to the sent `buf` until the send has finished. + That is, either all writes should be submitted on the current stream + (self._cuda_stream) or, if on a different stream, that stream should + synchronize with the current stream. + + Args: + buf: The torch.Tensor to send. It should already be on this + actor's default device. + peer_rank: The rank of the actor to send to. + """ + if self._closed: + raise RayChannelError("RayPPCommunicator has been destroyed.") + + assert self._comm is not None + self._comm.send(buf, peer_rank) + + def recv( + self, + shape: tuple[int], + dtype: "torch.dtype", + peer_rank: int, + allocator: TorchTensorAllocator, + ) -> "torch.Tensor": + """ + Receive a torch.Tensor from a peer and synchronize the current stream. + + After this call returns, the receive buffer is safe to read from from + any stream. An RayChannelError will be raised if an error occurred + (e.g., remote actor died), and the buffer is not safe to read. + + Args: + shape: The shape of the tensor to receive. + dtype: The dtype of the tensor to receive. + peer_rank: The rank of the actor to receive from. + allocator: The allocator to use to create the received tensor. + This is ignored for this implementation. + """ + if self._closed: + raise RayChannelError("RayPPCommunicator has been destroyed.") + + assert self._comm is not None + size = torch.Size(shape) + buf = self._comm.recv(size, dtype, src=peer_rank) + + # Buffer values are undefined if NCCL ops are aborted. Therefore, we + # need to synchronize here and check that the channel is still + # open to ensure that the receive buffer is valid. + # TODO(swang): Avoid CUDA synchronization. + current_stream().synchronize() + + if self._closed: + raise RayChannelError("RayPPCommunicator has been destroyed.") + return buf + + def allgather( + self, + send_buf: "torch.Tensor", + recv_buf: "torch.Tensor", + ): + raise NotImplementedError("allgather is not supported") + + def allreduce( + self, + send_buf: "torch.Tensor", + recv_buf: "torch.Tensor", + op: ReduceOp = ReduceOp.SUM, + ): + raise NotImplementedError("allreduce is not supported") + + def reducescatter( + self, + send_buf: "torch.Tensor", + recv_buf: "torch.Tensor", + op: ReduceOp = ReduceOp.SUM, + ): + raise NotImplementedError("reducescatter is not supported") + + @property + def recv_stream(self): + return torch.cuda.StreamContext(current_stream()) + + @property + def send_stream(self): + return torch.cuda.StreamContext(current_stream()) + + def destroy(self) -> None: + # Just sets a flag, vLLM manages the lifecycle of the underlying + # _PP GroupCoordinator. + self._closed = True + + def get_transport_name(self) -> str: + return "nccl" + + @classmethod + def generate_communicator_id(cls) -> Any: + return uuid.uuid4() diff --git a/vllm/envs.py b/vllm/envs.py index 7553eccf16ea9..2fda2903179b5 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -55,6 +55,7 @@ if TYPE_CHECKING: VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto" VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False + VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True VLLM_XLA_USE_SPMD: bool = False VLLM_WORKER_MULTIPROC_METHOD: str = "fork" VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") @@ -498,6 +499,13 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0")) ), + # If the env var is set, it uses a Ray Communicator wrapping + # vLLM's pipeline parallelism communicator to interact with Ray's + # Compiled Graph. Otherwise, it uses Ray's NCCL communicator. + # This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set. + "VLLM_USE_RAY_WRAPPED_PP_COMM": + lambda: bool(int(os.getenv("VLLM_USE_RAY_WRAPPED_PP_COMM", "1"))), + # Use dedicated multiprocess context for workers. # Both spawn and fork work "VLLM_WORKER_MULTIPROC_METHOD": diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index e9ad62aeb99a8..37c3fe59c65dd 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -608,6 +608,21 @@ class RayDistributedExecutor(DistributedExecutorBase): forward_dag = MultiOutputNode(outputs) + if envs.VLLM_USE_RAY_WRAPPED_PP_COMM: + from ray.experimental.channel.accelerator_context import ( + register_accelerator_context) + + from vllm.distributed.device_communicators.ray_communicator import ( + RayPPCommunicator) + register_accelerator_context(torch_module_name="cuda", + communicator_cls=RayPPCommunicator) + logger.info("Using RayPPCommunicator " + "(which wraps vLLM _PP GroupCoordinator) " + "for Ray Compiled Graph communication.") + else: + logger.info("Using Ray's NCCL communicator for " + "Ray Compiled Graph communication.") + return forward_dag.experimental_compile( enable_asyncio=enable_asyncio, _overlap_gpu_communication=envs. From d84b97a3e33ed79aaba7552bfe5889d363875562 Mon Sep 17 00:00:00 2001 From: XiongfeiWei Date: Fri, 1 Aug 2025 11:56:08 -0700 Subject: [PATCH 142/224] Add lora test for tp>1 case for TPU. (#21970) Signed-off-by: Xiongfei Wei --- tests/tpu/lora/test_lora.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/tests/tpu/lora/test_lora.py b/tests/tpu/lora/test_lora.py index b26bdd34d890e..4c47b8c43caff 100644 --- a/tests/tpu/lora/test_lora.py +++ b/tests/tpu/lora/test_lora.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest +from torch_xla._internal import tpu import vllm from vllm.lora.request import LoRARequest @@ -27,25 +28,31 @@ def use_v1_only(monkeypatch: pytest.MonkeyPatch): yield -def setup_vllm(num_loras: int) -> vllm.LLM: +def setup_vllm(num_loras: int, tp: int) -> vllm.LLM: return vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct", num_scheduler_steps=1, max_model_len=256, max_seq_len_to_capture=256, max_num_seqs=8, + tensor_parallel_size=tp, enable_lora=True, max_loras=num_loras, max_lora_rank=8) -def test_single_lora(): +TPU_TENSOR_PARALLEL_SIZES = [1, tpu.num_available_chips() + ] if tpu.num_available_chips() > 1 else [1] + + +@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES) +def test_single_lora(tp: int): """ This test ensures we can run a single LoRA adapter on the TPU backend. We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter" which will force Qwen2.5-3B-Instruct to claim 1+1=1. """ - llm = setup_vllm(1) + llm = setup_vllm(1, tp) prompt = "What is 1+1? \n" @@ -63,7 +70,8 @@ def test_single_lora(): assert int(answer) == 1 -def test_lora_hotswapping(): +@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES) +def test_lora_hotswapping(tp: int): """ This test ensures we can run multiple LoRA adapters on the TPU backend, even if we only have space to store 1. @@ -79,7 +87,7 @@ def test_lora_hotswapping(): for i in range(1, 5) ] - llm = setup_vllm(1) + llm = setup_vllm(1, tp) prompt = "What is 1+1? \n" @@ -94,7 +102,8 @@ def test_lora_hotswapping(): assert int(answer) == i + 1 -def test_multi_lora(): +@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES) +def test_multi_lora(tp: int): """ This test ensures we can run multiple LoRA adapters on the TPU backend, when we have enough space to store all of them. @@ -109,7 +118,7 @@ def test_multi_lora(): for i in range(1, 5) ] - llm = setup_vllm(4) + llm = setup_vllm(4, tp) prompt = "What is 1+1? \n" From 881e1af43a1bb7b4bedd373e413eb7ad9dc9f920 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 1 Aug 2025 22:40:45 +0100 Subject: [PATCH 143/224] [BugFix] Harden distributed DP startup (#21538) Signed-off-by: Nick Hill --- vllm/utils/__init__.py | 3 ++ vllm/v1/engine/coordinator.py | 12 +++++++ vllm/v1/engine/core.py | 61 +++++++++++++++++++++++------------ 3 files changed, 56 insertions(+), 20 deletions(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 7405f3986df8d..0d3fa6b059beb 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2794,6 +2794,9 @@ def make_zmq_socket( if linger is not None: socket.setsockopt(zmq.LINGER, linger) + if socket_type == zmq.XPUB: + socket.setsockopt(zmq.XPUB_VERBOSE, True) + # Determine if the path is a TCP socket with an IPv6 address. # Enable IPv6 on the zmq socket if so. scheme, host, _ = split_zmq_path(path) diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py index 440628576bcb7..8d8d1689e61e3 100644 --- a/vllm/v1/engine/coordinator.py +++ b/vllm/v1/engine/coordinator.py @@ -172,6 +172,18 @@ class DPCoordinatorProc: bind=True, ) as publish_back: + # Wait until all engines subscribe. + for _ in self.engines: + if publish_back.recv() != b'\x01': + logger.error( + "DP Coordinator received unexpected message while " + "waiting for engines to subscribe") + return + # Send ready message to engines. + publish_back.send(b"READY") + + logger.info("All engine subscriptions received by DP coordinator") + poller = zmq.Poller() poller.register(publish_front, zmq.POLLIN) poller.register(output_back, zmq.POLLIN) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 6ae5736df98b8..0a889b2a0a184 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -461,8 +461,11 @@ class EngineCoreProc(EngineCore): self.has_coordinator = addresses.coordinator_output is not None self.frontend_stats_publish_address = ( addresses.frontend_stats_publish_address) + logger.debug("Has DP Coordinator: %s, stats publish address: %s", + self.has_coordinator, + self.frontend_stats_publish_address) # Only publish request queue stats to coordinator for "internal" - # LB mode. + # and "hybrid" LB modes . self.publish_dp_lb_stats = ( self.has_coordinator and not vllm_config.parallel_config.data_parallel_external_lb) @@ -472,25 +475,38 @@ class EngineCoreProc(EngineCore): super().__init__(vllm_config, executor_class, log_stats, executor_fail_callback) + # Background Threads and Queues for IO. These enable us to + # overlap ZMQ socket IO with GPU since they release the GIL, + # and to overlap some serialization/deserialization with the + # model forward pass. + # Threads handle Socket <-> Queues and core_busy_loop uses Queue. + ready_event = threading.Event() + input_thread = threading.Thread(target=self.process_input_sockets, + args=(addresses.inputs, + addresses.coordinator_input, + identity, ready_event), + daemon=True) + input_thread.start() + + self.output_thread = threading.Thread( + target=self.process_output_sockets, + args=(addresses.outputs, addresses.coordinator_output, + self.engine_index), + daemon=True) + self.output_thread.start() + + # Don't complete handshake until DP coordinator ready message is + # received. + while not ready_event.wait(timeout=10): + if not input_thread.is_alive(): + raise RuntimeError( + "Input socket thread died during startup") + assert addresses.coordinator_input is not None + logger.info("Waiting for READY message from DP Coordinator...") + self.step_fn = (self.step if self.batch_queue is None else self.step_with_batch_queue) - # Background Threads and Queues for IO. These enable us to - # overlap ZMQ socket IO with GPU since they release the GIL, - # and to overlap some serialization/deserialization with the - # model forward pass. - # Threads handle Socket <-> Queues and core_busy_loop uses Queue. - threading.Thread(target=self.process_input_sockets, - args=(addresses.inputs, addresses.coordinator_input, - identity), - daemon=True).start() - self.output_thread = threading.Thread( - target=self.process_output_sockets, - args=(addresses.outputs, addresses.coordinator_output, - self.engine_index), - daemon=True) - self.output_thread.start() - @contextmanager def _perform_handshakes( self, @@ -505,10 +521,10 @@ class EngineCoreProc(EngineCore): For DP=1 or offline mode, this is with the colocated front-end process. - For DP>1 with internal loadbalancing this is with the shared front-end + For DP>1 with internal load-balancing this is with the shared front-end process which may reside on a different node. - For DP>1 with external or hybrid loadbalancing, two handshakes are + For DP>1 with external or hybrid load-balancing, two handshakes are performed: - With the rank 0 front-end process which retrieves the DP Coordinator ZMQ addresses and DP process group address. @@ -772,7 +788,7 @@ class EngineCoreProc(EngineCore): def process_input_sockets(self, input_addresses: list[str], coord_input_address: Optional[str], - identity: bytes): + identity: bytes, ready_event: threading.Event): """Input socket IO thread.""" # Msgpack serialization decoding. @@ -809,9 +825,14 @@ class EngineCoreProc(EngineCore): # back to us. input_socket.send(b'') poller.register(input_socket, zmq.POLLIN) + if coord_socket is not None: + # Wait for ready message from coordinator. + assert coord_socket.recv() == b"READY" poller.register(coord_socket, zmq.POLLIN) + ready_event.set() + del ready_event while True: for input_socket, _ in poller.poll(): # (RequestType, RequestData) From 88faa466d788e25082c02dc9688931d7976361f9 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 1 Aug 2025 19:18:38 -0400 Subject: [PATCH 144/224] [CI] Initial tests for SM100 Blackwell runner (#21877) Signed-off-by: mgoin --- .buildkite/test-pipeline.yaml | 24 ++++++++++++++++--- tests/compile/test_fusion_all_reduce.py | 15 +++++++----- .../quantization/test_cutlass_scaled_mm.py | 5 ---- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 598fd5762985e..cc1223d4c4653 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -647,13 +647,31 @@ steps: - label: Blackwell Test working_dir: "/vllm-workspace/" gpu: b200 - optional: true + # optional: true source_file_dependencies: - - csrc/ - - vllm/ + - csrc/quantization/fp4/ + - csrc/attention/mla/ + - csrc/quantization/cutlass_w8a8/moe/ + - vllm/model_executor/layers/fused_moe/cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/fusion.py commands: - nvidia-smi - python3 examples/offline_inference/basic/chat.py + # Attention + # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 + - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' + - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py + - pytest -v -s tests/kernels/test_cutlass_mla_decode.py + # Quantization + - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' + - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py + # Fusion + - pytest -v -s tests/compile/test_fusion_all_reduce.py ##### 1 GPU test ##### ##### multi gpus test ##### diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py index b394e0035c689..4c3cf6c2a10cf 100644 --- a/tests/compile/test_fusion_all_reduce.py +++ b/tests/compile/test_fusion_all_reduce.py @@ -136,12 +136,15 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module): @multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize("test_model", [ - TestAllReduceRMSNormModel, - TestAllReduceFusedAddRMSNormModel, - TestAllReduceFusedAddRMSNormStaticQuantFP8Model, - TestAllReduceFusedAddRMSNormStaticQuantFP4Model, -]) +@pytest.mark.parametrize( + "test_model", + [ + TestAllReduceRMSNormModel, + TestAllReduceFusedAddRMSNormModel, + TestAllReduceFusedAddRMSNormStaticQuantFP8Model, + # TODO: Enable with torch==2.8.0 + # TestAllReduceFusedAddRMSNormStaticQuantFP4Model, + ]) @pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("seq_len", [8]) @pytest.mark.parametrize("hidden_size", [16]) diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py index 544e6dc197904..8730eeaaa761c 100644 --- a/tests/kernels/quantization/test_cutlass_scaled_mm.py +++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py @@ -559,8 +559,6 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool, m_a_scales = m_g if per_act_token else 1 n_b_scales = n_g if per_out_ch else 1 - print("shape:", m_g, n_g, k_g) - # Create group-specific A and B (FP8) and output (FP16/FP32) a_g = to_fp8(torch.randn((m_g, k_g), device=device)) b_g = to_fp8(torch.randn((n_g, k_g), device=device).t()) @@ -639,7 +637,4 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool, for g in range(num_experts): baseline = baseline_tensors[g] c = out_tensors_stacked[expert_offsets[g]:expert_offsets[g + 1]] - print(baseline) - print(c) - print("*") torch.testing.assert_close(c, baseline, rtol=1e-2, atol=5e-4) From eefbf4a68b7b0a5b8364a59647906be1b7f043e2 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 1 Aug 2025 19:18:51 -0400 Subject: [PATCH 145/224] [Perf] Optimize `reshape_and_cache_flash` CUDA Kernel (#22036) Signed-off-by: yewentao256 --- .../benchmark_reshape_and_cache_flash.py | 156 ++++++++++++++++++ csrc/cache_kernels.cu | 92 ++++++++--- 2 files changed, 225 insertions(+), 23 deletions(-) create mode 100644 benchmarks/kernels/benchmark_reshape_and_cache_flash.py diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py new file mode 100644 index 0000000000000..d4648c18f31d5 --- /dev/null +++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py @@ -0,0 +1,156 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from __future__ import annotations + +import random +import time + +import torch +from tabulate import tabulate + +from vllm import _custom_ops as ops +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils import ( + STR_DTYPE_TO_TORCH_DTYPE, + FlexibleArgumentParser, + create_kv_caches_with_random_flash, +) + +logger = init_logger(__name__) + + +@torch.inference_mode() +def run_benchmark( + num_tokens: int, + num_heads: int, + head_size: int, + block_size: int, + num_blocks: int, + dtype: torch.dtype, + kv_cache_dtype: str, + kv_cache_layout: str, + num_iters: int, + device: str = "cuda", +) -> float: + """Return latency (seconds) for given num_tokens.""" + + if kv_cache_dtype == "fp8" and head_size % 16: + raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.") + + current_platform.seed_everything(42) + torch.set_default_device(device) + + # create random key / value tensors [T, H, D]. + key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device) + value = torch.randn_like(key) + + # prepare the slot mapping. + # each token is assigned a unique slot in the KV-cache. + num_slots = block_size * num_blocks + if num_tokens > num_slots: + raise ValueError("num_tokens cannot exceed the total number of cache slots") + slot_mapping_lst = random.sample(range(num_slots), num_tokens) + slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device) + + key_caches, value_caches = create_kv_caches_with_random_flash( + num_blocks, + block_size, + 1, # num_layers + num_heads, + head_size, + kv_cache_dtype, + dtype, + device=device, + cache_layout=kv_cache_layout, + ) + key_cache, value_cache = key_caches[0], value_caches[0] + + # compute per-kernel scaling factors for fp8 conversion (if used). + k_scale = (key.amax() / 64.0).to(torch.float32) + v_scale = (value.amax() / 64.0).to(torch.float32) + + def run_cuda_benchmark(n_iters: int) -> float: + nonlocal key, value, key_cache, value_cache, slot_mapping + torch.cuda.synchronize() + start = time.perf_counter() + for _ in range(n_iters): + ops.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + slot_mapping, + kv_cache_dtype, + k_scale, + v_scale, + ) + torch.cuda.synchronize() + end = time.perf_counter() + return (end - start) / n_iters + + # warm-up + run_cuda_benchmark(3) + + lat = run_cuda_benchmark(num_iters) + + # free tensors to mitigate OOM when sweeping + del key, value, key_cache, value_cache, slot_mapping + torch.cuda.empty_cache() + + return lat + + +def main(args): + rows = [] + for layout in ["NHD", "HND"]: + for exp in range(1, 17): + n_tok = 2**exp + lat = run_benchmark( + num_tokens=n_tok, + num_heads=args.num_heads, + head_size=args.head_size, + block_size=args.block_size, + num_blocks=args.num_blocks, + dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], + kv_cache_dtype=args.kv_cache_dtype, + kv_cache_layout=layout, + num_iters=args.iters, + device="cuda", + ) + rows.append([n_tok, layout, f"{lat * 1e6:.3f}"]) + + print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"])) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser() + + parser.add_argument("--num-heads", type=int, default=128) + parser.add_argument( + "--head-size", + type=int, + choices=[64, 80, 96, 112, 120, 128, 192, 256], + default=128, + ) + parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) + parser.add_argument("--num-blocks", type=int, default=128 * 512) + + parser.add_argument( + "--dtype", + type=str, + choices=["half", "bfloat16", "float"], + default="bfloat16", + ) + + parser.add_argument( + "--kv-cache-dtype", + type=str, + choices=["auto", "fp8"], + default="auto", + ) + + parser.add_argument("--iters", type=int, default=100) + args = parser.parse_args() + + main(args) diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 88559c8fe7183..131dcb15cd7e9 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -5,6 +5,7 @@ #include "cuda_utils.h" #include "cuda_compat.h" #include "dispatch_utils.h" +#include "quantization/vectorization_utils.cuh" #ifdef USE_ROCM #include "quantization/fp8/amd/quant_utils.cuh" @@ -261,14 +262,26 @@ __global__ void reshape_and_cache_kernel( } } +// Used by vectorization_utils to copy/convert one element +template +struct CopyWithScaleOp { + float scale; + + __device__ __forceinline__ void operator()(OutT& dst, const InT src) const { + if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { + dst = static_cast(src); + } else { + dst = fp8::scaled_convert(src, scale); + } + } +}; + template __global__ void reshape_and_cache_flash_kernel( const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size] - cache_t* __restrict__ key_cache, // [num_blocks, block_size, num_heads, - // head_size] - cache_t* __restrict__ value_cache, // [num_blocks, block_size, num_heads, - // head_size] + cache_t* __restrict__ key_cache, // NHD or HND, shape see comments below + cache_t* __restrict__ value_cache, // same above const int64_t* __restrict__ slot_mapping, // [num_tokens] const int64_t block_stride, const int64_t page_stride, const int64_t head_stride, const int64_t key_stride, @@ -282,25 +295,58 @@ __global__ void reshape_and_cache_flash_kernel( } const int64_t block_idx = slot_idx / block_size; const int64_t block_offset = slot_idx % block_size; - const int n = num_heads * head_size; - for (int i = threadIdx.x; i < n; i += blockDim.x) { - const int64_t src_key_idx = token_idx * key_stride + i; - const int64_t src_value_idx = token_idx * value_stride + i; - const int head_idx = i / head_size; - const int head_offset = i % head_size; - const int64_t tgt_key_value_idx = block_idx * block_stride + - block_offset * page_stride + - head_idx * head_stride + head_offset; - scalar_t tgt_key = key[src_key_idx]; - scalar_t tgt_value = value[src_value_idx]; - if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { - key_cache[tgt_key_value_idx] = tgt_key; - value_cache[tgt_key_value_idx] = tgt_value; - } else { - key_cache[tgt_key_value_idx] = - fp8::scaled_convert(tgt_key, *k_scale); - value_cache[tgt_key_value_idx] = - fp8::scaled_convert(tgt_value, *v_scale); + const int n_elems = num_heads * head_size; + + // pointers to the beginning of the source row for this token. + const scalar_t* __restrict__ key_src = key + token_idx * key_stride; + const scalar_t* __restrict__ value_src = value + token_idx * value_stride; + + // find the start position inside the kv-cache for this token. + cache_t* __restrict__ key_dst = + key_cache + block_idx * block_stride + block_offset * page_stride; + cache_t* __restrict__ value_dst = + value_cache + block_idx * block_stride + block_offset * page_stride; + + // this is true for the NHD layout where `head_stride == head_size` + const bool is_contiguous_heads = (head_stride == head_size); + + float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale; + float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale; + constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4; + CopyWithScaleOp k_op{k_scale_val}; + CopyWithScaleOp v_op{v_scale_val}; + if (is_contiguous_heads) { + // NHD layout + // kv cache: [num_blocks, block_size, num_heads, head_size] + vectorize_with_alignment(key_src, key_dst, n_elems, threadIdx.x, + blockDim.x, k_op); + + vectorize_with_alignment(value_src, value_dst, n_elems, + threadIdx.x, blockDim.x, v_op); + + } else { + // HND layout: heads are strided, but each head_size segment is contiguous + // kv cache: [num_blocks, num_heads, block_size, head_size] + const int lane = threadIdx.x & 31; // 0..31 within warp + const int warp_id = threadIdx.x >> 5; // warp index within block + const int warps_per_block = blockDim.x >> 5; + + for (int head = warp_id; head < num_heads; head += warps_per_block) { + const scalar_t* __restrict__ k_src_h = key_src + head * head_size; + const scalar_t* __restrict__ v_src_h = value_src + head * head_size; + + cache_t* __restrict__ k_dst_h = + key_dst + static_cast(head) * head_stride; + cache_t* __restrict__ v_dst_h = + value_dst + static_cast(head) * head_stride; + + // within each head, let the 32 threads of the warp perform the vector + // copy + vectorize_with_alignment(k_src_h, k_dst_h, head_size, lane, 32, + k_op); + + vectorize_with_alignment(v_src_h, v_dst_h, head_size, lane, 32, + v_op); } } } From 3654847db5a9b9a0955f8416292d94fa1c827f77 Mon Sep 17 00:00:00 2001 From: JartX Date: Sat, 2 Aug 2025 03:12:19 +0200 Subject: [PATCH 146/224] feat: Add Support GPTQ Quantization MOE on ROCM vllm serve (#21733) --- .../layers/fused_moe/fused_moe.py | 4 ++-- .../layers/quantization/gptq.py | 22 ++++++++++++++++--- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index b69575c7e96de..56d1dfe135b3b 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -761,8 +761,8 @@ def get_moe_wna16_block_config(config: dict[str, def should_moe_wna16_use_cuda(num_valid_tokens: int, group_size: int, num_experts: int, bit: int): - return bit == 4 and group_size in [32, 64, 128] and \ - num_valid_tokens / num_experts <= 6 + return current_platform.is_cuda() and bit == 4 and \ + group_size in [32, 64, 128] and num_valid_tokens / num_experts <= 6 def get_default_config( diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index d3ab1be3bee01..f18c936bac605 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -10,10 +10,11 @@ import torch from torch.nn.parameter import Parameter from vllm import _custom_ops as ops +from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.linear import LinearMethodBase from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) + QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.utils.gptq_utils import ( get_linear_quant_method) from vllm.model_executor.parameter import (ChannelQuantScaleParameter, @@ -110,8 +111,23 @@ class GPTQConfig(QuantizationConfig): return cls(weight_bits, group_size, desc_act, lm_head_quantized, dynamic) - def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["GPTQLinearMethod"]: + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional[Union["GPTQLinearMethod", "QuantizeMethodBase"]]: + if isinstance(layer, FusedMoE): + # GPTQ MoE support: fall back to MoeWNA16 for broad compatibility + from .moe_wna16 import MoeWNA16Config + + config = { + "quant_method": "gptq", + "bits": self.weight_bits, + "group_size": self.group_size, + "sym": True, # GPTQ typically uses symmetric quantization + "lm_head": False, + } + return MoeWNA16Config.from_config(config).get_quant_method( + layer, prefix) + return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod) From 23322431c802bb1057426c7ca31b22e859b51644 Mon Sep 17 00:00:00 2001 From: fhl2000 <63384265+fhl2000@users.noreply.github.com> Date: Sat, 2 Aug 2025 09:49:34 +0800 Subject: [PATCH 147/224] [V1][CUDA] Full cudagraph support for FlashInfer (#21367) --- vllm/v1/attention/backends/flash_attn.py | 7 +- vllm/v1/attention/backends/flashinfer.py | 355 ++++++++++++++++-- vllm/v1/attention/backends/mla/flashmla.py | 4 +- .../attention/backends/mla/rocm_aiter_mla.py | 4 +- vllm/v1/attention/backends/triton_attn.py | 6 +- vllm/v1/attention/backends/utils.py | 18 +- vllm/v1/worker/gpu_model_runner.py | 24 +- vllm/v1/worker/gpu_worker.py | 5 + 8 files changed, 376 insertions(+), 47 deletions(-) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 3f9afa67aef70..f086bab2556eb 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -25,7 +25,8 @@ if is_flash_attn_varlen_func_available(): from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger from vllm.utils import cdiv -from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, +from vllm.v1.attention.backends.utils import (AttentionCGSupport, + AttentionMetadataBuilder, CommonAttentionMetadata, get_kv_cache_layout) from vllm.v1.kv_cache_interface import AttentionSpec @@ -153,7 +154,9 @@ def _get_sliding_window_configs( class FlashAttentionMetadataBuilder( AttentionMetadataBuilder[FlashAttentionMetadata]): - full_cudagraph_supported: ClassVar[bool] = get_flash_attn_version() == 3 + attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.NEVER if get_flash_attn_version() == 2 \ + else AttentionCGSupport.ALWAYS def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index f8af1d7e41831..0aaad02b5b840 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -4,26 +4,28 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, ClassVar, Optional, Union import torch from flashinfer import (BatchDecodeWithPagedKVCacheWrapper, BatchPrefillWithPagedKVCacheWrapper, MultiLevelCascadeAttentionWrapper) -from flashinfer.decode import trtllm_batch_decode_with_kv_cache +from flashinfer.decode import (_get_range_buf, get_seq_lens, + trtllm_batch_decode_with_kv_cache) import vllm.envs as envs from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionType) from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import cdiv +from vllm.utils import cdiv, is_pin_memory_available from vllm.utils.flashinfer import use_trtllm_decode_attention from vllm.v1.attention.backends.flash_attn import use_cascade_attention from vllm.v1.attention.backends.utils import ( - AttentionMetadataBuilder, CommonAttentionMetadata, get_kv_cache_layout, - get_per_layer_parameters, infer_global_hyperparameters, - reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills) + AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, + get_kv_cache_layout, get_per_layer_parameters, + infer_global_hyperparameters, reorder_batch_to_split_decodes_and_prefills, + split_decodes_and_prefills) from vllm.v1.kv_cache_interface import AttentionSpec if TYPE_CHECKING: @@ -174,26 +176,66 @@ class FlashInferMetadata: class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): + attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.PURE_DECODE_ONLY def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): self.device = device + self.vllm_config = vllm_config + self.cache_config = vllm_config.cache_config + self.kv_cache_spec = kv_cache_spec self._workspace_buffer = None self._prefill_wrapper = None # Wrapper for prefill/append - self._decode_wrapper = None # Wrapper for decode + self._decode_wrapper = None # Wrapper for decode (general shape) + + self.compilation_config = vllm_config.compilation_config + max_num_pages_per_req = cdiv(vllm_config.model_config.max_model_len, + self.kv_cache_spec.block_size) + max_num_reqs = vllm_config.scheduler_config.max_num_seqs + max_num_pages = max_num_reqs * max_num_pages_per_req + self.enable_cuda_graph = self.compilation_config.full_cuda_graph + if self.enable_cuda_graph: + # For full cudagraph capture, one `decode_wrapper` for each batch + # size is needed for FlashInfer. + self._decode_wrappers_cudagraph: dict[ + int, BatchDecodeWithPagedKVCacheWrapper] = {} + self._decode_cudagraph_max_bs = min( + max_num_reqs, self.compilation_config.max_capture_size) + self._cascade_wrapper = None # Wrapper for cascade attention # Global hyperparameters shared by all attention layers self.global_hyperparameters = infer_global_hyperparameters( get_per_layer_parameters(vllm_config, layer_names, FlashInferImpl)) - self.vllm_config = vllm_config - self.cache_config = vllm_config.cache_config - self.kv_cache_spec = kv_cache_spec - max_num_blocks_per_request = cdiv( - vllm_config.model_config.max_model_len, - self.kv_cache_spec.block_size) - self.block_table_arange = torch.arange(max_num_blocks_per_request, + # Preparing persistent buffers (device-side) + self.paged_kv_indptr = torch.zeros(max_num_reqs + 1, + dtype=torch.int32, + device=self.device) + self.paged_kv_indices = torch.zeros( + max_num_pages, # max num pages possible + dtype=torch.int32, + device=self.device) + self.paged_kv_last_page_len = torch.zeros(max_num_reqs, + dtype=torch.int32, + device=self.device) + # host-side buffer + pin_memory = is_pin_memory_available() + self.paged_kv_indptr_cpu = torch.zeros(max_num_reqs + 1, + dtype=torch.int32, + device="cpu", + pin_memory=pin_memory) + self.paged_kv_indices_cpu = torch.zeros(max_num_pages, + dtype=torch.int32, + device="cpu", + pin_memory=pin_memory) + self.paged_kv_last_page_len_cpu = torch.zeros(max_num_reqs, + dtype=torch.int32, + device="cpu", + pin_memory=pin_memory) + + self.block_table_arange = torch.arange(max_num_pages_per_req, dtype=torch.int32, device=self.device) @@ -217,8 +259,16 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self._get_workspace_buffer(), get_kv_cache_layout()) return self._prefill_wrapper - def _get_decode_wrapper(self): - if self._decode_wrapper is None: + def _get_decode_wrapper(self, + batch_size: int, + use_cudagraph: bool = False): + if use_cudagraph: + decode_wrapper = self._decode_wrappers_cudagraph.get( + batch_size, None) + else: + decode_wrapper = self._decode_wrapper + + if decode_wrapper is None: num_qo_heads = ( self.vllm_config.model_config.get_num_attention_heads( self.vllm_config.parallel_config)) @@ -226,11 +276,32 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.vllm_config.parallel_config) use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or ( num_qo_heads // num_kv_heads > 4) - self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper( + + if use_cudagraph: + paged_kv_indptr = self.paged_kv_indptr[:batch_size + 1] + paged_kv_indices = self.paged_kv_indices + paged_kv_last_page_len = self.paged_kv_last_page_len[: + batch_size] + else: + paged_kv_indptr = None + paged_kv_indices = None + paged_kv_last_page_len = None + decode_wrapper = BatchDecodeWithPagedKVCacheWrapper( self._get_workspace_buffer(), get_kv_cache_layout(), + use_cuda_graph=use_cudagraph, + paged_kv_indptr_buffer=paged_kv_indptr, + paged_kv_indices_buffer=paged_kv_indices, + paged_kv_last_page_len_buffer=paged_kv_last_page_len, use_tensor_cores=use_tensor_cores) - return self._decode_wrapper + + # save the decode wrapper + if use_cudagraph: + self._decode_wrappers_cudagraph[batch_size] = decode_wrapper + else: + self._decode_wrapper = decode_wrapper + + return decode_wrapper def _get_cascade_wrapper(self): if self._cascade_wrapper is None: @@ -308,16 +379,44 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): ) if num_decodes > 0: - attn_metadata.decode_wrapper = self._get_decode_wrapper() + pure_decode = num_prefills == 0 + # possible required padding for cudagraph replay + use_cudagraph = (self.enable_cuda_graph and pure_decode and + num_decodes <= self._decode_cudagraph_max_bs) + if use_cudagraph: + num_input_tokens = ( + self.vllm_config.pad_for_cudagraph(num_decodes)) + # Carefully fulfill the padding region with reasonable value + # on cpu. + # Make sure paged_kv_indptr_cpu is not decreasing + self.paged_kv_indptr_cpu[1 + num_decodes:1 + + num_input_tokens].fill_( + attn_metadata. + paged_kv_indptr_cpu[-1]) + # Fill the remaining paged_kv_last_page_len_cpu with 1. + # This is because flashinfer treats 0 as a full page + # instead of empty. + self.paged_kv_last_page_len_cpu[ + num_decodes:num_input_tokens].fill_(1) + + else: + num_input_tokens = num_decodes + + attn_metadata.decode_wrapper = self._get_decode_wrapper( + num_input_tokens, use_cudagraph) if not use_trtllm_decode_attention( num_decodes, attn_metadata.max_seq_len, self.cache_config.cache_dtype, attn_metadata.num_qo_heads, attn_metadata.num_kv_heads, attn_metadata.head_dim): - attn_metadata.decode_wrapper.plan( - attn_metadata.paged_kv_indptr_cpu[:num_decodes + 1], + # Use the persistent buffer with padding length, + # instead of the same address but chunked version + # in atten_metadata when using cudagraph. + fast_plan_decode( + attn_metadata.decode_wrapper, + self.paged_kv_indptr_cpu[:num_input_tokens + 1], attn_metadata.paged_kv_indices, - attn_metadata.paged_kv_last_page_len_cpu[:num_decodes], + self.paged_kv_last_page_len_cpu[:num_input_tokens], attn_metadata.num_qo_heads, attn_metadata.num_kv_heads, attn_metadata.head_dim, @@ -336,6 +435,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata, fast_build: bool = False) -> FlashInferMetadata: + num_reqs = common_attn_metadata.num_reqs num_actual_tokens = common_attn_metadata.num_actual_tokens num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens =\ split_decodes_and_prefills(common_attn_metadata) @@ -381,18 +481,26 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): non_blocking=True) mask = (self.block_table_arange[:max_num_blocks].unsqueeze(0) < block_table_bounds.unsqueeze(1)) - paged_kv_indices = block_table_tensor[:, :max_num_blocks][mask] + # write self.paged_kv_indices inplace + num_actual_pages = torch.sum(mask) + paged_kv_indices = self.paged_kv_indices[:num_actual_pages] + torch.masked_select(block_table_tensor[:, :max_num_blocks], + mask, + out=paged_kv_indices) - paged_kv_indptr_cpu = torch.zeros(len(block_table_bounds_cpu) + 1, - dtype=torch.int32, - device='cpu') - paged_kv_indptr_cpu[1:] = block_table_bounds_cpu.cumsum( - dim=0, dtype=torch.int32) + # write self.paged_kv_indptr_cpu inplace (0-index is always 0) + torch.cumsum(block_table_bounds_cpu, + dim=0, + dtype=torch.int32, + out=self.paged_kv_indptr_cpu[1:1 + num_reqs]) paged_kv_last_page_len_cpu = seq_lens_cpu % page_size - paged_kv_last_page_len_cpu = torch.where( - paged_kv_last_page_len_cpu == 0, page_size, - paged_kv_last_page_len_cpu) + # write self.paged_kv_last_page_len_cpu inplace + torch.where(paged_kv_last_page_len_cpu == 0, + torch.tensor(page_size), + paged_kv_last_page_len_cpu, + out=self.paged_kv_last_page_len_cpu[:num_reqs]) + cache_dtype = self.cache_config.cache_dtype if cache_dtype.startswith("fp8"): kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer( @@ -402,9 +510,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): attn_metadata = FlashInferMetadata( num_actual_tokens=num_actual_tokens, qo_indptr_cpu=common_attn_metadata.query_start_loc_cpu, - paged_kv_indptr_cpu=paged_kv_indptr_cpu, + paged_kv_indptr_cpu=self.paged_kv_indptr_cpu[:1 + num_reqs], paged_kv_indices=paged_kv_indices, - paged_kv_last_page_len_cpu=paged_kv_last_page_len_cpu, + paged_kv_last_page_len_cpu=self. + paged_kv_last_page_len_cpu[:num_reqs], num_qo_heads=self.vllm_config.model_config.get_num_attention_heads( self.vllm_config.parallel_config), num_kv_heads=self.kv_cache_spec.num_kv_heads, @@ -431,6 +540,26 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): return attn_metadata + def build_for_cudagraph_capture( + self, common_attn_metadata: CommonAttentionMetadata): + """ + This method builds the metadata for full cudagraph capture. + Currently, only decode is supported for full cudagraphs with FlashInfer. + """ + m = common_attn_metadata + + assert m.num_reqs == m.num_actual_tokens, \ + "FlashInfer only supports decode-only full CUDAGraph capture. " \ + "Make sure all cudagraph capture sizes <= max_num_seq." + + m.max_query_len = 1 # decode-only + + return self.build(0, m) + + def can_run_in_cudagraph( + self, common_attn_metadata: CommonAttentionMetadata) -> bool: + return common_attn_metadata.max_query_len == 1 + def use_cascade_attention(self, *args, **kwargs) -> bool: if self.kv_cache_spec.dtype != self.vllm_config.model_config.dtype: # TODO: The cascade wrapper currently does not support setting @@ -638,3 +767,163 @@ class FlashInferImpl(AttentionImpl): out=output[:num_decode_tokens], ) return output_padded + + +def fast_plan_decode( + self, # decode wrapper + indptr_cpu: torch.Tensor, + indices: torch.Tensor, + last_page_len_cpu: torch.Tensor, + num_qo_heads: int, + num_kv_heads: int, + head_dim: int, + page_size: int, + pos_encoding_mode: str = "NONE", + window_left: int = -1, + logits_soft_cap: Optional[float] = None, + q_data_type: Optional[Union[str, torch.dtype]] = "float16", + kv_data_type: Optional[Union[str, torch.dtype]] = None, + data_type: Optional[Union[str, torch.dtype]] = None, + sm_scale: Optional[float] = None, + rope_scale: Optional[float] = None, + rope_theta: Optional[float] = None, + non_blocking: bool = True, +) -> None: + """ + A faster version of BatchDecodeWithPagedKVCacheWrapper::plan used for + cudagraph capture/replay, while the no cudagraph version turns back + to the original plan. + using original plan after passing host-side buffers: + - only host-to-device copy of indptr and last_page_len buffers + Modifications for cudagraph: + - only host-to-device copy of indptr and last_page_len buffers. + - avoid device-to-device copy of indices buffer. + + Part of the code get inspiration from the original plan from FlashInfer repo + and the implementation of fast_decode_plan for FlashInfer in SGlang repo. + """ + # Warm up with the original plan if it is first call, and always run the + # original plan if we run for dynamic shape. For fixed shape (cudagraph), + # this warm up is to generate the _cached_module for the decode wrapper. + if not self.is_cuda_graph_enabled or \ + getattr(self, "vllm_first_call", True): + self.plan( + indptr_cpu, + indices, + last_page_len_cpu, + num_qo_heads, + num_kv_heads, + head_dim, + page_size, + pos_encoding_mode, + window_left, + logits_soft_cap, + q_data_type, + kv_data_type, + data_type, + sm_scale, + rope_scale, + rope_theta, + non_blocking, + ) + self.vllm_first_call = False + return + + assert self.is_cuda_graph_enabled, "Should be cudagraph only here" + + batch_size = len(last_page_len_cpu) + if logits_soft_cap is None: + logits_soft_cap = 0.0 + + # Handle data types consistently + if data_type is not None: + if q_data_type is None: + q_data_type = data_type + if kv_data_type is None: + kv_data_type = data_type + elif q_data_type is None: + q_data_type = "float16" + + if kv_data_type is None: + kv_data_type = q_data_type + q_data_type = getattr(torch, q_data_type) if isinstance( + q_data_type, str) else q_data_type + kv_data_type = getattr(torch, kv_data_type) if isinstance( + kv_data_type, str) else kv_data_type + + if self.use_tensor_cores: + qo_indptr_host = _get_range_buf(batch_size + 1, "cpu") + + if batch_size != self._fixed_batch_size: + raise ValueError( + "The batch size should be fixed in cudagraph mode, the runtime " + "batch size {} mismatches the batch size set during " + "initialization {}".format(batch_size, self._fixed_batch_size)) + if len(indices) > len(self._paged_kv_indices_buf): + raise ValueError( + "The size of indices should be less than or equal to the " + "allocated buffer") + + # host-to-device copy for the indptr buffer + self._paged_kv_indptr_buf.copy_(indptr_cpu, non_blocking=True) + # host-to-device copy for the last_page_len buffer + self._paged_kv_last_page_len_buf.copy_(last_page_len_cpu, + non_blocking=True) + + indptr_host = indptr_cpu + last_page_len_host = last_page_len_cpu + + if self.use_tensor_cores: + kv_lens_arr_host = get_seq_lens(indptr_host, last_page_len_host, + page_size) + + try: + # Make sure we pass exactly 15 arguments for tensor core version + self._plan_info = self._cached_module.plan( + self._float_workspace_buffer, + self._int_workspace_buffer, + self._pin_memory_int_workspace_buffer, + qo_indptr_host, + indptr_host, + kv_lens_arr_host, + batch_size, # total_num_rows + batch_size, + num_qo_heads, + num_kv_heads, + page_size, + self.is_cuda_graph_enabled, + head_dim, + head_dim, + False, # causal + ) + except Exception as e: + raise RuntimeError(f"Error in tensor core plan: {e}") from e + else: + try: + # Make sure we pass exactly 15 arguments for standard version + self._plan_info = self._cached_module.plan( + self._float_workspace_buffer, + self._int_workspace_buffer, + self._pin_memory_int_workspace_buffer, + indptr_host, + batch_size, + num_qo_heads, + num_kv_heads, + page_size, + self.is_cuda_graph_enabled, + window_left, + logits_soft_cap, + head_dim, + head_dim, + torch.empty(0, dtype=q_data_type), + torch.empty(0, dtype=kv_data_type), + ) + except Exception as e: + raise RuntimeError(f"Error in standard plan: {e}") from e + + self._pos_encoding_mode = pos_encoding_mode + self._window_left = window_left + self._logits_soft_cap = logits_soft_cap + self._sm_scale = sm_scale + self._rope_scale = rope_scale + self._rope_theta = rope_theta diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 39463b9c06164..b5aecff9937f3 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -18,6 +18,7 @@ from vllm.v1.attention.backends.mla.common import (MLACommonBackend, MLACommonImpl, MLACommonMetadata, MLACommonMetadataBuilder) +from vllm.v1.attention.backends.utils import AttentionCGSupport from vllm.v1.kv_cache_interface import AttentionSpec logger = init_logger(__name__) @@ -54,7 +55,8 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]): class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): - full_cudagraph_supported: ClassVar[bool] = True # Decode-only + attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.PURE_DECODE_ONLY def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 5c5891f035ae2..8b55e1a301992 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -17,6 +17,7 @@ from vllm.v1.attention.backends.mla.common import (MLACommonBackend, MLACommonImpl, MLACommonMetadata, MLACommonMetadataBuilder) +from vllm.v1.attention.backends.utils import AttentionCGSupport from vllm.v1.kv_cache_interface import AttentionSpec # yapf: enable @@ -64,7 +65,8 @@ class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]): class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): - full_cudagraph_supported: ClassVar[bool] = True # decode only + attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.PURE_DECODE_ONLY def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 195fbd3b1b9c4..942cb95eefa2f 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -18,7 +18,8 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata -from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, +from vllm.v1.attention.backends.utils import (AttentionCGSupport, + AttentionMetadataBuilder, CommonAttentionMetadata) from vllm.v1.kv_cache_interface import AttentionSpec @@ -57,7 +58,8 @@ class TritonAttentionMetadata: class TritonAttentionMetadataBuilder( AttentionMetadataBuilder[TritonAttentionMetadata]): - full_cudagraph_supported: ClassVar[bool] = True + attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.ALWAYS def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 36bacf0cb36f8..d39cc0a39f45c 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import abc +import enum import functools from abc import abstractmethod from dataclasses import dataclass, make_dataclass @@ -65,9 +66,24 @@ class CommonAttentionMetadata: M = TypeVar("M") +class AttentionCGSupport(enum.Enum): + """ Constants for the cudagraph support of the attention backend + Here we do not consider the cascade attention, as currently + it is never cudagraph supported.""" + + NEVER = 0 + """NO cudagraph support""" + PURE_DECODE_ONLY = 1 + """Cudagraph supported for pure decode, need to run without + cudagraph for mixed prefill-decode batches""" + ALWAYS = 2 + """Cudagraph always supported""" + + class AttentionMetadataBuilder(abc.ABC, Generic[M]): # Does this backend/builder support CUDA Graphs for attention. - full_cudagraph_supported: ClassVar[bool] = False + attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.NEVER @abstractmethod def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 29cda4d837bf3..d5a5799efb47c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -47,7 +47,7 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, is_pin_memory_available, round_up, supports_dynamo) from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend from vllm.v1.attention.backends.utils import ( - AttentionMetadataBuilder, CommonAttentionMetadata, + AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, make_kv_sharing_fast_prefill_attention_metadata, make_local_attention_virtual_batches) from vllm.v1.core.encoder_cache_manager import compute_encoder_budget @@ -2619,12 +2619,22 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.device, ) - if (self.full_cuda_graph - and not attn_metadata_builder_i.full_cudagraph_supported): - raise ValueError( - f"Full CUDAGraph not supported for " - f"{attn_backend_i.__name__}. Turn off CompilationConfig." - f"full_cuda_graph or use a different attention backend.") + if self.full_cuda_graph: + if attn_metadata_builder_i.attn_cudagraph_support == \ + AttentionCGSupport.NEVER: + raise ValueError(f"Full CUDAGraph not supported for " + f"{attn_backend_i.__name__}. Turn off " + f"CompilationConfig.full_cuda_graph or use a " + f" different attention backend.") + if attn_metadata_builder_i.attn_cudagraph_support == \ + AttentionCGSupport.PURE_DECODE_ONLY: + # Limit the max cudagraph size to the max number of + # sequences for pure decode only cudagraph backend, + # whose max_query_len is 1. + self.cudagraph_batch_sizes = [ + size for size in self.cudagraph_batch_sizes + if size <= self.scheduler_config.max_num_seqs + ] return attn_backend_i, attn_metadata_builder_i def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 0f46ed223ab88..4bc4ece9a0df4 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -321,11 +321,16 @@ class Worker(WorkerBase): if get_pp_group().is_last_rank: max_num_reqs = min(self.scheduler_config.max_num_seqs, self.scheduler_config.max_num_batched_tokens) + # activate building attn_metadata for this dummy run to avoid + # potential illegal memory access for full cudagraph relay. + attn_cudagraph = self.compilation_config.full_cuda_graph and\ + not self.model_config.enforce_eager # We skip EPLB here since we don't want to record dummy metrics hidden_states, last_hidden_states = \ self.model_runner._dummy_run( num_tokens=max_num_reqs, + capture_attn_cudagraph=attn_cudagraph, skip_eplb=True, ) if self.model_runner.is_pooling_model: From ee2eb6ecd86be4b47e334f74feb7874b9a41ca25 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Sat, 2 Aug 2025 10:34:37 +0800 Subject: [PATCH 148/224] [Model] Qwen2.5 VL SiLU-and-Mul (#22066) Signed-off-by: kf Signed-off-by: vllmellm Co-authored-by: kf --- vllm/model_executor/models/qwen2_5_vl.py | 44 +++++++++++------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index c4c4650f569e1..04e64422d2e0b 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -43,9 +43,10 @@ from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY +from vllm.model_executor.layers.activation import get_act_and_mul_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig @@ -171,16 +172,12 @@ class Qwen2_5_VisionMLP(nn.Module): quant_config: Optional[QuantizationConfig] = None, prefix: str = ""): super().__init__() - self.gate_proj = ColumnParallelLinear(in_features, - hidden_features, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.gate_proj") - self.up_proj = ColumnParallelLinear(in_features, - hidden_features, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.up_proj") + self.gate_up_proj = MergedColumnParallelLinear( + input_size=in_features, + output_sizes=[hidden_features] * 2, # [gate_proj, up_proj] + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") self.down_proj = RowParallelLinear(hidden_features, in_features, bias=bias, @@ -189,10 +186,9 @@ class Qwen2_5_VisionMLP(nn.Module): self.act_fn = act_fn def forward(self, x: torch.Tensor): - x_gate, _ = self.gate_proj(x) - x_gate = self.act_fn(x_gate) - x_up, _ = self.up_proj(x) - x_down, _ = self.down_proj(x_gate * x_up) + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x_down, _ = self.down_proj(x) return x_down @@ -540,14 +536,14 @@ class Qwen2_5_VisionTransformer(nn.Module): self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2) self.blocks = nn.ModuleList([ - Qwen2_5_VisionBlock( - dim=self.hidden_size, - num_heads=self.num_heads, - mlp_hidden_dim=vision_config.intermediate_size, - act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], - norm_layer=norm_layer, - quant_config=quant_config, - prefix=f"{prefix}.blocks.{layer_idx}") + Qwen2_5_VisionBlock(dim=self.hidden_size, + num_heads=self.num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + act_fn=get_act_and_mul_fn( + vision_config.hidden_act), + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}") for layer_idx in range(depth) ]) self.merger = Qwen2_5_VisionPatchMerger( @@ -752,6 +748,8 @@ class Qwen2_5_VisionTransformer(nn.Module): ("attn.qkv.", "attn.q.", "q"), ("attn.qkv.", "attn.k.", "k"), ("attn.qkv.", "attn.v.", "v"), + ("mlp.gate_up_proj.", "mlp.gate_proj.", 0), + ("mlp.gate_up_proj.", "mlp.up_proj.", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: set[str] = set() From 57393715e804387588241fbdb4ec94a7570230b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Sat, 2 Aug 2025 04:41:40 +0200 Subject: [PATCH 149/224] [Misc] `VLLM_TARGET_DEVICE.lower()` (#22101) Signed-off-by: NickLucche --- vllm/envs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index 2fda2903179b5..c161fa0dff6ba 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -213,7 +213,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # Target device of vLLM, supporting [cuda (by default), # rocm, neuron, cpu] "VLLM_TARGET_DEVICE": - lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"), + lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(), # Maximum number of compilation jobs to run in parallel. # By default this is the number of CPUs From a65f46be5ea9a92dde48df2b951c1915aa1d9595 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Sat, 2 Aug 2025 08:12:03 +0530 Subject: [PATCH 150/224] [Misc] DeepGemmExperts : Avoid JIT generation in the hot-path (#21955) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- vllm/envs.py | 9 +++ .../layers/fused_moe/deep_gemm_moe.py | 77 ++++++++++++++++++- vllm/utils/deep_gemm.py | 7 ++ 3 files changed, 92 insertions(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index c161fa0dff6ba..2d470c6dccbfd 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -126,6 +126,7 @@ if TYPE_CHECKING: VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None VLLM_TPU_USING_PATHWAYS: bool = False VLLM_USE_DEEP_GEMM: bool = False + VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False VLLM_USE_FLASHINFER_MOE_FP8: bool = False VLLM_USE_FLASHINFER_MOE_FP4: bool = False VLLM_XGRAMMAR_CACHE_MB: int = 0 @@ -910,6 +911,14 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_DEEP_GEMM": lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))), + # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm + # JIT all the required kernels before model execution so there is no + # JIT'ing in the hot-path. However, this warmup increases the engine + # startup time by a couple of minutes. + # Set `VLLM_SKIP_DEEP_GEMM_WARMUP` to disable the warmup. + "VLLM_SKIP_DEEP_GEMM_WARMUP": + lambda: bool(int(os.getenv("VLLM_SKIP_DEEP_GEMM_WARMUP", "0"))), + # Allow use of FlashInfer MoE kernels for fused moe ops. "VLLM_USE_FLASHINFER_MOE_FP8": lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))), diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index b89e5ac6f093e..bd3605378b6dc 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -4,7 +4,9 @@ import functools from typing import Any, Optional import torch +from tqdm import tqdm +import vllm.envs as env import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig @@ -17,7 +19,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( from vllm.model_executor.layers.fused_moe.utils import _resize_cache from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8) -from vllm.utils import has_deep_gemm +from vllm.utils import has_deep_gemm, run_once from vllm.utils.deep_gemm import m_grouped_fp8_gemm_nt_contiguous logger = init_logger(__name__) @@ -82,6 +84,65 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor, return True +@run_once +def warmup_deepgemm_gg_contiguous_kernels(w1: torch.Tensor, w2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + num_topk: int): + """ + DeepGemm JITs the grouped-gemm kernels. The JIT'ing happens based on the + input tensor shapes. In this function, we construct all possible input + tensor shapes so all the kernels are JIT'ed and cached. + Note that this warmup is expected to happen during the model profile + call and not during actual model inference. + """ + + assert w1.size(0) == w2.size(0), ( + "w1 and w2 must have the same number of experts") + + block_m = deep_gemm_block_shape()[0] + num_experts = w1.size(0) + device = w1.device + + # This is the maximum GroupedGemm M size that we expect to run + # the grouped_gemm with. + MAX_M = compute_aligned_M(env.VLLM_FUSED_MOE_CHUNK_SIZE, + num_topk, + num_experts, + block_m, + expert_tokens_meta=None) + # Distribute expert-ids evenly. + MAX_BLOCKS = MAX_M // block_m + expert_ids_block = torch.randint(low=0, + high=num_experts, + size=(MAX_BLOCKS, ), + device=device, + dtype=torch.int32) + expert_ids = torch.repeat_interleave(expert_ids_block, block_m, dim=0) + + def _warmup(w: torch.Tensor, w_scale: torch.Tensor): + + _, n, k = w.size() + a1q = torch.empty((MAX_M, k), device=device).to(torch.float8_e4m3fn) + a1q_scales = torch.empty((MAX_M, k // block_m), + device=device, + dtype=torch.float32) + out = torch.empty((MAX_M, n), device=device, dtype=torch.bfloat16) + + pbar = tqdm(total=MAX_BLOCKS, + desc=f"DeepGemmExperts GEMM warmup (MAX_M={MAX_M})") + num_tokens = MAX_M + while num_tokens > 0: + m_grouped_fp8_gemm_nt_contiguous( + (a1q[:num_tokens], a1q_scales[:num_tokens]), (w, w_scale), + out[:num_tokens], expert_ids[:num_tokens]) + pbar.update(1) + num_tokens = num_tokens - block_m + + _warmup(w1, w1_scale) + _warmup(w2, w2_scale) + + class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): def __init__(self): @@ -156,6 +217,20 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): ): assert self.block_shape is not None assert a1q_scale is not None + assert w1_scale is not None + assert w2_scale is not None + + if not env.VLLM_SKIP_DEEP_GEMM_WARMUP: + # DeepGemm JITs the grouped-gemm kernels. We don't want the JIT'ing + # to happen during actual model-inference. The + # `warmup_deepgemm_kernels` function is a `run_once` decorated + # function that executes during the model profile run. This warmup + # should create all the required JITs for the current model. + warmup_deepgemm_gg_contiguous_kernels(w1, + w2, + w1_scale, + w2_scale, + num_topk=topk_ids.size(1)) a1q = hidden_states _, N, K = w1.size() diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 4dedee2a3f862..8ab34e7505ee2 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -8,6 +8,7 @@ from __future__ import annotations import functools import importlib +import os from typing import Any, Callable, NoReturn import torch @@ -77,6 +78,12 @@ def _lazy_init() -> None: if not has_deep_gemm(): return + # Set up deep_gemm cache path + DEEP_GEMM_JIT_CACHE_ENV_NAME = 'DG_JIT_CACHE_DIR' + if not os.environ.get(DEEP_GEMM_JIT_CACHE_ENV_NAME, None): + os.environ[DEEP_GEMM_JIT_CACHE_ENV_NAME] = os.path.join( + envs.VLLM_CACHE_ROOT, "deep_gemm") + _dg = importlib.import_module("deep_gemm") _fp8_gemm_nt_impl = _resolve_symbol(_dg, "fp8_gemm_nt", From 9f9c38c392476fd35b9154221c00a2255dcfd010 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 1 Aug 2025 22:43:37 -0400 Subject: [PATCH 151/224] [Speculators][Speculative Decoding] Add Qwen Eagle3 Support (#21835) Signed-off-by: Dipika Sikka --- .../speculators/test_eagle3.py | 14 +++++++++++-- vllm/config.py | 15 ++++++++++--- vllm/model_executor/models/qwen2.py | 21 +++++++++++++------ vllm/model_executor/models/qwen3.py | 7 +++++++ 4 files changed, 46 insertions(+), 11 deletions(-) diff --git a/tests/speculative_decoding/speculators/test_eagle3.py b/tests/speculative_decoding/speculators/test_eagle3.py index c58fc8c0dc5f4..c46ac7a88b751 100644 --- a/tests/speculative_decoding/speculators/test_eagle3.py +++ b/tests/speculative_decoding/speculators/test_eagle3.py @@ -6,11 +6,21 @@ import torch @pytest.mark.parametrize( "model_path", - [("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717"), - ("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")]) + [("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")]) def test_llama(vllm_runner, example_prompts, model_path): with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens=20) print(vllm_outputs) assert vllm_outputs + + +@pytest.mark.parametrize( + "model_path", + [("nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized")]) +def test_qwen(vllm_runner, example_prompts, model_path): + with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens=20) + print(vllm_outputs) + assert vllm_outputs diff --git a/vllm/config.py b/vllm/config.py index dabb4b524dfd8..95dae4275edf3 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3175,10 +3175,19 @@ class SpeculativeConfig: "speculative decoding is > 1, but got " f"{self.disable_by_batch_size=}") - if self.method == "eagle3" and self.target_model_config and \ - "llama" not in self.target_model_config.hf_text_config.model_type: + from vllm.transformers_utils.configs import SpeculatorsConfig + + eagle3_target_supported = ["llama"] + if self.draft_model_config and isinstance( + self.draft_model_config.hf_config, SpeculatorsConfig): + eagle3_target_supported.append("qwen") + + if self.method == "eagle3" and self.target_model_config and not any( + supported_model in + self.target_model_config.hf_text_config.model_type + for supported_model in eagle3_target_supported): raise ValueError( - "Eagle3 is only supported for Llama models. " + f"Eagle3 is only supported for {eagle3_target_supported} models. " # noqa: E501 f"Got {self.target_model_config.hf_text_config.model_type=}") return self diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 23f65b99c22ce..0e7507a4570be 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -330,6 +330,8 @@ class Qwen2Model(nn.Module): else: self.norm = PPMissingLayer() + self.aux_hidden_state_layers: tuple[int] = tuple() + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.embed_tokens(input_ids) @@ -350,18 +352,25 @@ class Qwen2Model(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: - hidden_states, residual = layer( - positions, - hidden_states, - residual, - ) + + aux_hidden_states = [] + for idx, layer in enumerate( + self.layers[self.start_layer:self.end_layer]): + if idx in self.aux_hidden_state_layers: + aux_hidden_states.append(hidden_states + residual) + hidden_states, residual = layer(positions, hidden_states, residual) + if not get_pp_group().is_last_rank: return IntermediateTensors({ "hidden_states": hidden_states, "residual": residual }) + hidden_states, _ = self.norm(hidden_states, residual) + + if len(aux_hidden_states) > 0: + return hidden_states, aux_hidden_states + return hidden_states def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 393ce41a91a00..d2ae8959b103d 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -288,6 +288,13 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None: + self.model.aux_hidden_state_layers = layers + + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]: + num_layers = len(self.model.layers) + return (2, num_layers // 2, num_layers - 3) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) From 8d524ce79ffd0571d6a576cb9a5c21feab187246 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sat, 2 Aug 2025 03:45:27 +0100 Subject: [PATCH 152/224] [BugFix] Improve internal DP load balancing (#21617) Signed-off-by: Nick Hill --- vllm/entrypoints/openai/api_server.py | 3 + vllm/v1/engine/async_llm.py | 4 + vllm/v1/engine/coordinator.py | 110 +++++++++++++++++--------- vllm/v1/engine/core.py | 13 +-- vllm/v1/engine/core_client.py | 46 +++++++---- vllm/v1/metrics/stats.py | 4 + vllm/v1/utils.py | 1 + 7 files changed, 122 insertions(+), 59 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index b8ec5461f7719..9bf4702320788 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -199,6 +199,8 @@ async def build_async_engine_client_from_engine_args( from vllm.v1.engine.async_llm import AsyncLLM async_llm: Optional[AsyncLLM] = None + client_count = client_config.pop( + "client_count") if client_config else 1 client_index = client_config.pop( "client_index") if client_config else 0 try: @@ -208,6 +210,7 @@ async def build_async_engine_client_from_engine_args( enable_log_requests=engine_args.enable_log_requests, disable_log_stats=engine_args.disable_log_stats, client_addresses=client_config, + client_count=client_count, client_index=client_index) # Don't keep the dummy data in memory diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 308ca32105ba9..45f450291ab63 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -57,6 +57,7 @@ class AsyncLLM(EngineClient): start_engine_loop: bool = True, stat_loggers: Optional[list[StatLoggerFactory]] = None, client_addresses: Optional[dict[str, str]] = None, + client_count: int = 1, client_index: int = 0, ) -> None: """ @@ -120,6 +121,7 @@ class AsyncLLM(EngineClient): executor_class=executor_class, log_stats=self.log_stats, client_addresses=client_addresses, + client_count=client_count, client_index=client_index, ) @@ -156,6 +158,7 @@ class AsyncLLM(EngineClient): enable_log_requests: bool = False, disable_log_stats: bool = False, client_addresses: Optional[dict[str, str]] = None, + client_count: int = 1, client_index: int = 0, disable_log_requests: bool = True, # Deprecated, will be removed ) -> "AsyncLLM": @@ -176,6 +179,7 @@ class AsyncLLM(EngineClient): log_stats=not disable_log_stats, usage_context=usage_context, client_addresses=client_addresses, + client_count=client_count, client_index=client_index, ) diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py index 8d8d1689e61e3..596edfdbe24f8 100644 --- a/vllm/v1/engine/coordinator.py +++ b/vllm/v1/engine/coordinator.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import copy import multiprocessing import time import weakref @@ -65,18 +66,14 @@ class DPCoordinator: # Assume coordinator is colocated with front-end procs when not in # either external or hybrid DP LB mode. + local_only = not (external_lb or hybrid_lb) front_publish_address = get_engine_client_zmq_addr( - local_only=not external_lb and not hybrid_lb, host=host) + local_only=local_only, host=host) local_only_eng = dp_size == parallel_config.data_parallel_size_local back_publish_address = get_engine_client_zmq_addr(local_only_eng, host) back_output_address = get_engine_client_zmq_addr(local_only_eng, host) - # When in external LB mode, load stats aren't published, only changes - # to request wave / running state, so we don't need to rate-limit the - # updates to the front-end proc(s). - min_stats_update_interval_ms = 0 if external_lb else 100 - context = get_mp_context() self.proc: multiprocessing.Process = context.Process( target=DPCoordinatorProc.run_coordinator, @@ -86,7 +83,6 @@ class DPCoordinator: "front_publish_address": front_publish_address, "back_output_address": back_output_address, "back_publish_address": back_publish_address, - "min_stats_update_interval_ms": min_stats_update_interval_ms, }, daemon=True) self.proc.start() @@ -125,10 +121,6 @@ class DPCoordinatorProc: self.stats_update_interval_ms = min_stats_update_interval_ms - self.current_wave = 0 - self.engines_running = False - self.stats_changed = False - @staticmethod def run_coordinator( engine_count: int, @@ -155,6 +147,16 @@ class DPCoordinatorProc: decoder = MsgpackDecoder(EngineCoreOutputs) + # For tracking request wave progression. + current_wave = 0 + engines_running = False + + # For tracking request counts for internal load-balancing. + stats_changed = False + last_stats_step = -1 + last_stats_wave = -1 + last_step_counts: Optional[list[list[int]]] = None + with make_zmq_socket( path=front_publish_address, # IPC ctx=self.ctx, @@ -191,21 +193,33 @@ class DPCoordinatorProc: while True: elapsed = int(time.time() * 1000) - last_publish_time # Send at stats_update_interval_ms interval if the stats have - # changed, or otherwise every 4 seconds. + # changed, or otherwise every 5 seconds. wait_for = (self.stats_update_interval_ms - if self.stats_changed else 4000) - events = poller.poll(timeout=max(0, wait_for - elapsed)) + if stats_changed else 5000) + + # Wait at least 50ms to ensure we've received all stats for + # the current step. + min_timeout = 50 if last_step_counts is None else 0 + + events = poller.poll(timeout=max(min_timeout, wait_for - + elapsed)) if not events: # Poller timeout - publish current stats to front-ends. - engine_req_counts_list = self._get_engine_counts() - to_publish = (engine_req_counts_list, self.current_wave, - self.engines_running) + if last_step_counts is not None: + engine_req_counts_list = last_step_counts + last_step_counts = None + else: + engine_req_counts_list = self._get_engine_counts() + stats_changed = False + + to_publish = (engine_req_counts_list, current_wave, + engines_running) publish_front.send(msgspec.msgpack.encode(to_publish)) last_publish_time = int(time.time() * 1000) - self.stats_changed = False continue events = dict(events) + wave_state_changed = False if publish_front in events: buffer = publish_front.recv() @@ -232,7 +246,7 @@ class DPCoordinatorProc: # current_wave # we note that 0 is the wave number for the new # engine - self.engines_running = False + engines_running = False logger.info( "DPCoordinator scaled up from %s to %s " "engines", current_count, new_engine_count) @@ -248,15 +262,15 @@ class DPCoordinatorProc: # engines are paused, so that we can wake the other # engines. engine_to_exclude, wave = decoded - if not self.engines_running: - if wave < self.current_wave: + if not engines_running: + if wave < current_wave: # If the wave number is stale, ensure the message # is handled by all the engines. engine_to_exclude = None - self.engines_running = True - self.stats_changed = True - self._send_start_wave(publish_back, self.current_wave, + engines_running = True + wave_state_changed = True + self._send_start_wave(publish_back, current_wave, engine_to_exclude) if output_back in events: @@ -274,36 +288,56 @@ class DPCoordinatorProc: # 1. Updated request load stats - update our local # state with these. stats = self.engines[eng_index].request_counts + stats_step = scheduler_stats.step_counter + stats_wave = scheduler_stats.current_wave + if (stats_wave > last_stats_wave + or stats_wave == last_stats_wave + and stats_step > last_stats_step): + if stats_changed: + last_step_counts = self._get_engine_counts( + do_copy=True) + last_stats_step = stats_step + last_stats_wave = stats_wave + elif stats_wave != last_stats_wave or ( + stats_step != last_stats_step): + logger.warning( + "Received stats for out-of-order " + "step (%d, %d) from engine %d (expected " + "> (%d, %d))", stats_wave, stats_step, + eng_index, last_stats_wave, last_stats_step) stats[0] = scheduler_stats.num_waiting_reqs stats[1] = scheduler_stats.num_running_reqs - self.stats_changed = True + stats_changed = True if (wave := outputs.wave_complete) is not None: # 2. Notification from rank 0 engine that we've # moved into the global paused state # (engines_running==False). - if self.current_wave <= wave: + if current_wave <= wave: new_wave = wave + 1 logger.debug("Moving DP wave from %d to %d.", - self.current_wave, new_wave) - self.current_wave = new_wave - self.engines_running = False - self.stats_changed = True + current_wave, new_wave) + current_wave = new_wave + engines_running = False + wave_state_changed = True elif (wave := outputs.start_wave) is not None and ( - wave > self.current_wave or - (wave == self.current_wave - and not self.engines_running)): + wave > current_wave or + (wave == current_wave and not engines_running)): # 3. The engine received request for a non-current wave # so we must ensure that other engines progress to the # next wave (race condition handling). logger.debug( "Starting wave %d after notification of " "stale wave request from engine.", wave) - self.current_wave = wave - self.engines_running = True - self.stats_changed = True + current_wave = wave + engines_running = True + wave_state_changed = True self._send_start_wave(publish_back, wave, eng_index) + if wave_state_changed: + message = (None, current_wave, engines_running) + publish_front.send(msgspec.msgpack.encode(message)) + @staticmethod def _send_start_wave(socket: zmq.Socket, wave: int, exclude_engine_index: Optional[int]): @@ -316,6 +350,8 @@ class DPCoordinatorProc: socket.send_multipart( (EngineCoreRequestType.START_DP_WAVE.value, wave_encoded)) - def _get_engine_counts(self) -> list[list[int]]: + def _get_engine_counts(self, do_copy=False) -> list[list[int]]: """Return list of [waiting, running] count lists for each engine.""" + if do_copy: + return [copy.copy(e.request_counts) for e in self.engines] return [e.request_counts for e in self.engines] diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 0a889b2a0a184..79c47e1028882 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -928,7 +928,7 @@ class DPEngineCoreProc(EngineCoreProc): ): # Counts forward-passes of the model so that we can synchronize # finished with DP peers every N steps. - self.counter = 0 + self.step_counter = 0 self.current_wave = 0 self.last_counts = (0, 0) @@ -999,7 +999,9 @@ class DPEngineCoreProc(EngineCoreProc): counts = self.scheduler.get_request_counts() if counts != self.last_counts: self.last_counts = counts - stats = SchedulerStats(*counts) + stats = SchedulerStats(*counts, + step_counter=self.step_counter, + current_wave=self.current_wave) self.output_queue.put_nowait( (-1, EngineCoreOutputs(scheduler_stats=stats))) @@ -1041,15 +1043,16 @@ class DPEngineCoreProc(EngineCoreProc): self.output_queue.put_nowait( (client_index, EngineCoreOutputs(wave_complete=self.current_wave))) + # Increment wave count and reset step counter. self.current_wave += 1 + self.step_counter = 0 def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool: # Optimization - only perform finish-sync all-reduce every 32 steps. - self.counter += 1 - if self.counter != 32: + self.step_counter += 1 + if self.step_counter % 32 != 0: return True - self.counter = 0 return ParallelConfig.has_unfinished_dp(self.dp_group, local_unfinished) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 26985df6f62df..4d30bb6b74466 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -86,11 +86,12 @@ class EngineCoreClient(ABC): executor_class: type[Executor], log_stats: bool, client_addresses: Optional[dict[str, str]] = None, + client_count: int = 1, client_index: int = 0, ) -> "MPClient": parallel_config = vllm_config.parallel_config client_args = (vllm_config, executor_class, log_stats, - client_addresses, client_index) + client_addresses, client_count, client_index) if parallel_config.data_parallel_size > 1: if parallel_config.data_parallel_external_lb: # External load balancer - client per DP rank. @@ -727,6 +728,7 @@ class AsyncMPClient(MPClient): executor_class: type[Executor], log_stats: bool, client_addresses: Optional[dict[str, str]] = None, + client_count: int = 1, client_index: int = 0): super().__init__( asyncio_mode=True, @@ -929,11 +931,12 @@ class DPAsyncMPClient(AsyncMPClient): executor_class: type[Executor], log_stats: bool, client_addresses: Optional[dict[str, str]] = None, + client_count: int = 1, client_index: int = 0): self.current_wave = 0 super().__init__(vllm_config, executor_class, log_stats, - client_addresses, client_index) + client_addresses, client_count, client_index) # List of [waiting, running] pair per engine. # Used only by DPLBAsyncMPClient subclass. @@ -1029,7 +1032,11 @@ class DPAsyncMPClient(AsyncMPClient): counts, wave, running = msgspec.msgpack.decode(buf) self.current_wave = wave self.engines_running = running - self.lb_engines = counts[count_slice] + if counts is not None: + sliced_counts = counts[count_slice] + self.lb_engines = sliced_counts + logger.debug("Received counts: %s (%s)", sliced_counts, + count_slice) resources.stats_update_task = asyncio.create_task( run_engine_stats_update_task()) @@ -1065,40 +1072,45 @@ class DPLBAsyncMPClient(DPAsyncMPClient): executor_class: type[Executor], log_stats: bool, client_addresses: Optional[dict[str, str]] = None, + client_count: int = 1, client_index: int = 0): + self.client_count = client_count + # To route aborts to the correct engine. self.reqs_in_flight: dict[str, EngineIdentity] = {} super().__init__(vllm_config, executor_class, log_stats, - client_addresses, client_index) + client_addresses, client_count, client_index) assert len(self.core_engines) > 1 + self.eng_start_index = (len(self.core_engines) * + self.client_index) // client_count + def get_core_engine_for_request( self, request: EngineCoreRequest) -> EngineIdentity: # Engines are in rank order. + current_counts = self.lb_engines if (eng_index := request.data_parallel_rank) is None: - if not self.lb_engines: + if not current_counts: return self.core_engine # TODO use P2C alg for larger DP sizes - num_engines = len(self.lb_engines) - min_counts = [sys.maxsize, sys.maxsize] + num_engines = len(current_counts) + min_score = sys.maxsize eng_index = 0 for i in range(num_engines): # Start from client_index to help with balancing when engines # are empty. - idx = (self.client_index + i) % num_engines - counts = self.lb_engines[idx] - if counts < min_counts: - min_counts = counts + idx = (self.eng_start_index + i) % num_engines + waiting, running = current_counts[idx] + score = waiting * 4 + running + if score < min_score: + min_score = score eng_index = idx - # Adjust local counts for better balancing between stats updates - # from the coordinator (which happen every 100ms). - if min_counts[0]: - min_counts[0] += 1 - else: - min_counts[1] += 1 + # Increment local waiting count for better balancing between stats + # updates from the coordinator (which happen every 100ms). + current_counts[eng_index][0] += self.client_count chosen_engine = self.core_engines[eng_index] # Record which engine is chosen for this request, to handle aborts. diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 1eb10ccb6c493..9a80460261e02 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -33,6 +33,10 @@ class SchedulerStats: num_running_reqs: int = 0 num_waiting_reqs: int = 0 + # These are used for internal DP load-balancing. + step_counter: int = 0 + current_wave: int = 0 + kv_cache_usage: float = 0.0 prefix_cache_stats: PrefixCacheStats = field( diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index c74d8c543f76c..d0175695c1d0f 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -154,6 +154,7 @@ class APIServerProcessManager: client_config = { "input_address": in_addr, "output_address": out_addr, + "client_count": num_servers, "client_index": i } if stats_update_address is not None: From 6e8d8c4afbddf725b34ef938616701869f5b3462 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 1 Aug 2025 22:45:46 -0400 Subject: [PATCH 153/224] [Test] Add Unit Test for Batched DeepGEMM (#21559) Signed-off-by: yewentao256 --- tests/kernels/moe/test_batched_deepgemm.py | 103 +++++++++++++++++++++ tests/kernels/moe/test_deepgemm.py | 8 +- vllm/utils/deep_gemm.py | 4 +- 3 files changed, 107 insertions(+), 8 deletions(-) create mode 100644 tests/kernels/moe/test_batched_deepgemm.py diff --git a/tests/kernels/moe/test_batched_deepgemm.py b/tests/kernels/moe/test_batched_deepgemm.py new file mode 100644 index 0000000000000..018d4c224f75e --- /dev/null +++ b/tests/kernels/moe/test_batched_deepgemm.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import torch + +from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( + BatchedDeepGemmExperts) +from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( + BatchedPrepareAndFinalize, BatchedTritonExperts) +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEModularKernel) +from vllm.utils.deep_gemm import calc_diff, is_deep_gemm_supported + +from .test_deepgemm import make_block_quant_fp8_weights + +BLOCK_SIZE = [128, 128] + + +@pytest.mark.skipif(not is_deep_gemm_supported(), + reason="Requires deep_gemm kernels") +@pytest.mark.parametrize("E", [16, 32]) # number of experts +@pytest.mark.parametrize("T", [256, 512]) # tokens per expert +@pytest.mark.parametrize("K", [128, 256]) # hidden dim +@pytest.mark.parametrize("N", [512, 1024]) # intermediate dim per expert +@pytest.mark.parametrize("topk", [2, 4]) +def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int, + monkeypatch): + """Compare BatchedDeepGemmExperts to BatchedTritonExperts.""" + + monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1") + + device = "cuda" + w1, w2, w1_s, w2_s = make_block_quant_fp8_weights(E, N, K, BLOCK_SIZE) + + M = E * T # total tokens + a = torch.randn(M, K, device=device, dtype=torch.bfloat16) / 10.0 + fp8_info = torch.finfo(torch.float8_e4m3fn) + a.clamp_(fp8_info.min, fp8_info.max) + + # random router outputs → top-k indices / weights + router_logits = torch.randn(M, E, device=device, dtype=torch.float32) + topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1) + topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1) + + # token number for each expert + cnt = torch.bincount(topk_ids.flatten(), minlength=E) + max_cnt = int(cnt.max().item()) + # next power of 2 for max token number + max_num_tokens = 1 << (max_cnt - 1).bit_length() + + prep_finalize = BatchedPrepareAndFinalize( + max_num_tokens=max_num_tokens, + num_local_experts=E, + num_dispatchers=1, + rank=0, + ) + + # triton (reference) + triton_experts = BatchedTritonExperts( + max_num_tokens=max_num_tokens, + num_dispatchers=1, + use_fp8_w8a8=True, + per_act_token_quant=False, + block_shape=BLOCK_SIZE, + ) + mk_triton = FusedMoEModularKernel(prep_finalize, triton_experts) + + out_triton = mk_triton( + hidden_states=a, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=False, + w1_scale=w1_s, + w2_scale=w2_s, + global_num_experts=E, + ) + + # deepgemm + deepgemm_experts = BatchedDeepGemmExperts( + max_num_tokens=max_num_tokens, + num_dispatchers=1, + block_shape=BLOCK_SIZE, + per_act_token_quant=False, + ) + mk_deepgemm = FusedMoEModularKernel(prep_finalize, deepgemm_experts) + + out_deepgemm = mk_deepgemm( + hidden_states=a, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=False, + w1_scale=w1_s, + w2_scale=w2_s, + global_num_experts=E, + ) + + diff = calc_diff(out_deepgemm, out_triton) + assert diff < 1e-3, f"Output diff too large: {diff}" diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py index b6ea4ee2324c9..b2b78662c9ded 100644 --- a/tests/kernels/moe/test_deepgemm.py +++ b/tests/kernels/moe/test_deepgemm.py @@ -20,11 +20,6 @@ from vllm.utils.deep_gemm import (calc_diff, is_deep_gemm_supported, BLOCK_SIZE = [128, 128] -requires_deep_gemm = pytest.mark.skipif( - not is_deep_gemm_supported(), - reason="Requires deep_gemm kernels", -) - def make_block_quant_fp8_weights( e: int, @@ -152,7 +147,8 @@ NUM_EXPERTS = [32] @pytest.mark.parametrize("mnk", MNKs) @pytest.mark.parametrize("topk", TOPKS) @pytest.mark.parametrize("num_experts", NUM_EXPERTS) -@requires_deep_gemm +@pytest.mark.skipif(not is_deep_gemm_supported(), + reason="Requires deep_gemm kernels") def test_deepgemm_vs_triton(mnk, topk, num_experts, monkeypatch): with monkeypatch.context() as m: diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 8ab34e7505ee2..0edfb01cde9d6 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -23,10 +23,10 @@ def is_deep_gemm_supported() -> bool: """Return ``True`` if DeepGEMM is supported on the current platform. Currently, only Hopper and Blackwell GPUs are supported. """ - supported_arch = current_platform.is_cuda() and ( + is_supported_arch = current_platform.is_cuda() and ( current_platform.is_device_capability(90) or current_platform.is_device_capability(100)) - return has_deep_gemm() and supported_arch + return has_deep_gemm() and is_supported_arch @functools.cache From 0edaf752d7482a3c170c25376c466e730ab87ddd Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Fri, 1 Aug 2025 19:47:53 -0700 Subject: [PATCH 154/224] [Attention][DBO] Add support for "splitting" the CommonAttentionMetadata (#21153) Signed-off-by: Sage Moore --- .../v1/attention/test_attention_splitting.py | 157 ++++++++++++++++++ vllm/v1/attention/backends/utils.py | 83 +++++++++ 2 files changed, 240 insertions(+) create mode 100644 tests/v1/attention/test_attention_splitting.py diff --git a/tests/v1/attention/test_attention_splitting.py b/tests/v1/attention/test_attention_splitting.py new file mode 100644 index 0000000000000..3fc1011d5042e --- /dev/null +++ b/tests/v1/attention/test_attention_splitting.py @@ -0,0 +1,157 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import torch + +from tests.v1.attention.test_attention_backends import BATCH_SPECS +from tests.v1.attention.utils import create_common_attn_metadata +from vllm.v1.attention.backends.utils import (UbatchSlice, + _make_metadata_with_slice, + slice_query_start_locs, + split_attn_metadata) + + +@pytest.fixture +def sample_query_start_loc(): + """Sample query_start_loc tensor for testing""" + return torch.tensor([0, 5, 12, 20, 35, 50]) + + +def test_basic_slice_middle(sample_query_start_loc): + """Test slicing from middle of tensor""" + req_slice = slice(1, 3) # slice from index 1 to 3 + result = slice_query_start_locs(sample_query_start_loc, req_slice) + + expected = torch.tensor([0, 7, 15]) + assert torch.equal(result, expected) + + +def test_slice_from_beginning(sample_query_start_loc): + """Test slicing from the beginning of tensor""" + req_slice = slice(0, 2) # slice from index 0 to 2 + result = slice_query_start_locs(sample_query_start_loc, req_slice) + + expected = torch.tensor([0, 5, 12]) + assert torch.equal(result, expected) + + +def test_slice_to_end(sample_query_start_loc): + """Test slicing to the end of tensor""" + req_slice = slice(3, 5) # slice from index 3 to 5 (last index) + result = slice_query_start_locs(sample_query_start_loc, req_slice) + + expected = torch.tensor([0, 15, 30]) + assert torch.equal(result, expected) + + +def test_single_element_slice(sample_query_start_loc): + """Test slice that results in single element""" + req_slice = slice(2, 3) # slice from index 2 to 3 + result = slice_query_start_locs(sample_query_start_loc, req_slice) + + expected = torch.tensor([0, 8]) + assert torch.equal(result, expected) + + +def test_full_tensor_slice(sample_query_start_loc): + """Test slicing the entire tensor""" + req_slice = slice(0, 5) # slice entire tensor + result = slice_query_start_locs(sample_query_start_loc, req_slice) + + expected = torch.tensor([0, 5, 12, 20, 35, 50]) + assert torch.equal(result, expected) + + +def test_slice_bounds_edge_cases(sample_query_start_loc): + # Test slice that goes exactly to the last element + req_slice = slice(4, 5) # Last index + result = slice_query_start_locs(sample_query_start_loc, req_slice) + + expected = torch.tensor([0, 15]) + assert torch.equal(result, expected) + + +@pytest.fixture +def small_decode_metadata(): + """Create metadata for small decode batch""" + batch_spec = BATCH_SPECS["small_decode"] + device = torch.device("cpu") + return create_common_attn_metadata(batch_spec, + block_size=16, + device=device) + + +@pytest.fixture +def large_decode_metadata(): + """Create metadata for small decode batch""" + batch_spec = BATCH_SPECS["large_decode"] + device = torch.device("cpu") + return create_common_attn_metadata(batch_spec, + block_size=16, + device=device) + + +@pytest.fixture +def mixed_small_metadata(): + """Create metadata for mixed small batch""" + batch_spec = BATCH_SPECS["mixed_small"] + device = torch.device("cpu") + return create_common_attn_metadata(batch_spec, + block_size=16, + device=device) + + +# Tests for _make_metadata_with_slice +def test_make_metadata_with_slice_decode_batch(small_decode_metadata): + """Test slicing decode batch metadata""" + # Split first request only + ubatch_slice = UbatchSlice(slice(0, 1), slice(0, 1)) + + result = _make_metadata_with_slice(ubatch_slice, small_decode_metadata) + + # Check sliced results + assert result.num_reqs == 1 # slice(0, 1) gives 1 requests + assert result.num_actual_tokens == 1 # slice(0, 1) gives 1 token + assert result.max_query_len == 1 + assert torch.equal(result.query_start_loc, torch.tensor([0, 1])) + assert torch.equal(result.seq_lens, torch.tensor([32])) + + +def test_make_metadata_with_slice_mixed_batch(mixed_small_metadata): + """Test slicing mixed batch metadata""" + ubatch_slice = UbatchSlice(slice(1, 3), + slice(1, 7)) # Requests 1-3, tokens 1-7 + + result = _make_metadata_with_slice(ubatch_slice, mixed_small_metadata) + + assert result.num_reqs == 2 # slice(1, 3) gives 2 requests + assert result.num_actual_tokens == 6 # slice(1, 7) gives 6 tokens + assert result.max_query_len == 5 + assert torch.equal(result.query_start_loc, torch.tensor([0, 1, 6])) + assert torch.equal(result.seq_lens, torch.tensor([40, 48])) + + +def test_split_attn_metadata_decode_batch(large_decode_metadata): + """Test splitting decode batch into two equal parts""" + num_tokens = large_decode_metadata.num_reqs + mid_point = num_tokens // 2 + ubatch_slices = [ + UbatchSlice(slice(0, mid_point), slice(0, mid_point)), + UbatchSlice(slice(mid_point, num_tokens), slice(mid_point, + num_tokens)), + ] + + results = split_attn_metadata(ubatch_slices, large_decode_metadata) + + assert len(results) == 2 + + # Check first split + assert results[0].num_reqs == mid_point + assert results[0].num_actual_tokens == mid_point + assert torch.equal(results[0].seq_lens, torch.tensor([2048] * mid_point)) + + # Check second split + assert results[1].num_reqs == mid_point + assert results[1].num_actual_tokens == mid_point + assert torch.equal(results[1].seq_lens, torch.tensor([2048] * mid_point)) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index d39cc0a39f45c..0f041573e9d20 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -63,6 +63,89 @@ class CommonAttentionMetadata: causal: bool = True +@dataclass +class UbatchSlice: + request_slice: slice + token_slice: slice + + +def slice_query_start_locs( + query_start_loc: torch.Tensor, + request_slice: slice, +) -> torch.Tensor: + """ + Creates a new query_start_loc that corresponds to the requests in + request_slice. + + Note: This function creates a new tensor to hold the new query_start_locs. + This will break cudagraph compatibility. + """ + return query_start_loc[request_slice.start: request_slice.stop + 1] -\ + query_start_loc[request_slice.start] + + +def _make_metadata_with_slice( + ubatch_slice: UbatchSlice, + attn_metadata: CommonAttentionMetadata) -> CommonAttentionMetadata: + """ + This function creates a new CommonAttentionMetadata that corresponds to + the requests included in ubatch_slice + """ + + request_slice = ubatch_slice.request_slice + token_slice = ubatch_slice.token_slice + + query_start_loc = slice_query_start_locs(attn_metadata.query_start_loc, + request_slice) + assert len(query_start_loc >= 2) + query_start_loc_cpu = slice_query_start_locs( + attn_metadata.query_start_loc_cpu, request_slice) + + seq_lens = attn_metadata.seq_lens[request_slice] + seq_lens_cpu = attn_metadata.seq_lens_cpu[request_slice] + num_computed_tokens_cpu = attn_metadata.num_computed_tokens_cpu[ + request_slice] + + num_requests = request_slice.stop - request_slice.start + num_actual_tokens = token_slice.stop - token_slice.start + max_query_len = int( + torch.max(torch.abs(query_start_loc_cpu[1:] - + query_start_loc_cpu[:-1])).item()) + + block_table_tensor = attn_metadata.block_table_tensor[request_slice] + slot_mapping = attn_metadata.slot_mapping[token_slice] + + return CommonAttentionMetadata( + query_start_loc=query_start_loc, + query_start_loc_cpu=query_start_loc_cpu, + seq_lens=seq_lens, + seq_lens_cpu=seq_lens_cpu, + num_computed_tokens_cpu=num_computed_tokens_cpu, + num_reqs=num_requests, + num_actual_tokens=num_actual_tokens, + max_query_len=max_query_len, + block_table_tensor=block_table_tensor, + slot_mapping=slot_mapping, + ) + + +def split_attn_metadata( + ubatch_slices: list[UbatchSlice], + common_attn_metadata: CommonAttentionMetadata, +) -> list[CommonAttentionMetadata]: + """ + Creates a new CommonAttentionMetadata instance that corresponds to the + requests for each UbatchSlice in ubatch_slices. + + Note: This function does not modify common_attn_metadata + """ + results = [] + for ubatch_slice in ubatch_slices: + results.append( + _make_metadata_with_slice(ubatch_slice, common_attn_metadata)) + return results + + M = TypeVar("M") From d3a6f2120bb6b67fc58a3f1000d624cfb351eb05 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Sat, 2 Aug 2025 14:53:18 +0800 Subject: [PATCH 155/224] [FEAT][ROCm] Enable running Flash Attention as ViT attn backend for Qwen-VL models on ROCm platform. (#22069) Signed-off-by: tjtanaavllm Signed-off-by: vllmellm Co-authored-by: tjtanaavllm --- vllm/model_executor/models/qwen2_5_vl.py | 18 ++++++++---- vllm/model_executor/models/qwen2_vl.py | 18 ++++++++---- vllm/model_executor/models/vision.py | 36 +++++------------------- vllm/platforms/cuda.py | 14 +++++++++ vllm/platforms/interface.py | 5 ++++ vllm/platforms/rocm.py | 12 ++++++++ 6 files changed, 64 insertions(+), 39 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 04e64422d2e0b..45fb7f9580ae4 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -246,11 +246,15 @@ class Qwen2_5_VisionAttention(nn.Module): # Detect attention implementation. self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) if self.attn_backend not in { - _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS + _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS, + _Backend.ROCM_AITER_FA }: raise RuntimeError( f"Qwen2.5-VL does not support {self.attn_backend} backend now." ) + self.is_flash_attn_backend = self.attn_backend in { + _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA + } def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: # [s, b, 3 * head * head_dim] @@ -297,10 +301,13 @@ class Qwen2_5_VisionAttention(nn.Module): q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) - if self.attn_backend == _Backend.FLASH_ATTN: + if self.is_flash_attn_backend: # from vllm_flash_attn.flash_attn_interface import ( # flash_attn_varlen_func) - from flash_attn import flash_attn_varlen_func + if self.attn_backend == _Backend.ROCM_AITER_FA: + from aiter import flash_attn_varlen_func + else: + from flash_attn import flash_attn_varlen_func q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) @@ -311,7 +318,7 @@ class Qwen2_5_VisionAttention(nn.Module): cu_seqlens_k=cu_seqlens, max_seqlen_q=max_seqlen, max_seqlen_k=max_seqlen, - dropout_p=0, + dropout_p=0.0, causal=False) context_layer = rearrange(output, @@ -635,7 +642,8 @@ class Qwen2_5_VisionTransformer(nn.Module): cu_seqlens: torch.Tensor, ) -> tuple[Optional[int], Optional[list[int]]]: max_seqlen, seqlens = None, None - if self.attn_backend == _Backend.FLASH_ATTN: + if (self.attn_backend == _Backend.FLASH_ATTN + or self.attn_backend == _Backend.ROCM_AITER_FA): max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() elif self.attn_backend == _Backend.XFORMERS: seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 4e8ea8e449133..40d77312b72c2 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -274,10 +274,14 @@ class Qwen2VisionAttention(nn.Module): # Detect attention implementation. self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) if self.attn_backend not in { - _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS + _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS, + _Backend.ROCM_AITER_FA }: raise RuntimeError( f"Qwen2-VL does not support {self.attn_backend} backend now.") + self.is_flash_attn_backend = self.attn_backend in { + _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA + } def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: # [s, b, 3 * head * head_dim] @@ -324,10 +328,13 @@ class Qwen2VisionAttention(nn.Module): q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) - if self.attn_backend == _Backend.FLASH_ATTN: + if self.is_flash_attn_backend: # from vllm_flash_attn.flash_attn_interface import ( # flash_attn_varlen_func) - from flash_attn import flash_attn_varlen_func + if self.attn_backend == _Backend.ROCM_AITER_FA: + from aiter import flash_attn_varlen_func + else: + from flash_attn import flash_attn_varlen_func q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) @@ -338,7 +345,7 @@ class Qwen2VisionAttention(nn.Module): cu_seqlens_k=cu_seqlens, max_seqlen_q=max_seqlen, max_seqlen_k=max_seqlen, - dropout_p=0, + dropout_p=0.0, causal=False) context_layer = rearrange(output, @@ -620,7 +627,8 @@ class Qwen2VisionTransformer(nn.Module): self, cu_seqlens: torch.Tensor ) -> tuple[Optional[int], Optional[list[int]]]: max_seqlen, seqlens = None, None - if self.attn_backend == _Backend.FLASH_ATTN: + if (self.attn_backend == _Backend.FLASH_ATTN + or self.attn_backend == _Backend.ROCM_AITER_FA): max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() elif self.attn_backend == _Backend.XFORMERS: seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index ac6a659bbaa32..de30509b1ccb4 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -7,9 +7,7 @@ from typing import Final, Generic, Optional, Protocol, TypeVar, Union import torch from transformers import PretrainedConfig -import vllm.envs as envs -from vllm.attention.selector import (backend_name_to_enum, - get_global_forced_attn_backend) +from vllm.attention.selector import get_env_variable_attn_backend from vllm.logger import init_logger from vllm.platforms import _Backend, current_platform @@ -75,32 +73,12 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend: Get the available attention backend for Vision Transformer. """ # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn. - selected_backend: Optional[_Backend] = get_global_forced_attn_backend() - if selected_backend is None: - backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND - if backend_by_env_var is not None: - selected_backend = backend_name_to_enum(backend_by_env_var) - if selected_backend is None: - if current_platform.is_cuda(): - device_available = current_platform.has_device_capability(80) - if device_available and support_fa: - from transformers.utils import is_flash_attn_2_available - if is_flash_attn_2_available(): - selected_backend = _Backend.FLASH_ATTN - else: - logger.warning_once( - "Current `vllm-flash-attn` has a bug inside vision " - "module, so we use xformers backend instead. You can " - "run `pip install flash-attn` to use flash-attention " - "backend.") - selected_backend = _Backend.XFORMERS - else: - # For Volta and Turing GPUs, use xformers instead. - selected_backend = _Backend.XFORMERS - else: - # Default to torch SDPA for other non-GPU platforms. - selected_backend = _Backend.TORCH_SDPA - return selected_backend + + selected_backend: Optional[_Backend] = get_env_variable_attn_backend() + if selected_backend is not None: + return selected_backend + + return current_platform.get_vit_attn_backend(support_fa) def resolve_visual_encoder_outputs( diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 87ff6b385809a..a90910639f784 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -206,6 +206,20 @@ class CudaPlatformBase(Platform): torch.cuda.reset_peak_memory_stats(device) return torch.cuda.max_memory_allocated(device) + @classmethod + def get_vit_attn_backend(cls, support_fa: bool = False) -> _Backend: + if cls.has_device_capability(80) and support_fa: + from transformers.utils import is_flash_attn_2_available + if is_flash_attn_2_available(): + return _Backend.FLASH_ATTN + logger.warning_once( + "Current `vllm-flash-attn` has a bug inside vision " + "module, so we use xformers backend instead. You can " + "run `pip install flash-attn` to use flash-attention " + "backend.") + # Fallback for Volta/Turing GPUs or FA not supported + return _Backend.XFORMERS + @classmethod def get_attn_backend_cls(cls, selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1, diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 6bae0fe25c797..997aee7063f57 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -46,6 +46,7 @@ class _Backend(enum.Enum): ROCM_FLASH = enum.auto() ROCM_AITER_MLA = enum.auto() # Supported by V1 ROCM_AITER_MLA_VLLM_V1 = enum.auto() + ROCM_AITER_FA = enum.auto() # used for ViT attn backend TORCH_SDPA = enum.auto() FLASHINFER = enum.auto() FLASHINFER_VLLM_V1 = enum.auto() @@ -186,6 +187,10 @@ class Platform: else: return device_id + @classmethod + def get_vit_attn_backend(cls, support_fa: bool = False) -> _Backend: + return _Backend.TORCH_SDPA + @classmethod def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, dtype: torch.dtype, kv_cache_dtype: Optional[str], diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index b2e69f60343f6..54ffc83cd565a 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -173,6 +173,18 @@ class RocmPlatform(Platform): "quark", "ptpc_fp8" ] + @classmethod + def get_vit_attn_backend(cls, support_fa: bool = False) -> _Backend: + if support_fa: + if (envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA + and on_gfx9()): + # Note: AITER FA is only supported for Qwen-VL models. + # TODO: Add support for other VL models in their model class. + return _Backend.ROCM_AITER_FA + if on_gfx9(): + return _Backend.FLASH_ATTN + return _Backend.TORCH_SDPA + @classmethod def get_attn_backend_cls(cls, selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1, From 4ac8437352a8945262e877d64162d741404768e2 Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Fri, 1 Aug 2025 23:54:40 -0700 Subject: [PATCH 156/224] [Misc] Getting and passing ray runtime_env to workers (#22040) Signed-off-by: Rui Qiao --- tests/config/test_config_generation.py | 33 ++++++++++++++++++++++++++ vllm/config.py | 5 ++++ vllm/engine/arg_utils.py | 11 +++++++++ vllm/executor/ray_utils.py | 7 ++++-- vllm/ray/lazy_utils.py | 22 +++++++++++++++++ vllm/utils/__init__.py | 12 +--------- 6 files changed, 77 insertions(+), 13 deletions(-) create mode 100644 vllm/ray/lazy_utils.py diff --git a/tests/config/test_config_generation.py b/tests/config/test_config_generation.py index 024e81fccc5f1..e37b6b95941e9 100644 --- a/tests/config/test_config_generation.py +++ b/tests/config/test_config_generation.py @@ -36,3 +36,36 @@ def test_cuda_empty_vs_unset_configs(monkeypatch: pytest.MonkeyPatch): assert deep_compare(normal_config_dict, empty_config_dict), ( "Configs with normal CUDA_VISIBLE_DEVICES and CUDA_VISIBLE_DEVICES=\"\"" " should be equivalent") + + +def test_ray_runtime_env(monkeypatch: pytest.MonkeyPatch): + # In testing, this method needs to be nested inside as ray does not + # see the test module. + def create_config(): + engine_args = EngineArgs(model="deepseek-ai/DeepSeek-V2-Lite", + trust_remote_code=True) + return engine_args.create_engine_config() + + config = create_config() + parallel_config = config.parallel_config + assert parallel_config.ray_runtime_env is None + + import ray + ray.init() + + runtime_env = { + "env_vars": { + "TEST_ENV_VAR": "test_value", + }, + } + + config_ref = ray.remote(create_config).options( + runtime_env=runtime_env).remote() + + config = ray.get(config_ref) + parallel_config = config.parallel_config + assert parallel_config.ray_runtime_env is not None + assert parallel_config.ray_runtime_env.env_vars().get( + "TEST_ENV_VAR") == "test_value" + + ray.shutdown() diff --git a/vllm/config.py b/vllm/config.py index 95dae4275edf3..ee8f3dd98dd86 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -57,6 +57,7 @@ from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, if TYPE_CHECKING: from _typeshed import DataclassInstance + from ray.runtime_env import RuntimeEnv from ray.util.placement_group import PlacementGroup from transformers.configuration_utils import PretrainedConfig @@ -74,6 +75,7 @@ if TYPE_CHECKING: else: DataclassInstance = Any PlacementGroup = Any + RuntimeEnv = Any PretrainedConfig = Any ExecutorBase = Any QuantizationConfig = Any @@ -2098,6 +2100,9 @@ class ParallelConfig: ray_workers_use_nsight: bool = False """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.""" + ray_runtime_env: Optional["RuntimeEnv"] = None + """Ray runtime environment to pass to distributed workers.""" + placement_group: Optional["PlacementGroup"] = None """ray distributed model workers placement group.""" diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 0d38b5b5302c1..47b3efa6af726 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -36,6 +36,7 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, from vllm.logger import init_logger from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins +from vllm.ray.lazy_utils import is_ray_initialized from vllm.reasoning import ReasoningParserManager from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 from vllm.transformers_utils.utils import check_gguf_file @@ -1099,6 +1100,15 @@ class EngineArgs: kv_sharing_fast_prefill=self.kv_sharing_fast_prefill, ) + ray_runtime_env = None + if is_ray_initialized(): + # Ray Serve LLM calls `create_engine_config` in the context + # of a Ray task, therefore we check is_ray_initialized() + # as opposed to is_in_ray_actor(). + import ray + ray_runtime_env = ray.get_runtime_context().runtime_env + logger.info("Using ray runtime env: %s", ray_runtime_env) + # Get the current placement group if Ray is initialized and # we are in a Ray actor. If so, then the placement group will be # passed to spawned processes. @@ -1211,6 +1221,7 @@ class EngineArgs: max_parallel_loading_workers=self.max_parallel_loading_workers, disable_custom_all_reduce=self.disable_custom_all_reduce, ray_workers_use_nsight=self.ray_workers_use_nsight, + ray_runtime_env=ray_runtime_env, placement_group=placement_group, distributed_executor_backend=self.distributed_executor_backend, worker_cls=self.worker_cls, diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 033ecc00853ba..7abaffa54c089 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -295,9 +295,12 @@ def initialize_ray_cluster( logger.warning( "No existing RAY instance detected. " "A new instance will be launched with current node resources.") - ray.init(address=ray_address, num_gpus=parallel_config.world_size) + ray.init(address=ray_address, + num_gpus=parallel_config.world_size, + runtime_env=parallel_config.ray_runtime_env) else: - ray.init(address=ray_address) + ray.init(address=ray_address, + runtime_env=parallel_config.ray_runtime_env) device_str = current_platform.ray_device_key if not device_str: diff --git a/vllm/ray/lazy_utils.py b/vllm/ray/lazy_utils.py new file mode 100644 index 0000000000000..bb3535579cfdf --- /dev/null +++ b/vllm/ray/lazy_utils.py @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +def is_ray_initialized(): + """Check if Ray is initialized.""" + try: + import ray + return ray.is_initialized() + except ImportError: + return False + + +def is_in_ray_actor(): + """Check if we are in a Ray actor.""" + + try: + import ray + return (ray.is_initialized() + and ray.get_runtime_context().get_actor_id() is not None) + except ImportError: + return False diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 0d3fa6b059beb..3318ae5106377 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -72,6 +72,7 @@ from typing_extensions import Never, ParamSpec, TypeIs, assert_never import vllm.envs as envs from vllm.logger import enable_trace_function_call, init_logger +from vllm.ray.lazy_utils import is_in_ray_actor if TYPE_CHECKING: from argparse import Namespace @@ -2835,17 +2836,6 @@ def zmq_socket_ctx( ctx.destroy(linger=linger) -def is_in_ray_actor(): - """Check if we are in a Ray actor.""" - - try: - import ray - return (ray.is_initialized() - and ray.get_runtime_context().get_actor_id() is not None) - except ImportError: - return False - - def _maybe_force_spawn(): """Check if we need to force the use of the `spawn` multiprocessing start method. From 8564dc9448ed8648088c25248313933308ae36d8 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Fri, 1 Aug 2025 23:55:34 -0700 Subject: [PATCH 157/224] Fix test_kv_sharing_fast_prefill flakiness (#22038) Signed-off-by: Yong Hoon Shin --- tests/v1/e2e/test_kv_sharing_fast_prefill.py | 35 +++++++++++++++++--- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py index 616fc7a860599..f5a7b9cc276b3 100644 --- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py +++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import gc import random from typing import Optional, Union @@ -10,6 +9,7 @@ import torch from vllm import LLM, SamplingParams from vllm.config import CompilationConfig, CompilationLevel +from vllm.distributed import cleanup_dist_env_and_memory from vllm.forward_context import get_forward_context from vllm.model_executor.models.gemma3n import Gemma3nForConditionalGeneration from vllm.model_executor.models.registry import ModelRegistry @@ -18,6 +18,9 @@ from vllm.sequence import IntermediateTensors from ...utils import fork_new_process_for_each_test +# global seed +SEED = 42 + class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration): @@ -95,8 +98,25 @@ def test_prompts(): return prompts +def cleanup(llm: LLM, compilation_config: CompilationConfig): + # hacky: below lines are required to free up memory for the next test + # when setting VLLM_ENABLE_V1_MULTIPROCESSING=0, del llm is not sufficient + # TODO(sarckk): when enforce_eager=False, memory is not freed: + # find out why and re-enable test for enforce_eager=False case + llm_engine = llm.llm_engine.engine_core.engine_core + model_runner = llm_engine.model_executor.driver_worker.worker.model_runner + del model_runner.model + del model_runner.kv_caches + del compilation_config.static_forward_context + compilation_config.static_forward_context = {} + + del llm + torch.cuda.empty_cache() + cleanup_dist_env_and_memory() + + @fork_new_process_for_each_test -@pytest.mark.parametrize("enforce_eager", [True, False]) +@pytest.mark.parametrize("enforce_eager", [True]) def test_kv_sharing_fast_prefill( monkeypatch: pytest.MonkeyPatch, enforce_eager: bool, @@ -115,23 +135,28 @@ def test_kv_sharing_fast_prefill( with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") + # Make scheduling deterministic for reproducibility + m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") + llm = LLM( model="google/gemma-3n-E2B-it", enforce_eager=enforce_eager, compilation_config=compilation_config, + seed=SEED, ) ref_responses = llm.generate(test_prompts, sampling_params) - del llm - gc.collect() - torch.cuda.empty_cache() + cleanup(llm, compilation_config) llm = LLM(model="google/gemma-3n-E2B-it", enforce_eager=enforce_eager, compilation_config=compilation_config, + seed=SEED, kv_sharing_fast_prefill=True) optimized_responses = llm.generate(test_prompts, sampling_params) + cleanup(llm, compilation_config) + misses = 0 for ref_response, optimized_response in zip(ref_responses, From c64861d63c1a5362bfad443daf7a096f1bcfd1e4 Mon Sep 17 00:00:00 2001 From: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Date: Sat, 2 Aug 2025 02:55:57 -0400 Subject: [PATCH 158/224] [Bugfix] Mamba2 remove bugged initial state condition in chunk scan (#22034) Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> --- .../model_executor/layers/mamba/ops/ssd_chunk_scan.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py index 365e1c54b555a..61eff0c008f60 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py @@ -476,15 +476,8 @@ def _chunk_scan_fwd( # with initial states, we need to take care of how # seq_idx crosses the boundaries assert batch == 1, "chunk scan only supports initial states with batch 1" - - if initial_states.shape[0] == 1: - # no in this case no point to use initial states - initial_states = None - else: - assert chunk_indices is not None and chunk_offsets is not None, \ - ( - "chunk_indices and chunk_offsets should have been set" - ) + assert chunk_indices is not None and chunk_offsets is not None, \ + "chunk_indices and chunk_offsets should have been set" else: chunk_indices, chunk_offsets = None, None else: From 067c34a1559400e956311f067ddd185f54207a2b Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 2 Aug 2025 00:19:48 -0700 Subject: [PATCH 159/224] docs: remove deprecated disable-log-requests flag (#22113) Signed-off-by: Roger Wang --- .buildkite/scripts/tpu/run_bm.sh | 1 - benchmarks/README.md | 10 +++++----- benchmarks/auto_tune/auto_tune.sh | 1 - benchmarks/benchmark_serving.py | 3 +-- benchmarks/benchmark_serving_structured_output.py | 2 +- docs/design/p2p_nccl_connector.md | 8 -------- docs/models/supported_models.md | 2 +- .../disagg_example_p2p_nccl_xpyd.sh | 2 -- examples/online_serving/prometheus_grafana/README.md | 3 +-- .../disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh | 2 -- tests/entrypoints/openai/correctness/test_lmeval.py | 2 +- tests/entrypoints/openai/test_chunked_prompt.py | 2 -- tests/models/quantization/test_bitsandbytes.py | 1 - .../kv_connector/nixl_integration/run_accuracy_test.sh | 2 -- .../nixl_integration/run_edge_case_test.sh | 2 -- .../nixl_integration/run_tpu_disagg_accuracy_test.sh | 3 --- .../nixl_integration/run_tpu_edge_case_test.sh | 2 -- tests/v1/sample/test_logprobs_e2e.py | 2 +- vllm/utils/__init__.py | 5 +++-- 19 files changed, 14 insertions(+), 41 deletions(-) diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh index beecaf7a740ae..b1e17b438578d 100755 --- a/.buildkite/scripts/tpu/run_bm.sh +++ b/.buildkite/scripts/tpu/run_bm.sh @@ -44,7 +44,6 @@ echo VLLM_USE_V1=1 vllm serve $MODEL \ --seed 42 \ - --disable-log-requests \ --max-num-seqs $MAX_NUM_SEQS \ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \ diff --git a/benchmarks/README.md b/benchmarks/README.md index 644517235b122..d6442a4fc3872 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -91,7 +91,7 @@ become available. First start serving your model ```bash -vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests +vllm serve NousResearch/Hermes-3-Llama-3.1-8B ``` Then run the benchmarking script @@ -146,7 +146,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you ```bash # start server -VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests +VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct ``` ```bash @@ -171,7 +171,7 @@ You can skip applying chat template if your data already has it by using `--cust ```bash # need a model with vision capability here -vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests +vllm serve Qwen/Qwen2-VL-7B-Instruct ``` ```bash @@ -205,7 +205,7 @@ vllm bench serve \ ### Other HuggingFaceDataset Examples ```bash -vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests +vllm serve Qwen/Qwen2-VL-7B-Instruct ``` `lmms-lab/LLaVA-OneVision-Data`: @@ -430,7 +430,7 @@ Benchmark the performance of structured output generation (JSON, grammar, regex) ### Server Setup ```bash -vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests +vllm serve NousResearch/Hermes-3-Llama-3.1-8B ``` ### JSON Schema Benchmark diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index 3cd8580e065dd..df26376504b95 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -60,7 +60,6 @@ start_server() { pkill -f vllm VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \ - --disable-log-requests \ --port 8004 \ --gpu-memory-utilization $gpu_memory_utilization \ --max-num-seqs $max_num_seqs \ diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 3affa18ae3a4f..93b72211eb332 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -5,8 +5,7 @@ r"""Benchmark online serving throughput. On the server side, run one of the following commands: vLLM OpenAI API server vllm serve \ - --swap-space 16 \ - --disable-log-requests + --swap-space 16 On the client side, run: python benchmarks/benchmark_serving.py \ diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 2a22f122c78e6..ca6843a72aa36 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -4,7 +4,7 @@ r"""Benchmark online serving throughput with structured outputs. On the server side, run one of the following commands: (vLLM OpenAI API server) - vllm serve --disable-log-requests + vllm serve On the client side, run: python benchmarks/benchmark_serving_structured_output.py \ diff --git a/docs/design/p2p_nccl_connector.md b/docs/design/p2p_nccl_connector.md index 94af8bedd24d2..adf838306bc77 100644 --- a/docs/design/p2p_nccl_connector.md +++ b/docs/design/p2p_nccl_connector.md @@ -109,7 +109,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 & ``` @@ -131,7 +130,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 & ``` @@ -153,7 +151,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 & ``` @@ -175,7 +172,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 & ``` @@ -206,7 +202,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20001"}}' > /var/vllm.log 2>&1 & ``` @@ -228,7 +223,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20002"}}' > /var/vllm.log 2>&1 & ``` @@ -250,7 +244,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003"}}' > /var/vllm.log 2>&1 & ``` @@ -272,7 +265,6 @@ python3 disagg_proxy_p2p_nccl_xpyd.py & --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ - --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20004"}}' > /var/vllm.log 2>&1 & ``` diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 56c77a1e5f118..bd7a57b436213 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -255,7 +255,7 @@ export https_proxy=http://your.proxy.server:port https_proxy=http://your.proxy.server:port huggingface-cli download # or use vllm cmd directly -https_proxy=http://your.proxy.server:port vllm serve --disable-log-requests +https_proxy=http://your.proxy.server:port vllm serve ``` - Set the proxy in Python interpreter: diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh index 568f7a43b4962..7b0b12bb34d25 100644 --- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh +++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh @@ -178,7 +178,6 @@ main() { --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ - --disable-log-request \ --kv-transfer-config \ "{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_size\":\"1e1\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > prefill$((i+1)).log 2>&1 & PIDS+=($!) @@ -207,7 +206,6 @@ main() { --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ - --disable-log-request \ --kv-transfer-config \ "{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_size\":\"8e9\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > decode$((i+1)).log 2>&1 & PIDS+=($!) diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md index 7c4e649e6d029..5cd4dab5a8fa7 100644 --- a/examples/online_serving/prometheus_grafana/README.md +++ b/examples/online_serving/prometheus_grafana/README.md @@ -13,8 +13,7 @@ Prometheus metric logging is enabled by default in the OpenAI-compatible server. ```bash vllm serve mistralai/Mistral-7B-v0.1 \ - --max-model-len 2048 \ - --disable-log-requests + --max-model-len 2048 ``` Launch Prometheus and Grafana servers with `docker compose`: diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh index 5719fa8212923..1284466a45580 100644 --- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh +++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh @@ -28,7 +28,6 @@ if [[ $1 == "prefiller" ]]; then CUDA_VISIBLE_DEVICES=0 \ vllm serve $MODEL \ --port 8100 \ - --disable-log-requests \ --enforce-eager \ --kv-transfer-config \ '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}' @@ -46,7 +45,6 @@ elif [[ $1 == "decoder" ]]; then CUDA_VISIBLE_DEVICES=1 \ vllm serve $MODEL \ --port 8200 \ - --disable-log-requests \ --enforce-eager \ --kv-transfer-config \ '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}' diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py index a07a147cdc2b2..d75731637d282 100644 --- a/tests/entrypoints/openai/correctness/test_lmeval.py +++ b/tests/entrypoints/openai/correctness/test_lmeval.py @@ -22,7 +22,7 @@ TASK = "gsm8k" FILTER = "exact_match,strict-match" RTOL = 0.03 EXPECTED_VALUE = 0.54 -DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"] +DEFAULT_ARGS = ["--max-model-len", "4096"] MORE_ARGS_LIST = [ [], # Default ["--enable-chunked-prefill"], # Chunked diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py index 3c8ed955a65a2..c8160c5f2d0e3 100644 --- a/tests/entrypoints/openai/test_chunked_prompt.py +++ b/tests/entrypoints/openai/test_chunked_prompt.py @@ -26,8 +26,6 @@ def server(): "--enable-chunked-prefill", "--max-num-batched-tokens", "1000", - # large prompts create a lot of output - "--disable-log-requests", ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py index 8cb269d7e9496..e0e919b62b217 100644 --- a/tests/models/quantization/test_bitsandbytes.py +++ b/tests/models/quantization/test_bitsandbytes.py @@ -102,7 +102,6 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts, def test_load_pp_4bit_bnb_model(model_name, description) -> None: common_args = [ "--disable-log-stats", - "--disable-log-requests", "--dtype", "bfloat16", "--enable-prefix-caching", diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index b48655d80eefd..9322410ec99e9 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -88,7 +88,6 @@ run_tests_for_model() { BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --enforce-eager \ - --disable-log-requests \ --gpu-memory-utilization 0.2 \ --tensor-parallel-size $PREFILLER_TP_SIZE \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'" @@ -121,7 +120,6 @@ run_tests_for_model() { BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --enforce-eager \ - --disable-log-requests \ --gpu-memory-utilization 0.2 \ --tensor-parallel-size $DECODER_TP_SIZE \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'" diff --git a/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh b/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh index 98903a176e28b..b64461292910d 100644 --- a/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh @@ -57,7 +57,6 @@ run_tests_for_model() { BASE_CMD="CUDA_VISIBLE_DEVICES=0 VLLM_NIXL_SIDE_CHANNEL_PORT=5559 vllm serve $model_name \ --port $PREFILL_PORT \ --enforce-eager \ - --disable-log-requests \ --gpu-memory-utilization 0.2 \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'" @@ -76,7 +75,6 @@ run_tests_for_model() { BASE_CMD="CUDA_VISIBLE_DEVICES=1 VLLM_NIXL_SIDE_CHANNEL_PORT=6000 vllm serve $model_name \ --port $DECODE_PORT \ --enforce-eager \ - --disable-log-requests \ --gpu-memory-utilization 0.2 \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'" diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh index 45779d16914f0..ea125f99fc42c 100644 --- a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh @@ -63,7 +63,6 @@ launch_baseline() { --seed 42 \ --block-size ${BLOCK_SIZE} \ --gpu-memory-utilization 0.5 \ - --disable-log-requests \ --enforce-eager" echo ${BASELINE_BASE_CMD} ssh -tt ${BASELINE_HOST} "${BASELINE_BASE_CMD}" & @@ -87,7 +86,6 @@ launch_pd() { --block-size ${BLOCK_SIZE} \ --enforce-eager \ --gpu-memory-utilization 0.5 \ - --disable-log-requests \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'" @@ -106,7 +104,6 @@ launch_pd() { --block-size ${BLOCK_SIZE} \ --enforce-eager \ --gpu-memory-utilization 0.5 \ - --disable-log-requests \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'" echo ${PREFILL_BASE_CMD} diff --git a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh index c37c92fdf5d3f..8ba653770c4f0 100644 --- a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh @@ -68,7 +68,6 @@ launch_pd() { --block-size ${BLOCK_SIZE} \ --enforce-eager \ --gpu-memory-utilization 0.5 \ - --disable-log-requests \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'" @@ -87,7 +86,6 @@ launch_pd() { --block-size ${BLOCK_SIZE} \ --enforce-eager \ --gpu-memory-utilization 0.5 \ - --disable-log-requests \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'" echo ${PREFILL_BASE_CMD} diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py index 50b14a15dc164..7f41355ff7ce4 100644 --- a/tests/v1/sample/test_logprobs_e2e.py +++ b/tests/v1/sample/test_logprobs_e2e.py @@ -15,7 +15,7 @@ EXPECTED_VALUE = 0.62 MODEL = "meta-llama/Llama-3.2-1B-Instruct" MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False,gpu_memory_utilization=0.8" # noqa: E501 SERVER_ARGS = [ - "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests", + "--enforce_eager", "--no_enable_prefix_caching", "--gpu-memory-utilization=0.8" ] NUM_CONCURRENT = 100 diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 3318ae5106377..ce62282c2199f 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1673,8 +1673,9 @@ class FlexibleArgumentParser(ArgumentParser): # Special case warning because the warning below won't trigger # if –-disable-log-requests because its value is default. logger.warning_once( - "argument '--disable-log-requests' is deprecated. This " - "will be removed in v0.12.0.") + "argument '--disable-log-requests' is deprecated and " + "replaced with '--enable-log-requests'. This will be " + "removed in v0.12.0.") namespace, args = super().parse_known_args(args, namespace) for action in FlexibleArgumentParser._deprecated: if (hasattr(namespace, dest := action.dest) From 58eee5f2e05b74eb2cb1a3bbda9c04df4805e4cc Mon Sep 17 00:00:00 2001 From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Date: Sat, 2 Aug 2025 12:43:52 +0400 Subject: [PATCH 160/224] [PERF] Use faster way of decode in tokenizer: avoid useless list-to-list conversion (#20000) Signed-off-by: Vadim Gimpelson --- vllm/transformers_utils/tokenizer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 24ddd35abea60..6a31a41980695 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -50,11 +50,12 @@ def decode_tokens( `skip_special_tokens=None` means to use the backend's default settings. """ + decode_method = getattr(tokenizer, "_decode", tokenizer.decode) if skip_special_tokens is not None: - return tokenizer.decode(token_ids, - skip_special_tokens=skip_special_tokens) + return decode_method(token_ids, + skip_special_tokens=skip_special_tokens) - return tokenizer.decode(token_ids) + return decode_method(token_ids) def encode_tokens( From 25373b6c6cc2068e3914fa906d3240088f7af157 Mon Sep 17 00:00:00 2001 From: Yuxuan Zhang <2448370773@qq.com> Date: Sat, 2 Aug 2025 16:46:57 +0800 Subject: [PATCH 161/224] for glm-4.1V update (#22000) Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com> --- docs/models/supported_models.md | 3 ++- tests/models/registry.py | 11 +++++----- tests/tool_use/test_glm4_moe_tool_parser.py | 2 +- .../model_executor/layers/rotary_embedding.py | 2 +- vllm/model_executor/models/glm4_1v.py | 21 ++++++++++++------- vllm/model_executor/models/registry.py | 1 + 6 files changed, 24 insertions(+), 16 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index bd7a57b436213..c058c20f1ed73 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -591,7 +591,8 @@ See [this page](generative_models.md) for more information on how to use generat | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ | | `GLM4VForCausalLM`^ | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `THUDM/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4MoeForCausalLM` | GLM-4.5 | T + IE+ + VE+ | `THUDM/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Glm4MoeForCausalLM` | GLM-4.5 | T + IE+ + VE+ | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Glm4v_moeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V-Air`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index fdc7888c85efb..d88d77cddcca5 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -377,9 +377,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b", trust_remote_code=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 - "Glm4vForConditionalGeneration": _HfExamplesInfo("THUDM/GLM-4.1V-9B-Thinking", min_transformers_version="4.53"), # noqa: E501 - "Glm4MoeForCausalLM": _HfExamplesInfo("THUDM/GLM-4.5", - min_transformers_version="4.54", + "Glm4vForConditionalGeneration": _HfExamplesInfo("THUDM/GLM-4.1V-9B-Thinking"), # noqa: E501 + "Glm4MoeForCausalLM": _HfExamplesInfo("zai-org/GLM-4.5", + min_transformers_version="4.54"), # noqa: E501 + "Glm4v_moeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V-Air", is_available_online=False), # noqa: E501 "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m", extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501 @@ -515,8 +516,8 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { is_available_online=False, speculative_model="openbmb/MiniCPM-2B-sft-bf16", tokenizer="openbmb/MiniCPM-2B-sft-bf16"), - "Glm4MoeMTPModel": _HfExamplesInfo("THUDM/GLM-4.5", - speculative_model="THUDM/GLM-4.5", + "Glm4MoeMTPModel": _HfExamplesInfo("zai-org/GLM-4.5", + speculative_model="zai-org/GLM-4.5", min_transformers_version="4.54", is_available_online=False), "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_use/test_glm4_moe_tool_parser.py index 478f4b9166725..91913c933184e 100644 --- a/tests/tool_use/test_glm4_moe_tool_parser.py +++ b/tests/tool_use/test_glm4_moe_tool_parser.py @@ -12,7 +12,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer pytest.skip("skip glm4_moe parser test", allow_module_level=True) # Use a common model that is likely to be available -MODEL = "THUDM/GLM-4.5" +MODEL = "zai-org/GLM-4.5" @pytest.fixture(scope="module") diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index dddd4d6a71170..24dd86620fe91 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -1096,7 +1096,7 @@ class MRotaryEmbedding(RotaryEmbedding): audio_feature_lengths=audio_feature_lengths, use_audio_in_video=use_audio_in_video, ) - elif "glm4v" in hf_config.model_type: + elif hf_config.model_type in ["glm4v", "glm4v_moe"]: return cls._glm4v_get_input_positions_tensor( input_tokens=input_tokens, hf_config=hf_config, diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 7c9840790fe3e..7983895687a38 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -37,8 +37,7 @@ import torch.nn as nn import torch.nn.functional as F from einops import rearrange from transformers import BatchFeature -from transformers.models.glm4v.configuration_glm4v import (Glm4vConfig, - Glm4vVisionConfig) +from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig from transformers.models.glm4v.image_processing_glm4v import ( Glm4vImageProcessor, smart_resize) from transformers.models.glm4v.video_processing_glm4v import ( @@ -801,7 +800,7 @@ class Glm4vVisionTransformer(nn.Module): class Glm4vProcessingInfo(BaseProcessingInfo): def get_hf_config(self): - return self.ctx.get_hf_config(Glm4vConfig) + return self.ctx.get_hf_config() def get_tokenizer(self): return self.ctx.tokenizer @@ -1253,7 +1252,7 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - config: Glm4vConfig = vllm_config.model_config.hf_config + config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config @@ -1267,12 +1266,18 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, prefix=maybe_prefix(prefix, "visual"), ) + if config.model_type == "glm4v": + architectures = ["Glm4ForCausalLM"] + elif config.model_type == "glm4v_moe": + architectures = ["Glm4MoeForCausalLM"] + else: + architectures = None + self.language_model = init_vllm_registered_model( vllm_config=vllm_config, - prefix=maybe_prefix(prefix, ""), - architectures=["Glm4ForCausalLM"], - hf_config=self.config.get_text_config(), - ) + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + architectures=architectures) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 0c5d87a7dc472..9b6ab52d86805 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -206,6 +206,7 @@ _MULTIMODAL_MODELS = { "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501 "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"), "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501 + "Glm4v_moeForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501 "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"), # noqa: E501 "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), "InternVLChatModel": ("internvl", "InternVLChatModel"), From b690e34824fd5a5c4054a0c0468ebfb6aa1dd215 Mon Sep 17 00:00:00 2001 From: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Date: Sat, 2 Aug 2025 04:59:34 -0400 Subject: [PATCH 162/224] [Model] Mamba2 preallocate SSM output tensor to avoid d2d copy overhead (#21075) Signed-off-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> --- tests/kernels/mamba/test_mamba_ssm.py | 74 ++++++++++--------- tests/kernels/mamba/test_mamba_ssm_ssd.py | 23 +++--- .../layers/mamba/mamba_mixer.py | 6 +- .../layers/mamba/mamba_mixer2.py | 54 ++++++++------ .../layers/mamba/ops/mamba_ssm.py | 16 ++-- .../layers/mamba/ops/ssd_chunk_scan.py | 19 ++--- .../layers/mamba/ops/ssd_combined.py | 28 ++++--- vllm/model_executor/models/phi4flash.py | 6 +- vllm/model_executor/models/plamo2.py | 36 +++++---- 9 files changed, 144 insertions(+), 118 deletions(-) diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py index 8dece26ddb29c..4c32ae81b34c5 100644 --- a/tests/kernels/mamba/test_mamba_ssm.py +++ b/tests/kernels/mamba/test_mamba_ssm.py @@ -365,6 +365,7 @@ def test_selective_state_update(dim, dstate, has_z, itype): batch_size = 1 state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device) x = torch.randn(batch_size, dim, device=device, dtype=itype) + out = torch.empty_like(x) dt = torch.randn(batch_size, dim, device=device, dtype=itype) dt_bias = torch.rand(dim, device=device) - 4.0 A = -torch.rand(dim, dstate, device=device) - 1.0 @@ -373,16 +374,17 @@ def test_selective_state_update(dim, dstate, has_z, itype): D = torch.randn(dim, device=device) z = torch.randn_like(x) if has_z else None state_ref = state.detach().clone() - out = selective_state_update(state, - x, - dt, - A, - B, - C, - D=D, - z=z, - dt_bias=dt_bias, - dt_softplus=True) + selective_state_update(state, + x, + dt, + A, + B, + C, + D=D, + z=z, + dt_bias=dt_bias, + dt_softplus=True, + out=out) out_ref = selective_state_update_ref(state_ref, x, dt, @@ -581,6 +583,7 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate, ], dim=0) x = torch.randn(padded_batch_size, dim, device=device, dtype=itype) + out = torch.empty_like(x) dt = torch.randn(padded_batch_size, dim, device=device, dtype=itype) dt_bias = torch.rand(dim, device=device) - 4.0 A = -torch.rand(dim, dstate, device=device) - 1.0 @@ -590,18 +593,19 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate, z = torch.randn_like(x) if has_z else None state_ref = state[state_indices, :].clone() state_before = state.clone() - out = selective_state_update(state, - x, - dt, - A, - B, - C, - D=D, - z=z, - dt_bias=dt_bias, - dt_softplus=True, - state_batch_indices=padded_state_indices, - pad_slot_id=PAD_SLOT_ID) + selective_state_update(state, + x, + dt, + A, + B, + C, + D=D, + z=z, + dt_bias=dt_bias, + dt_softplus=True, + state_batch_indices=padded_state_indices, + pad_slot_id=PAD_SLOT_ID, + out=out) out_ref = selective_state_update_ref(state_ref, x[:batch_size], dt[:batch_size], @@ -665,6 +669,7 @@ def test_selective_state_update_with_heads_with_batch_indices( dtype=torch.int32, device=device) x = torch.randn(batch_size, nheads, headdim, device=device, dtype=itype) + out = torch.empty_like(x) if not tie_hdim: dt = torch.randn(batch_size, nheads, @@ -691,18 +696,19 @@ def test_selective_state_update_with_heads_with_batch_indices( C = torch.randn(batch_size, ngroups, dstate, device=device) z = torch.randn_like(x) if has_z else None state_ref = state[state_indices, :].detach().clone() - out = selective_state_update(state, - x, - dt, - A, - B, - C, - D=D, - z=z, - dt_bias=dt_bias, - dt_softplus=True, - state_batch_indices=state_indices, - pad_slot_id=PAD_SLOT_ID) + selective_state_update(state, + x, + dt, + A, + B, + C, + D=D, + z=z, + dt_bias=dt_bias, + dt_softplus=True, + state_batch_indices=state_indices, + pad_slot_id=PAD_SLOT_ID, + out=out) out_ref = selective_state_update_ref(state_ref, x, dt, diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py index 00c1a2911d7db..67b14a7faa89f 100644 --- a/tests/kernels/mamba/test_mamba_ssm_ssd.py +++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py @@ -212,15 +212,16 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1), A * dt, B, C, chunk_size) - - Y, final_state = mamba_chunk_scan_combined(X, - dt, - A, - B, - C, - chunk_size, - D=None, - return_final_states=True) + Y = torch.empty_like(X) + final_state = mamba_chunk_scan_combined(X, + dt, + A, + B, + C, + chunk_size, + D=None, + return_final_states=True, + out=Y) # just test the last in sequence torch.testing.assert_close(Y[:, -1], Y_min[:, -1], atol=atol, rtol=rtol) @@ -292,7 +293,8 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases, _query_start_loc_to_chunk_indices_offsets( cu_seqlens, chunk_size, cu_seqlens[-1]) - Y, new_states = mamba_chunk_scan_combined( + Y = torch.empty_like(X) + new_states = mamba_chunk_scan_combined( X, dt, A, @@ -306,6 +308,7 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases, chunk_offsets=chunk_offsets, return_varlen_states=True, initial_states=states, + out=Y, ) # just test the last in sequence diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index 796c8d9375727..60cf3e11885a1 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -220,7 +220,8 @@ class MambaMixer(CustomOp): has_initial_state=attn_metadata.context_lens_tensor > 0, query_start_loc=attn_metadata.query_start_loc) else: - scan_outputs = selective_state_update( + scan_outputs = torch.empty_like(hidden_states.transpose(0, 1)) + selective_state_update( mamba_cache_params.ssm_state, hidden_states.transpose(0, 1), discrete_time_step.transpose(0, 1), @@ -231,7 +232,8 @@ class MambaMixer(CustomOp): gate.transpose(0, 1), time_proj_bias, dt_softplus=True, - state_batch_indices=mamba_cache_params.state_indices_tensor) + state_batch_indices=mamba_cache_params.state_indices_tensor, + out=scan_outputs) scan_outputs = scan_outputs.transpose(0, 1) # 4. Final linear projection diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 36edac2375d0e..5ac9a7f9ab3e4 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -541,7 +541,6 @@ class MambaMixer2(MambaBase, CustomOp): # NOTE: V0 put prefill before decode, v1 puts decode before prefill # Separate prefill and decode by splitting varlen input # Split along token dimension - # NOTE: V0 put prefill before decode, v1 puts decode before prefill if envs.VLLM_USE_V1: hidden_states_B_C_d, hidden_states_B_C_p = torch.split( hidden_states_B_C[:num_actual_tokens], @@ -583,7 +582,28 @@ class MambaMixer2(MambaBase, CustomOp): 1] if has_prefill else None) - ssd_output_list = [] + # Preallocate output tensor to avoid memcpy cost for merging prefill + # and decode outputs + preallocated_ssm_out = torch.empty( + [ + num_prefill_tokens + num_decodes, + (self.num_heads // self.tp_size) * self.head_dim + ], + dtype=hidden_states.dtype, + device=hidden_states.device, + ) + if envs.VLLM_USE_V1: + preallocated_ssm_out_d, preallocated_ssm_out_p = torch.split( + preallocated_ssm_out, + [num_decodes, num_prefill_tokens], + dim=0, + ) + else: + preallocated_ssm_out_p, preallocated_ssm_out_d = torch.split( + preallocated_ssm_out, + [num_prefill_tokens, num_decodes], + dim=0, + ) # Process prefill requests if has_prefill: @@ -623,7 +643,8 @@ class MambaMixer2(MambaBase, CustomOp): has_initial_states_p[:num_prefills, None, None, None], ssm_state[state_indices_tensor_p], 0) - scan_output, varlen_state = mamba_chunk_scan_combined( + # NOTE: final output is an in-place update of out tensor + varlen_state = mamba_chunk_scan_combined( hidden_states_p.view(1, num_prefill_tokens, self.num_heads // self.tp_size, self.head_dim), @@ -646,15 +667,14 @@ class MambaMixer2(MambaBase, CustomOp): return_final_states=False, dt_softplus=True, dt_limit=(0.0, float("inf")), + out=preallocated_ssm_out_p.view(1, num_prefill_tokens, -1, + self.head_dim), ) # update ssm states # - varlen state is a (num_prefills, nheads, headdim, dstate) tensor ssm_state[state_indices_tensor_p] = varlen_state - # - reshape - ssd_output_list.append(scan_output.view(num_prefill_tokens, -1)) - # Process decode requests if has_decode: # 2. Convolution sequence transformation @@ -684,8 +704,8 @@ class MambaMixer2(MambaBase, CustomOp): # - the hidden is reshaped into (bs, num_heads, head_dim) # - mamba_cache_params.ssm_state's slots will be selected # using state_indices_tensor_d - - hidden_states_d = selective_state_update( + # NOTE: final output is an in-place update of out tensor + selective_state_update( ssm_state, hidden_states_d, dt_d, @@ -697,26 +717,16 @@ class MambaMixer2(MambaBase, CustomOp): dt_bias=dt_bias, dt_softplus=True, state_batch_indices=state_indices_tensor_d, + out=preallocated_ssm_out_d.view(num_decodes, -1, + self.head_dim), ) - if envs.VLLM_USE_V1: - ssd_output_list.insert( - 0, - hidden_states_d.view(-1, (self.num_heads // self.tp_size) * - self.head_dim)) - else: - ssd_output_list.append( - hidden_states_d.view(-1, (self.num_heads // self.tp_size) * - self.head_dim)) - - # Merge prefill and decode outputs before passing to gated MLP - hidden_states = torch.vstack(ssd_output_list) - # 4. gated MLP # GatedRMSNorm internally applying SiLU to the gate # SiLU is applied internally before normalization, unlike standard # norm usage - hidden_states = self.norm(hidden_states, gate[:num_actual_tokens]) + hidden_states = self.norm(preallocated_ssm_out, + gate[:num_actual_tokens]) # 5. Final linear projection output[:num_actual_tokens], _ = self.out_proj(hidden_states) diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py index 3f67fc35afdfc..838290a9f5fb2 100644 --- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py +++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py @@ -205,7 +205,8 @@ def selective_state_update(state, dt_bias=None, dt_softplus=False, state_batch_indices=None, - pad_slot_id=PAD_SLOT_ID): + pad_slot_id=PAD_SLOT_ID, + out=None): """ Argument: state: (batch, dim, dstate) or (batch, nheads, dim, dstate) @@ -223,10 +224,9 @@ def selective_state_update(state, for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id] in this case, the kernel will not process entries at indices 0 and 3 - Return: - out: (batch, dim) or (batch, nheads, dim) + out: Preallocated ssm output tensor. Assume same shape as x. + In-place updated. """ - has_heads = state.dim() > 3 if state.dim() == 3: state = state.unsqueeze(1) if x.dim() == 2: @@ -245,6 +245,8 @@ def selective_state_update(state, z = z.unsqueeze(1) if dt_bias is not None and dt_bias.dim() == 1: dt_bias = dt_bias.unsqueeze(0) + if out.dim() == 2: + out = out.unsqueeze(1) _, nheads, dim, dstate = state.shape batch = x.shape[0] @@ -264,7 +266,8 @@ def selective_state_update(state, assert dt_bias.shape == (nheads, dim) if state_batch_indices is not None: assert state_batch_indices.shape == (batch, ) - out = torch.empty_like(x) + assert out.shape == x.shape + grid = lambda META: (triton.cdiv(dim, META['BLOCK_SIZE_M']), batch, nheads) z_strides = ((z.stride(0), z.stride(1), z.stride(2)) if z is not None else (0, 0, 0)) @@ -328,9 +331,6 @@ def selective_state_update(state, BLOCK_SIZE_M, num_warps=num_warps, ) - if not has_heads: - out = out.squeeze(1) - return out def selective_scan_fn(u, diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py index 61eff0c008f60..fc2b3b25fd0a8 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py @@ -454,6 +454,7 @@ def _chunk_scan_fwd( chunk_indices=None, chunk_offsets=None, initial_states=None, + out=None, ): batch, seqlen, nheads, headdim = x.shape _, _, nchunks, chunk_size = dt.shape @@ -483,20 +484,10 @@ def _chunk_scan_fwd( else: chunk_indices, chunk_offsets = None, None - # Allocates output. - out = torch.empty(batch, - seqlen, - nheads, - headdim, - device=x.device, - dtype=x.dtype) + assert out.shape == x.shape + if z is not None: - out_x = torch.empty(batch, - seqlen, - nheads, - headdim, - device=x.device, - dtype=x.dtype) + out_x = torch.empty_like(x) assert out_x.stride() == out.stride() else: out_x = None @@ -579,4 +570,4 @@ def _chunk_scan_fwd( IS_TRITON_22=TRITON_22, HAS_INITSTATES=initial_states is not None, ) - return out, out_x + return out_x diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py index b121275e9eb38..ad2853a3d8a8b 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_combined.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py @@ -36,7 +36,8 @@ def _mamba_chunk_scan_combined_fwd(x, chunk_offsets=None, cu_seqlens=None, dt_softplus=False, - dt_limit=(0.0, float("inf"))): + dt_limit=(0.0, float("inf")), + out=None): batch, seqlen, nheads, headdim = x.shape _, _, ngroups, dstate = B.shape assert nheads % ngroups == 0 @@ -134,7 +135,7 @@ def _mamba_chunk_scan_combined_fwd(x, # - in each (pseudo) chunk, we detect if the previous (pseudo) chunk had # a seq_idx change, in which case we take states information from # init_states. - out, out_x = _chunk_scan_fwd( + out_x = _chunk_scan_fwd( CB, x, dt, @@ -147,9 +148,10 @@ def _mamba_chunk_scan_combined_fwd(x, chunk_indices=chunk_indices, chunk_offsets=chunk_offsets, initial_states=initial_states, + out=out, ) if cu_seqlens is None: - return out, out_x, dt, dA_cumsum, states, final_states + return out_x, dt, dA_cumsum, states, final_states else: assert batch == 1, "passing cu_seqlens to get the varlen states is only supported if batch dimension is 1" varlen_states = chunk_state_varlen( @@ -161,7 +163,7 @@ def _mamba_chunk_scan_combined_fwd(x, states.squeeze(0), initial_states=initial_states, ) - return out, out_x, dt, dA_cumsum, states, final_states, varlen_states + return out_x, dt, dA_cumsum, states, final_states, varlen_states def mamba_chunk_scan_combined(x, @@ -180,6 +182,7 @@ def mamba_chunk_scan_combined(x, cu_seqlens=None, dt_softplus=False, dt_limit=(0.0, float("inf")), + out=None, return_final_states=False, return_varlen_states=False): """ @@ -197,15 +200,14 @@ def mamba_chunk_scan_combined(x, seq_idx: (batch, seqlen) cu_seqlens: (num_sequences + 1) or None, only used if return_varlen_states is True dt_softplus: Whether to apply softplus to dt - Return: - out: (batch, seqlen, nheads, headdim) + out: Preallocated output tensor """ if not return_varlen_states: cu_seqlens = None else: assert cu_seqlens is not None, "cu_seqlens must be provided if return_varlen_states is True" - out, out_x, dt_out, dA_cumsum, states, final_states, *rest = _mamba_chunk_scan_combined_fwd( + out_x, dt_out, dA_cumsum, states, final_states, *rest = _mamba_chunk_scan_combined_fwd( x, dt, A, @@ -221,12 +223,14 @@ def mamba_chunk_scan_combined(x, chunk_offsets=chunk_offsets, cu_seqlens=cu_seqlens, dt_softplus=dt_softplus, - dt_limit=dt_limit) + dt_limit=dt_limit, + out=out) if not return_varlen_states: - return out if not return_final_states else (out, final_states) + if not return_final_states: + return + else: + return final_states else: varlen_states = rest[0] - return (out, - varlen_states) if not return_final_states else (out, - final_states, + return (varlen_states) if not return_final_states else (final_states, varlen_states) diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py index a4ded2b7a3047..1a761d01fc066 100644 --- a/vllm/model_executor/models/phi4flash.py +++ b/vllm/model_executor/models/phi4flash.py @@ -387,7 +387,8 @@ class Phi4Mamba(nn.Module): has_initial_state=attn_metadata.context_lens_tensor > 0, query_start_loc=attn_metadata.query_start_loc) else: - scan_outputs = selective_state_update( + scan_outputs = torch.empty_like(hidden_states.transpose(0, 1)) + selective_state_update( mamba_cache_params.ssm_state, hidden_states.transpose(0, 1), discrete_time_step.transpose(0, 1), @@ -400,7 +401,8 @@ class Phi4Mamba(nn.Module): None if self.yoco_kv else gate.transpose(0, 1), time_proj_bias, dt_softplus=True, - state_batch_indices=mamba_cache_params.state_indices_tensor) + state_batch_indices=mamba_cache_params.state_indices_tensor, + out=scan_outputs) scan_outputs = scan_outputs.transpose(0, 1) # 4. Final linear projection diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 9bc577cfe3a3e..8b1df66f02805 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -257,7 +257,21 @@ class Plamo2MambaMixer(nn.Module): query_start_loc_p = (attn_metadata.query_start_loc[:num_prefills + 1] if has_prefill else None) - ssd_output_list = [] + # Preallocate output tensor to avoid memcpy cost for merging prefill + # and decode outputs + preallocated_ssm_out = torch.empty( + [ + num_prefill_tokens + num_decodes, + (self.num_heads // self.tp_size) * self.head_dim + ], + dtype=hidden_states.dtype, + device=hidden_states.device, + ) + preallocated_ssm_out_p, preallocated_ssm_out_d = torch.split( + preallocated_ssm_out, + [num_prefill_tokens, num_decodes], + dim=0, + ) # Process prefill requests if has_prefill: @@ -290,7 +304,7 @@ class Plamo2MambaMixer(nn.Module): initial_states = torch.where( mamba2_metadata.has_initial_states[:, None, None, None], mamba_cache_params.ssm_state[state_indices_tensor_p], 0) - scan_output, varlen_state = mamba_chunk_scan_combined( + varlen_state = mamba_chunk_scan_combined( hidden_states_p.view(1, num_prefill_tokens, self.num_heads // self.tp_size, self.head_dim), @@ -312,15 +326,14 @@ class Plamo2MambaMixer(nn.Module): return_final_states=False, dt_softplus=True, dt_limit=(0.0, float("inf")), + out=preallocated_ssm_out_p.view(1, num_prefill_tokens, -1, + self.head_dim), ) # update ssm states # - varlen state is a (batch, nheads, headdim, dstate) tensor mamba_cache_params.ssm_state[state_indices_tensor_p] = varlen_state - # - reshape - ssd_output_list.append(scan_output.view(num_prefill_tokens, -1)) - # Process decode requests if has_decode: # 2. Convolution sequence transformation @@ -349,8 +362,7 @@ class Plamo2MambaMixer(nn.Module): # - the hidden is reshaped into (bs, num_heads, head_dim) # - mamba_cache_params.ssm_state's slots will be selected # using state_indices_tensor_d - - hidden_states_d = selective_state_update( + selective_state_update( mamba_cache_params.ssm_state, hidden_states_d, dt, @@ -362,17 +374,13 @@ class Plamo2MambaMixer(nn.Module): dt_bias=dt_bias, dt_softplus=True, state_batch_indices=state_indices_tensor_d, + out=preallocated_ssm_out_d.view(num_decodes, -1, + self.head_dim), ) assert self.num_heads % self.tp_size == 0 - ssd_output_list.append( - hidden_states_d.view(-1, (self.num_heads // self.tp_size) * - self.head_dim)) - - # Merge prefill and decode outputs before passing to MLP - hidden_states = torch.vstack(ssd_output_list) # 4. Final linear projection - out = self.out_proj(hidden_states) + out = self.out_proj(preallocated_ssm_out) return out From f5d0f4784fdd93f1032f3bb81220af10d7588f5a Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 2 Aug 2025 17:20:38 +0800 Subject: [PATCH 163/224] [Frontend] Improve error message for too many mm items (#22114) Signed-off-by: DarkLight1337 --- tests/entrypoints/test_chat_utils.py | 10 ++---- tests/multimodal/test_processing.py | 10 +++--- vllm/entrypoints/chat_utils.py | 27 ++++++-------- vllm/multimodal/processing.py | 54 ++++++++++++++++++---------- vllm/multimodal/profiling.py | 2 +- 5 files changed, 52 insertions(+), 51 deletions(-) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 54daf1a91d645..647f1c7b7f34f 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -579,10 +579,7 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message( warnings.filterwarnings( "ignore", message="coroutine 'async_get_and_parse_image' was never awaited") - with pytest.raises( - ValueError, - match="At most 2 image\\(s\\) may be provided in one request\\." - ): + with pytest.raises(ValueError, match="At most"): parse_chat_messages( [{ "role": @@ -622,10 +619,7 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages( warnings.filterwarnings( "ignore", message="coroutine 'async_get_and_parse_image' was never awaited") - with pytest.raises( - ValueError, - match="At most 2 image\\(s\\) may be provided in one request\\." - ): + with pytest.raises(ValueError, match="At most"): parse_chat_messages( [{ "role": diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 659ee9af9ddec..508c773b8aedf 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -3,7 +3,6 @@ from contextlib import nullcontext from typing import Optional, cast -from unittest.mock import MagicMock import numpy as np import pytest @@ -957,15 +956,14 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): ) processor = MULTIMODAL_REGISTRY.create_processor(model_config) - profiler = MultiModalProfiler(processor) + processor._supported_mm_limits = {"image": num_supported} - mock_supported_mm_limits = MagicMock(return_value={"image": num_supported}) - processor.info.get_supported_mm_limits = mock_supported_mm_limits + profiler = MultiModalProfiler(processor) if is_valid: exc_ctx = nullcontext() else: - exc_ctx = pytest.raises(ValueError, match="The model only supports") + exc_ctx = pytest.raises(ValueError, match="At most") with exc_ctx: profiler.get_decoder_dummy_data( @@ -1002,7 +1000,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): if is_valid: exc_ctx = nullcontext() else: - exc_ctx = pytest.raises(ValueError, match=f"passed {num_images} image") + exc_ctx = pytest.raises(ValueError, match="At most") with exc_ctx: processor.apply( diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 6485ed6b148b4..a658d97cc8c5e 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -535,9 +535,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): return self._model_config @cached_property - def model_cls(self): + def model_cls(self) -> type[SupportsMultiModal]: from vllm.model_executor.model_loader import get_model_cls - return get_model_cls(self.model_config) + model_cls = get_model_cls(self.model_config) + return cast(type[SupportsMultiModal], model_cls) @property def allowed_local_media_path(self): @@ -547,31 +548,23 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): def mm_registry(self): return MULTIMODAL_REGISTRY + @cached_property + def mm_processor(self): + return self.mm_registry.create_processor(self.model_config) + def add(self, modality: ModalityStr, item: _T) -> Optional[str]: """ Add a multi-modal item to the current prompt and returns the placeholder string to use, if any. """ - mm_registry = self.mm_registry - model_config = self.model_config - model_cls = cast(SupportsMultiModal, self.model_cls) - input_modality = modality.replace("_embeds", "") + num_items = len(self._items_by_modality[modality]) + 1 - mm_processor = mm_registry.create_processor(model_config) - allowed_counts = mm_processor.info.get_allowed_mm_limits() - allowed_count = allowed_counts.get(input_modality, 0) - - current_count = len(self._items_by_modality[modality]) + 1 - if current_count > allowed_count: - raise ValueError( - f"At most {allowed_count} {modality}(s) may be provided in " - "one request. You can set `--limit-mm-per-prompt` to " - "increase this limit if the model supports it.") + self.mm_processor.validate_num_items(input_modality, num_items) self._items_by_modality[modality].append(item) - return model_cls.get_placeholder_str(modality, current_count) + return self.model_cls.get_placeholder_str(modality, num_items) @abstractmethod def create_parser(self) -> "BaseMultiModalContentParser": diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 78d244a6b4fc8..46240855d12a2 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import json import sys from abc import ABC, abstractmethod from collections import defaultdict @@ -1156,6 +1155,18 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): self.data_parser = self._get_data_parser() + # Avoid unnecessary recomputation + self._supported_mm_limits = self.info.get_supported_mm_limits() + self._allowed_mm_limits = self.info.get_allowed_mm_limits() + + @property + def supported_mm_limits(self): + return self._supported_mm_limits + + @property + def allowed_mm_limits(self): + return self._allowed_mm_limits + def __call__( self, prompt: str, @@ -1176,6 +1187,28 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): """ return MultiModalDataParser() + def validate_num_items( + self, + modality: str, + num_items: int, + ) -> None: + supported_limit = self.supported_mm_limits.get(modality, 0) + allowed_limit = self.allowed_mm_limits.get(modality, 0) + + if supported_limit is None: + supported_limit = allowed_limit + + limit = min(supported_limit, allowed_limit) + + if num_items > limit: + msg = (f"At most {limit} {modality}(s) may be provided in " + "one prompt.") + + if num_items <= supported_limit: + msg += " Set `--limit-mm-per-prompt` to increase this limit." + + raise ValueError(msg) + def _to_mm_items( self, mm_data: MultiModalDataDict, @@ -1188,26 +1221,9 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data]. """ mm_items = self.data_parser.parse_mm_data(mm_data) - supported_mm_limits = self.info.get_supported_mm_limits() - allowed_mm_limits = self.info.get_allowed_mm_limits() for modality, items in mm_items.items(): - supported_limit = supported_mm_limits.get(modality, 0) - allowed_limit = allowed_mm_limits.get(modality, 0) - num_items = len(items) - - if supported_limit is not None and num_items > supported_limit: - raise ValueError( - f"The model only supports at most {supported_limit} " - f"{modality} items, but you passed {num_items} " - f"{modality} items in the same prompt.") - - if num_items > allowed_limit: - raise ValueError( - "You set or defaulted to " - f"'{json.dumps({modality: allowed_limit})}' in " - f"`--limit-mm-per-prompt`, but passed {num_items} " - f"{modality} items in the same prompt.") + self.validate_num_items(modality, len(items)) return mm_items diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index d96803b643ff2..d876887fc155d 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -156,7 +156,7 @@ class MultiModalProfiler(Generic[_I]): return self.processor.dummy_inputs def get_mm_limits(self) -> Mapping[str, int]: - return self.processing_info.get_allowed_mm_limits() + return self.processor.allowed_mm_limits def _get_dummy_mm_inputs( self, From 4abfd8796f37adc8fccc9481f37f20de1bce62e4 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Sat, 2 Aug 2025 14:29:40 +0200 Subject: [PATCH 164/224] [V1] [Hybrid] Validate compatibility of attention backend batch reordering at init time (#21557) Signed-off-by: Thomas Parnell --- vllm/v1/attention/backends/flashinfer.py | 28 +++++------- vllm/v1/attention/backends/mamba_attn.py | 20 +++------ vllm/v1/attention/backends/mla/common.py | 22 +++------ vllm/v1/attention/backends/rocm_aiter_fa.py | 3 -- vllm/v1/attention/backends/utils.py | 12 ++--- vllm/v1/worker/cpu_model_runner.py | 34 +++++++++++++- vllm/v1/worker/gpu_model_runner.py | 49 ++++++++++++++------- 7 files changed, 96 insertions(+), 72 deletions(-) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 0aaad02b5b840..3697cb9387a92 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -4,7 +4,7 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Optional, Union +from typing import ClassVar, Optional, Union import torch from flashinfer import (BatchDecodeWithPagedKVCacheWrapper, @@ -21,17 +21,17 @@ from vllm.logger import init_logger from vllm.utils import cdiv, is_pin_memory_available from vllm.utils.flashinfer import use_trtllm_decode_attention from vllm.v1.attention.backends.flash_attn import use_cascade_attention -from vllm.v1.attention.backends.utils import ( - AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, - get_kv_cache_layout, get_per_layer_parameters, - infer_global_hyperparameters, reorder_batch_to_split_decodes_and_prefills, - split_decodes_and_prefills) +# yapf conflicts with isort for this block +# yapf: disable +from vllm.v1.attention.backends.utils import (AttentionCGSupport, + AttentionMetadataBuilder, + CommonAttentionMetadata, + get_kv_cache_layout, + get_per_layer_parameters, + infer_global_hyperparameters, + split_decodes_and_prefills) from vllm.v1.kv_cache_interface import AttentionSpec -if TYPE_CHECKING: - from vllm.v1.core.sched.output import SchedulerOutput - from vllm.v1.worker.gpu_input_batch import InputBatch - FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024 logger = init_logger(__name__) @@ -179,6 +179,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ AttentionCGSupport.PURE_DECODE_ONLY + reorder_batch_threshold: ClassVar[int] = 1 + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): self.device = device @@ -239,12 +241,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): dtype=torch.int32, device=self.device) - def reorder_batch(self, input_batch: InputBatch, - scheduler_output: SchedulerOutput) -> bool: - return reorder_batch_to_split_decodes_and_prefills(input_batch, - scheduler_output, - decode_threshold=1) - def _get_workspace_buffer(self): if self._workspace_buffer is None: self._workspace_buffer = torch.empty( diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py index 8b702e28d67c0..66a8d91db89c2 100644 --- a/vllm/v1/attention/backends/mamba_attn.py +++ b/vllm/v1/attention/backends/mamba_attn.py @@ -2,21 +2,17 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional +from typing import ClassVar, Optional import torch from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig -from vllm.v1.attention.backends.utils import ( - AttentionMetadataBuilder, CommonAttentionMetadata, - reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills) +from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, + CommonAttentionMetadata, + split_decodes_and_prefills) from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec -if TYPE_CHECKING: - from vllm.v1.core.sched.output import SchedulerOutput - from vllm.v1.worker.gpu_input_batch import InputBatch - def _query_start_loc_to_chunk_indices_offsets(query_start_loc: torch.Tensor, chunk_size: int, @@ -87,6 +83,8 @@ class Mamba2AttentionMetadata: class Mamba2AttentionMetadataBuilder( AttentionMetadataBuilder[Mamba2AttentionMetadata]): + reorder_batch_threshold: ClassVar[int] = 1 + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): assert isinstance(kv_cache_spec, MambaSpec) @@ -95,12 +93,6 @@ class Mamba2AttentionMetadataBuilder( assert self.chunk_size is not None, ( "chunk_size needs to be set in the model config for Mamba2 models") - def reorder_batch(self, input_batch: "InputBatch", - scheduler_output: "SchedulerOutput") -> bool: - return reorder_batch_to_split_decodes_and_prefills(input_batch, - scheduler_output, - decode_threshold=1) - def build(self, common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata, diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index d112468f1c91d..badff67656c24 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -190,7 +190,7 @@ return curr_o @ W_O import functools from abc import abstractmethod from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Generic, Optional, TypeVar, Union +from typing import ClassVar, Generic, Optional, TypeVar, Union import torch @@ -210,10 +210,11 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.platforms import current_platform from vllm.utils import cdiv, round_down from vllm.utils.flashinfer import has_nvidia_artifactory -from vllm.v1.attention.backends.utils import ( - AttentionMetadataBuilder, CommonAttentionMetadata, - get_per_layer_parameters, infer_global_hyperparameters, - reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills) +from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, + CommonAttentionMetadata, + get_per_layer_parameters, + infer_global_hyperparameters, + split_decodes_and_prefills) from vllm.v1.kv_cache_interface import AttentionSpec try: @@ -233,10 +234,6 @@ try: except ImportError: flashinfer_available = False -if TYPE_CHECKING: - from vllm.v1.core.sched.output import SchedulerOutput - from vllm.v1.worker.gpu_input_batch import InputBatch - logger = init_logger(__name__) CUDNN_WORKSPACE_SIZE = 12800 @@ -403,6 +400,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): NOTE: Please read the comment at the top of the file before trying to understand this class """ + reorder_batch_threshold: ClassVar[int] = 1 def __init__(self, kv_cache_spec: AttentionSpec, @@ -559,12 +557,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): prefill.prefill_main = self._fi_prefill_main prefill.prefill_chunks = self._fi_prefill_chunks - def reorder_batch(self, input_batch: "InputBatch", - scheduler_output: "SchedulerOutput") -> bool: - return reorder_batch_to_split_decodes_and_prefills(input_batch, - scheduler_output, - decode_threshold=1) - def _build_decode(self, block_table_tensor: torch.Tensor, seq_lens: torch.Tensor): return MLACommonDecodeMetadata( diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index dd10b7f02730a..abe05174507ff 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -251,9 +251,6 @@ class AiterFlashAttentionMetadataBuilder( self.aot_sliding_window: Optional[tuple[int, int]] = None self.total_tokens: int = 0 - def reorder_batch(self, input_batch, scheduler_output) -> bool: - return False - def build_for_cudagraph_capture( self, common_attn_metadata: CommonAttentionMetadata): self.total_tokens = self.model_config.max_model_len \ diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 0f041573e9d20..6defd211f4cfa 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -167,6 +167,10 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]): # Does this backend/builder support CUDA Graphs for attention. attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ AttentionCGSupport.NEVER + # Does this backend/builder reorder the batch? + # If not, set this to None. Otherwise set it to the query + # length that will be pulled into the front of the batch. + reorder_batch_threshold: ClassVar[Optional[int]] = None @abstractmethod def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], @@ -221,14 +225,6 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]): ) -> bool: return False - def reorder_batch(self, input_batch: "InputBatch", - scheduler_output: "SchedulerOutput") -> bool: - """ - This method can reorder the batch if desired by the backend. - :return: Has the batch been reordered (default False). - """ - return False - @functools.lru_cache def get_kv_cache_layout(): diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index 6b2b50a57e1f8..d8f3e0d89a960 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import contextmanager -from typing import Any +from typing import TYPE_CHECKING, Any import torch import torch.nn as nn @@ -9,8 +9,12 @@ import torch.nn as nn from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model +from vllm.v1.attention.backends.cpu_attn import TorchSDPAMetadataBuilderV1 from vllm.v1.worker.gpu_model_runner import GPUModelRunner +if TYPE_CHECKING: + from vllm.v1.core.sched.output import SchedulerOutput + logger = init_logger(__name__) @@ -27,6 +31,34 @@ class CPUModelRunner(GPUModelRunner): self._postprocess_tenosrs() + def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None: + """ + Update the order of requests in the batch based on the attention + backend's needs. For example, some attention backends (namely MLA) may + want to separate requests based on if the attention computation will be + compute-bound or memory-bound. + + Args: + scheduler_output: The scheduler output. + """ + # Attention free models have zero kv_cache_goups, however models + # like Mamba are also attention free but use the kv_cache for + # keeping its internal state. This is why we check the number + # of kv_cache groups instead of solely checking + # for self.model_config.is_attention_free. + if len(self.kv_cache_config.kv_cache_groups) == 0: + return + + if len(self.kv_cache_config.kv_cache_groups) > 1: + raise ValueError("Multiple KVCacheGroups is not" + "currently supported with CPU model runner.") + + assert type( + self.attn_metadata_builders[0]) is TorchSDPAMetadataBuilderV1 + + self.attn_metadata_builders[0].reorder_batch(self.input_batch, + scheduler_output) + def _postprocess_tenosrs(self) -> None: # Note: replace device tensors with cpu tensors def replace_tensor(obj: Any, cpu_attr_name: str, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d5a5799efb47c..42cef6c5733d2 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -49,7 +49,8 @@ from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, make_kv_sharing_fast_prefill_attention_metadata, - make_local_attention_virtual_batches) + make_local_attention_virtual_batches, + reorder_batch_to_split_decodes_and_prefills) from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.kv_cache_interface import (AttentionSpec, ChunkedLocalAttentionSpec, @@ -329,6 +330,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.kv_sharing_fast_prefill_logits_indices = torch.zeros( self.max_num_tokens, dtype=torch.int32, device=self.device) + self.reorder_batch_threshold: Optional[int] = None + def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None: """ Update the order of requests in the batch based on the attention @@ -347,20 +350,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if len(self.kv_cache_config.kv_cache_groups) == 0: return - self.attn_metadata_builders[0].reorder_batch(self.input_batch, - scheduler_output) - - # For models with multiple KV cache groups, the groups should agree on - # the same order of requests. We ensure this by only allowing the first - # group to reorder the batch and asserting that all other groups do not - # reorder the batch. - # TODO(tdoublep): make this more flexible so that any group can - # re-order the batch (not only the first). - # TODO(tdoublep): verify this during engine init instead of at runtime - for i in range(1, len(self.kv_cache_config.kv_cache_groups)): - batch_reordered = self.attn_metadata_builders[i].reorder_batch( - self.input_batch, scheduler_output) - assert not batch_reordered + if self.reorder_batch_threshold is not None: + reorder_batch_to_split_decodes_and_prefills( + self.input_batch, + scheduler_output, + decode_threshold=self.reorder_batch_threshold) # Note: used for model runner override. def _init_device_properties(self) -> None: @@ -2654,6 +2648,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.attn_backends.append(attn_backend_i) self.attn_metadata_builders.append(attn_metadata_builder_i) + # Calculate reorder batch threshold (if neeeded) + self.calculate_reorder_batch_threshold() + if len(self.attn_backends) > 0: return @@ -2688,6 +2685,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.attn_metadata_builders.append(attn_metadata_builder) self.is_encoder_only_model = True + def calculate_reorder_batch_threshold(self) -> None: + """ + Check that if any backends reorder batches; that the reordering + is compatible (e.g., decode threshold is the same) + """ + for attn_metadata_builder_i in self.attn_metadata_builders: + # check that if any backends reorder batches; that the reordering + # is compatible (e.g., decode threshold is the same) + reorder_batch_threshold_i = ( + attn_metadata_builder_i.reorder_batch_threshold) + if reorder_batch_threshold_i is not None: + if self.reorder_batch_threshold is not None: + if reorder_batch_threshold_i != \ + self.reorder_batch_threshold: + raise ValueError( + f"Attention backend reorders decodes with " + f"threshold {reorder_batch_threshold_i} but other " + f"backend uses threshold " + f"{self.reorder_batch_threshold}") + else: + self.reorder_batch_threshold = reorder_batch_threshold_i + def may_reinitialize_input_batch(self, kv_cache_config: KVCacheConfig) -> None: """ From 73e1b9b1d4cd478eb9d715b637683c000207de67 Mon Sep 17 00:00:00 2001 From: Yan Ma Date: Sat, 2 Aug 2025 22:49:08 +0800 Subject: [PATCH 165/224] [xpu]support moe models on XPU platform (#21643) Signed-off-by: yan Signed-off-by: Yan Ma --- vllm/model_executor/layers/fused_moe/layer.py | 47 ++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index e16fc13c945cf..c2039adad99c3 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -327,7 +327,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): layer.w13_weight.data = shuffled_w13 layer.w2_weight.data = shuffled_w2 - if current_platform.is_cpu(): + if current_platform.is_xpu(): + import intel_extension_for_pytorch as ipex + layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE( + layer.w13_weight, + layer.w2_weight, + use_prepack=True, + ) + elif current_platform.is_cpu(): if current_platform.get_cpu_architecture() == CpuArchEnum.X86: from vllm.model_executor.layers.fused_moe import cpu_fused_moe dtype = layer.w13_weight.dtype @@ -509,6 +516,44 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): activation, ) + def forward_xpu( + self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: Optional[torch.Tensor] = None, + logical_to_physical_map: Optional[torch.Tensor] = None, + logical_replica_count: Optional[torch.Tensor] = None, + ): + if enable_eplb is not False or expert_load_view is not None or \ + logical_to_physical_map is not None or \ + logical_replica_count is not None: + raise NotImplementedError("Expert load balancing is not supported " + "for XPU.") + assert custom_routing_function is None + return layer.ipex_fusion( + x, + use_grouped_topk, + top_k, + router_logits, + renormalize, + topk_group, + num_expert_group, + ) + def forward_tpu( self, layer: torch.nn.Module, From 554df8a6a2ed9007086f64768803ae4c780127bd Mon Sep 17 00:00:00 2001 From: Xiao Date: Sat, 2 Aug 2025 09:03:30 -0700 Subject: [PATCH 166/224] Revert "[compile][startup] Disable C++ compilation of symbolic shapes" (#22122) Signed-off-by: Xiao Liu --- vllm/compilation/decorators.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 0d2c432497c40..1370862d580a5 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -267,15 +267,8 @@ def _support_torch_compile( code.co_filename) return inline_call(parent, func, args, kwargs) - # Disable the C++ compilation of symbolic shape guards. C++-fication - # of symbolic shape guards can improve guard overhead. But, since - # vllm skip guards anyways, setting this flag to False can improve - # compile time. - with torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", - False), patch.object( - InliningInstructionTranslator, - 'inline_call', - patched_inline_call): + with patch.object(InliningInstructionTranslator, 'inline_call', + patched_inline_call): output = self.compiled_callable(*args, **kwargs) return output From 2ff46b882694cc3eb6cde48f6b9251ccbc5fdb04 Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Sat, 2 Aug 2025 19:42:00 -0700 Subject: [PATCH 167/224] [Misc] Bump ray to 2.48.0 (#22123) Signed-off-by: Rui Qiao --- requirements/cuda.txt | 2 +- requirements/nightly_torch_test.txt | 2 +- requirements/test.in | 2 +- requirements/test.txt | 22 +++++++++++++++------- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 75008dc20df48..fb30e493f80b3 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -5,7 +5,7 @@ numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Req numba == 0.61.2; python_version > '3.9' # Dependencies for NVIDIA GPUs -ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1. +ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1. torch==2.7.1 torchaudio==2.7.1 # These must be updated alongside torch diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index 0a72ddefda79c..7ae5e6f2f409a 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -16,7 +16,7 @@ librosa # required for audio tests vocos # required for minicpmo_26 test peft pqdm -ray[cgraph,default]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests +ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests sentence-transformers # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests diff --git a/requirements/test.in b/requirements/test.in index 3c5e3c0204bfb..9ecaaae92727f 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -17,7 +17,7 @@ vector_quantize_pytorch # required for minicpmo_26 test vocos # required for minicpmo_26 test peft>=0.15.0 # required for phi-4-mm test pqdm -ray[cgraph,default]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests +ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests sentence-transformers # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests diff --git a/requirements/test.txt b/requirements/test.txt index d45048aae5809..691420df87c48 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -22,9 +22,7 @@ aiohttp==3.10.11 aiohttp-cors==0.8.1 # via ray aiosignal==1.3.1 - # via - # aiohttp - # ray + # via aiohttp albucore==0.0.16 # via terratorch albumentations==1.4.6 @@ -226,7 +224,6 @@ frozenlist==1.5.0 # via # aiohttp # aiosignal - # ray fsspec==2024.9.0 # via # datasets @@ -603,10 +600,18 @@ opencv-python-headless==4.11.0.86 opentelemetry-api==1.35.0 # via # mlflow-skinny + # opentelemetry-exporter-prometheus # opentelemetry-sdk # opentelemetry-semantic-conventions +opentelemetry-exporter-prometheus==0.56b0 + # via ray +opentelemetry-proto==1.36.0 + # via ray opentelemetry-sdk==1.35.0 - # via mlflow-skinny + # via + # mlflow-skinny + # opentelemetry-exporter-prometheus + # ray opentelemetry-semantic-conventions==0.56b0 # via opentelemetry-sdk packaging==24.2 @@ -697,7 +702,9 @@ pqdm==0.2.0 pretrainedmodels==0.7.4 # via segmentation-models-pytorch prometheus-client==0.22.0 - # via ray + # via + # opentelemetry-exporter-prometheus + # ray propcache==0.2.0 # via yarl proto-plus==1.26.1 @@ -707,6 +714,7 @@ protobuf==5.28.3 # google-api-core # googleapis-common-protos # mlflow-skinny + # opentelemetry-proto # proto-plus # ray # tensorboardx @@ -854,7 +862,7 @@ rasterio==1.4.3 # rioxarray # terratorch # torchgeo -ray==2.43.0 +ray==2.48.0 # via -r requirements/test.in redis==5.2.0 # via tensorizer From 337eb23bcca6257a75e2c8677c4698bbff9f4a81 Mon Sep 17 00:00:00 2001 From: jiahanc <173873397+jiahanc@users.noreply.github.com> Date: Sun, 3 Aug 2025 00:50:34 -0700 Subject: [PATCH 168/224] [Fix] Fix llama4 modelopt weight loading error (#22107) Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> Co-authored-by: mgoin --- vllm/model_executor/models/mllama4.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 924f10d82b381..e73dc0c2be82e 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -906,11 +906,13 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, def _rename_weight_for_modelopt_checkpoint(self, name: str) -> str: """Rename weights from ModelOpt llama4 fp8 checkpoints to vLLM format.""" - if name.startswith("model."): + if name.startswith("model.") or name.startswith( + "language_model.model."): + renamed = name.replace("model.", "language_model.model.", + 1) if name.startswith("model.") else name # Handle expert scale parameters with flat naming if "feed_forward.experts." in name and ("_input_scale" in name or "_weight_scale" in name): - renamed = name.replace("model.", "language_model.model.", 1) # Map checkpoint naming to vLLM's expected naming if "down_proj_input_scale" in renamed: return renamed.replace("down_proj_input_scale", @@ -929,7 +931,6 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, # Handle attention scale parameters elif "self_attn." in name and (".k_scale" in name or ".v_scale" in name): - renamed = name.replace("model.", "language_model.model.", 1) if ".k_proj.k_scale" in renamed: return renamed.replace(".k_proj.k_scale", ".attn.k_scale") elif ".v_proj.v_scale" in renamed: @@ -937,7 +938,7 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, return renamed # Standard model.* to language_model.model.* renaming - return name.replace("model.", "language_model.model.", 1) + return renamed elif name.startswith("lm_head.weight"): return name.replace("lm_head.weight", From 3dddbf1f2545740659a9cb975b7becca2c3dc0e6 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 3 Aug 2025 15:52:14 +0800 Subject: [PATCH 169/224] [Misc] Add tensor schema test coverage for multimodal models (#21754) Signed-off-by: Isotr0py Signed-off-by: Isotr0py <2037008807@qq.com> --- .buildkite/test-pipeline.yaml | 3 +- tests/conftest.py | 2 +- tests/models/multimodal/test_tensor_schema.py | 199 ++++++++++++++++++ tests/models/registry.py | 7 +- vllm/model_executor/models/deepseek_vl2.py | 3 +- vllm/model_executor/models/keye.py | 17 +- .../processors/deepseek_vl2.py | 6 +- 7 files changed, 222 insertions(+), 15 deletions(-) create mode 100644 tests/models/multimodal/test_tensor_schema.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index cc1223d4c4653..88e1197d703a4 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -581,7 +581,8 @@ steps: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip freeze | grep -E 'torch' - pytest -v -s models/multimodal/processing - - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model + - pytest -v -s --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/test_tensor_schema.py models/multimodal -m core_model + - pytest -v -s models/multimodal/test_tensor_schema.py -m core_model # Needs mp_method="spawn" - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - label: Multi-Modal Models Test (Extended) 1 diff --git a/tests/conftest.py b/tests/conftest.py index 67f0e7424038c..3f3790cab8d35 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -775,7 +775,7 @@ class VllmRunner: tokenizer_mode: str = "auto", trust_remote_code: bool = True, seed: Optional[int] = 0, - max_model_len: int = 1024, + max_model_len: Optional[int] = 1024, dtype: str = "auto", disable_log_stats: bool = True, tensor_parallel_size: int = 1, diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py new file mode 100644 index 0000000000000..bdc62b1d2682d --- /dev/null +++ b/tests/models/multimodal/test_tensor_schema.py @@ -0,0 +1,199 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from functools import partial +from typing import Any +from unittest.mock import patch + +import pytest +from transformers import PretrainedConfig + +from vllm.config import ModelConfig +from vllm.engine.llm_engine import LLMEngine as V0LLMEngine +from vllm.inputs import InputProcessingContext +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.processing import BaseMultiModalProcessor +from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from vllm.utils import GiB_bytes, set_default_torch_num_threads +from vllm.v1.core.kv_cache_utils import get_kv_cache_config +from vllm.v1.engine.core import EngineCore as V1EngineCore + +from ...conftest import VllmRunner +from ..registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS + +ARCH_TO_SKIP = { + "MolmoForCausalLM": "incompatible requirements", + "MiniMaxVL01ForConditionalGeneration": "broken model", +} + + +def create_batched_mm_kwargs( + model_config: ModelConfig, + processor: BaseMultiModalProcessor, +) -> MultiModalKwargs: + processing_info = processor.info + dummy_inputs = processor.dummy_inputs + supported_mm_limits = processing_info.get_supported_mm_limits() + mm_counts = { + modality: 3 if limit is None else limit + for modality, limit in supported_mm_limits.items() + } + processor_inputs = dummy_inputs.get_dummy_processor_inputs( + seq_len=model_config.max_model_len, + mm_counts=mm_counts, + ) + mm_kwargs = processor.apply( + prompt=processor_inputs.prompt, + mm_data=processor_inputs.mm_data, + hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, + tokenization_kwargs=processor_inputs.tokenization_kwargs, + )["mm_kwargs"] + mm_kwargs = MultiModalKwargs.batch([mm_kwargs]) + return mm_kwargs + + +# Avoid OOM and reduce initialization time by only using 1 layer +def hf_overrides(hf_config: PretrainedConfig, + exist_overrides: dict[str, Any]) -> PretrainedConfig: + hf_config.update(exist_overrides) + text_config = hf_config.get_text_config() + # Ensure at least 2 expert per group + # Since `grouped_topk` assumes top-2 + n_group = getattr(text_config, 'n_group', None) + num_experts = n_group * 2 if n_group is not None else 2 + # we use three layers for Gemma-3n to check + # both normal layer and kv_shared_layer + text_config.update({ + "num_layers": 1, + "num_hidden_layers": 1, + "num_experts": num_experts, + "num_experts_per_tok": 2, + "num_local_experts": num_experts, + # Otherwise there will not be any expert layers + "first_k_dense_replace": 0, + # To avoid OOM on DeepSeek-V3 + "n_routed_experts": num_experts, + # For Gemma-3n + "num_kv_shared_layers": 1, + }) + if hasattr(hf_config, "vision_config"): + hf_config.vision_config.update({ + "num_layers": 1, + "num_hidden_layers": 1, + }) + # e.g.: ibm-granite/granite-speech-3.3-2b + if hasattr(hf_config, "encoder_config"): + hf_config.encoder_config.update({ + "num_layers": 1, + "num_hidden_layers": 1, + }) + # e.g.: Qwen/Qwen2-Audio-7B-Instruct + if hasattr(hf_config, "audio_config"): + hf_config.audio_config.update({ + "num_layers": 1, + "num_hidden_layers": 1, + "encoder_layers": 1, + }) + return hf_config + + +@pytest.mark.core_model +@pytest.mark.parametrize("model_arch", list(_MULTIMODAL_EXAMPLE_MODELS.keys())) +def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner], + monkeypatch): + if model_arch in ARCH_TO_SKIP: + pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}") + + model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) + model_info.check_available_online(on_fail="skip") + + model_id = model_info.default + + hf_overrides_fn = partial(hf_overrides, + exist_overrides=model_info.hf_overrides) + + model_config = ModelConfig( + model_id, + tokenizer=model_info.tokenizer or model_id, + tokenizer_mode=model_info.tokenizer_mode, + revision=model_info.revision, + trust_remote_code=model_info.trust_remote_code, + hf_overrides=model_info.hf_overrides, + ) + model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) + factories = MULTIMODAL_REGISTRY._processor_factories[model_cls] + + if not any( + hasattr(model_cls, f"_parse_and_validate_{m}_input") + for m in ["image", "video", "audio"]): + pytest.skip(f"{model_arch} does not support tensor schema validation.") + + ctx = InputProcessingContext( + model_config, + tokenizer=cached_tokenizer_from_config(model_config), + ) + processing_info = factories.info(ctx) + supported_mm_limits = processing_info.get_supported_mm_limits() + limit_mm_per_prompt = { + modality: 3 if limit is None else limit + for modality, limit in supported_mm_limits.items() + } + + # Avoid calling model.forward() + def _initialize_kv_caches_v0(self) -> None: + self.cache_config.num_gpu_blocks = 0 + self.cache_config.num_cpu_blocks = 0 + + def _initialize_kv_caches_v1(self, vllm_config): + kv_cache_specs = self.model_executor.get_kv_cache_specs() + scheduler_kv_cache_config = get_kv_cache_config( + vllm_config, + kv_cache_specs[0], + 10 * GiB_bytes, + ) + + # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config + return 1, 0, scheduler_kv_cache_config + + with (patch.object(V0LLMEngine, "_initialize_kv_caches", + _initialize_kv_caches_v0), + patch.object(V1EngineCore, "_initialize_kv_caches", + _initialize_kv_caches_v1), monkeypatch.context() as m): + m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + if model_info.v0_only: + m.setenv("VLLM_USE_V1", "0") + + with ( + set_default_torch_num_threads(1), + vllm_runner( + model_id, + tokenizer_name=model_info.tokenizer, + tokenizer_mode=model_info.tokenizer_mode, + revision=model_info.revision, + trust_remote_code=model_info.trust_remote_code, + max_model_len=model_info.max_model_len, + load_format="dummy", + hf_overrides=hf_overrides_fn, + limit_mm_per_prompt=limit_mm_per_prompt, + enforce_eager=True, + ) as vllm_model, + ): + model_config = vllm_model.llm.llm_engine.model_config + llm_engine = vllm_model.llm.llm_engine + + if hasattr(llm_engine, "processor"): + # v1 processor + mm_registry = llm_engine.processor.mm_registry + else: + # v0 input_preprocessor + mm_registry = llm_engine.input_preprocessor.mm_registry + + processor = mm_registry.create_processor(model_config) + mm_kwargs = create_batched_mm_kwargs(model_config, processor) + + def validate_model_input(model): + for modality in ("audio", "image", "video"): + method_name = f"_parse_and_validate_{modality}_input" + if hasattr(model, method_name): + getattr(model, method_name)(**mm_kwargs) + + vllm_model.apply_model(validate_model_input) diff --git a/tests/models/registry.py b/tests/models/registry.py index d88d77cddcca5..8fc870cf85642 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -383,6 +383,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Glm4v_moeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V-Air", is_available_online=False), # noqa: E501 "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m", + trust_remote_code=True, extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501 max_transformers_version="4.48", # noqa: E501 transformers_version_reason="HF model is not compatible."), # noqa: E501 @@ -432,6 +433,9 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True), "Llama_Nemotron_Nano_VL" : _HfExamplesInfo("nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", # noqa: E501 trust_remote_code=True), + "Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True, + extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B", + "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501 "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501 extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501 "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct", @@ -439,9 +443,6 @@ _MULTIMODAL_EXAMPLE_MODELS = { max_transformers_version="4.48", transformers_version_reason="Use of deprecated imports which have been removed.", # noqa: E501 extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}), # noqa: E501 - "Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True, - extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B", - "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501 "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct", trust_remote_code=True), "Phi4MultimodalForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct", # noqa: E501 diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 531018625478b..e0acca75d9dd6 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -51,13 +51,14 @@ class DeepseekVL2ImagePixelInputs(TensorSchema): """ Dimensions: - bn: Batch size * number of images + - p: Number of patches - c: Number of channels (3) - h: Height of each image - w: Width of each image """ type: Literal["pixel_values"] data: Annotated[Union[torch.Tensor, list[torch.Tensor]], - TensorShape("bn", 3, "h", "w")] + TensorShape("bn", "p", 3, "h", "w", dynamic_dims={"p"})] images_spatial_crop: Annotated[torch.Tensor, TensorShape("bn", 2)] diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 4d8aa8de0f0b1..40c66c2268507 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -104,13 +104,16 @@ def smart_resize( class KeyeImagePixelInputs(TensorSchema): """ Dimensions: + - b: Batch size - np: Number of patches - - cps: Number of channels * patch_size * patch_size + - c: Number of channels + - ps: Patch size - ni: Number of images - g: Grid dimensions (3 for t, h, w) """ type: Literal["pixel_values"] - pixel_values: Annotated[torch.Tensor, TensorShape("np", "cps")] + pixel_values: Annotated[torch.Tensor, + TensorShape("b", "np", 3, "ps", "ps")] image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)] @@ -134,14 +137,16 @@ KeyeImageInputs = Union[KeyeImagePixelInputs, KeyeImageEmbeddingInputs] class KeyeVideoPixelInputs(TensorSchema): """ Dimensions: + - b: Batch size - np: Number of patches - - ctps: Number of channels * temporal_patch_size * patch_size * - patch_size - - nv: Number of videos + - c: Number of channels + - ps: Patch size + - ni: Number of images - g: Grid dimensions (3 for t, h, w) """ type: Literal["pixel_values_videos"] - pixel_values_videos: Annotated[torch.Tensor, TensorShape("np", "ctps")] + pixel_values_videos: Annotated[torch.Tensor, + TensorShape("b", "np", 3, "ps", "ps")] video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)] diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py index b4669d12fa213..5896bde312657 100644 --- a/vllm/transformers_utils/processors/deepseek_vl2.py +++ b/vllm/transformers_utils/processors/deepseek_vl2.py @@ -256,7 +256,7 @@ class DeepseekVLV2Processor(ProcessorMixin): def __call__( self, *, - prompt: str, + text: str, images: list[Image.Image], inference_mode: bool = True, **kwargs, @@ -264,7 +264,7 @@ class DeepseekVLV2Processor(ProcessorMixin): """ Args: - prompt (str): the formatted prompt; + text (str): the formatted prompt; images (list[ImageType]): the list of images; inference_mode (bool): if True, then remove the last eos token; **kwargs: @@ -278,7 +278,7 @@ class DeepseekVLV2Processor(ProcessorMixin): """ prepare = self.process_one( - prompt=prompt, + prompt=text, images=images, inference_mode=inference_mode, ) From 3f36c325fa6cd086ab3dea40866f8ab0d7f8ef6e Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" Date: Sun, 3 Aug 2025 00:52:38 -0700 Subject: [PATCH 170/224] [Benchmark] Support ready check timeout in `vllm bench serve` (#21696) Signed-off-by: Ye (Charlotte) Qi Co-authored-by: Roger Wang --- vllm/benchmarks/latency.py | 4 +- vllm/benchmarks/lib/__init__.py | 3 + .../{ => lib}/endpoint_request_func.py | 0 vllm/benchmarks/lib/ready_checker.py | 70 +++++++++++++++++++ vllm/benchmarks/{ => lib}/utils.py | 0 vllm/benchmarks/serve.py | 24 +++++-- vllm/benchmarks/throughput.py | 4 +- 7 files changed, 94 insertions(+), 11 deletions(-) create mode 100644 vllm/benchmarks/lib/__init__.py rename vllm/benchmarks/{ => lib}/endpoint_request_func.py (100%) create mode 100644 vllm/benchmarks/lib/ready_checker.py rename vllm/benchmarks/{ => lib}/utils.py (100%) diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py index 5c6124db80b4f..cebdf56c45b1b 100644 --- a/vllm/benchmarks/latency.py +++ b/vllm/benchmarks/latency.py @@ -14,8 +14,8 @@ from tqdm import tqdm import vllm.envs as envs from vllm import LLM, SamplingParams -from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format, - write_to_json) +from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, + write_to_json) from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType from vllm.sampling_params import BeamSearchParams diff --git a/vllm/benchmarks/lib/__init__.py b/vllm/benchmarks/lib/__init__.py new file mode 100644 index 0000000000000..005e87af61949 --- /dev/null +++ b/vllm/benchmarks/lib/__init__.py @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Benchmark library utilities.""" diff --git a/vllm/benchmarks/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py similarity index 100% rename from vllm/benchmarks/endpoint_request_func.py rename to vllm/benchmarks/lib/endpoint_request_func.py diff --git a/vllm/benchmarks/lib/ready_checker.py b/vllm/benchmarks/lib/ready_checker.py new file mode 100644 index 0000000000000..a663f85b629d2 --- /dev/null +++ b/vllm/benchmarks/lib/ready_checker.py @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Utilities for checking endpoint readiness.""" + +import asyncio +import time + +import aiohttp +from tqdm.asyncio import tqdm + +from .endpoint_request_func import RequestFuncInput, RequestFuncOutput + + +async def wait_for_endpoint( + request_func, + test_input: RequestFuncInput, + timeout_seconds: int = 600, + retry_interval: int = 5, +) -> RequestFuncOutput: + """ + Wait for an endpoint to become available before starting benchmarks. + + Args: + request_func: The async request function to call + test_input: The RequestFuncInput to test with + timeout_seconds: Maximum time to wait in seconds (default: 10 minutes) + retry_interval: Time between retries in seconds (default: 5 seconds) + + Returns: + RequestFuncOutput: The successful response + + Raises: + ValueError: If the endpoint doesn't become available within the timeout + """ + deadline = time.perf_counter() + timeout_seconds + output = RequestFuncOutput(success=False) + print(f"Waiting for endpoint to become up in {timeout_seconds} seconds") + + with tqdm( + total=timeout_seconds, + bar_format="{desc} |{bar}| {elapsed} elapsed, {remaining} remaining", + unit="s", + ) as pbar: + + while True: + # update progress bar + remaining = deadline - time.perf_counter() + elapsed = timeout_seconds - remaining + update_amount = min(elapsed - pbar.n, timeout_seconds - pbar.n) + pbar.update(update_amount) + pbar.refresh() + if remaining <= 0: + pbar.close() + break + + # ping the endpoint using request_func + try: + output = await request_func(request_func_input=test_input) + if output.success: + pbar.close() + return output + except aiohttp.ClientConnectorError: + pass + + # retry after a delay + sleep_duration = min(retry_interval, remaining) + if sleep_duration > 0: + await asyncio.sleep(sleep_duration) + + return output diff --git a/vllm/benchmarks/utils.py b/vllm/benchmarks/lib/utils.py similarity index 100% rename from vllm/benchmarks/utils.py rename to vllm/benchmarks/lib/utils.py diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index bd2b1e5990c83..45798547ac719 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -34,12 +34,12 @@ from transformers import PreTrainedTokenizerBase from vllm.benchmarks.datasets import (SampleRequest, add_dataset_parser, get_samples) -from vllm.benchmarks.endpoint_request_func import (ASYNC_REQUEST_FUNCS, - OPENAI_COMPATIBLE_BACKENDS, - RequestFuncInput, - RequestFuncOutput) -from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format, - write_to_json) +from vllm.benchmarks.lib.endpoint_request_func import ( + ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput, + RequestFuncOutput) +from vllm.benchmarks.lib.ready_checker import wait_for_endpoint +from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, + write_to_json) from vllm.transformers_utils.tokenizer import get_tokenizer MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -331,6 +331,7 @@ async def benchmark( ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None, ramp_up_start_rps: Optional[int] = None, ramp_up_end_rps: Optional[int] = None, + ready_check_timeout_sec: int = 600, ): if endpoint_type in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[endpoint_type] @@ -359,7 +360,8 @@ async def benchmark( extra_body=extra_body, ) - test_output = await request_func(request_func_input=test_input) + test_output = await wait_for_endpoint( + request_func, test_input, timeout_seconds=ready_check_timeout_sec) if not test_output.success: raise ValueError( "Initial test run failed - Please make sure benchmark arguments " @@ -907,6 +909,13 @@ def add_cli_args(parser: argparse.ArgumentParser): help="The ending request rate for ramp-up (RPS). " "Needs to be specified when --ramp-up-strategy is used.", ) + parser.add_argument( + "--ready-check-timeout-sec", + type=int, + default=600, + help="Maximum time to wait for the endpoint to become ready " + "in seconds (default: 600 seconds / 10 minutes).", + ) def main(args: argparse.Namespace): @@ -1012,6 +1021,7 @@ def main(args: argparse.Namespace): ramp_up_strategy=args.ramp_up_strategy, ramp_up_start_rps=args.ramp_up_start_rps, ramp_up_end_rps=args.ramp_up_end_rps, + ready_check_timeout_sec=args.ready_check_timeout_sec, )) # Save config and results to json diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 0fe042e2736da..bbd18ca3ae22e 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -21,8 +21,8 @@ from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset, InstructCoderDataset, RandomDataset, SampleRequest, ShareGPTDataset, SonnetDataset, VisionArenaDataset) -from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format, - write_to_json) +from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, + write_to_json) from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args) From 789562c28c143201a1d2ca35f7adcdf54ef832e5 Mon Sep 17 00:00:00 2001 From: "Roberto L. Castro" <38211239+LopezCastroRoberto@users.noreply.github.com> Date: Sun, 3 Aug 2025 09:54:22 +0200 Subject: [PATCH 171/224] Support CUTLASS NVFP4 (w4a4) for Blackwell Geforce GPUs (SM120) (#21309) Signed-off-by: LopezCastroRoberto --- CMakeLists.txt | 21 +- .../fp4/nvfp4_blockwise_moe_kernel.cu | 6 +- csrc/quantization/fp4/nvfp4_quant_entry.cu | 14 +- csrc/quantization/fp4/nvfp4_quant_kernels.cu | 2 +- .../quantization/fp4/nvfp4_scaled_mm_entry.cu | 14 +- .../fp4/nvfp4_scaled_mm_sm120_kernels.cu | 285 ++++++++++++++++++ 6 files changed, 329 insertions(+), 13 deletions(-) create mode 100644 csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index ea56b8451f228..e2cc0ccdef515 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -529,6 +529,25 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() endif() + # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require + # CUDA 12.8 or later + cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) + set(SRCS + "csrc/quantization/fp4/nvfp4_quant_kernels.cu" + "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu") + set_gencode_flags_for_srcs( + SRCS "${SRCS}" + CUDA_ARCHS "${FP4_ARCHS}") + list(APPEND VLLM_EXT_SRC "${SRCS}") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1") + message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}") + else() + message(STATUS "Not building NVFP4 as no compatible archs were found.") + # clear FP4_ARCHS + set(FP4_ARCHS) + endif() + # FP4 Archs and flags cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) @@ -541,7 +560,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") SRCS "${SRCS}" CUDA_ARCHS "${FP4_ARCHS}") list(APPEND VLLM_EXT_SRC "${SRCS}") - list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM100=1") list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1") message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}") else() diff --git a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu index a21ee55b65862..03db5cc196d59 100644 --- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu +++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu @@ -335,7 +335,7 @@ void run_fp4_blockwise_scaled_group_mm( TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM"); } -#if defined ENABLE_NVFP4 && ENABLE_NVFP4 +#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100 constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte; constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn; #endif @@ -356,7 +356,7 @@ void cutlass_fp4_group_mm( const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales, const torch::Tensor& alphas, const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) { -#if defined ENABLE_NVFP4 && ENABLE_NVFP4 +#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100 // Input validation CHECK_INPUT(a, FLOAT4_E2M1X2, "a"); CHECK_INPUT(b, FLOAT4_E2M1X2, "b"); @@ -398,7 +398,7 @@ void cutlass_fp4_group_mm( TORCH_CHECK_NOT_IMPLEMENTED( false, "No compiled cutlass_fp4_group_mm kernel, vLLM must " - "be compiled with ENABLE_NVFP4 for SM100+ and CUDA " + "be compiled with ENABLE_NVFP4_SM100 for SM100+ and CUDA " "12.8 or above."); #endif } diff --git a/csrc/quantization/fp4/nvfp4_quant_entry.cu b/csrc/quantization/fp4/nvfp4_quant_entry.cu index badbb7e310df0..1b61bd4519fc3 100644 --- a/csrc/quantization/fp4/nvfp4_quant_entry.cu +++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu @@ -16,14 +16,15 @@ #include -#if defined ENABLE_NVFP4 && ENABLE_NVFP4 -void scaled_fp4_quant_sm100a(torch::Tensor const& output, +#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ + (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) +void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, torch::Tensor const& input, torch::Tensor const& output_sf, torch::Tensor const& input_sf); #endif -#if defined ENABLE_NVFP4 && ENABLE_NVFP4 +#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100 void scaled_fp4_experts_quant_sm100a( torch::Tensor& output, torch::Tensor& output_scale, torch::Tensor const& input, torch::Tensor const& input_global_scale, @@ -33,8 +34,9 @@ void scaled_fp4_experts_quant_sm100a( void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input, torch::Tensor& output_sf, torch::Tensor const& input_sf) { -#if defined ENABLE_NVFP4 && ENABLE_NVFP4 - return scaled_fp4_quant_sm100a(output, input, output_sf, input_sf); +#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ + (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) + return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf); #endif TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel"); } @@ -44,7 +46,7 @@ void scaled_fp4_experts_quant( torch::Tensor const& input, torch::Tensor const& input_global_scale, torch::Tensor const& input_offset_by_experts, torch::Tensor const& output_scale_offset_by_experts) { -#if defined ENABLE_NVFP4 && ENABLE_NVFP4 +#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100 return scaled_fp4_experts_quant_sm100a( output, output_scale, input, input_global_scale, input_offset_by_experts, output_scale_offset_by_experts); diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu index d32911357a953..4e080de151648 100644 --- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu +++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu @@ -332,7 +332,7 @@ template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input, int multiProcessorCount, cudaStream_t stream); -void scaled_fp4_quant_sm100a(torch::Tensor const& output, +void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, torch::Tensor const& input, torch::Tensor const& output_sf, torch::Tensor const& input_sf) { diff --git a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu index 61b75e92dfaa0..9cba2828aac2e 100644 --- a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu +++ b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu @@ -16,7 +16,7 @@ #include -#if defined ENABLE_NVFP4 && ENABLE_NVFP4 +#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100 void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A, torch::Tensor const& B, torch::Tensor const& A_sf, @@ -24,12 +24,22 @@ void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A, torch::Tensor const& alpha); #endif +#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120 +void cutlass_scaled_fp4_mm_sm120a(torch::Tensor& D, torch::Tensor const& A, + torch::Tensor const& B, + torch::Tensor const& A_sf, + torch::Tensor const& B_sf, + torch::Tensor const& alpha); +#endif + void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A, torch::Tensor const& B, torch::Tensor const& A_sf, torch::Tensor const& B_sf, torch::Tensor const& alpha) { -#if defined ENABLE_NVFP4 && ENABLE_NVFP4 +#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100 return cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha); +#elif defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120 + return cutlass_scaled_fp4_mm_sm120a(D, A, B, A_sf, B_sf, alpha); #endif TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 mm kernel, vLLM should " diff --git a/csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu b/csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu new file mode 100644 index 0000000000000..89de23b76e65d --- /dev/null +++ b/csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include "cutlass_extensions/common.hpp" + +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/epilogue/collective/collective_builder.hpp" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" + +#include "cutlass/util/packed_stride.hpp" + +#include "core/math.hpp" + +using namespace cute; + +#define CHECK_TYPE(x, st, m) \ + TORCH_CHECK(x.scalar_type() == st, ": Inconsistency of Tensor type:", m) +#define CHECK_TH_CUDA(x, m) \ + TORCH_CHECK(x.is_cuda(), m, ": must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x, m) \ + TORCH_CHECK(x.is_contiguous(), m, ": must be contiguous") +#define CHECK_INPUT(x, st, m) \ + CHECK_TH_CUDA(x, m); \ + CHECK_CONTIGUOUS(x, m); \ + CHECK_TYPE(x, st, m) + +constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte; +constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn; + +struct sm120_fp4_config_M256 { + using ClusterShape = Shape<_1, _1, _1>; + using MmaTileShape = Shape<_128, _128, _128>; + using PerSmTileShape_MNK = Shape<_128, _128, _128>; +}; + +struct sm120_fp4_config_default { + using ClusterShape = Shape<_1, _1, _1>; + using MmaTileShape = Shape<_256, _128, _128>; + using PerSmTileShape_MNK = Shape<_256, _128, _128>; +}; + +template +struct Fp4GemmSm120 { + using ElementA = cutlass::nv_float4_t; + using LayoutATag = cutlass::layout::RowMajor; + static constexpr int AlignmentA = 32; + + using ElementB = cutlass::nv_float4_t; + using LayoutBTag = cutlass::layout::ColumnMajor; + static constexpr int AlignmentB = 32; + + using ElementD = OutType; + using ElementC = OutType; + using LayoutCTag = cutlass::layout::RowMajor; + using LayoutDTag = cutlass::layout::RowMajor; + static constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value; + static constexpr int AlignmentC = 128 / cutlass::sizeof_bits::value; + + using ElementAccumulator = float; + using ArchTag = cutlass::arch::Sm120; + using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp; + + using MmaTileShape = typename Config::MmaTileShape; + using ClusterShape = typename Config::ClusterShape; + using PerSmTileShape_MNK = typename Config::PerSmTileShape_MNK; + + using CollectiveEpilogue = + typename cutlass::epilogue::collective::CollectiveBuilder< + ArchTag, OperatorClass, PerSmTileShape_MNK, ClusterShape, + cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator, + ElementAccumulator, ElementC, LayoutCTag, AlignmentC, ElementD, + LayoutDTag, AlignmentD, + cutlass::epilogue::collective::EpilogueScheduleAuto>::CollectiveOp; + + using CollectiveMainloop = + typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, OperatorClass, ElementA, LayoutATag, AlignmentA, ElementB, + LayoutBTag, AlignmentB, ElementAccumulator, MmaTileShape, + ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout( + sizeof(typename CollectiveEpilogue::SharedStorage))>, + cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, CollectiveMainloop, CollectiveEpilogue, void>; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; +}; + +template +typename Gemm::Arguments args_from_options(at::Tensor& D, at::Tensor const& A, + at::Tensor const& B, + at::Tensor const& A_sf, + at::Tensor const& B_sf, + torch::Tensor const& alpha, int M, + int N, int K) { + using ElementA = typename Gemm::ElementA; + using ElementB = typename Gemm::ElementB; + using ElementD = typename Gemm::ElementD; + using ElementSFA = cutlass::float_ue4m3_t; + using ElementSFB = cutlass::float_ue4m3_t; + using ElementCompute = float; + + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = typename Gemm::GemmKernel::StrideD; + + using Sm1xxBlkScaledConfig = + typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig; + + auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, {M, K, 1}); + auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {N, K, 1}); + auto stride_D = cutlass::make_cute_packed_stride(StrideD{}, {M, N, 1}); + + auto layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA( + cute::make_shape(M, N, K, 1)); + auto layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB( + cute::make_shape(M, N, K, 1)); + + typename Gemm::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + {M, N, K, 1}, + {static_cast(A.data_ptr()), stride_A, + static_cast(B.data_ptr()), stride_B, + static_cast(A_sf.data_ptr()), layout_SFA, + static_cast(B_sf.data_ptr()), layout_SFB}, + {{}, + static_cast(D.data_ptr()), + stride_D, + static_cast(D.data_ptr()), + stride_D}}; + auto& fusion_args = arguments.epilogue.thread; + fusion_args.alpha_ptr = static_cast(alpha.data_ptr()); + + return arguments; +} + +template +void runGemm(at::Tensor& D, at::Tensor const& A, at::Tensor const& B, + at::Tensor const& A_sf, at::Tensor const& B_sf, + torch::Tensor const& alpha, int M, int N, int K, + cudaStream_t stream) { + Gemm gemm; + + auto arguments = args_from_options(D, A, B, A_sf, B_sf, alpha, M, N, K); + + size_t workspace_size = Gemm::get_workspace_size(arguments); + auto const workspace_options = + torch::TensorOptions().dtype(torch::kUInt8).device(A.device()); + auto workspace = torch::empty(workspace_size, workspace_options); + + CUTLASS_CHECK(gemm.can_implement(arguments)); + + CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream)); + + CUTLASS_CHECK(gemm.run(arguments, workspace.data_ptr(), stream)); +} + +void cutlass_fp4_bf16_gemm_dispatch(torch::Tensor& D, torch::Tensor const& A, + torch::Tensor const& B, + torch::Tensor const& A_sf, + torch::Tensor const& B_sf, + torch::Tensor const& alpha, int m, int n, + int k, cudaStream_t stream) { + uint32_t const mp2 = std::max(static_cast(16), next_pow_2(m)); + if (mp2 <= 256) { + runGemm::Gemm>( + D, A, B, A_sf, B_sf, alpha, m, n, k, stream); + } else { + runGemm::Gemm>( + D, A, B, A_sf, B_sf, alpha, m, n, k, stream); + } +} + +void cutlass_fp4_f16_gemm_dispatch(torch::Tensor& D, torch::Tensor const& A, + torch::Tensor const& B, + torch::Tensor const& A_sf, + torch::Tensor const& B_sf, + torch::Tensor const& alpha, int m, int n, + int k, cudaStream_t stream) { + uint32_t const mp2 = std::max(static_cast(16), next_pow_2(m)); + if (mp2 <= 256) { + runGemm::Gemm>( + D, A, B, A_sf, B_sf, alpha, m, n, k, stream); + } else { + runGemm::Gemm>( + D, A, B, A_sf, B_sf, alpha, m, n, k, stream); + } +} + +void cutlass_scaled_fp4_mm_sm120a(torch::Tensor& D, torch::Tensor const& A, + torch::Tensor const& B, + torch::Tensor const& A_sf, + torch::Tensor const& B_sf, + torch::Tensor const& alpha) { +#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) + CHECK_INPUT(A, FLOAT4_E2M1X2, "a"); + CHECK_INPUT(B, FLOAT4_E2M1X2, "b"); + + CHECK_INPUT(A_sf, SF_DTYPE, "scale_a"); + CHECK_INPUT(B_sf, SF_DTYPE, "scale_b"); + + CHECK_INPUT(alpha, at::ScalarType::Float, "alpha"); + + TORCH_CHECK(A.dim() == 2, "a must be a matrix"); + TORCH_CHECK(B.dim() == 2, "b must be a matrix"); + TORCH_CHECK(A.sizes()[1] == B.sizes()[1], + "a and b shapes cannot be multiplied (", A.sizes()[0], "x", + A.sizes()[1], " and ", B.sizes()[0], "x", B.sizes()[1], ")"); + + auto const m = A.sizes()[0]; + auto const n = B.sizes()[0]; + auto const k = A.sizes()[1] * 2; + + constexpr int alignment = 32; + TORCH_CHECK(k % alignment == 0, "Expected k to be divisible by ", alignment, + ", but got a shape: (", A.sizes()[0], "x", A.sizes()[1], + "), k: ", k, "."); + TORCH_CHECK(n % alignment == 0, "Expected n to be divisible by ", alignment, + ", but got b shape: (", B.sizes()[0], "x", B.sizes()[1], ")."); + + auto round_up = [](int x, int y) { return (x + y - 1) / y * y; }; + int rounded_m = round_up(m, 128); + int rounded_n = round_up(n, 128); + // Since k is divisible by 32 (alignment), k / 16 is guaranteed to be an + // integer. + int rounded_k = round_up(k / 16, 4); + + TORCH_CHECK(A_sf.dim() == 2, "scale_a must be a matrix"); + TORCH_CHECK(B_sf.dim() == 2, "scale_b must be a matrix"); + TORCH_CHECK(A_sf.sizes()[1] == B_sf.sizes()[1], + "scale_a and scale_b shapes cannot be multiplied (", + A_sf.sizes()[0], "x", A_sf.sizes()[1], " and ", B_sf.sizes()[0], + "x", B_sf.sizes()[1], ")"); + TORCH_CHECK(A_sf.sizes()[0] == rounded_m && A_sf.sizes()[1] == rounded_k, + "scale_a must be padded and swizzled to a shape (", rounded_m, + "x", rounded_k, "), but got a shape (", A_sf.sizes()[0], "x", + A_sf.sizes()[1], ")"); + TORCH_CHECK(B_sf.sizes()[0] == rounded_n && B_sf.sizes()[1] == rounded_k, + "scale_b must be padded and swizzled to a shape (", rounded_n, + "x", rounded_k, "), but got a shape (", B_sf.sizes()[0], "x", + B_sf.sizes()[1], ")"); + + auto out_dtype = D.dtype(); + const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(A.get_device()); + + if (out_dtype == at::ScalarType::BFloat16) { + return cutlass_fp4_bf16_gemm_dispatch(D, A, B, A_sf, B_sf, alpha, m, n, k, + stream); + } else if (out_dtype == at::ScalarType::Half) { + return cutlass_fp4_f16_gemm_dispatch(D, A, B, A_sf, B_sf, alpha, m, n, k, + stream); + } else { + TORCH_CHECK(false, "Unsupported output data type of nvfp4 mm sm120 (", + out_dtype, ")"); + } +#else + TORCH_CHECK(false, + "Unsupported CUTLASS version. Set VLLM_CUTLASS_SRC_DIR to " + "a CUTLASS 3.8 source directory to enable support."); +#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) +} \ No newline at end of file From 7de45db9a5b95073c3f99eec75ae510d347d625f Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Sun, 3 Aug 2025 15:55:20 +0800 Subject: [PATCH 172/224] [Misc] update doc comment for send (#22026) Signed-off-by: Andy Xie --- .../device_communicators/base_device_communicator.py | 2 +- vllm/distributed/device_communicators/cuda_communicator.py | 2 +- vllm/distributed/parallel_state.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py index dc5923cdc5a0d..127a340fc6c6d 100644 --- a/vllm/distributed/device_communicators/base_device_communicator.py +++ b/vllm/distributed/device_communicators/base_device_communicator.py @@ -219,7 +219,7 @@ class DeviceCommunicatorBase: return output_tensor def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None: - """Sends a tensor to the destination rank in a non-blocking way""" + """Sends a tensor to the destination rank in a blocking way""" """NOTE: `dst` is the local rank of the destination rank.""" if dst is None: dst = (self.rank_in_group + 1) % self.world_size diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index e4804691f0f65..4ab8f3d938fcf 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -179,7 +179,7 @@ class CudaCommunicator(DeviceCommunicatorBase): return output.movedim(0, dim).contiguous() def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None: - """Sends a tensor to the destination rank in a non-blocking way""" + """Sends a tensor to the destination rank in a blocking way""" """NOTE: `dst` is the local rank of the destination rank.""" if dst is None: dst = (self.rank_in_group + 1) % self.world_size diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 1f7a14920c418..ee581124db510 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -782,7 +782,7 @@ class GroupCoordinator: torch.distributed.barrier(group=self.cpu_group) def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None: - """Sends a tensor to the destination rank in a non-blocking way""" + """Sends a tensor to the destination rank in a blocking way""" """NOTE: `dst` is the local rank of the destination rank.""" self.device_communicator.send(tensor, dst) From 24d1dffbeb0d27cf42904153f56e919fb01b5a07 Mon Sep 17 00:00:00 2001 From: H Date: Sun, 3 Aug 2025 03:04:45 -0700 Subject: [PATCH 173/224] [executor] feat: add supports_pp attr to executors (#21786) Signed-off-by: Haibin Lin --- vllm/engine/arg_utils.py | 20 ++++++++++++-------- vllm/executor/executor_base.py | 1 + vllm/v1/executor/multiproc_executor.py | 2 ++ vllm/v1/executor/ray_distributed_executor.py | 2 ++ 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 47b3efa6af726..c94e440e5c845 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1490,14 +1490,18 @@ class EngineArgs: and _warn_or_fallback("Engine in background thread")): return False - if (self.pipeline_parallel_size > 1 - and self.distributed_executor_backend - not in (ParallelConfig.distributed_executor_backend, "ray", - "mp", "external_launcher")): - name = "Pipeline Parallelism without Ray distributed executor " \ - "or multiprocessing executor or external launcher" - _raise_or_fallback(feature_name=name, recommend_to_remove=False) - return False + if self.pipeline_parallel_size > 1: + supports_pp = getattr(self.distributed_executor_backend, + 'supports_pp', False) + if not supports_pp and self.distributed_executor_backend not in ( + ParallelConfig.distributed_executor_backend, "ray", "mp", + "external_launcher"): + name = "Pipeline Parallelism without Ray distributed " \ + "executor or multiprocessing executor or external " \ + "launcher" + _raise_or_fallback(feature_name=name, + recommend_to_remove=False) + return False # The platform may be supported on V1, but off by default for now. if not current_platform.default_v1( # noqa: SIM103 diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 97d0d6f08b81e..813232cd19281 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -35,6 +35,7 @@ class ExecutorBase(ABC): """ uses_ray: bool # whether the executor uses Ray for orchestration. + supports_pp: bool = False # whether the executor supports PP def __init__( self, diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index d90051c3224fd..0db3bcd7fb408 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -41,6 +41,8 @@ logger = init_logger(__name__) class MultiprocExecutor(Executor): + supports_pp: bool = True + def _init_executor(self) -> None: # Call self.shutdown at exit to clean up # and ensure workers will be terminated. diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py index b86ac048f5206..c05ad1966d611 100644 --- a/vllm/v1/executor/ray_distributed_executor.py +++ b/vllm/v1/executor/ray_distributed_executor.py @@ -43,6 +43,8 @@ class FutureWrapper(Future): class RayDistributedExecutor(RayDistributedExecutorV0, Executor): """Ray distributed executor using Ray Compiled Graphs.""" + supports_pp: bool = True + def _init_executor(self) -> None: super()._init_executor() From aefeea0fde0fbe5871a0799fad583e6ed6fdf903 Mon Sep 17 00:00:00 2001 From: David Ben-David Date: Sun, 3 Aug 2025 14:03:40 +0300 Subject: [PATCH 174/224] [V1] [P/D] Refactor KV Connector Path (#21980) Signed-off-by: David Ben-David Co-authored-by: David Ben-David --- .../unit/test_output_aggreagator.py | 20 +++++- .../unit/test_remote_decode_lifecycle.py | 8 ++- .../unit/test_remote_prefill_lifecycle.py | 8 ++- tests/v1/kv_connector/unit/utils.py | 8 ++- .../kv_transfer/kv_connector/utils.py | 16 +++-- vllm/sequence.py | 13 ++-- vllm/v1/core/sched/scheduler.py | 12 ++-- vllm/v1/outputs.py | 13 ++-- vllm/v1/worker/gpu_model_runner.py | 30 +++------ vllm/v1/worker/gpu_worker.py | 22 ++++--- .../worker/kv_connector_model_runner_mixin.py | 63 ++++++++++++++++--- vllm/v1/worker/tpu_model_runner.py | 9 +-- 12 files changed, 142 insertions(+), 80 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_output_aggreagator.py b/tests/v1/kv_connector/unit/test_output_aggreagator.py index cad73f68e9f15..5d2b27a9eb4da 100644 --- a/tests/v1/kv_connector/unit/test_output_aggreagator.py +++ b/tests/v1/kv_connector/unit/test_output_aggreagator.py @@ -4,7 +4,7 @@ from concurrent.futures import Future from typing import Optional from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator -from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput class DummyModelRunnerOutput(ModelRunnerOutput): @@ -12,8 +12,16 @@ class DummyModelRunnerOutput(ModelRunnerOutput): def __init__(self, finished_sending: Optional[set[str]] = None, finished_recving: Optional[set[str]] = None): - self.finished_sending = finished_sending - self.finished_recving = finished_recving + self.kv_connector_output = KVConnectorOutput( + finished_sending=finished_sending, + finished_recving=finished_recving, + ) + + def __repr__(self): + return ( + f"DummyModelRunnerOutput(" + f"finished_sending={self.kv_connector_output.finished_sending}," + f"finished_recving={self.kv_connector_output.finished_recving})") def test_aggregate_workers_output(): @@ -27,6 +35,7 @@ def test_aggregate_workers_output(): aggregated = aggregator.aggregate([output1, output2]) assert aggregated is output1 + aggregated = aggregated.kv_connector_output assert aggregated.finished_sending is None assert aggregated.finished_recving is None @@ -38,6 +47,7 @@ def test_aggregate_workers_output(): aggregated = aggregator.aggregate([output1, output2]) assert aggregated is output1 + aggregated = aggregated.kv_connector_output assert aggregated.finished_sending == {'req1'} assert aggregated.finished_recving is None @@ -49,6 +59,7 @@ def test_aggregate_workers_output(): aggregated = aggregator.aggregate([output1, output2]) assert aggregated is output1 + aggregated = aggregated.kv_connector_output assert aggregated.finished_sending is None assert aggregated.finished_recving == {'req2'} @@ -70,6 +81,7 @@ def test_async_aggregate_workers_output(): assert result_future.done() aggregated = result_future.result() assert aggregated is output1 + aggregated = aggregated.kv_connector_output assert aggregated.finished_sending is None assert aggregated.finished_recving is None @@ -87,6 +99,7 @@ def test_async_aggregate_workers_output(): assert result_future.done() aggregated = result_future.result() assert aggregated is output1 + aggregated = aggregated.kv_connector_output assert aggregated.finished_sending == {'req1'} assert aggregated.finished_recving is None @@ -104,5 +117,6 @@ def test_async_aggregate_workers_output(): assert result_future.done() aggregated = result_future.result() assert aggregated is output1 + aggregated = aggregated.kv_connector_output assert aggregated.finished_sending is None assert aggregated.finished_recving == {'req2'} diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py index 12a71d97e8d29..76394a540aacd 100644 --- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy -from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT +from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput from vllm.v1.request import FinishReason, RequestStatus from .utils import (assert_scheduler_empty, create_model_runner_output, @@ -86,7 +86,8 @@ def test_basic_lifecycle(): # (3b): execute_model() model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) - model_runner_output.finished_sending = [request_id] + model_runner_output.kv_connector_output = KVConnectorOutput( + finished_sending=[request_id]) # (3c): update_from_output() scheduler.update_from_output(scheduler_output, model_runner_output) @@ -176,7 +177,8 @@ def test_prefix_cache_lifecycle(): scheduler_output = scheduler.schedule() scheduler.schedule() model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) - model_runner_output.finished_sending = [request_remote.request_id] + model_runner_output.kv_connector_output = KVConnectorOutput( + finished_sending=[request_remote.request_id]) scheduler.update_from_output(scheduler_output, model_runner_output) _ = scheduler.schedule() assert_scheduler_empty(scheduler) diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py index f89970bf2c807..3d52ea526d96b 100644 --- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy -from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT +from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput from vllm.v1.request import FinishReason, RequestStatus from .utils import (assert_scheduler_empty, create_model_runner_output, @@ -72,7 +72,8 @@ def test_basic_lifecycle(): # (2b): forward(): request finishes recv. model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) - model_runner_output.finished_recving = [request_id] + model_runner_output.kv_connector_output = KVConnectorOutput( + finished_recving=[request_id]) # (2c): update_from_output(): engine_core_outputs = scheduler.update_from_output(scheduler_output, @@ -309,7 +310,8 @@ def test_full_block_prompt(): # # STEP (2): Recv. scheduler_output = scheduler.schedule() model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) - model_runner_output.finished_recving = [request_id] + model_runner_output.kv_connector_output = KVConnectorOutput( + finished_recving=[request_id]) scheduler.update_from_output(scheduler_output, model_runner_output) assert len(scheduler.waiting) == 1 assert (request_id in scheduler.finished_recving_kv_req_ids) diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 480a7074cdf4e..291c84d117cb6 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -17,7 +17,7 @@ from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.sched.scheduler import Scheduler from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec) -from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput from vllm.v1.request import Request from vllm.v1.structured_output import StructuredOutputManager @@ -188,8 +188,10 @@ def create_model_runner_output( logprobs=None, prompt_logprobs_dict={}, pooler_output=None, - finished_sending=finished_sending, - finished_recving=finished_recving, + kv_connector_output=KVConnectorOutput( + finished_sending=finished_sending, + finished_recving=finished_recving, + ), ) diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 559c233947ce8..1a11cb6d0189a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -16,7 +16,7 @@ from vllm.config import VllmConfig, get_current_vllm_config from vllm.distributed.kv_transfer.kv_connector.factory import ( KVConnectorFactory) from vllm.logger import init_logger -from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput logger = init_logger(__name__) @@ -129,7 +129,7 @@ class KVOutputAggregator: def aggregate(self, outputs: list[ModelRunnerOutput], output_rank: int = 0) -> ModelRunnerOutput: - # aggregate finished_sending, finished_recving from all workers + # aggregate kv_connector_output from all workers def update_finished_set(req_ids: Optional[set[str]], remaining_count_dict: dict[str, int], @@ -143,6 +143,7 @@ class KVOutputAggregator: finished_sending = set[str]() finished_recving = set[str]() for output in outputs: + output = output.kv_connector_output update_finished_set(output.finished_sending, self._send_remaining_count, finished_sending) update_finished_set(output.finished_recving, @@ -151,13 +152,10 @@ class KVOutputAggregator: # select output of the worker specified by output_rank output = outputs[output_rank] - # set the aggregated finished_sending / finished_recving - # if output.finished_sending/recving is not empty, but the other ranks - # still have unfinished send/recv, we want to set the aggregated - # finished_sending/recving to None until all ranks have finished - # send/recv - output.finished_sending = finished_sending if finished_sending else None - output.finished_recving = finished_recving if finished_recving else None + output.kv_connector_output = KVConnectorOutput( + finished_sending=finished_sending or None, + finished_recving=finished_recving or None, + ) return output diff --git a/vllm/sequence.py b/vllm/sequence.py index fe87b52f9df15..6e65a2bd03189 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -10,7 +10,7 @@ from collections.abc import Mapping from collections.abc import Sequence as GenericSequence from dataclasses import dataclass, field from functools import reduce -from typing import Any, Callable, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Optional, Union import msgspec import torch @@ -21,6 +21,10 @@ from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict from vllm.pooling_params import PoolingParams from vllm.sampling_params import RequestOutputKind, SamplingParams +if TYPE_CHECKING: + from vllm.v1.worker.kv_connector_model_runner_mixin import ( + KVConnectorOutput) + VLLM_TOKEN_ID_ARRAY_TYPE = "l" VLLM_INVALID_TOKEN_ID = -1 @@ -1159,14 +1163,11 @@ class IntermediateTensors: states and residuals to be sent to the next stage. This data structure contains the hidden states and residuals for a request. - Each stage also needs to handle its own finished_sending and - finished_recving in case of kv transfer. + Each stage also needs to handle its own kv_connector_output. """ tensors: dict[str, torch.Tensor] - # [req_ids] - finished_sending: Optional[set[str]] = None - finished_recving: Optional[set[str]] = None + kv_connector_output: Optional["KVConnectorOutput"] def __init__(self, tensors): # manually define this function, so that diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 446f98034cb8b..49a744cfec69a 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -30,7 +30,7 @@ from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs) from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.metrics.stats import SchedulerStats -from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput from vllm.v1.request import Request, RequestStatus from vllm.v1.spec_decode.metrics import SpecDecodingStats from vllm.v1.structured_output import StructuredOutputManager @@ -884,7 +884,9 @@ class Scheduler(SchedulerInterface): self.waiting.remove_requests(stopped_preempted_reqs) # KV Connector: update state for finished KV Transfers. - self._update_from_kv_xfer_finished(model_runner_output) + if model_runner_output.kv_connector_output: + self._update_from_kv_xfer_finished( + model_runner_output.kv_connector_output) # Create EngineCoreOutputs for all clients that have requests with # outputs in this step. @@ -1128,7 +1130,7 @@ class Scheduler(SchedulerInterface): return True def _update_from_kv_xfer_finished(self, - model_runner_output: ModelRunnerOutput): + kv_connector_output: KVConnectorOutput): """ KV Connector: update the scheduler state based on the output. @@ -1139,9 +1141,9 @@ class Scheduler(SchedulerInterface): scheduler the request during the next step. """ # KV Connector:: update recv and send status from last step. - for req_id in (model_runner_output.finished_recving or ()): + for req_id in (kv_connector_output.finished_recving or ()): logger.debug("Finished recving KV transfer for request %s", req_id) self.finished_recving_kv_req_ids.add(req_id) - for req_id in (model_runner_output.finished_sending or ()): + for req_id in (kv_connector_output.finished_sending or ()): logger.debug("Finished sending KV transfer for request %s", req_id) self._free_blocks(self.requests[req_id]) diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index f78623f571b2d..7d7cd0c94dd04 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -71,6 +71,13 @@ class SamplerOutput: logprobs_tensors: Optional[LogprobsTensors] +@dataclass +class KVConnectorOutput: + # [req_ids] + finished_sending: Optional[set[str]] = None + finished_recving: Optional[set[str]] = None + + # ModelRunnerOutput is serialized and sent to the scheduler process. # This is expensive for torch.Tensor so prefer to use list instead. @dataclass @@ -104,9 +111,7 @@ class ModelRunnerOutput: # [num_reqs, hidden_size] pooler_output: list[Optional[torch.Tensor]] - # [req_ids] - finished_sending: Optional[set[str]] = None - finished_recving: Optional[set[str]] = None + kv_connector_output: Optional[KVConnectorOutput] = None # req_id -> num_nans_in_logits num_nans_in_logits: Optional[dict[str, int]] = None @@ -119,6 +124,4 @@ EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(req_ids=[], logprobs=None, prompt_logprobs_dict={}, pooler_output=[], - finished_sending=None, - finished_recving=None, num_nans_in_logits=None) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 42cef6c5733d2..041687ae28b20 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -69,7 +69,7 @@ from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.spec_decode.ngram_proposer import NgramProposer from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.kv_connector_model_runner_mixin import ( - KVConnectorModelRunnerMixin) + KVConnectorModelRunnerMixin, KVConnectorOutput) from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin from ..sample.logits_processor import LogitsProcessorManager @@ -1423,8 +1423,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): hidden_states: torch.Tensor, num_scheduled_tokens: int, num_scheduled_tokens_np: np.ndarray, - finished_sending: Optional[set[str]], - finished_recving: Optional[set[str]], + kv_connector_output: Optional[KVConnectorOutput], ) -> ModelRunnerOutput: assert self.input_batch.num_reqs ==\ len(self.input_batch.pooling_params), \ @@ -1459,8 +1458,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logprobs=None, prompt_logprobs_dict={}, pooler_output=pooler_output, - finished_sending=finished_sending, - finished_recving=finished_recving, + kv_connector_output=kv_connector_output, ) @torch.inference_mode() @@ -1564,8 +1562,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_tokens=num_input_tokens, num_tokens_across_dp=num_tokens_across_dp, skip_cuda_graphs=skip_cuda_graphs, - ): - self.maybe_setup_kv_connector(scheduler_output) + ), self.maybe_get_kv_connector_output( + scheduler_output) as kv_connector_output: model_output = self.model( input_ids=input_ids, @@ -1578,10 +1576,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ), ) - self.maybe_wait_for_kv_save() - finished_sending, finished_recving = ( - self.get_finished_kv_transfers(scheduler_output)) - if self.use_aux_hidden_state_outputs: hidden_states, aux_hidden_states = model_output else: @@ -1597,20 +1591,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): == "external_launcher" and len(get_pp_group().ranks) > 0 if not get_pp_group().is_last_rank: # For mid-pipeline stages, return the hidden states. - if not broadcast_pp_output: - if finished_sending or finished_recving: - hidden_states.finished_sending = finished_sending - hidden_states.finished_recving = finished_recving - return hidden_states assert isinstance(hidden_states, IntermediateTensors) + if not broadcast_pp_output: + hidden_states.kv_connector_output = kv_connector_output + return hidden_states get_pp_group().send_tensor_dict(hidden_states.tensors, all_gather_group=get_tp_group()) logits = None else: if self.input_batch.pooling_params: return self._pool(hidden_states, num_scheduled_tokens, - num_scheduled_tokens_np, finished_sending, - finished_recving) + num_scheduled_tokens_np, kv_connector_output) sample_hidden_states = hidden_states[logits_indices] logits = self.model.compute_logits(sample_hidden_states, None) @@ -1760,8 +1751,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logprobs=logprobs_lists, prompt_logprobs_dict=prompt_logprobs_dict, pooler_output=[], - finished_sending=finished_sending, - finished_recving=finished_recving, + kv_connector_output=kv_connector_output, num_nans_in_logits=num_nans_in_logits, ) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 4bc4ece9a0df4..7fca245c1bef8 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -16,8 +16,7 @@ from vllm.config import VllmConfig from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment, set_custom_all_reduce) -from vllm.distributed.kv_transfer import (ensure_kv_transfer_initialized, - has_kv_transfer_group) +from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized from vllm.distributed.parallel_state import get_pp_group, get_tp_group from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -369,17 +368,20 @@ class Worker(WorkerBase): assert isinstance(output, IntermediateTensors) get_pp_group().send_tensor_dict(output.tensors, all_gather_group=get_tp_group()) - if not has_kv_transfer_group(): + + kv_connector_output = output.kv_connector_output + if not kv_connector_output: return None # In case of PP with kv transfer, we need to pass through the - # finished_sending and finished_recving buffers. - new_output = EMPTY_MODEL_RUNNER_OUTPUT - if output.finished_sending or output.finished_recving: - new_output = copy.copy(new_output) - new_output.finished_sending = output.finished_sending - new_output.finished_recving = output.finished_recving - output = new_output + # kv_connector_output + if (not kv_connector_output.finished_sending + and not kv_connector_output.finished_recving): + return EMPTY_MODEL_RUNNER_OUTPUT + + output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT) + output.kv_connector_output = kv_connector_output + return output assert isinstance(output, ModelRunnerOutput) return output diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index 5a3186058fcfe..343befe176797 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -4,6 +4,8 @@ Define KV connector functionality mixin for model runners. """ import copy +from contextlib import AbstractContextManager, contextmanager, nullcontext +from typing import Generator # noqa: UP035 from typing import TYPE_CHECKING, Optional from vllm.config import VllmConfig @@ -12,7 +14,8 @@ from vllm.distributed.kv_transfer import (get_kv_transfer_group, from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 from vllm.forward_context import get_forward_context, set_forward_context from vllm.logger import init_logger -from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, ModelRunnerOutput +from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput, + ModelRunnerOutput) if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput @@ -53,18 +56,60 @@ class KVConnectorModelRunnerMixin: scheduler_output.finished_req_ids) return None, None - def kv_connector_no_forward(self, scheduler_output: "SchedulerOutput", + @staticmethod + def kv_connector_no_forward(scheduler_output: "SchedulerOutput", vllm_config: VllmConfig) -> ModelRunnerOutput: # KV send/recv even if no work to do. - with set_forward_context(None, vllm_config): - self.maybe_setup_kv_connector(scheduler_output) - finished_sending, finished_recving = ( - self.get_finished_kv_transfers(scheduler_output)) + with set_forward_context( + None, vllm_config + ), KVConnectorModelRunnerMixin._get_kv_connector_output( + scheduler_output, wait_for_save=False) as kv_connector_output: + pass - if not finished_sending and not finished_recving: + if (not kv_connector_output.finished_sending + and not kv_connector_output.finished_recving): return EMPTY_MODEL_RUNNER_OUTPUT output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT) - output.finished_sending = finished_sending - output.finished_recving = finished_recving + output.kv_connector_output = kv_connector_output return output + + @staticmethod + def maybe_get_kv_connector_output( + scheduler_output: "SchedulerOutput" + ) -> AbstractContextManager[Optional[KVConnectorOutput]]: + return KVConnectorModelRunnerMixin._get_kv_connector_output( + scheduler_output) if has_kv_transfer_group() else nullcontext() + + # This context manager must be used within an active forward context. + # It encapsulates the entire KV conector lifecycle within execute_model + @staticmethod + @contextmanager + def _get_kv_connector_output( + scheduler_output: "SchedulerOutput", + wait_for_save: bool = True + ) -> Generator[KVConnectorOutput, None, None]: + output = KVConnectorOutput() + + # Update KVConnector with the KVConnector metadata forward(). + kv_connector = get_kv_transfer_group() + assert isinstance(kv_connector, KVConnectorBase_V1) + assert scheduler_output.kv_connector_metadata is not None + kv_connector.bind_connector_metadata( + scheduler_output.kv_connector_metadata) + + # Background KV cache transfers happen here. + # These transfers are designed to be async and the requests + # involved may be disjoint from the running requests. + # Do this here to save a collective_rpc. + kv_connector.start_load_kv(get_forward_context()) + try: + yield output + finally: + if wait_for_save: + kv_connector.wait_for_save() + + output.finished_sending, output.finished_recving = ( + kv_connector.get_finished(scheduler_output.finished_req_ids)) + + kv_connector.clear_connector_metadata() diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 59cbb0150570b..67cb2f9dd810e 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -51,7 +51,7 @@ from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsLists, from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler from vllm.v1.worker.kv_connector_model_runner_mixin import ( - KVConnectorModelRunnerMixin) + KVConnectorModelRunnerMixin, KVConnectorOutput) from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin from vllm.v1.worker.tpu_input_batch import CachedRequestState, InputBatch @@ -1175,9 +1175,10 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logprobs=logprobs_lists, prompt_logprobs_dict=prompt_logprobs_dict, pooler_output=[], - finished_sending=finished_sending, - finished_recving=finished_recving, - ) + kv_connector_output=KVConnectorOutput( + finished_sending=finished_sending, + finished_recving=finished_recving, + )) # Check there are no new graphs compiled - all the graphs should be # captured and compiled during warm up. From 6d98843b31fb6d12fa682fecf584a5b7a4e98491 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 3 Aug 2025 04:04:21 -0700 Subject: [PATCH 175/224] [Responses API] Disable response store by default (#22137) Signed-off-by: Woosuk Kwon --- .../entrypoints/openai/responses/conftest.py | 12 ++++++--- .../openai/responses/test_image.py | 7 ++++-- vllm/entrypoints/openai/serving_responses.py | 25 ++++++++++++++++--- vllm/envs.py | 12 +++++++++ 4 files changed, 46 insertions(+), 10 deletions(-) diff --git a/tests/v1/entrypoints/openai/responses/conftest.py b/tests/v1/entrypoints/openai/responses/conftest.py index 2dcdda04ecb57..2d677a00b646a 100644 --- a/tests/v1/entrypoints/openai/responses/conftest.py +++ b/tests/v1/entrypoints/openai/responses/conftest.py @@ -21,12 +21,16 @@ def default_server_args(): @pytest.fixture(scope="module") -def server(default_server_args): - with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: +def server_with_store(default_server_args): + with RemoteOpenAIServer( + MODEL_NAME, + default_server_args, + env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"}, + ) as remote_server: yield remote_server @pytest_asyncio.fixture -async def client(server): - async with server.get_async_client() as async_client: +async def client(server_with_store): + async with server_with_store.get_async_client() as async_client: yield async_client diff --git a/tests/v1/entrypoints/openai/responses/test_image.py b/tests/v1/entrypoints/openai/responses/test_image.py index f3bce91e97cdf..c8d09fd39fb13 100644 --- a/tests/v1/entrypoints/openai/responses/test_image.py +++ b/tests/v1/entrypoints/openai/responses/test_image.py @@ -37,8 +37,11 @@ def default_image_server_args(): @pytest.fixture(scope="module") def image_server(default_image_server_args): - with RemoteOpenAIServer(MODEL_NAME, - default_image_server_args) as remote_server: + with RemoteOpenAIServer( + MODEL_NAME, + default_image_server_args, + env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"}, + ) as remote_server: yield remote_server diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 64880a3a5377f..5e9401cbd7473 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -11,6 +11,7 @@ import jinja2 from fastapi import Request from openai.types.responses import ResponseOutputMessage, ResponseOutputText +from vllm import envs from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, @@ -89,15 +90,17 @@ class OpenAIServingResponses(OpenAIServing): logger.info("Using default chat sampling params from %s: %s", source, self.default_sampling_params) + # False by default. + self.enable_store = envs.VLLM_ENABLE_RESPONSES_API_STORE # HACK(woosuk): This is a hack. We should use a better store. - # FIXME: This causes a memory leak since we never remove responses - # from the store. + # FIXME: If enable_store=True, this may cause a memory leak since we + # never remove responses from the store. self.response_store: dict[str, ResponsesResponse] = {} self.response_store_lock = asyncio.Lock() # HACK(woosuk): This is a hack. We should use a better store. - # FIXME: This causes a memory leak since we never remove messages - # from the store. + # FIXME: If enable_store=True, this may cause a memory leak since we + # never remove messages from the store. self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {} self.background_tasks: dict[str, asyncio.Task] = {} @@ -118,6 +121,10 @@ class OpenAIServingResponses(OpenAIServing): if self.engine_client.errored: raise self.engine_client.dead_error + # If store is not enabled, return an error. + if request.store and not self.enable_store: + return self._make_store_not_supported_error() + # Handle the previous response ID. prev_response_id = request.previous_response_id if prev_response_id is not None: @@ -456,3 +463,13 @@ class OpenAIServingResponses(OpenAIServing): message=f"Response with id '{response_id}' not found.", status_code=HTTPStatus.NOT_FOUND, ) + + def _make_store_not_supported_error(self) -> ErrorResponse: + return self.create_error_response( + err_type="invalid_request_error", + message=("`store=True` (default) is not supported. Please set " + "`store=False` in Responses API or set " + "`VLLM_ENABLE_RESPONSES_API_STORE=1` in the env var when " + "starting the vLLM server."), + status_code=HTTPStatus.BAD_REQUEST, + ) diff --git a/vllm/envs.py b/vllm/envs.py index 2d470c6dccbfd..8d3c7eab471cf 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -151,6 +151,7 @@ if TYPE_CHECKING: VLLM_ENABLE_CUDAGRAPH_GC: bool = False VLLM_LOOPBACK_IP: str = "" VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False + VLLM_ENABLE_RESPONSES_API_STORE: bool = False def get_default_cache_root(): @@ -1056,6 +1057,17 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE": lambda: bool(int(os.getenv(\ "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))), + + # Enables support for the "store" option in the OpenAI Responses API. + # When set to 1, vLLM's OpenAI server will retain the input and output + # messages for those requests in memory. By default, this is disabled (0). + # NOTE/WARNING: + # 1. Messages are kept in memory only (not persisted to disk) and will be + # lost when the vLLM server shuts down. + # 2. Enabling this option will cause a memory leak, as stored messages are + # never removed from memory until the server terminates. + "VLLM_ENABLE_RESPONSES_API_STORE": + lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))), } # --8<-- [end:env-vars-definition] From b5dfb94fa013d4488e6678ae2b0cd08576a12326 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Sun, 3 Aug 2025 20:34:04 +0800 Subject: [PATCH 176/224] [CI/Build][Bugfix] Fix Qwen2.5 tests in CPU CI via fallback silu_and_mul to torch native implementation (#22145) Signed-off-by: jiang1.li --- vllm/model_executor/layers/activation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 1fd96fe405b9a..7ce44174ead6d 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -65,11 +65,13 @@ class SiluAndMul(CustomOp): def __init__(self): super().__init__() - if current_platform.is_cuda_alike() or current_platform.is_cpu(): + if current_platform.is_cuda_alike(): self.op = torch.ops._C.silu_and_mul elif current_platform.is_xpu(): from vllm._ipex_ops import ipex_ops self.op = ipex_ops.silu_and_mul + elif current_platform.is_cpu(): + self._forward_method = self.forward_native def forward_native(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" From 83f7bbb3180fe7503cfbc4fb49b06200fb64cdf0 Mon Sep 17 00:00:00 2001 From: TankNee Date: Sun, 3 Aug 2025 22:47:55 +0800 Subject: [PATCH 177/224] Add chat doc in quick start (#21213) Co-authored-by: Cyrus Leung --- docs/getting_started/quickstart.md | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index 3a93497fab137..f833807666460 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -98,6 +98,43 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` +!!! note + The `llm.generate` method does not automatically apply the model's chat template to the input prompt. Therefore, if you are using an Instruct model or Chat model, you should manually apply the corresponding chat template to ensure the expected behavior. Alternatively, you can use the `llm.chat` method and pass a list of messages which have the same format as those passed to OpenAI's `client.chat.completions`: + + ??? code + + ```python + # Using tokenizer to apply chat template + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained("/path/to/chat_model") + messages_list = [ + [{"role": "user", "content": prompt}] + for prompt in prompts + ] + texts = tokenizer.apply_chat_template( + messages_list, + tokenize=False, + add_generation_prompt=True, + ) + + # Generate outputs + outputs = llm.generate(texts, sampling_params) + + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + # Using chat interface. + outputs = llm.chat(messages_list, sampling_params) + for idx, output in enumerate(outputs): + prompt = prompts[idx] + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + ``` + [](){ #quickstart-online } ## OpenAI-Compatible Server From d3c18c9cb0b6c42eab4ed7251adbf68dde4da39a Mon Sep 17 00:00:00 2001 From: Yuxuan Zhang <2448370773@qq.com> Date: Mon, 4 Aug 2025 00:04:54 +0800 Subject: [PATCH 178/224] fuse fp32 for GLM-4.5 e_score_correction_bias (#22143) Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> --- vllm/model_executor/models/glm4_moe.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 6a196fef572de..c702684c6caa1 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -125,9 +125,8 @@ class Glm4MoE(nn.Module): quant_config=None, prefix=f"{prefix}.gate") - # noaux_tc is not set in transformers new config now - self.gate.e_score_correction_bias = (nn.Parameter( - torch.empty(config.n_routed_experts))) + self.gate.e_score_correction_bias = nn.Parameter( + torch.empty(config.n_routed_experts, dtype=torch.float32)) # Load balancing settings. vllm_config = get_current_vllm_config() From 6a39ba85fe0f2fff9494b5eccea717c93510c230 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Mon, 4 Aug 2025 03:04:38 +0800 Subject: [PATCH 179/224] [Bugfix] Fix failing multimodal standard test (#22153) Signed-off-by: Isotr0py --- tests/models/multimodal/test_tensor_schema.py | 2 ++ tests/models/registry.py | 12 ++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py index bdc62b1d2682d..f80e8456f02e3 100644 --- a/tests/models/multimodal/test_tensor_schema.py +++ b/tests/models/multimodal/test_tensor_schema.py @@ -105,6 +105,8 @@ def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner], model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) model_info.check_available_online(on_fail="skip") + model_info.check_transformers_version(on_fail="skip", + check_max_version=False) model_id = model_info.default diff --git a/tests/models/registry.py b/tests/models/registry.py index 8fc870cf85642..25cfa267d1815 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -80,6 +80,8 @@ class _HfExamplesInfo: self, *, on_fail: Literal["error", "skip"], + check_min_version: bool = True, + check_max_version: bool = True, ) -> None: """ If the installed transformers version does not meet the requirements, @@ -96,9 +98,11 @@ class _HfExamplesInfo: msg = f"`transformers=={current_version}` installed, but `transformers" # Only check the base version for the min/max version, otherwise preview # models cannot be run because `x.yy.0.dev0`<`x.yy.0` - if min_version and Version(cur_base_version) < Version(min_version): + if (check_min_version and min_version + and Version(cur_base_version) < Version(min_version)): msg += f">={min_version}` is required to run this model." - elif max_version and Version(cur_base_version) > Version(max_version): + elif (check_max_version and max_version + and Version(cur_base_version) > Version(max_version)): msg += f"<={max_version}` is required to run this model." else: return @@ -185,6 +189,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { min_transformers_version="4.53"), "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"), "Glm4ForCausalLM": _HfExamplesInfo("THUDM/GLM-4-9B-0414"), + "Glm4MoeForCausalLM": _HfExamplesInfo("zai-org/GLM-4.5", + min_transformers_version="4.54"), # noqa: E501 "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}), "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder", @@ -378,8 +384,6 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 "Glm4vForConditionalGeneration": _HfExamplesInfo("THUDM/GLM-4.1V-9B-Thinking"), # noqa: E501 - "Glm4MoeForCausalLM": _HfExamplesInfo("zai-org/GLM-4.5", - min_transformers_version="4.54"), # noqa: E501 "Glm4v_moeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V-Air", is_available_online=False), # noqa: E501 "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m", From 6f5478298ddd8e6aa330f171c70811f667b8699b Mon Sep 17 00:00:00 2001 From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Date: Sun, 3 Aug 2025 19:23:32 -0700 Subject: [PATCH 180/224] Use `aiohttp` connection pool for benchmarking (#21981) Signed-off-by: Seiji Eicher --- vllm/benchmarks/lib/endpoint_request_func.py | 469 +++++++++---------- vllm/benchmarks/lib/ready_checker.py | 4 +- vllm/benchmarks/serve.py | 40 +- 3 files changed, 271 insertions(+), 242 deletions(-) diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index 60ae520db3862..2d64cc115f00f 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -50,6 +50,7 @@ class RequestFuncOutput: async def async_request_openai_completions( request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, pbar: Optional[tqdm] = None, ) -> RequestFuncOutput: """The async request function for the OpenAI Completions API. @@ -66,96 +67,94 @@ async def async_request_openai_completions( ("completions", "profile") ), "OpenAI Completions API URL must end with 'completions' or 'profile'." - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: - payload = { - "model": request_func_input.model_name \ - if request_func_input.model_name else request_func_input.model, - "prompt": request_func_input.prompt, - "temperature": 0.0, - "repetition_penalty": 1.0, - "max_tokens": request_func_input.output_len, - "logprobs": request_func_input.logprobs, - "stream": True, - "stream_options": { - "include_usage": True, - }, - } - if request_func_input.ignore_eos: - payload["ignore_eos"] = request_func_input.ignore_eos - if request_func_input.extra_body: - payload.update(request_func_input.extra_body) - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" - } + payload = { + "model": request_func_input.model_name \ + if request_func_input.model_name else request_func_input.model, + "prompt": request_func_input.prompt, + "temperature": 0.0, + "repetition_penalty": 1.0, + "max_tokens": request_func_input.output_len, + "logprobs": request_func_input.logprobs, + "stream": True, + "stream_options": { + "include_usage": True, + }, + } + if request_func_input.ignore_eos: + payload["ignore_eos"] = request_func_input.ignore_eos + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" + } - output = RequestFuncOutput() - output.prompt_len = request_func_input.prompt_len + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len - generated_text = "" - st = time.perf_counter() - most_recent_timestamp = st - try: - async with session.post(url=api_url, json=payload, - headers=headers) as response: - if response.status == 200: - first_chunk_received = False - async for chunk_bytes in response.content: - chunk_bytes = chunk_bytes.strip() - if not chunk_bytes: - continue - chunk_bytes = chunk_bytes.decode("utf-8") - # NOTE: SSE comments (often used as pings) start with - # a colon. These are not JSON data payload and should - # be skipped. - if chunk_bytes.startswith(":"): - continue + generated_text = "" + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload, + headers=headers) as response: + if response.status == 200: + first_chunk_received = False + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + chunk_bytes = chunk_bytes.decode("utf-8") + # NOTE: SSE comments (often used as pings) start with + # a colon. These are not JSON data payload and should + # be skipped. + if chunk_bytes.startswith(":"): + continue - chunk = chunk_bytes.removeprefix("data: ") + chunk = chunk_bytes.removeprefix("data: ") - if chunk != "[DONE]": - data = json.loads(chunk) + if chunk != "[DONE]": + data = json.loads(chunk) - # NOTE: Some completion API might have a last - # usage summary response without a token so we - # want to check a token was generated - if choices := data.get("choices"): - # Note that text could be empty here - # e.g. for special tokens - text = choices[0].get("text") - timestamp = time.perf_counter() - # First token - if not first_chunk_received: - first_chunk_received = True - ttft = time.perf_counter() - st - output.ttft = ttft + # NOTE: Some completion API might have a last + # usage summary response without a token so we + # want to check a token was generated + if choices := data.get("choices"): + # Note that text could be empty here + # e.g. for special tokens + text = choices[0].get("text") + timestamp = time.perf_counter() + # First token + if not first_chunk_received: + first_chunk_received = True + ttft = time.perf_counter() - st + output.ttft = ttft - # Decoding phase - else: - output.itl.append(timestamp - - most_recent_timestamp) + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) - most_recent_timestamp = timestamp - generated_text += text or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") - if first_chunk_received: - output.success = True - else: - output.success = False - output.error = ( - "Never received a valid chunk to calculate TTFT." - "This response will be marked as failed!") - output.generated_text = generated_text - output.latency = most_recent_timestamp - st + most_recent_timestamp = timestamp + generated_text += text or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") + if first_chunk_received: + output.success = True else: - output.error = response.reason or "" output.success = False - except Exception: - output.success = False - exc_info = sys.exc_info() - output.error = "".join(traceback.format_exception(*exc_info)) + output.error = ( + "Never received a valid chunk to calculate TTFT." + "This response will be marked as failed!") + output.generated_text = generated_text + output.latency = most_recent_timestamp - st + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) if pbar: pbar.update(1) @@ -164,45 +163,158 @@ async def async_request_openai_completions( async def async_request_openai_chat_completions( request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, pbar: Optional[tqdm] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith(("chat/completions", "profile")), ( "OpenAI Chat Completions API URL must end with 'chat/completions'.") - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: - content = [{"type": "text", "text": request_func_input.prompt}] - if request_func_input.multi_modal_content: - content.append(request_func_input.multi_modal_content) - payload = { - "model": - request_func_input.model_name - if request_func_input.model_name else request_func_input.model, - "messages": [ - { - "role": "user", - "content": content - }, - ], - "temperature": - 0.0, - "max_completion_tokens": - request_func_input.output_len, - "stream": - True, - "stream_options": { - "include_usage": True, + content = [{"type": "text", "text": request_func_input.prompt}] + if request_func_input.multi_modal_content: + content.append(request_func_input.multi_modal_content) + payload = { + "model": + request_func_input.model_name + if request_func_input.model_name else request_func_input.model, + "messages": [ + { + "role": "user", + "content": content }, - } - if request_func_input.ignore_eos: - payload["ignore_eos"] = request_func_input.ignore_eos - if request_func_input.extra_body: - payload.update(request_func_input.extra_body) - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - } + ], + "temperature": + 0.0, + "max_completion_tokens": + request_func_input.output_len, + "stream": + True, + "stream_options": { + "include_usage": True, + }, + } + if request_func_input.ignore_eos: + payload["ignore_eos"] = request_func_input.ignore_eos + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload, + headers=headers) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + chunk_bytes = chunk_bytes.decode("utf-8") + # NOTE: SSE comments (often used as pings) start with + # a colon. These are not JSON data payload and should + # be skipped. + if chunk_bytes.startswith(":"): + continue + + chunk = chunk_bytes.removeprefix("data: ") + + if chunk != "[DONE]": + timestamp = time.perf_counter() + data = json.loads(chunk) + + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") + + most_recent_timestamp = timestamp + + output.generated_text = generated_text + output.success = True + output.latency = most_recent_timestamp - st + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_openai_audio( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + # Lazy import without PlaceholderModule to avoid vllm dep. + import soundfile + + api_url = request_func_input.api_url + assert api_url.endswith(("transcriptions", "translations")), ( + "OpenAI Chat Completions API URL must end with 'transcriptions' ") + "or `translations`." + + content = [{"type": "text", "text": request_func_input.prompt}] + payload = { + "model": + request_func_input.model_name + if request_func_input.model_name else request_func_input.model, + "temperature": + 0.0, + "max_completion_tokens": + request_func_input.output_len, + "stream": + True, + "language": + "en", + # Flattened due to multipart/form-data + "stream_include_usage": + True, + "stream_continuous_usage_stats": + True, + } + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + + # Send audio file + def to_bytes(y, sr): + buffer = io.BytesIO() + soundfile.write(buffer, y, sr, format="WAV") + buffer.seek(0) + return buffer + + with to_bytes(*request_func_input.multi_modal_content["audio"]) as f: + form = aiohttp.FormData() + form.add_field("file", f, content_type="audio/wav") + for key, value in payload.items(): + form.add_field(key, str(value)) output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -212,28 +324,24 @@ async def async_request_openai_chat_completions( st = time.perf_counter() most_recent_timestamp = st try: - async with session.post(url=api_url, json=payload, + async with session.post(url=api_url, + data=form, headers=headers) as response: if response.status == 200: async for chunk_bytes in response.content: chunk_bytes = chunk_bytes.strip() if not chunk_bytes: continue - chunk_bytes = chunk_bytes.decode("utf-8") - # NOTE: SSE comments (often used as pings) start with - # a colon. These are not JSON data payload and should - # be skipped. - if chunk_bytes.startswith(":"): - continue - - chunk = chunk_bytes.removeprefix("data: ") + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data: ") if chunk != "[DONE]": timestamp = time.perf_counter() data = json.loads(chunk) if choices := data.get("choices"): - content = choices[0]["delta"].get("content") + content = choices[0]["delta"].get( + "content") # First token if ttft == 0.0: ttft = timestamp - st @@ -241,8 +349,8 @@ async def async_request_openai_chat_completions( # Decoding phase else: - output.itl.append(timestamp - - most_recent_timestamp) + output.itl.append( + timestamp - most_recent_timestamp) generated_text += content or "" elif usage := data.get("usage"): @@ -267,117 +375,6 @@ async def async_request_openai_chat_completions( return output -async def async_request_openai_audio( - request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, -) -> RequestFuncOutput: - # Lazy import without PlaceholderModule to avoid vllm dep. - import soundfile - - api_url = request_func_input.api_url - assert api_url.endswith(("transcriptions", "translations")), ( - "OpenAI Chat Completions API URL must end with 'transcriptions' ") - "or `translations`." - - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: - content = [{"type": "text", "text": request_func_input.prompt}] - payload = { - "model": - request_func_input.model_name - if request_func_input.model_name else request_func_input.model, - "temperature": - 0.0, - "max_completion_tokens": - request_func_input.output_len, - "stream": - True, - "language": - "en", - # Flattened due to multipart/form-data - "stream_include_usage": - True, - "stream_continuous_usage_stats": - True, - } - if request_func_input.extra_body: - payload.update(request_func_input.extra_body) - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - } - - # Send audio file - def to_bytes(y, sr): - buffer = io.BytesIO() - soundfile.write(buffer, y, sr, format="WAV") - buffer.seek(0) - return buffer - - with to_bytes(*request_func_input.multi_modal_content["audio"]) as f: - form = aiohttp.FormData() - form.add_field("file", f, content_type="audio/wav") - for key, value in payload.items(): - form.add_field(key, str(value)) - - output = RequestFuncOutput() - output.prompt_len = request_func_input.prompt_len - - generated_text = "" - ttft = 0.0 - st = time.perf_counter() - most_recent_timestamp = st - try: - async with session.post(url=api_url, - data=form, - headers=headers) as response: - if response.status == 200: - async for chunk_bytes in response.content: - chunk_bytes = chunk_bytes.strip() - if not chunk_bytes: - continue - - chunk = chunk_bytes.decode("utf-8").removeprefix( - "data: ") - if chunk != "[DONE]": - timestamp = time.perf_counter() - data = json.loads(chunk) - - if choices := data.get("choices"): - content = choices[0]["delta"].get( - "content") - # First token - if ttft == 0.0: - ttft = timestamp - st - output.ttft = ttft - - # Decoding phase - else: - output.itl.append( - timestamp - most_recent_timestamp) - - generated_text += content or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") - - most_recent_timestamp = timestamp - - output.generated_text = generated_text - output.success = True - output.latency = most_recent_timestamp - st - else: - output.error = response.reason or "" - output.success = False - except Exception: - output.success = False - exc_info = sys.exc_info() - output.error = "".join(traceback.format_exception(*exc_info)) - - if pbar: - pbar.update(1) - return output - - # TODO: Add more request functions for different API protocols. ASYNC_REQUEST_FUNCS = { "vllm": async_request_openai_completions, diff --git a/vllm/benchmarks/lib/ready_checker.py b/vllm/benchmarks/lib/ready_checker.py index a663f85b629d2..7e836158386a9 100644 --- a/vllm/benchmarks/lib/ready_checker.py +++ b/vllm/benchmarks/lib/ready_checker.py @@ -14,6 +14,7 @@ from .endpoint_request_func import RequestFuncInput, RequestFuncOutput async def wait_for_endpoint( request_func, test_input: RequestFuncInput, + session: aiohttp.ClientSession, timeout_seconds: int = 600, retry_interval: int = 5, ) -> RequestFuncOutput: @@ -55,7 +56,8 @@ async def wait_for_endpoint( # ping the endpoint using request_func try: - output = await request_func(request_func_input=test_input) + output = await request_func( + request_func_input=test_input, session=session) if output.success: pbar.close() return output diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 45798547ac719..ca8d218581e77 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -28,6 +28,7 @@ from dataclasses import dataclass from datetime import datetime from typing import Any, Literal, Optional +import aiohttp import numpy as np from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase @@ -338,6 +339,24 @@ async def benchmark( else: raise ValueError(f"Unknown endpoint_type: {endpoint_type}") + # Reuses connections across requests to reduce TLS handshake overhead. + connector = aiohttp.TCPConnector( + limit=max_concurrency or 0, + limit_per_host=max_concurrency or 0, + ttl_dns_cache=300, + use_dns_cache=True, + keepalive_timeout=60, + enable_cleanup_closed=True, + force_close=False, + ssl=("https://" in api_url), + ) + + session = aiohttp.ClientSession( + connector=connector, + trust_env=True, + timeout=aiohttp.ClientTimeout(total=6 * 60 * 60), + ) + print("Starting initial single prompt test run...") test_prompt, test_prompt_len, test_output_len, test_mm_content = ( input_requests[0].prompt, @@ -361,7 +380,11 @@ async def benchmark( ) test_output = await wait_for_endpoint( - request_func, test_input, timeout_seconds=ready_check_timeout_sec) + request_func, + test_input, + session, + timeout_seconds=ready_check_timeout_sec, + ) if not test_output.success: raise ValueError( "Initial test run failed - Please make sure benchmark arguments " @@ -386,7 +409,8 @@ async def benchmark( multi_modal_content=test_mm_content, ignore_eos=ignore_eos, extra_body=extra_body) - profile_output = await request_func(request_func_input=profile_input) + profile_output = await request_func( + request_func_input=profile_input, session=session) if profile_output.success: print("Profiler started") @@ -412,12 +436,14 @@ async def benchmark( semaphore = (asyncio.Semaphore(max_concurrency) if max_concurrency else None) - async def limited_request_func(request_func_input, pbar): + async def limited_request_func(request_func_input, session, pbar): if semaphore is None: return await request_func(request_func_input=request_func_input, + session=session, pbar=pbar) async with semaphore: - return await request_func(request_func_input=request_func_input, + return await request_func(request_func_input=request_func_input, + session=session, pbar=pbar) benchmark_start_time = time.perf_counter() @@ -469,6 +495,7 @@ async def benchmark( tasks.append( asyncio.create_task( limited_request_func(request_func_input=request_func_input, + session=session, pbar=pbar))) outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) @@ -580,9 +607,12 @@ async def benchmark( output_len=test_output_len, logprobs=logprobs, ) - profile_output = await request_func(request_func_input=profile_input) + profile_output = await request_func( + request_func_input=profile_input, session=session) if profile_output.success: print("Profiler stopped") + + await session.close() return result From e27d25a0dcbb71a0d2e2a27d7e2b606a8df30320 Mon Sep 17 00:00:00 2001 From: "ZiTian.Zhao" Date: Mon, 4 Aug 2025 10:24:02 +0800 Subject: [PATCH 181/224] [fix] fix correct assertion syntax error in attention utils. (#22154) Signed-off-by: zitian.zhao --- vllm/v1/attention/backends/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 6defd211f4cfa..48bd632227c5b 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -97,7 +97,9 @@ def _make_metadata_with_slice( query_start_loc = slice_query_start_locs(attn_metadata.query_start_loc, request_slice) - assert len(query_start_loc >= 2) + assert len(query_start_loc) >= 2, ( + f"query_start_loc must have at least 2 elements, " + f"got {len(query_start_loc)}") query_start_loc_cpu = slice_query_start_locs( attn_metadata.query_start_loc_cpu, request_slice) From 845420ac2c2bc27ae0f96c25430b4f1cd20063cc Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Sun, 3 Aug 2025 19:43:33 -0700 Subject: [PATCH 182/224] [RLHF] Fix torch.dtype not serializable in example (#22158) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- examples/offline_inference/rlhf.py | 5 ++++- examples/offline_inference/rlhf_utils.py | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py index 752117a4e3623..ed974b90b57ee 100644 --- a/examples/offline_inference/rlhf.py +++ b/examples/offline_inference/rlhf.py @@ -126,7 +126,10 @@ for name, p in train_model.named_parameters(): # Synchronize the updated weights to the inference engine. for name, p in train_model.named_parameters(): - handle = llm.collective_rpc.remote("update_weight", args=(name, p.dtype, p.shape)) + dtype_name = str(p.dtype).split(".")[-1] + handle = llm.collective_rpc.remote( + "update_weight", args=(name, dtype_name, p.shape) + ) model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream()) ray.get(handle) diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py index c445224d75686..d2a8419ffabcd 100644 --- a/examples/offline_inference/rlhf_utils.py +++ b/examples/offline_inference/rlhf_utils.py @@ -45,7 +45,8 @@ class WorkerExtension: self.device, ) - def update_weight(self, name, dtype, shape): + def update_weight(self, name, dtype_name, shape): + dtype = getattr(torch, dtype_name) weight = torch.empty(shape, dtype=dtype, device="cuda") self.model_update_group.broadcast( weight, src=0, stream=torch.cuda.current_stream() From 0d7db16a92afd9fc005ed0fba73356845586f5e7 Mon Sep 17 00:00:00 2001 From: Abirdcfly Date: Mon, 4 Aug 2025 10:57:03 +0800 Subject: [PATCH 183/224] [PD] add test for chat completions endpoint (#21925) Signed-off-by: Abirdcfly --- .../nixl_integration/test_disagg_accuracy.py | 41 ++++++++++++------- .../nixl_integration/toy_proxy_server.py | 2 + 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py b/tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py index 00e62f351ce30..697e101c35926 100644 --- a/tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py +++ b/tests/v1/kv_connector/nixl_integration/test_disagg_accuracy.py @@ -51,20 +51,31 @@ def check_vllm_server(url: str, timeout=5, retries=3) -> bool: return False -def run_simple_prompt(base_url: str, model_name: str, - input_prompt: str) -> str: +def run_simple_prompt(base_url: str, model_name: str, input_prompt: str, + use_chat_endpoint: bool) -> str: client = openai.OpenAI(api_key="EMPTY", base_url=base_url) - completion = client.completions.create(model=model_name, - prompt=input_prompt, - max_tokens=MAX_OUTPUT_LEN, - temperature=0.0, - seed=42) + if use_chat_endpoint: + completion = client.chat.completions.create( + model=model_name, + messages=[{ + "role": "user", + "content": [{ + "type": "text", + "text": input_prompt + }] + }], + max_completion_tokens=MAX_OUTPUT_LEN, + temperature=0.0, + seed=42) + return completion.choices[0].message.content + else: + completion = client.completions.create(model=model_name, + prompt=input_prompt, + max_tokens=MAX_OUTPUT_LEN, + temperature=0.0, + seed=42) - # print("-" * 50) - # print(f"Completion results for {model_name}:") - # print(completion) - # print("-" * 50) - return completion.choices[0].text + return completion.choices[0].text def main(): @@ -125,10 +136,12 @@ def main(): f"vllm server: {args.service_url} is not ready yet!") output_strs = dict() - for prompt in SAMPLE_PROMPTS: + for i, prompt in enumerate(SAMPLE_PROMPTS): + use_chat_endpoint = (i % 2 == 1) output_str = run_simple_prompt(base_url=service_url, model_name=args.model_name, - input_prompt=prompt) + input_prompt=prompt, + use_chat_endpoint=use_chat_endpoint) print(f"Prompt: {prompt}, output: {output_str}") output_strs[prompt] = output_str diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py index 66e237da0f80a..905ae0ea71722 100644 --- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py +++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py @@ -162,6 +162,8 @@ async def send_request_to_service(client_info: dict, endpoint: str, } req_data["stream"] = False req_data["max_tokens"] = 1 + if "max_completion_tokens" in req_data: + req_data["max_completion_tokens"] = 1 if "stream_options" in req_data: del req_data["stream_options"] headers = { From c2e75b3c11047eec0f184577ce134879ce993f77 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Mon, 4 Aug 2025 11:03:58 +0800 Subject: [PATCH 184/224] remove duplicate code within cleanup_dist_env_and_memory (#22147) Signed-off-by: Andy Xie --- vllm/distributed/parallel_state.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index ee581124db510..f31e4766bfdad 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1238,8 +1238,6 @@ def destroy_distributed_environment(): def cleanup_dist_env_and_memory(shutdown_ray: bool = False): destroy_model_parallel() destroy_distributed_environment() - with contextlib.suppress(AssertionError): - torch.distributed.destroy_process_group() if shutdown_ray: import ray # Lazy import Ray ray.shutdown() From aa7012eb6db69baab57c80ac596d088eb81e090f Mon Sep 17 00:00:00 2001 From: Giancarlo Delfin <32987265+TheEpicDolphin@users.noreply.github.com> Date: Sun, 3 Aug 2025 22:13:26 -0700 Subject: [PATCH 185/224] Add tree attention backend for v1 (part 1) (#20401) Signed-off-by: Giancarlo Delfin --- tests/v1/attention/test_attention_backends.py | 2 +- tests/v1/attention/utils.py | 6 +- tests/v1/spec_decode/test_eagle.py | 7 +- tests/v1/spec_decode/test_tree_attention.py | 299 ++++++++++++ .../attention/ops/triton_unified_attention.py | 48 ++ vllm/config.py | 13 + vllm/engine/arg_utils.py | 2 +- vllm/platforms/cuda.py | 4 + vllm/platforms/interface.py | 1 + vllm/v1/attention/backends/tree_attn.py | 452 ++++++++++++++++++ vllm/v1/attention/backends/utils.py | 20 + vllm/v1/spec_decode/eagle.py | 269 ++++++++++- 12 files changed, 1098 insertions(+), 25 deletions(-) create mode 100644 tests/v1/spec_decode/test_tree_attention.py create mode 100644 vllm/v1/attention/backends/tree_attn.py diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py index f197cbb7bbba0..ac08b9052cd80 100644 --- a/tests/v1/attention/test_attention_backends.py +++ b/tests/v1/attention/test_attention_backends.py @@ -17,7 +17,7 @@ from vllm.v1.kv_cache_interface import FullAttentionSpec BACKENDS_TO_TEST = [ _Backend.FLASH_ATTN_VLLM_V1, _Backend.FLASHINFER_VLLM_V1, - _Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN_VLLM_V1 + _Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN_VLLM_V1, _Backend.TREE_ATTN ] # Remove flashinfer from the list if it's not available diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index be6cfce6fba8a..78a6509986fcd 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -109,11 +109,11 @@ def create_common_attn_metadata( def get_attention_backend(backend_name: _Backend): """Set up attention backend classes for testing. - + Args: backend_name: Name of the backend ("flash_attn", "flashinfer", etc.) vllm_config: VllmConfig instance - + Returns: Tuple of (backend_builder_class, backend_impl_class) """ @@ -126,6 +126,8 @@ def get_attention_backend(backend_name: _Backend): "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend", _Backend.TRITON_ATTN_VLLM_V1: "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend", + _Backend.TREE_ATTN: + "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend", } if backend_name not in backend_map: diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index a126c7c943ed0..05f6dd40a9ea9 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -202,7 +202,9 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method, @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8]) -def test_propose(num_speculative_tokens): +@pytest.mark.parametrize("backend", + [_Backend.FLASH_ATTN_VLLM_V1, _Backend.TREE_ATTN]) +def test_propose(num_speculative_tokens, backend): # Use GPU device device = torch.device(current_platform.device_type) @@ -301,8 +303,7 @@ def test_propose(num_speculative_tokens): device=device) sampling_metadata = mock.MagicMock() - attn_metadata_builder_cls, _ = get_attention_backend( - _Backend.FLASH_ATTN_VLLM_V1) + attn_metadata_builder_cls, _ = get_attention_backend(backend) attn_metadata_builder = attn_metadata_builder_cls( kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config), layer_names=proposer.attn_layer_names, diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py new file mode 100644 index 0000000000000..42468daa62a9a --- /dev/null +++ b/tests/v1/spec_decode/test_tree_attention.py @@ -0,0 +1,299 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math +from typing import Optional + +import torch + +from tests.v1.attention.utils import (_Backend, create_standard_kv_cache_spec, + create_vllm_config, + get_attention_backend) +from vllm.config import ParallelConfig, SpeculativeConfig +from vllm.v1.attention.backends.utils import CommonAttentionMetadata + + +class MockAttentionLayer(torch.nn.Module): + _q_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda") + _k_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda") + _v_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda") + + def __init__(self): + super().__init__() + + def forward(self, x): + return x + + +def forward_attention( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + kv_cache: torch.Tensor, + block_table: torch.Tensor, + slot_mapping: torch.Tensor, + seqlen_k: int, + backend: _Backend, + spec_token_tree: Optional[str] = None, + num_spec_tokens: int = 0, +) -> torch.Tensor: + batch_size, q_len, num_heads, dim_per_head = q.shape + num_kv_heads = k.shape[-2] + # Initialize the query and KV sequence lengths. + query_start_loc = q_len * torch.arange( + batch_size + 1, device=q.device, dtype=torch.int32) + query_lens = torch.diff(query_start_loc) + seq_lens = torch.full( + (batch_size, ), + seqlen_k, + device=q.device, + dtype=torch.int32, + ) + context_lens = seq_lens - query_lens + max_query_len = q_len + num_actual_tokens = query_start_loc[-1] + + softmax_scale = q.shape[-1]**(-0.5) + layer = MockAttentionLayer() + + # Build common metadata. + model_name = "meta-llama/Meta-Llama-3-8B" + builder_cls, impl_cls = get_attention_backend(backend) + vllm_config = create_vllm_config(model_name=model_name, + max_model_len=max(seq_lens)) + if spec_token_tree is not None: + # Create speculative config if token tree is specified. + vllm_config.speculative_config = SpeculativeConfig( + target_model_config=vllm_config.model_config, + target_parallel_config=ParallelConfig(), + model=model_name, + method="eagle", + num_speculative_tokens=num_spec_tokens, + speculative_token_tree=spec_token_tree) + kv_cache_spec = create_standard_kv_cache_spec(vllm_config) + builder = builder_cls(kv_cache_spec, [], vllm_config, q.device) + common_attn_metadata = CommonAttentionMetadata( + query_start_loc=query_start_loc, + query_start_loc_cpu=query_start_loc.cpu(), + seq_lens=seq_lens, + seq_lens_cpu=seq_lens.cpu(), + num_computed_tokens_cpu=context_lens.cpu(), + num_reqs=batch_size, + num_actual_tokens=num_actual_tokens, + max_query_len=max_query_len, + block_table_tensor=block_table, + slot_mapping=slot_mapping, + ) + + # Build attention metadata. + attn_metadata = builder.build( + common_prefix_len=0, + common_attn_metadata=common_attn_metadata, + ) + + # Initialize the backend implementation. + instance = impl_cls( + num_heads=num_heads, + head_size=dim_per_head, + scale=softmax_scale, + num_kv_heads=num_kv_heads, + alibi_slopes=None, + sliding_window=None, + kv_cache_dtype="auto", + ) + + # Run forward pass and return output. + query = q.view(-1, num_heads, dim_per_head) + key = k.view(-1, num_kv_heads, dim_per_head) + value = v.view(-1, num_kv_heads, dim_per_head) + output = torch.empty_like(query) + return instance.forward( + layer=layer, + query=query, + key=key, + value=value, + kv_cache=kv_cache.clone(), + attn_metadata=attn_metadata, + output=output, + ) + + +def test_tree_attn_correctness() -> None: + torch.manual_seed(42) + torch.cuda.manual_seed_all(42) + + device = "cuda" + tree_attn_masks = { + # Chain. + "[(0,), (0, 0), (0, 0, 0)]": + torch.tensor( + [ + [1, 0, 0, 0], + [1, 1, 0, 0], + [1, 1, 1, 0], + [1, 1, 1, 1], + ], + device=device, + dtype=torch.int32, + ), + # Tree. + "[(0,), (1,), (0, 0), (0, 1), (1, 0), (1, 1)]": + torch.tensor( + [ + [1, 0, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0, 0], + [1, 0, 1, 0, 0, 0, 0], + [1, 1, 0, 1, 0, 0, 0], + [1, 1, 0, 0, 1, 0, 0], + [1, 0, 1, 0, 0, 1, 0], + [1, 0, 1, 0, 0, 0, 1], + ], + device=device, + dtype=torch.int32, + ), + } + + dim_per_head = 128 + num_kv_heads = 2 + block_size = 128 + max_sequence_length = 8192 + randomize_blocks = True + for batch_size in [1, 16, 32]: + for num_heads in [2, 4]: + for sequence_position in [16, 1024, 2048]: + for spec_token_tree, tree_attn_mask in tree_attn_masks.items(): + # Assert that the number of heads is divisible + # by the number of KV heads. + assert num_heads % num_kv_heads == 0 + + # Initialize q, k, and v. + tree_size_q = tree_attn_mask.shape[0] + seqlen_k = sequence_position + tree_size_q + q = torch.randn( + (batch_size, tree_size_q, num_heads, dim_per_head), + device=device, + dtype=torch.bfloat16, + ) + k = torch.randn( + (batch_size, tree_size_q, num_kv_heads, dim_per_head), + device=device, + dtype=torch.bfloat16, + ) + v = torch.randn( + (batch_size, tree_size_q, num_kv_heads, dim_per_head), + device=device, + dtype=torch.bfloat16, + ) + + # Setup the block table and KV cache for paged KV. + assert max_sequence_length % block_size == 0 + max_blocks_per_batch = max_sequence_length // block_size + kv_cache = torch.randn( + ( + 2, + batch_size * max_blocks_per_batch, + block_size, + num_kv_heads, + dim_per_head, + ), + device=q.device, + dtype=torch.bfloat16, + ) + num_alloc_blocks_per_batch = math.ceil(seqlen_k / + block_size) + block_table = torch.zeros( + (batch_size, max_blocks_per_batch), + device=q.device, + dtype=torch.int32, + ) + block_ids = torch.arange( + 0, + batch_size * num_alloc_blocks_per_batch, + device=q.device, + dtype=torch.int32, + ) + if randomize_blocks: + # Randomize the block ids. + block_ids = block_ids[torch.randperm( + block_ids.numel())] + block_table[:, : + num_alloc_blocks_per_batch] = block_ids.view( + -1, num_alloc_blocks_per_batch) + + # Setup the slot mapping for the input KVs. + tree_positions = sequence_position + torch.arange( + 0, + tree_size_q, + device=q.device, + dtype=torch.int64, + ).repeat(batch_size, 1) + tree_slot_mapping = _gen_slot_mapping( + tree_positions, block_table, block_size) + + # Compute attention for the tree. + tree_attn_output = forward_attention( + q=q, + k=k, + v=v, + kv_cache=kv_cache, + block_table=block_table, + slot_mapping=tree_slot_mapping, + seqlen_k=seqlen_k, + backend=_Backend.TREE_ATTN, + spec_token_tree=spec_token_tree, + num_spec_tokens=tree_size_q - 1, + ).view(batch_size, -1, num_heads, dim_per_head) + + # Verify that the chain attention output for each + # branch of the tree (computed using FA3) matches + # the tree attention output. + for q_index in range(tree_size_q): + # Get the q, k, and v for the branch. + branch_mask = tree_attn_mask[q_index, :] + branch_indices = torch.nonzero(branch_mask, + as_tuple=True)[0] + q_len = branch_indices.shape[0] + q_branch = q[:, branch_indices] + k_branch = k[:, branch_indices] + v_branch = v[:, branch_indices] + + # Setup slot mapping for the branch. + branch_positions = sequence_position + torch.arange( + 0, + q_len, + device=q.device, + dtype=torch.int64, + ).repeat(batch_size, 1) + branch_slot_mapping = _gen_slot_mapping( + branch_positions, block_table, block_size) + + # Compute flash attention for the branch. + flash_attn_output = forward_attention( + q=q_branch, + k=k_branch, + v=v_branch, + kv_cache=kv_cache, + block_table=block_table, + slot_mapping=branch_slot_mapping, + seqlen_k=sequence_position + q_len, + backend=_Backend.FLASH_ATTN_VLLM_V1, + ).view(batch_size, -1, num_heads, dim_per_head) + + # Compare the outputs. + assert torch.allclose( + tree_attn_output[:, branch_indices], + flash_attn_output, + atol=7.81e-3, + ), (f"outputs are not close for " + f"batch_size: {batch_size}, " + f"num_heads: {num_heads}, " + f"sequence_position: {sequence_position}, " + f"tree_attn_mask: {tree_attn_mask}, " + f"q_index: {q_index}.") + + +def _gen_slot_mapping(positions: torch.Tensor, block_table: torch.Tensor, + block_size: int): + block_indices = positions // block_size + blocks = block_table.gather(dim=1, index=block_indices) + return (blocks * block_size + positions % block_size).view(-1) diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index eb9c4f1c1030a..0fdba569f93f2 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -55,6 +55,7 @@ def kernel_unified_attention_2d( block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] seq_lens_ptr, # [num_seqs] alibi_slopes_ptr, # [num_query_heads] + qq_bias_ptr, # [num_query_tokens, num_query_tokens] scale, # float32 k_scale, # float32 v_scale, # float32 @@ -66,10 +67,12 @@ def kernel_unified_attention_2d( query_stride_1: tl.int64, # int, should be equal to head_size output_stride_0: tl.int64, # int output_stride_1: tl.int64, # int, should be equal to head_size + qq_bias_stride_0: tl.int64, # int BLOCK_SIZE: tl.constexpr, # int HEAD_SIZE: tl.constexpr, # int HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 USE_ALIBI_SLOPES: tl.constexpr, # bool + USE_QQ_BIAS: tl.constexpr, # bool USE_SOFTCAP: tl.constexpr, # bool SLIDING_WINDOW: tl.constexpr, # int stride_k_cache_0: tl.int64, # int @@ -144,6 +147,11 @@ def kernel_unified_attention_2d( mask=query_mask_1, other=0.0) + # query-query attention bias + if USE_QQ_BIAS: + qq_bias_row_ptrs = (qq_bias_ptr + query_pos[:, None] * qq_bias_stride_0 + ) # shape: [BLOCK_M] + # compute the length of the longest sequence prefix spanned by any # query token in the current q_block (q_block_local_idx) max_seq_prefix_len = context_len + q_block_local_idx * BLOCK_Q + ( @@ -223,6 +231,18 @@ def kernel_unified_attention_2d( if USE_ALIBI_SLOPES: S += alibi_slope[:, None] * (seq_offset - context_len) + if USE_QQ_BIAS: + # compute key positions relative to query section + key_rel_pos = seq_offset - context_len # shape: [BLOCK_SIZE] + # load bias only for keys that correspond to queries + is_query_key = key_rel_pos >= 0 and key_rel_pos < qq_bias_stride_0 + qq_bias = tl.load( + qq_bias_row_ptrs + key_rel_pos[None, :], + mask=is_query_key[None, :], # avoid OOB for context keys + other=0.0, + ) + S += qq_bias + # compute running maximum # m_j : (BLOCK_M,) m_j = tl.maximum(M, tl.max(S, axis=1)) @@ -275,6 +295,7 @@ def kernel_unified_attention_3d( block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] seq_lens_ptr, # [num_seqs] alibi_slopes_ptr, # [num_query_heads] + qq_bias_ptr, # [num_query_tokens, num_query_tokens] scale, # float32 k_scale, # float32 v_scale, # float32 @@ -284,10 +305,12 @@ def kernel_unified_attention_3d( block_table_stride: tl.int64, # int query_stride_0: tl.int64, # int query_stride_1: tl.int64, # int, should be equal to head_size + qq_bias_stride_0: tl.int64, # int BLOCK_SIZE: tl.constexpr, # int HEAD_SIZE: tl.constexpr, # int HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 USE_ALIBI_SLOPES: tl.constexpr, # bool + USE_QQ_BIAS: tl.constexpr, # bool USE_SOFTCAP: tl.constexpr, # bool SLIDING_WINDOW: tl.constexpr, # int stride_k_cache_0: tl.int64, # int @@ -373,6 +396,11 @@ def kernel_unified_attention_3d( mask=query_mask_1, other=0.0) + # query-query attention bias + if USE_QQ_BIAS: + qq_bias_row_ptrs = (qq_bias_ptr + query_pos[:, None] * qq_bias_stride_0 + ) # shape: [BLOCK_M] + num_blocks = cdiv_fn(seq_len, BLOCK_SIZE) # iterate through tiles within current segment @@ -442,6 +470,18 @@ def kernel_unified_attention_3d( if USE_ALIBI_SLOPES: S += alibi_slope[:, None] * (seq_offset - context_len) + if USE_QQ_BIAS: + # compute key positions relative to query section + key_rel_pos = seq_offset - context_len # shape: [BLOCK_SIZE] + # load bias only for keys that correspond to queries + is_query_key = key_rel_pos >= 0 and key_rel_pos < qq_bias_stride_0 + qq_bias = tl.load( + qq_bias_row_ptrs + key_rel_pos[None, :], + mask=is_query_key[None, :], # avoid OOB for context keys + other=0.0, + ) + S += qq_bias + # compute running maximum # m_j : (BLOCK_M,) m_j = tl.maximum(M, tl.max(S, axis=1)) @@ -586,6 +626,7 @@ def unified_attention( k_descale, v_descale, alibi_slopes=None, + qq_bias=None, ): assert causal, "Only causal attention is supported" assert q_descale is None, "Q scales not supported" @@ -595,6 +636,7 @@ def unified_attention( "Block size must be at least 32 for fp8" use_alibi_slopes = alibi_slopes is not None + use_qq_bias = qq_bias is not None block_size = v.shape[1] num_seqs = len(seqused_k) @@ -630,6 +672,7 @@ def unified_attention( block_tables_ptr=block_table, seq_lens_ptr=seqused_k, alibi_slopes_ptr=alibi_slopes, + qq_bias_ptr=qq_bias, scale=softmax_scale, k_scale=k_descale, v_scale=v_descale, @@ -641,10 +684,12 @@ def unified_attention( query_stride_1=q.stride(1), output_stride_0=out.stride(0), output_stride_1=out.stride(1), + qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0, BLOCK_SIZE=block_size, HEAD_SIZE=head_size, HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), USE_ALIBI_SLOPES=use_alibi_slopes, + USE_QQ_BIAS=use_qq_bias, USE_SOFTCAP=(softcap > 0), SLIDING_WINDOW=(1 + window_size[0]), stride_k_cache_0=k.stride(0), @@ -699,6 +744,7 @@ def unified_attention( block_tables_ptr=block_table, seq_lens_ptr=seqused_k, alibi_slopes_ptr=alibi_slopes, + qq_bias_ptr=qq_bias, scale=softmax_scale, k_scale=k_descale, v_scale=v_descale, @@ -708,10 +754,12 @@ def unified_attention( block_table_stride=block_table.stride(0), query_stride_0=q.stride(0), query_stride_1=q.stride(1), + qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0, BLOCK_SIZE=block_size, HEAD_SIZE=head_size, HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), USE_ALIBI_SLOPES=use_alibi_slopes, + USE_QQ_BIAS=use_qq_bias, USE_SOFTCAP=(softcap > 0), SLIDING_WINDOW=(1 + window_size[0]), stride_k_cache_0=k.stride(0), diff --git a/vllm/config.py b/vllm/config.py index ee8f3dd98dd86..871df455ef58f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3049,6 +3049,19 @@ class SpeculativeConfig: f"num_speculative_tokens:{self.num_speculative_tokens}" f" must be divisible by {n_predict=}") + if self.speculative_token_tree is None: + # Generate chain of tokens. + self.speculative_token_tree = str([ + (i + 1) * (0, ) + for i in range(self.num_speculative_tokens) + ]) + else: + # Sort the token tree breadth-first. + tree_choices = ast.literal_eval( + self.speculative_token_tree) + self.speculative_token_tree = str( + sorted(tree_choices, key=lambda t: (len(t), t))) + self.draft_tensor_parallel_size = \ SpeculativeConfig._verify_and_get_draft_tp( self.target_parallel_config, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c94e440e5c845..5eb9660cd1e8c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1454,7 +1454,6 @@ class EngineArgs: "Please consider using other speculative decoding methods " "such as ngram, medusa, eagle, or deepseek_mtp.") - # No XFormers so far. V1_BACKENDS = [ "FLASH_ATTN_VLLM_V1", "FLASH_ATTN", @@ -1469,6 +1468,7 @@ class EngineArgs: "ROCM_AITER_MLA", "TORCH_SDPA_VLLM_V1", "FLEX_ATTENTION", + "TREE_ATTN", ] if (envs.is_set("VLLM_ATTENTION_BACKEND") and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS): diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index a90910639f784..b61b39a9274d0 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -270,6 +270,7 @@ class CudaPlatformBase(Platform): FLEX_ATTENTION_V1 = "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" # noqa: E501 TRITON_ATTN_VLLM_V1 = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend" # noqa: E501 FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" # noqa: E501 + TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend" # noqa: E501 if selected_backend == _Backend.FLASHINFER: logger.info_once("Using FlashInfer backend on V1 engine.") @@ -287,6 +288,9 @@ class CudaPlatformBase(Platform): elif selected_backend == _Backend.FLASH_ATTN: logger.info_once("Using Flash Attention backend on V1 engine.") return FLASH_ATTN_V1 + elif selected_backend == _Backend.TREE_ATTN: + logger.info_once("Using Tree Attention backend on V1 engine.") + return TREE_ATTN_V1 from vllm.attention.selector import is_attn_backend_supported diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 997aee7063f57..61ce868c13b47 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -62,6 +62,7 @@ class _Backend(enum.Enum): DIFFERENTIAL_FLASH_ATTN = enum.auto() NO_ATTENTION = enum.auto() FLEX_ATTENTION = enum.auto() + TREE_ATTN = enum.auto() class PlatformEnum(enum.Enum): diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py new file mode 100644 index 0000000000000..4fb7483284053 --- /dev/null +++ b/vllm/v1/attention/backends/tree_attn.py @@ -0,0 +1,452 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Attention layer with TreeAttention.""" + +import ast +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Optional + +import torch + +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionMetadata, AttentionType) +from vllm.attention.ops.triton_unified_attention import unified_attention +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.v1.attention.backends.utils import ( + AttentionMetadataBuilder, CommonAttentionMetadata, + reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills) +from vllm.v1.kv_cache_interface import AttentionSpec + +if TYPE_CHECKING: + from vllm.v1.core.sched.output import SchedulerOutput + from vllm.v1.worker.gpu_input_batch import InputBatch + +from vllm import _custom_ops as ops + +logger = init_logger(__name__) + + +class TreeAttentionBackend(AttentionBackend): + + accept_output_buffer: bool = True + + @classmethod + def get_supported_dtypes(cls) -> list[torch.dtype]: + return [torch.float16, torch.bfloat16] + + @classmethod + def get_supported_head_sizes(cls) -> list[int]: + return [32, 64, 96, 128, 160, 192, 224, 256] + + @classmethod + def validate_head_size(cls, head_size: int) -> None: + supported_head_sizes = cls.get_supported_head_sizes() + if head_size not in supported_head_sizes: + attn_type = cls.__name__.removesuffix("Backend") + raise ValueError( + f"Head size {head_size} is not supported by {attn_type}. " + f"Supported head sizes are: {supported_head_sizes}. " + "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use " + "FlexAttention backend which supports all head sizes.") + + @staticmethod + def get_name() -> str: + return "TREE_ATTN_VLLM_V1" + + @staticmethod + def get_impl_cls() -> type["TreeAttentionImpl"]: + return TreeAttentionImpl + + @staticmethod + def get_metadata_cls() -> type["AttentionMetadata"]: + return TreeAttentionMetadata + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> tuple[int, ...]: + if block_size % 16 != 0: + raise ValueError("Block size must be a multiple of 16.") + return (2, num_blocks, block_size, num_kv_heads, head_size) + + @staticmethod + def get_builder_cls() -> type["TreeAttentionMetadataBuilder"]: + return TreeAttentionMetadataBuilder + + @staticmethod + def use_cascade_attention(*args, **kwargs) -> bool: + return False + + +@dataclass +class TreeAttentionMetadata: + num_actual_tokens: int # Number of tokens excluding padding. + max_query_len: int + query_start_loc: torch.Tensor + max_seq_len: int + seq_lens: torch.Tensor + block_table: torch.Tensor + slot_mapping: torch.Tensor + + num_prefill_tokens: int = 0 + num_decode_tokens: int = 0 + num_prefills: int = 0 + num_decodes: int = 0 + + tree_attn_bias: Optional[torch.Tensor] = None + + # Cached Prefill/decode metadata. + _cached_prefill_metadata: Optional["TreeAttentionMetadata"] = None + _cached_decode_metadata: Optional["TreeAttentionMetadata"] = None + + @property + def prefill_metadata(self) -> Optional["TreeAttentionMetadata"]: + if self.num_prefills == 0: + return None + + if self._cached_prefill_metadata is not None: + # Recover cached prefill-phase attention + # metadata structure + return self._cached_prefill_metadata + + q_start_loc = self.query_start_loc[self.num_decodes:] + q_seqlens = torch.diff(q_start_loc) + kv_seqlens = self.seq_lens[self.num_decodes:] + # Construct & cache prefill-phase attention metadata structure + self._cached_prefill_metadata = TreeAttentionMetadata( + num_actual_tokens=self.num_prefill_tokens, + max_query_len=int(q_seqlens.max().item()), + query_start_loc=q_start_loc - q_start_loc[0], + max_seq_len=int(kv_seqlens.max().item()), + seq_lens=kv_seqlens, + block_table=self.block_table[self.num_decodes:], + slot_mapping=self.slot_mapping[self.num_decode_tokens:], + ) + return self._cached_prefill_metadata + + @property + def decode_metadata(self) -> Optional["TreeAttentionMetadata"]: + if self.num_decode_tokens == 0: + return None + + if self._cached_decode_metadata is not None: + # Recover cached decode-phase attention + # metadata structure + return self._cached_decode_metadata + + q_start_loc = self.query_start_loc[:self.num_decodes + 1] + q_seqlens = torch.diff(q_start_loc) + kv_seqlens = self.seq_lens[:self.num_decodes] + # Construct & cache decode-phase attention metadata structure + self._cached_decode_metadata = TreeAttentionMetadata( + num_actual_tokens=self.num_decode_tokens, + max_query_len=int(q_seqlens.max().item()), + query_start_loc=q_start_loc, + max_seq_len=int(kv_seqlens.max().item()), + seq_lens=kv_seqlens, + block_table=self.block_table[:self.num_decodes], + slot_mapping=self.slot_mapping[:self.num_decode_tokens], + tree_attn_bias=self.tree_attn_bias, + ) + return self._cached_decode_metadata + + +class TreeAttentionMetadataBuilder( + AttentionMetadataBuilder[TreeAttentionMetadata]): + + def __init__( + self, + kv_cache_spec: AttentionSpec, + layer_names: list[str], + vllm_config: VllmConfig, + device: torch.device, + ): + self.kv_cache_spec = kv_cache_spec + self.block_size = kv_cache_spec.block_size + + spec_config = vllm_config.speculative_config + spec_token_tree = (spec := spec_config) and spec.speculative_token_tree + tree_choices: list[tuple[int, + ...]] = (ast.literal_eval(spec_token_tree) + if spec_token_tree is not None else + [(0, )]) + # Construct the tree attention bias. + depth_counts = _get_depth_counts(tree_choices) + self.tree_attn_bias = _prepare_tree_attn_bias( + tree_choices, + depth_counts, + dtype=torch.float32, + device=device, + ) + + def reorder_batch(self, input_batch: "InputBatch", + scheduler_output: "SchedulerOutput") -> bool: + return reorder_batch_to_split_decodes_and_prefills( + input_batch, + scheduler_output, + decode_threshold=self.tree_attn_bias.shape[0]) + + def build( + self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False, + ) -> TreeAttentionMetadata: + decode_threshold = self.tree_attn_bias.shape[0] + num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( + split_decodes_and_prefills(common_attn_metadata, + decode_threshold=decode_threshold)) + + num_actual_tokens = common_attn_metadata.num_actual_tokens + q_start_loc = common_attn_metadata.query_start_loc + max_query_len = common_attn_metadata.max_query_len + kv_seqlens = common_attn_metadata.seq_lens + max_seq_len = int(common_attn_metadata.seq_lens_cpu.max()) + block_table = common_attn_metadata.block_table_tensor + slot_mapping = common_attn_metadata.slot_mapping + + return TreeAttentionMetadata( + num_actual_tokens=num_actual_tokens, + num_prefill_tokens=num_prefill_tokens, + num_decode_tokens=num_decode_tokens, + num_prefills=num_prefills, + num_decodes=num_decodes, + max_query_len=max_query_len, + query_start_loc=q_start_loc, + max_seq_len=max_seq_len, + seq_lens=kv_seqlens, + block_table=block_table, + slot_mapping=slot_mapping, + tree_attn_bias=self.tree_attn_bias, + ) + + def build_for_drafting( + self, + common_attn_metadata: CommonAttentionMetadata, + draft_index: int, + ) -> TreeAttentionMetadata: + # Cache the original tree attention bias. + orig_tree_attn_bias = self.tree_attn_bias + + if draft_index == 0: + # Use prefill for drafting at the root level. + self.tree_attn_bias = torch.empty(0) + else: + # Slice the tree attention bias for drafting. + query_len = common_attn_metadata.max_query_len + start, end = draft_index, draft_index + query_len + self.tree_attn_bias = self.tree_attn_bias[start:end, + start:end].contiguous() + + # Build attention bias. + attn_metadata = self.build(0, common_attn_metadata, fast_build=True) + + # Reset the tree attention bias to the original value. + self.tree_attn_bias = orig_tree_attn_bias + return attn_metadata + + +def _get_depth_counts(sorted_tree_choices: list[tuple[int, ...]]) -> list[int]: + # Count the number of choices at each depth of the tree. + depth_counts = [] + prev_depth = 0 + for path in sorted_tree_choices: + depth = len(path) + if depth != prev_depth: + depth_counts.append(0) + depth_counts[depth - 1] += 1 + prev_depth = depth + return depth_counts + + +def _prepare_tree_attn_bias( + sorted_tree_choices: list[tuple[int, ...]], + depth_counts: list[int], + dtype: Optional[torch.dtype], + device: Optional[torch.device], +) -> torch.Tensor: + # +1 comes from the additional root node. + tree_len = len(sorted_tree_choices) + 1 + tree_attn_mask = torch.full((tree_len, tree_len), + -torch.inf, + device=device, + dtype=dtype) + + # Set diagonal to all zeros. Each token should + # attend to itself. + mask_val = 0 + for i in range(tree_len): + tree_attn_mask[i, i] = mask_val + + # Set root to all zeros. All tokens attend to it. + tree_attn_mask[:, 0] = mask_val + + # Set all ancestors to zeros. + start = 0 + for i in range(len(depth_counts)): + for j in range(depth_counts[i]): + cur_tree_choice = sorted_tree_choices[start + j] + # Retrieve ancestor position. + if len(cur_tree_choice) == 1: + continue + ancestor_idx = [] + for c in range(len(cur_tree_choice) - 1): + ancestor_idx.append( + sorted_tree_choices.index(cur_tree_choice[:c + 1]) + 1) + tree_attn_mask[j + start + 1, ancestor_idx] = mask_val + start += depth_counts[i] + return tree_attn_mask + + +class TreeAttentionImpl(AttentionImpl): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[list[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[dict[str, Any]] = None, + logits_soft_cap: Optional[float] = None, + attn_type: AttentionType = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, + use_irope: bool = False, + ) -> None: + if blocksparse_params is not None: + raise ValueError( + "TreeAttention does not support block-sparse attention.") + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_kv_heads + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + self.kv_cache_dtype = kv_cache_dtype + self.kv_sharing_target_layer_name = kv_sharing_target_layer_name + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) + self.alibi_slopes = alibi_slopes + if logits_soft_cap is None: + # Setting logits_soft_cap to 0 means no soft cap. + logits_soft_cap = 0 + self.logits_soft_cap = logits_soft_cap + if sliding_window is None: + self.sliding_window = (-1, -1) + else: + self.sliding_window = (sliding_window - 1, 0) + + TreeAttentionBackend.validate_head_size(head_size) + + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "TreeAttentionImpl.") + + def forward( + self, + layer: torch.nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: TreeAttentionMetadata, + output: Optional[torch.Tensor] = None, + output_scale: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Forward pass with TreeAttention. + + Args: + query: shape = [num_tokens, num_heads, head_size] + key: shape = [num_tokens, num_kv_heads, head_size] + value: shape = [num_tokens, num_kv_heads, head_size] + kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + attn_metadata: Metadata for attention. + Returns: + shape = [num_tokens, num_heads * head_size] + """ + assert output is not None, "Output tensor must be provided." + + if output_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported" + " for TreeAttentionImpl") + + if attn_metadata is None: + # Profiling run. + return output + + # Cache the input KVs. + key_cache, value_cache = kv_cache.unbind(0) + if self.kv_sharing_target_layer_name is None: + # Reshape the input keys and values and store them in the cache. + # Skip this if sharing KV cache with an earlier attention layer. + # NOTE(woosuk): Here, key and value are padded while slot_mapping is + # not padded. However, we don't need to do key[:num_actual_tokens] + # and value[:num_actual_tokens] because the reshape_and_cache_flash + # op uses the slot_mapping's shape to determine the number of + # actual tokens. + ops.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) + + num_actual_tokens = attn_metadata.num_actual_tokens + num_decode_tokens = attn_metadata.num_decode_tokens + descale_shape = (attn_metadata.query_start_loc.shape[0] - 1, + key.shape[1]) + if prefill_meta := attn_metadata.prefill_metadata: + unified_attention( + q=query[num_decode_tokens:num_actual_tokens], + k=key_cache, + v=value_cache, + out=output[num_decode_tokens:num_actual_tokens], + cu_seqlens_q=prefill_meta.query_start_loc, + max_seqlen_q=prefill_meta.max_query_len, + seqused_k=prefill_meta.seq_lens, + max_seqlen_k=prefill_meta.max_seq_len, + softmax_scale=self.scale, + causal=True, + alibi_slopes=self.alibi_slopes, + window_size=self.sliding_window, + block_table=prefill_meta.block_table, + softcap=self.logits_soft_cap, + q_descale=None, # Not supported + k_descale=layer._k_scale.expand(descale_shape), + v_descale=layer._v_scale.expand(descale_shape), + ) + + if decode_meta := attn_metadata.decode_metadata: + unified_attention( + q=query[:num_decode_tokens], + k=key_cache, + v=value_cache, + out=output[:num_decode_tokens], + cu_seqlens_q=decode_meta.query_start_loc, + max_seqlen_q=decode_meta.max_query_len, + seqused_k=decode_meta.seq_lens, + max_seqlen_k=decode_meta.max_seq_len, + softmax_scale=self.scale, + causal=True, + alibi_slopes=self.alibi_slopes, + qq_bias=decode_meta.tree_attn_bias, + window_size=self.sliding_window, + block_table=decode_meta.block_table, + softcap=self.logits_soft_cap, + q_descale=None, # Not supported + k_descale=layer._k_scale.expand(descale_shape), + v_descale=layer._v_scale.expand(descale_shape), + ) + return output diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 48bd632227c5b..7aeea40b25a67 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -214,6 +214,26 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]): return self.build(common_prefix_len=0, common_attn_metadata=common_attn_metadata) + def build_for_drafting( + self, + common_attn_metadata: CommonAttentionMetadata, + draft_index: int, + ) -> M: + """ + Build attention metadata for draft model. Uses build by default. + + Args: + common_attn_metadata: The common attention metadata. + draft_index: The index of the current draft operation. + When speculating a chain of tokens, this index refers to the + draft attempt for the i-th token. + For tree-based attention, this index instead refers to the + draft attempt for the i-th level in the tree of tokens. + """ + return self.build(common_prefix_len=0, + common_attn_metadata=common_attn_metadata, + fast_build=True) + def use_cascade_attention( self, common_prefix_len: int, diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 302126dbe3d5f..b2380bb3dd5ab 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import ast +from dataclasses import replace from typing import Optional import numpy as np @@ -17,6 +19,8 @@ from vllm.model_executor.models import supports_multimodal from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM from vllm.utils import is_pin_memory_available from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata +from vllm.v1.attention.backends.tree_attn import (TreeAttentionMetadata, + TreeAttentionMetadataBuilder) from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.sample.metadata import SamplingMetadata @@ -74,18 +78,52 @@ class EagleProposer: (self.max_num_tokens, self.hidden_size), dtype=self.dtype, device=device) - # We need +1 here because the arange is used to set query_start_loc, - # which has one more element than batch_size. - self.arange = torch.arange(vllm_config.scheduler_config.max_num_seqs + - 1, - device=device, - dtype=torch.int32) + + max_batch_size = vllm_config.scheduler_config.max_num_seqs + self.arange = torch.arange( + # We need +1 here because the arange is used to set query_start_loc, + # which has one more element than batch_size. + max_batch_size + 1, + device=device, + dtype=torch.int32, + ) self.inputs_embeds = torch.zeros( (self.max_num_tokens, self.hidden_size), dtype=self.dtype, device=device) + # Parse the speculative token tree. + spec_token_tree = self.speculative_config.speculative_token_tree + self.tree_choices: list[tuple[int, + ...]] = ast.literal_eval(spec_token_tree) + tree_depth = len(self.tree_choices[-1]) + # Precompute per-level properties of the tree. + num_drafts_per_level = [0] * tree_depth + for node in self.tree_choices: + num_drafts_per_level[len(node) - 1] += 1 + self.cu_drafts_per_level = [num_drafts_per_level[0]] + self.child_drafts_per_level = [num_drafts_per_level[0]] + for level in range(1, tree_depth): + self.cu_drafts_per_level.append(self.cu_drafts_per_level[-1] + + num_drafts_per_level[level]) + self.child_drafts_per_level.append(num_drafts_per_level[level] // + num_drafts_per_level[level - 1]) + # Find the first level where the tree branches off into one or more + # children. + self.first_branching_level = None + for level in range(tree_depth): + if self.cu_drafts_per_level[level] > level + 1: + self.first_branching_level = level + break + # Precompute draft position offsets in flattened tree. + self.tree_draft_pos_offsets = torch.arange( + 1, + len(self.tree_choices) + 1, + device=device, + dtype=torch.int32, + ).repeat(max_batch_size, 1) + def propose( self, # [num_tokens] @@ -120,11 +158,9 @@ class EagleProposer: assert self.runner is not None # FIXME: need to consider multiple kv_cache_groups - attn_metadata = self.runner.attn_metadata_builders[0].build( - common_prefix_len=0, - common_attn_metadata=common_attn_metadata, - fast_build=True, - ) + attn_metadata = self.runner.attn_metadata_builders[ + 0].build_for_drafting(common_attn_metadata=common_attn_metadata, + draft_index=0) # At this moment, we assume all eagle layers belong to the same KV # cache group, thus using the same attention metadata. @@ -167,6 +203,22 @@ class EagleProposer: last_hidden_states, hidden_states = ret_hidden_states sample_hidden_states = last_hidden_states[last_token_indices] logits = self.model.compute_logits(sample_hidden_states, None) + positions = target_positions[last_token_indices] + hidden_states = hidden_states[last_token_indices] + if self.first_branching_level == 0: + # Branching has occurred at the root level. Draft using tree + # attention. + draft_token_ids_list = self.propose_tree( + tree_root_level=0, + batch_size=batch_size, + logits=logits, + positions=positions, + hidden_states=hidden_states, + common_attn_metadata=common_attn_metadata, + ) + # [batch_size, num_tree_tokens] + return torch.cat(draft_token_ids_list, dim=1) + draft_token_ids = logits.argmax(dim=-1) # Early exit if there is only one draft token to be generated. @@ -178,16 +230,15 @@ class EagleProposer: # one layer. Adapt this code to support multiple layers once # there's a multi-layer MTP module. - # Currently FlashAttention is the only backend that supports - # multi-token eagle spec decode. This is because the code below + # Currently, only FlashAttention and TreeAttention support multi-token + # eagle spec decode. This is because the code below # makes assumptions about attn_metadata attributes available. - assert isinstance(attn_metadata, FlashAttentionMetadata) + assert isinstance(attn_metadata, + (FlashAttentionMetadata, TreeAttentionMetadata)) # Generate the remaining draft tokens. draft_token_ids_list = [draft_token_ids] - positions = target_positions[last_token_indices] - hidden_states = hidden_states[last_token_indices] if self.use_cuda_graph and \ batch_size <= self.cudagraph_batch_sizes[-1]: input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size) @@ -196,7 +247,7 @@ class EagleProposer: attn_metadata.num_actual_tokens = batch_size attn_metadata.max_query_len = 1 attn_metadata.query_start_loc = self.arange[:batch_size + 1] - for _ in range(self.num_speculative_tokens - 1): + for token_index in range(self.num_speculative_tokens - 1): # Update the inputs. # cast to int32 is crucial when eagle model is compiled. # tensor.argmax() returns int64 by default. @@ -265,7 +316,20 @@ class EagleProposer: logits = self.model.compute_logits(last_hidden_states[:batch_size], None) - # TODO(wenlong): get more than one token for tree attention + if self.first_branching_level == token_index + 1: + # Branching has occurred. The remaining tokens are drafted + # using tree attention. + draft_token_ids_list += self.propose_tree( + tree_root_level=token_index + 1, + batch_size=batch_size, + logits=logits, + positions=positions, + hidden_states=hidden_states, + common_attn_metadata=common_attn_metadata, + ) + # [batch_size, num_tree_tokens] + return torch.cat(draft_token_ids_list, dim=1) + draft_token_ids = logits.argmax(dim=-1) draft_token_ids_list.append(draft_token_ids) @@ -273,6 +337,175 @@ class EagleProposer: draft_token_ids = torch.stack(draft_token_ids_list, dim=1) return draft_token_ids + def propose_tree( + self, + tree_root_level: int, + batch_size: int, + # [num_tokens, vocab_size] + logits: torch.Tensor, + # [num_tokens] + positions: torch.Tensor, + # [num_tokens, hidden_size] + hidden_states: torch.Tensor, + common_attn_metadata: CommonAttentionMetadata, + ) -> list[torch.Tensor]: + tree_attn_metadata_builder = self.runner.attn_metadata_builders[0] + assert isinstance(tree_attn_metadata_builder, + TreeAttentionMetadataBuilder) + + total_num_drafts = self.cu_drafts_per_level[tree_root_level] + level_num_drafts = total_num_drafts + # Sample a draft token for each child at the tree root level. + num_children = self.child_drafts_per_level[tree_root_level] + if num_children == 1: + draft_token_ids = logits.argmax(dim=-1).view(batch_size, -1) + else: + draft_token_ids = torch.topk(logits, num_children, + dim=-1).indices.view(batch_size, -1) + draft_token_ids_list = [draft_token_ids] + draft_hidden_states = hidden_states.view(batch_size, 1, -1) + + # Initialize empty tensors for concatenation with the level outputs. + tree_input_ids = torch.empty(0, + device=self.input_ids.device, + dtype=self.input_ids.dtype) + tree_positions = torch.empty(0, + device=self.positions.device, + dtype=self.positions.dtype) + tree_hidden_states = torch.empty(0, + device=self.hidden_states.device, + dtype=self.hidden_states.dtype) + # Precompute the draft token positions. + flattened_draft_positions = ( + positions.view(batch_size, -1) + + self.tree_draft_pos_offsets[:batch_size, :]) + tree_depth = len(self.cu_drafts_per_level) + for level in range(tree_root_level, tree_depth - 1): + # Get draft positions for RoPE. + draft_positions = positions + (level + 1) + exceeds_max_model_len = (positions + + total_num_drafts) >= self.max_model_len + # Mask out the position ids that exceed the max model length. + # Otherwise, we may get out-of-range error in RoPE. + clamped_draft_positions = torch.where( + exceeds_max_model_len, + 0, + draft_positions, + ) + if level_num_drafts > 1: + # Repeat the positions for each draft at this level. + draft_positions = clamped_draft_positions.repeat_interleave( + level_num_drafts).reshape(batch_size, -1) + + if num_children > 1: + # Repeat draft hidden states for each child. + draft_hidden_states = draft_hidden_states.repeat_interleave( + num_children, dim=1) + + # Concatenate the draft tokens, positions, and hidden states. + tree_input_ids = torch.cat([tree_input_ids, draft_token_ids], + dim=1) + tree_positions = torch.cat([tree_positions, draft_positions], + dim=1) + tree_hidden_states = torch.cat( + [tree_hidden_states, draft_hidden_states], dim=1) + + # Build new attention metadata for the next level of drafts. + # This is necessary to support tree attention. + query_len = total_num_drafts - tree_root_level + common_attn_metadata = replace( + common_attn_metadata, + query_start_loc=query_len * self.arange[:batch_size + 1], + seq_lens=common_attn_metadata.seq_lens + level_num_drafts, + num_actual_tokens=batch_size * query_len, + max_query_len=query_len, + ) + attn_metadata = tree_attn_metadata_builder.build_for_drafting( + common_attn_metadata=common_attn_metadata, + draft_index=tree_root_level + 1, + ) + + # Apply new attention metadata to all layers. + per_layer_attn_metadata = {} + for layer_name in self.attn_layer_names: + per_layer_attn_metadata[layer_name] = attn_metadata + + # Consider max model length. + attn_metadata.max_seq_len = min(attn_metadata.max_seq_len, + self.max_model_len) + # For the requests that exceed the max model length, we set the + # sequence length to 1 to minimize their overheads in attention. + attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1) + + # Compute the slot mapping. + query_positions = flattened_draft_positions[:, level:level + + query_len] + block_numbers = query_positions // self.block_size + block_ids = attn_metadata.block_table.gather(dim=1, + index=block_numbers) + slot_mapping = (block_ids * self.block_size + + query_positions % self.block_size) + # Mask out the slot mappings that exceed the max model length. + # Otherwise, the KV cache will be inadvertently updated with the + # padding tokens. + slot_mapping[exceeds_max_model_len] = PADDING_SLOT_ID + attn_metadata.slot_mapping = slot_mapping.view(-1) + + # Copy inputs to buffer for cudagraph. + num_tokens = attn_metadata.num_actual_tokens + input_ids = tree_input_ids.view(-1) + self.input_ids[:num_tokens] = input_ids + self.positions[:num_tokens] = tree_positions.view(-1) + self.hidden_states[:num_tokens] = tree_hidden_states.view( + num_tokens, -1) + + if self.use_cuda_graph and \ + num_tokens <= self.cudagraph_batch_sizes[-1]: + num_input_tokens = self.vllm_config.pad_for_cudagraph( + num_tokens) + else: + num_input_tokens = num_tokens + # Run the model. + with set_forward_context(per_layer_attn_metadata, + self.vllm_config, + num_tokens=num_input_tokens): + last_hidden_states, hidden_states = self.model( + input_ids=self.input_ids[:num_input_tokens], + positions=self.positions[:num_input_tokens], + hidden_states=self.hidden_states[:num_input_tokens], + inputs_embeds=None, + ) + + # Get the output hidden states for the draft tokens. + draft_hidden_states = hidden_states[:num_tokens].view( + batch_size, query_len, -1)[:, -level_num_drafts:] + draft_last_hidden_states = last_hidden_states[:num_tokens].view( + batch_size, query_len, -1)[:, -level_num_drafts:] + + # Get the output logits for the draft tokens. + logits = self.model.compute_logits( + draft_last_hidden_states.reshape(batch_size * level_num_drafts, + -1), + None, + ) + + # Sample a draft token for each child at the next tree level. + num_children = self.child_drafts_per_level[level + 1] + if num_children == 1: + draft_token_ids = logits.argmax(dim=-1).view(batch_size, -1) + else: + draft_token_ids = torch.topk(logits, num_children, + dim=-1).indices.view( + batch_size, -1) + draft_token_ids_list.append(draft_token_ids) + + # Update the # drafts counters for the next tree level. + level_num_drafts = self.cu_drafts_per_level[level + + 1] - total_num_drafts + total_num_drafts = self.cu_drafts_per_level[level + 1] + + return draft_token_ids_list + def prepare_inputs( self, common_attn_metadata: CommonAttentionMetadata, From 49bcd893e753d89a1c2a95a1c2649819309c1e1b Mon Sep 17 00:00:00 2001 From: "ZiTian.Zhao" Date: Mon, 4 Aug 2025 13:14:49 +0800 Subject: [PATCH 186/224] [refactor] improve ConstantList exception specificity (#22156) Signed-off-by: zitian.zhao --- vllm/v1/utils.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index d0175695c1d0f..b5750c82db023 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -34,22 +34,22 @@ class ConstantList(Generic[T], Sequence): self._x = x def append(self, item): - raise Exception("Cannot append to a constant list") + raise TypeError("Cannot append to a constant list") def extend(self, item): - raise Exception("Cannot extend a constant list") + raise TypeError("Cannot extend a constant list") def insert(self, item): - raise Exception("Cannot insert into a constant list") + raise TypeError("Cannot insert into a constant list") def pop(self, item): - raise Exception("Cannot pop from a constant list") + raise TypeError("Cannot pop from a constant list") def remove(self, item): - raise Exception("Cannot remove from a constant list") + raise TypeError("Cannot remove from a constant list") def clear(self): - raise Exception("Cannot clear a constant list") + raise TypeError("Cannot clear a constant list") def index(self, item: T, @@ -78,10 +78,10 @@ class ConstantList(Generic[T], Sequence): ... def __setitem__(self, item: Union[int, slice], value: Union[T, list[T]]): - raise Exception("Cannot set item in a constant list") + raise TypeError("Cannot set item in a constant list") def __delitem__(self, item): - raise Exception("Cannot delete item from a constant list") + raise TypeError("Cannot delete item from a constant list") def __iter__(self): return iter(self._x) From e5949e5ae013692ba09cc52472cf441675f5a270 Mon Sep 17 00:00:00 2001 From: Chenxi Yang Date: Sun, 3 Aug 2025 22:15:14 -0700 Subject: [PATCH 187/224] Remove index_put from MM embeddings merging (#22105) Co-authored-by: Chenxi Yang --- vllm/model_executor/models/utils.py | 42 ++++++++++++++++------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 62deb68035b92..28508e1bac1ee 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -393,7 +393,7 @@ def merge_multimodal_embeddings_from_map( inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors, placeholder_map: MultiModalPlaceholderMap.IndexMap) -> torch.Tensor: """ - Merge ``multimodal_embeddings`` into ``inputs_embeds`` using the provided + Merge ``multimodal_embeddings`` into ``inputs_embeds`` using the provided placeholder map . Note: @@ -418,17 +418,23 @@ def _merge_multimodal_embeddings( Note: This updates ``inputs_embeds`` in place. """ - num_expected_tokens = is_multimodal.sum().item() - assert isinstance(num_expected_tokens, int) - flattened = _flatten_embeddings(multimodal_embeddings) - if flattened.shape[0] != num_expected_tokens: - expr = _embedding_count_expression(multimodal_embeddings) - raise ValueError( - f"Attempted to assign {expr} = {flattened.shape[0]} " - f"multimodal tokens to {num_expected_tokens} placeholders") + try: + # This is equivalent to: inputs_embeds[is_multimodal] = flattened. + inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1), flattened) + except RuntimeError as e: + num_expected_tokens = is_multimodal.sum().item() + assert isinstance(num_expected_tokens, int) + + if flattened.shape[0] != num_expected_tokens: + expr = _embedding_count_expression(multimodal_embeddings) + raise ValueError( + f"Attempted to assign {expr} = {flattened.shape[0]} " + f"multimodal tokens to {num_expected_tokens} placeholders" + ) from e + else: + raise ValueError("Error during masked scatter operation") from e - inputs_embeds[is_multimodal] = flattened return inputs_embeds @@ -478,11 +484,11 @@ def merge_multimodal_embeddings( Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the positions in ``inputs_embeds`` corresponding to placeholder tokens in ``input_ids``. - - ``placeholder_token_id`` can be a list of token ids (e.g, token ids - of img_start, img_break, and img_end tokens) when needed: This means - the order of these tokens in the ``input_ids`` MUST MATCH the order of - their embeddings in ``multimodal_embeddings`` since we need to + + ``placeholder_token_id`` can be a list of token ids (e.g, token ids + of img_start, img_break, and img_end tokens) when needed: This means + the order of these tokens in the ``input_ids`` MUST MATCH the order of + their embeddings in ``multimodal_embeddings`` since we need to slice-merge instead of individually scattering. For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where @@ -491,9 +497,9 @@ def merge_multimodal_embeddings( - I is image embedding token - B is image break token - E is image end token. - - Then the image embeddings (that correspond to I's) from vision encoder - must be padded with embeddings of S, B, and E in the same order of + + Then the image embeddings (that correspond to I's) from vision encoder + must be padded with embeddings of S, B, and E in the same order of input_ids for a correct embedding merge. Note: From 8ecb3e9e9336ce47e47b61417e24161b38079e93 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Mon, 4 Aug 2025 01:19:04 -0400 Subject: [PATCH 188/224] [CI Bugfix] Fix wNa16 kernel not found for test_shared_storage_connector_hashes (#22163) Signed-off-by: Tyler Michael Smith --- tests/v1/kv_connector/unit/test_shared_storage_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_shared_storage_connector.py index 11b7e378441a4..db203b81f15fc 100644 --- a/tests/v1/kv_connector/unit/test_shared_storage_connector.py +++ b/tests/v1/kv_connector/unit/test_shared_storage_connector.py @@ -10,7 +10,7 @@ from vllm.assets.image import ImageAsset from vllm.config import KVTransferConfig from vllm.multimodal.utils import encode_image_base64 -MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w4a16" +MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w8a8" SAMPLING_PARAMS = SamplingParams(temperature=0.0, top_k=1, max_tokens=128) From a7b8788d2c2fae6bf52c128916de19e85f2b0a25 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Mon, 4 Aug 2025 14:51:20 +0800 Subject: [PATCH 189/224] [Misc] Modify the organization of GLM series (#22171) Signed-off-by: Jee Jee Li --- docs/models/supported_models.md | 10 +++++----- examples/offline_inference/vision_language.py | 4 ++-- tests/distributed/test_pipeline_parallel.py | 4 ++-- tests/lora/test_add_lora.py | 2 +- tests/lora/test_chatglm3_tp.py | 2 +- tests/models/language/generation/test_common.py | 2 +- tests/models/multimodal/generation/test_common.py | 6 +++--- tests/models/multimodal/processing/test_common.py | 4 ++-- tests/models/multimodal/processing/test_glm4_1v.py | 2 +- tests/models/registry.py | 10 +++++----- tests/tokenization/test_cached_tokenizer.py | 2 +- vllm/model_executor/models/chatglm.py | 6 +++--- vllm/model_executor/models/glm4v.py | 2 +- vllm/test_utils.py | 2 +- vllm/transformers_utils/configs/chatglm.py | 2 +- vllm/transformers_utils/tokenizer.py | 2 +- 16 files changed, 31 insertions(+), 31 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index c058c20f1ed73..cd1228836b870 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -328,7 +328,7 @@ th { | `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ | | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | | | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | | -| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ | | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ | | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ | | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ | @@ -348,8 +348,8 @@ th { | `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Gemma3nForConditionalGeneration` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | -| `GlmForCausalLM` | GLM-4 | `THUDM/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4ForCausalLM` | GLM-4-0414 | `THUDM/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ | | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ | @@ -589,8 +589,8 @@ See [this page](generative_models.md) for more information on how to use generat | `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | | | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ | | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ | -| `GLM4VForCausalLM`^ | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `THUDM/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `GLM4VForCausalLM`^ | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4MoeForCausalLM` | GLM-4.5 | T + IE+ + VE+ | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4v_moeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V-Air`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index a75b8e2b047d8..16bb3712f551e 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -221,7 +221,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData: # GLM-4v def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" - model_name = "THUDM/glm-4v-9b" + model_name = "zai-org/glm-4v-9b" engine_args = EngineArgs( model=model_name, @@ -250,7 +250,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: # GLM-4.1V def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData: - model_name = "THUDM/GLM-4.1V-9B-Thinking" + model_name = "zai-org/GLM-4.1V-9B-Thinking" engine_args = EngineArgs( model=model_name, diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index cfb2e2dd15f4d..12dd7c4222630 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -154,7 +154,7 @@ TEXT_GENERATION_MODELS = { "baichuan-inc/Baichuan-7B": PPTestSettings.fast(), "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(), "bigscience/bloomz-1b1": PPTestSettings.fast(), - "THUDM/chatglm3-6b": PPTestSettings.fast(), + "zai-org/chatglm3-6b": PPTestSettings.fast(), "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"), "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"), "Deci/DeciLM-7B-instruct": PPTestSettings.fast(), @@ -224,7 +224,7 @@ MULTIMODAL_MODELS = { "Salesforce/blip2-opt-6.7b": PPTestSettings.fast(), "facebook/chameleon-7b": PPTestSettings.fast(), "adept/fuyu-8b": PPTestSettings.fast(), - "THUDM/glm-4v-9b": PPTestSettings.fast(), + "zai-org/glm-4v-9b": PPTestSettings.fast(), "OpenGVLab/InternVL2-1B": PPTestSettings.fast(), "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(), "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(), diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py index cc8160b2860d9..d7b019509fa3e 100644 --- a/tests/lora/test_add_lora.py +++ b/tests/lora/test_add_lora.py @@ -14,7 +14,7 @@ from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams from vllm.utils import merge_async_iterators -MODEL_PATH = "THUDM/chatglm3-6b" +MODEL_PATH = "zai-org/chatglm3-6b" LORA_RANK = 64 DEFAULT_MAX_LORAS = 4 * 3 diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py index 5481b413b8f5f..fb00e7b65b04a 100644 --- a/tests/lora/test_chatglm3_tp.py +++ b/tests/lora/test_chatglm3_tp.py @@ -6,7 +6,7 @@ from vllm.lora.request import LoRARequest from ..utils import create_new_process_for_each_test, multi_gpu_test -MODEL_PATH = "THUDM/chatglm3-6b" +MODEL_PATH = "zai-org/chatglm3-6b" PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index ea240d2278895..57382914bfea8 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -53,7 +53,7 @@ AITER_MODEL_LIST = [ marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), pytest.param( - "THUDM/chatglm3-6b", # chatglm (text-only) + "zai-org/chatglm3-6b", # chatglm (text-only) ), pytest.param( "meta-llama/Llama-3.2-1B-Instruct", # llama diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 967228b54a0af..8cb826c1144d2 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -355,7 +355,7 @@ VLM_TEST_SETTINGS = { num_logprobs=10, ), "glm4v": VLMTestInfo( - models=["THUDM/glm-4v-9b"], + models=["zai-org/glm-4v-9b"], test_type=VLMTestType.IMAGE, prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501 single_image_prompts=IMAGE_ASSETS.prompts({ @@ -374,7 +374,7 @@ VLM_TEST_SETTINGS = { marks=[large_gpu_mark(min_gb=32)], ), "glm4_1v": VLMTestInfo( - models=["THUDM/GLM-4.1V-9B-Thinking"], + models=["zai-org/GLM-4.1V-9B-Thinking"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501 img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501 @@ -388,7 +388,7 @@ VLM_TEST_SETTINGS = { marks=[large_gpu_mark(min_gb=32)], ), "glm4_1v-video": VLMTestInfo( - models=["THUDM/GLM-4.1V-9B-Thinking"], + models=["zai-org/GLM-4.1V-9B-Thinking"], # GLM4.1V require include video metadata for input test_type=VLMTestType.CUSTOM_INPUTS, max_model_len=4096, diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index f70e03d0f6691..bd1c55d95dac2 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -271,8 +271,8 @@ def _test_processing_correctness_one( "microsoft/Florence-2-base", "adept/fuyu-8b", "google/gemma-3-4b-it", - "THUDM/glm-4v-9b", - "THUDM/GLM-4.1V-9B-Thinking", + "zai-org/glm-4v-9b", + "zai-org/GLM-4.1V-9B-Thinking", "ibm-granite/granite-speech-3.3-2b", "h2oai/h2ovl-mississippi-800m", "internlm/Intern-S1", diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py index d1c5fa8fec6d2..a6d900ec5d895 100644 --- a/tests/models/multimodal/processing/test_glm4_1v.py +++ b/tests/models/multimodal/processing/test_glm4_1v.py @@ -9,7 +9,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from ...utils import build_model_context -@pytest.mark.parametrize("model_id", ["THUDM/GLM-4.1V-9B-Thinking"]) +@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"]) @pytest.mark.parametrize("expected_toks_per_frame", [299]) @pytest.mark.parametrize("num_frames", [32, 128]) @pytest.mark.parametrize("fps, expected_grid_t", [(1, 5), (2, 10)]) diff --git a/tests/models/registry.py b/tests/models/registry.py index 25cfa267d1815..ffa6b755adf43 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -153,7 +153,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}), # noqa: E501 "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m", {"1b": "bigscience/bloomz-1b1"}), - "ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b", + "ChatGLMModel": _HfExamplesInfo("zai-org/chatglm3-6b", trust_remote_code=True, max_transformers_version="4.48"), "ChatGLMForConditionalGeneration": _HfExamplesInfo("thu-coai/ShieldLM-6B-chatglm3", # noqa: E501 @@ -187,8 +187,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"), "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it", # noqa: E501 min_transformers_version="4.53"), - "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"), - "Glm4ForCausalLM": _HfExamplesInfo("THUDM/GLM-4-9B-0414"), + "GlmForCausalLM": _HfExamplesInfo("zai-org/glm-4-9b-chat-hf"), + "Glm4ForCausalLM": _HfExamplesInfo("zai-org/GLM-4-9B-0414"), "Glm4MoeForCausalLM": _HfExamplesInfo("zai-org/GLM-4.5", min_transformers_version="4.54"), # noqa: E501 "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", @@ -380,10 +380,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"), "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"), "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-2b"), # noqa: E501 - "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b", + "GLM4VForCausalLM": _HfExamplesInfo("zai-org/glm-4v-9b", trust_remote_code=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 - "Glm4vForConditionalGeneration": _HfExamplesInfo("THUDM/GLM-4.1V-9B-Thinking"), # noqa: E501 + "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"), # noqa: E501 "Glm4v_moeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V-Air", is_available_online=False), # noqa: E501 "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m", diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py index e218678c4363b..07217611ea4d2 100644 --- a/tests/tokenization/test_cached_tokenizer.py +++ b/tests/tokenization/test_cached_tokenizer.py @@ -10,7 +10,7 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer, get_cached_tokenizer) -@pytest.mark.parametrize("model_id", ["gpt2", "THUDM/chatglm3-6b"]) +@pytest.mark.parametrize("model_id", ["gpt2", "zai-org/chatglm3-6b"]) def test_cached_tokenizer(model_id: str): reference_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 129f0942f14ef..5470ff3e8b612 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from -# https://github.com/THUDM/ChatGLM2-6B +# https://github.com/zai-org/ChatGLM2-6B """Inference-only ChatGLM model compatible with THUDM weights.""" import json from collections.abc import Iterable @@ -86,10 +86,10 @@ class GLMAttention(nn.Module): prefix=f"{prefix}.dense", ) - # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141 + # https://huggingface.co/zai-org/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141 rope_ratio = getattr(config, "rope_ratio", 1.0) max_positions = getattr(config, "seq_length", 8192) - # NOTE: THUDM/cogagent-9b-20241220 uses original_rope=False, + # NOTE: zai-org/cogagent-9b-20241220 uses original_rope=False, # which is equivalent to is_neox_style=True is_neox_style = not config.original_rope self.rotary_emb = get_rope( diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 537aeabf72d5a..1751fccd08b06 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from -# https://github.com/THUDM/CogAgent +# https://github.com/zai-org/CogAgent """Inference-only CogAgent model compatible with THUDM weights.""" from argparse import Namespace from collections.abc import Mapping, Sequence diff --git a/vllm/test_utils.py b/vllm/test_utils.py index 1e61ca6b3deaf..23679b8228d6f 100644 --- a/vllm/test_utils.py +++ b/vllm/test_utils.py @@ -118,7 +118,7 @@ MODELS_ON_S3 = [ "stabilityai/stablelm-zephyr-3b", "state-spaces/mamba-130m-hf", "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", - "THUDM/glm-4v-9b", + "zai-org/glm-4v-9b", "TIGER-Lab/Mantis-8B-siglip-llama3", "TIGER-Lab/VLM2Vec-Full", "tiiuae/falcon-40b", diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py index 7c5de3e948ed7..176d2b8f63fe4 100644 --- a/vllm/transformers_utils/configs/chatglm.py +++ b/vllm/transformers_utils/configs/chatglm.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from -# https://github.com/THUDM/ChatGLM2-6B +# https://github.com/zai-org/ChatGLM2-6B from transformers import PretrainedConfig diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 6a31a41980695..d2be2ceeeae6d 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -271,7 +271,7 @@ def get_tokenizer( } tokenizer.add_special_tokens(special_tokens_map) - # NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324 + # NOTE: We can remove this after https://github.com/zai-org/ChatGLM3/issues/1324 if type(tokenizer).__name__ in ("ChatGLMTokenizer", "ChatGLM4Tokenizer"): assert isinstance(tokenizer, PreTrainedTokenizer) From c1b4eb048a286ea5e7bcca730ae5676625f06541 Mon Sep 17 00:00:00 2001 From: Weixiao Huang Date: Mon, 4 Aug 2025 15:43:06 +0800 Subject: [PATCH 190/224] [feat] move WEIGHT_SCALE_SUPPORTED into raise block to accelerate RLHF weight loading (#21164) Signed-off-by: huangweixiao --- vllm/model_executor/layers/fused_moe/layer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index c2039adad99c3..9e7296feeae1e 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1079,9 +1079,6 @@ class FusedMoE(torch.nn.Module): raise ValueError(f"shard_id must be ['w1','w2','w3'] but " f"got {shard_id}.") - WEIGHT_SCALE_SUPPORTED = [ - e.value for e in FusedMoeWeightScaleSupported - ] # Fetch the dim to shard the parameter/loaded weight # based on the shard id. This will be whatever # dimension intermediate_size_per_partition is used. @@ -1230,6 +1227,9 @@ class FusedMoE(torch.nn.Module): loaded_weight=loaded_weight, expert_id=expert_id) else: + WEIGHT_SCALE_SUPPORTED = [ + e.value for e in FusedMoeWeightScaleSupported + ] raise ValueError( f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}") return True if return_success else None From fed5849d3fd7a5e7454cf87f101a18c2bad0436f Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Mon, 4 Aug 2025 16:27:02 +0800 Subject: [PATCH 191/224] [Bugfix] Fix failing GGUF models test (#22174) Signed-off-by: Isotr0py --- vllm/transformers_utils/config.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 0e633c2c0b6ae..cc41a771d06c2 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -290,20 +290,29 @@ def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig: def maybe_override_with_speculators_target_model( - model: str, - tokenizer: str, - trust_remote_code: bool, - revision: Optional[str] = None) -> tuple[str, str]: + model: str, + tokenizer: str, + trust_remote_code: bool, + revision: Optional[str] = None, + **kwargs, +) -> tuple[str, str]: """ If running a speculators config, override running model with target model """ + is_gguf = check_gguf_file(model) + if is_gguf: + kwargs["gguf_file"] = Path(model).name + gguf_model_repo = Path(model).parent + else: + gguf_model_repo = None config_dict, _ = PretrainedConfig.get_config_dict( - model, + model if gguf_model_repo is None else gguf_model_repo, revision=revision, trust_remote_code=trust_remote_code, token=_get_hf_token(), + **kwargs, ) - spec_config = config_dict.get("speculators_config") + spec_config = config_dict.get("speculators_config", None) # Return the target model if spec_config is not None: model = tokenizer = spec_config["verifier"]["name_or_path"] From 54de71d0dfbb6340fdbc620f4ebeb4236d165a37 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Mon, 4 Aug 2025 03:04:12 -0700 Subject: [PATCH 192/224] [Sampler] Support returning all logprobs or logits (#21792) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- tests/v1/sample/test_logprobs.py | 27 +++++++++++++++++++++++++++ vllm/config.py | 7 ++++--- vllm/sampling_params.py | 6 ++++-- vllm/v1/engine/logprobs.py | 5 +++-- vllm/v1/engine/processor.py | 5 ++++- vllm/v1/worker/gpu_input_batch.py | 4 +++- 6 files changed, 45 insertions(+), 9 deletions(-) diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 680e2ce98bb27..8bd142e87b06e 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -429,6 +429,33 @@ def test_zero_logprobs(vllm_model, example_prompts, assert len(prompt_token_ids) == len(prompt_logprobs) +def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch): + """Engine should return all vocabulary logprobs + + Args: + example_prompts: list of example prompts (test fixture) + """ + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + runner = VllmRunner( + "facebook/opt-125m", + max_logprobs=-1, + enable_prefix_caching=False, + # 2 other llms alive during whole session + gpu_memory_utilization=0.15, + max_model_len=256) + sampling_params_logprobs_all = SamplingParams(max_tokens=5, + logprobs=-1) + results_logprobs_all = runner.llm.generate( + example_prompts, sampling_params=sampling_params_logprobs_all) + vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size() + for i in range(len(results_logprobs_all)): + logprobs = results_logprobs_all[i].outputs[0].logprobs + assert logprobs is not None + for logprob in logprobs: + assert len(logprob) == vocab_size + + @pytest.mark.parametrize( "logprobs_mode", ["raw_logprobs", "raw_logits", "processed_logprobs", "processed_logits"]) diff --git a/vllm/config.py b/vllm/config.py index 871df455ef58f..5c300e327397b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -377,7 +377,8 @@ class ModelConfig: max_logprobs: int = 20 """Maximum number of log probabilities to return when `logprobs` is specified in `SamplingParams`. The default value comes the default for the - OpenAI Chat Completions API.""" + OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length * + vocab_size) logprobs are allowed to be returned and it may cause OOM.""" logprobs_mode: LogprobsMode = "raw_logprobs" """Indicates the content returned in the logprobs and prompt_logprobs. Supported mode: @@ -1585,7 +1586,7 @@ class ModelConfig: """ This method attempts to retrieve the non-default values of the generation config for this model. - + The generation config can contain information about special tokens, as well as sampling parameters. Which is why this method exists separately to `get_diff_sampling_param`. @@ -2066,7 +2067,7 @@ class ParallelConfig: and when data_parallel_size > 0. Enables running an AsyncLLM and API server on a "per-node" basis where vLLM load balances between local data parallel ranks, but an external LB balances - between vLLM nodes/replicas. Set explicitly in conjunction with + between vLLM nodes/replicas. Set explicitly in conjunction with --data-parallel-start-rank.""" enable_expert_parallel: bool = False """Use expert parallelism instead of tensor parallelism for MoE layers.""" diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 322e53b753948..52e4cbd096153 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -156,6 +156,7 @@ class SamplingParams( Note that the implementation follows the OpenAI API: The API will always return the log probability of the sampled token, so there may be up to `logprobs+1` elements in the response. + When set to -1, return all `vocab_size` log probabilities. prompt_logprobs: Number of log probabilities to return per prompt token. detokenize: Whether to detokenize the output. Defaults to True. skip_special_tokens: Whether to skip special tokens in the output. @@ -414,9 +415,10 @@ class SamplingParams( raise ValueError( f"min_tokens must be less than or equal to " f"max_tokens={self.max_tokens}, got {self.min_tokens}.") - if self.logprobs is not None and self.logprobs < 0: + if (self.logprobs is not None and self.logprobs != -1 + and self.logprobs < 0): raise ValueError( - f"logprobs must be non-negative, got {self.logprobs}.") + f"logprobs must be non-negative or -1, got {self.logprobs}.") if self.prompt_logprobs is not None and self.prompt_logprobs < 0: raise ValueError(f"prompt_logprobs must be non-negative, got " f"{self.prompt_logprobs}.") diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py index e95da0a5e5aaf..3de7fa6889e55 100644 --- a/vllm/v1/engine/logprobs.py +++ b/vllm/v1/engine/logprobs.py @@ -138,7 +138,7 @@ class LogprobsProcessor: def pop_prompt_logprobs(self) -> Optional[PromptLogprobs]: """Pop and return all request prompt logprobs - + The logprobs processor aggregates prompt chunk logprobs over one or more prefill chunks. This method returns all prompt logprobs at once and then forgets them. @@ -176,7 +176,8 @@ class LogprobsProcessor: Returns: dict[token id, Logprob] """ - + if num_logprobs == -1: + num_logprobs = len(logprobs) # We do not need a special case for the sampled token # being in the topk, since inserting duplicated data # into a dictionary twice is the same as doing it once. diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 224acc47feb27..692a7dd5640e0 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -65,8 +65,11 @@ class Processor: params: SamplingParams, ) -> None: max_logprobs = self.model_config.max_logprobs + if max_logprobs == -1: + return # Validate sample logprobs. - if params.logprobs and params.logprobs > max_logprobs: + if params.logprobs and (params.logprobs == -1 + or params.logprobs > max_logprobs): raise ValueError( f"Requested sample logprobs of {params.logprobs}, " f"which is greater than max allowed: {max_logprobs}") diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index c63041600f388..d9d0b4bec871a 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -337,7 +337,9 @@ class InputBatch: self.generators[req_index] = request.generator if sampling_params.logprobs is not None: - self.num_logprobs[req_id] = sampling_params.logprobs + self.num_logprobs[req_id] = (self.vocab_size + if sampling_params.logprobs == -1 + else sampling_params.logprobs) if sampling_params.prompt_logprobs is not None: self.num_prompt_logprobs[ req_id] = sampling_params.prompt_logprobs From 1539ced93a1ac3a78bef57d362cb9707c52f2a29 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 4 Aug 2025 18:37:06 +0800 Subject: [PATCH 193/224] [Doc] Update pooling model docs (#22186) Signed-off-by: DarkLight1337 --- docs/models/pooling_models.md | 2 +- docs/models/supported_models.md | 69 ++++++++++++++++++++------------- 2 files changed, 43 insertions(+), 28 deletions(-) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 1fbbba7ace5e1..c6588363b63fb 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -120,7 +120,7 @@ A code example can be found here: th { white-space: nowrap; @@ -419,7 +421,9 @@ See [this page](./pooling_models.md) for more information on how to use pooling Since some model architectures support both generative and pooling tasks, you should explicitly specify `--runner pooling` to ensure that the model is used in pooling mode instead of generative mode. -#### Text Embedding +#### Embedding + +These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) API. | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| @@ -457,28 +461,10 @@ If your model is not in the above list, we will try to automatically convert the [as_embedding_model][vllm.model_executor.models.adapters.as_embedding_model]. By default, the embeddings of the whole prompt are extracted from the normalized hidden state corresponding to the last token. -#### Reward Modeling - -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | -|--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `LlamaForCausalLM`C | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | - -C Automatically converted into a reward model via `--convert reward`. ([details](./pooling_models.md#model-conversion)) -\* Feature support is the same as that of the original model. - -If your model is not in the above list, we will try to automatically convert the model using -[as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly. - -!!! important - For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, - e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. - #### Classification +These models primarily support the [`LLM.classify`](./pooling_models.md#llmclassify) API. + | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| | `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | | @@ -491,7 +477,10 @@ If your model is not in the above list, we will try to automatically convert the If your model is not in the above list, we will try to automatically convert the model using [as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. -#### Sentence Pair Scoring +#### Cross-encoder / Reranker + +Cross-encoder and reranker models are a subset of classification models that accept two prompts as input. +These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API. | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| @@ -501,6 +490,7 @@ If your model is not in the above list, we will try to automatically convert the | `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | | `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | | | `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | | +| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | C Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion)) \* Feature support is the same as that of the original model. @@ -526,6 +516,28 @@ If your model is not in the above list, we will try to automatically convert the vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' ``` +#### Reward Modeling + +These models primarily support the [`LLM.reward`](./pooling_models.md#llmreward) API. + +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | +|--------------|--------|-------------------|----------------------|---------------------------|---------------------| +| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `LlamaForCausalLM`C | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | + +C Automatically converted into a reward model via `--convert reward`. ([details](./pooling_models.md#model-conversion)) +\* Feature support is the same as that of the original model. + +If your model is not in the above list, we will try to automatically convert the model using +[as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly. + +!!! important + For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, + e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. + [](){ #supported-mm-models } ## List of Multimodal Language Models @@ -579,6 +591,8 @@ See [this page](generative_models.md) for more information on how to use generat #### Text Generation +These models primarily accept the [`LLM.generate`](./generative_models.md#llmgenerate) API. Chat/Instruct models additionally support the [`LLM.chat`](./generative_models.md#llmchat) API. + | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | |--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------| | `AriaForConditionalGeneration` | Aria | T + I+ | `rhymes-ai/Aria` | | | ✅︎ | @@ -720,11 +734,9 @@ Speech2Text models trained specifically for Automatic Speech Recognition. See [this page](./pooling_models.md) for more information on how to use pooling models. -!!! important - Since some model architectures support both generative and pooling tasks, - you should explicitly specify `--runner pooling` to ensure that the model is used in pooling mode instead of generative mode. +#### Embedding -#### Text Embedding +These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) API. !!! note To get the best results, you should use pooling models that are specifically trained as such. @@ -742,7 +754,10 @@ The following table lists those that are tested in vLLM. --- -#### Scoring +#### Cross-encoder / Reranker + +Cross-encoder and reranker models are a subset of classification models that accept two prompts as input. +These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API. | Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | |-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------| From a5fff3bd49a5ea888cf0dbdfe7ecf140455fa8d4 Mon Sep 17 00:00:00 2001 From: Raghav Ravishankar <113712354+alyosha-swamy@users.noreply.github.com> Date: Mon, 4 Aug 2025 16:39:56 +0530 Subject: [PATCH 194/224] Fix Arcee model weight loading: Add custom load_weights (#21725) Signed-off-by: alyosha-swamy --- tests/models/registry.py | 3 +- vllm/model_executor/models/arcee.py | 83 +++++++++++++++++++++++++++-- 2 files changed, 80 insertions(+), 6 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index ffa6b755adf43..d86bd20fb0e34 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -139,8 +139,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", trust_remote_code=True), - "ArceeForCausalLM": _HfExamplesInfo("arcee-ai/AFM-4.5B-Base", - is_available_online=False), + "ArceeForCausalLM": _HfExamplesInfo("arcee-ai/AFM-4.5B-Base"), "ArcticForCausalLM": _HfExamplesInfo("Snowflake/snowflake-arctic-instruct", trust_remote_code=True), "BaiChuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan-7B", diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py index 4e3ba107ba7e0..4cf73e2e0ea56 100644 --- a/vllm/model_executor/models/arcee.py +++ b/vllm/model_executor/models/arcee.py @@ -24,10 +24,12 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP -from .utils import (AutoWeightsLoader, PPMissingLayer, +from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers) @@ -260,6 +262,81 @@ class ArceeModel(nn.Module): return hidden_states, aux_hidden_states return hidden_states + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + """Load weights, mapping q/k/v projections to fused qkv_proj.""" + stacked_params_mapping = [ + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + ] + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + continue + + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + + if "scale" in name: + remapped_name = maybe_remap_kv_scale_name(name, params_dict) + if remapped_name is None: + continue + name = remapped_name + + mapped = False + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + + name = name.replace(weight_name, param_name) + + if name.endswith(".bias") and name not in params_dict: + mapped = True + break + + if is_pp_missing_parameter(name, self): + mapped = True + break + + param = params_dict[name] + weight_loader = param.weight_loader # type: ignore[attr-defined] + weight_loader(param, loaded_weight, shard_id) + loaded_params.add(name) + mapped = True + break + + if mapped: + continue + + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + + return loaded_params + class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): """Arcee Model for causal language modeling, integrated with vLLM @@ -304,8 +381,7 @@ class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): else: # Placeholder for lm_head on non-last ranks self.lm_head = PPMissingLayer() - # Provide a reference to the model's method for generating empty - # tensors (used in pipeline parallel schedule) + self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -316,7 +392,6 @@ class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None ) -> Union[torch.Tensor, IntermediateTensors]: - # Forward pass through the Arcee model backbone model_output = self.model(input_ids=input_ids, positions=positions, intermediate_tensors=intermediate_tensors, From 9af654cc38c74cd51b00c609eaa290e495f225e1 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 4 Aug 2025 05:12:48 -0700 Subject: [PATCH 195/224] [Responses API] Ignore `store=True` and process the request by default (#22185) Signed-off-by: Woosuk Kwon --- vllm/entrypoints/openai/serving_responses.py | 31 ++++++++++++++++++-- vllm/envs.py | 3 +- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 5e9401cbd7473..e009529fbd2ad 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -90,8 +90,17 @@ class OpenAIServingResponses(OpenAIServing): logger.info("Using default chat sampling params from %s: %s", source, self.default_sampling_params) - # False by default. + # If False (default), the "store" option is (silently) ignored and the + # response is not stored. If True, the response is stored in memory. + # NOTE(woosuk): This may not be intuitive for users, as the default + # behavior in OpenAI's Responses API is to store the response, but + # vLLM's default behavior is not. self.enable_store = envs.VLLM_ENABLE_RESPONSES_API_STORE + if self.enable_store: + logger.warning_once( + "`VLLM_ENABLE_RESPONSES_API_STORE` is enabled. This may " + "cause a memory leak since we never remove responses from " + "the store.") # HACK(woosuk): This is a hack. We should use a better store. # FIXME: If enable_store=True, this may cause a memory leak since we # never remove responses from the store. @@ -121,9 +130,25 @@ class OpenAIServingResponses(OpenAIServing): if self.engine_client.errored: raise self.engine_client.dead_error - # If store is not enabled, return an error. if request.store and not self.enable_store: - return self._make_store_not_supported_error() + if request.background: + return self.create_error_response( + err_type="invalid_request_error", + message=( + "This vLLM engine does not support `store=True` and " + "therefore does not support the background mode. To " + "enable these features, set the environment variable " + "`VLLM_ENABLE_RESPONSES_API_STORE=1` when launching " + "the vLLM server."), + status_code=HTTPStatus.BAD_REQUEST, + ) + # Disable the store option. + # NOTE(woosuk): Although returning an error is possible, we opted + # to implicitly disable store and process the request anyway, as + # we assume most users do not intend to actually store the response + # (i.e., their request's `store=True` just because it's the default + # value). + request.store = False # Handle the previous response ID. prev_response_id = request.previous_response_id diff --git a/vllm/envs.py b/vllm/envs.py index 8d3c7eab471cf..78f955f78a987 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1060,7 +1060,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # Enables support for the "store" option in the OpenAI Responses API. # When set to 1, vLLM's OpenAI server will retain the input and output - # messages for those requests in memory. By default, this is disabled (0). + # messages for those requests in memory. By default, this is disabled (0), + # and the "store" option is ignored. # NOTE/WARNING: # 1. Messages are kept in memory only (not persisted to disk) and will be # lost when the vLLM server shuts down. From 309c1bb822c94436e8beff60d68404b4cecd62b8 Mon Sep 17 00:00:00 2001 From: ericehanley Date: Mon, 4 Aug 2025 10:12:06 -0500 Subject: [PATCH 196/224] [Bug] Update auto_tune.sh to separate benchmarking and profiling. (#21629) Signed-off-by: Eric Hanley --- benchmarks/auto_tune/auto_tune.sh | 123 +++++++++++++++++++----------- 1 file changed, 80 insertions(+), 43 deletions(-) diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index df26376504b95..82c20ffa6554c 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -49,6 +49,7 @@ best_throughput=0 best_max_num_seqs=0 best_num_batched_tokens=0 best_goodput=0 +best_request_rate=0 start_server() { local gpu_memory_utilization=$1 @@ -57,18 +58,35 @@ start_server() { local vllm_log=$4 local profile_dir=$5 - pkill -f vllm + pkill -if vllm - VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \ - --port 8004 \ - --gpu-memory-utilization $gpu_memory_utilization \ - --max-num-seqs $max_num_seqs \ - --max-num-batched-tokens $max_num_batched_tokens \ - --tensor-parallel-size $TP \ - --enable-prefix-caching \ - --load-format dummy \ - --download-dir "$DOWNLOAD_DIR" \ - --max-model-len $MAX_MODEL_LEN > "$vllm_log" 2>&1 & + # Define the common arguments as a bash array. + # Each argument and its value are separate elements. + local common_args_array=( + "$MODEL" + "--disable-log-requests" + "--port" "8004" + "--gpu-memory-utilization" "$gpu_memory_utilization" + "--max-num-seqs" "$max_num_seqs" + "--max-num-batched-tokens" "$max_num_batched_tokens" + "--tensor-parallel-size" "$TP" + "--enable-prefix-caching" + "--load-format" "dummy" + "--download-dir" "$DOWNLOAD_DIR" + "--max-model-len" "$MAX_MODEL_LEN" + ) + + # Use the array expansion "${common_args_array[@]}" + # This correctly passes each element as a separate argument. + if [[ -n "$profile_dir" ]]; then + # Start server with profiling enabled + VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \ + vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & + else + # Start server without profiling + VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \ + vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & + fi # wait for 10 minutes... server_started=0 @@ -82,6 +100,7 @@ start_server() { sleep 10 fi done + if (( ! server_started )); then echo "server did not start within 10 minutes. Please check server log at $vllm_log". return 1 @@ -90,37 +109,20 @@ start_server() { fi } -update_best_profile() { - local profile_dir=$1 - local profile_index=$2 - sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort)) - selected_profile_file= - if [[ "$SYSTEM" == "TPU" ]]; then - selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb" - fi - if [[ "$SYSTEM" == "GPU" ]]; then - selected_profile_file="${sorted_paths[$profile_index]}" - fi - rm -f $PROFILE_PATH/* - cp $selected_profile_file $PROFILE_PATH -} - run_benchmark() { local max_num_seqs=$1 local max_num_batched_tokens=$2 local gpu_memory_utilization=$3 echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt" - local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}" echo "vllm_log: $vllm_log" echo rm -f $vllm_log - mkdir -p $profile_dir - pkill -f vllm - local profile_index=0 + pkill -if vllm echo "starting server..." - start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir + # Call start_server without a profile_dir to avoid profiling overhead + start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log "" result=$? if [[ "$result" -eq 1 ]]; then echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" @@ -134,7 +136,8 @@ run_benchmark() { # get a basic qps by using request-rate inf bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 )) -adjusted_input_len=$(( INPUT_LEN - prefix_len )) + adjusted_input_len=$(( INPUT_LEN - prefix_len )) + # --profile flag is removed from this call vllm bench serve \ --backend vllm \ --model $MODEL \ @@ -148,8 +151,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len )) --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --num-prompts 1000 \ --random-prefix-len $prefix_len \ - --port 8004 \ - --profile &> "$bm_log" + --port 8004 &> "$bm_log" throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') @@ -163,7 +165,6 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len )) # start from request-rate as int(throughput) + 1 request_rate=$((${throughput%.*} + 1)) while ((request_rate > 0)); do - profile_index=$((profile_index+1)) # clear prefix cache curl -X POST http://0.0.0.0:8004/reset_prefix_cache sleep 5 @@ -201,12 +202,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len )) best_max_num_seqs=$max_num_seqs best_num_batched_tokens=$max_num_batched_tokens best_goodput=$goodput - if [[ "$SYSTEM" == "TPU" ]]; then - update_best_profile "$profile_dir/plugins/profile" $profile_index - fi - if [[ "$SYSTEM" == "GPU" ]]; then - update_best_profile "$profile_dir" $profile_index - fi + best_request_rate=$request_rate fi else echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" @@ -215,7 +211,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len )) echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" - pkill vllm + pkill -if vllm sleep 10 printf '=%.0s' $(seq 1 20) return 0 @@ -228,7 +224,8 @@ read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST" gpu_memory_utilization=0.98 find_gpu_memory_utilization=0 while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do - start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" + # Pass empty string for profile_dir argument + start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" "" result=$? if [[ "$result" -eq 0 ]]; then find_gpu_memory_utilization=1 @@ -251,5 +248,45 @@ for num_seqs in "${num_seqs_list[@]}"; do done done echo "finish permutations" + +# ================================================================================= +# FINAL PROFILING RUN FOR THE BEST CONFIGURATION +# ================================================================================= +if (( $(echo "$best_throughput > 0" | bc -l) )); then + echo + echo "Benchmark tuning finished. Now running profiling on the best configuration found..." + echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput" + echo + + vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt" + bm_log="$LOG_FOLDER/bm_log_BEST_PROFILE.txt" + + # Start server with the best params and profiling ENABLED + echo "Starting server for profiling..." + start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH" + + # Run benchmark with the best params and the --profile flag + echo "Running benchmark with profiling..." + prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 )) + adjusted_input_len=$(( INPUT_LEN - prefix_len )) + vllm bench serve \ + --backend vllm \ + --model $MODEL \ + --dataset-name random \ + --random-input-len $adjusted_input_len \ + --random-output-len $OUTPUT_LEN \ + --ignore-eos \ + --disable-tqdm \ + --request-rate $best_request_rate \ + --percentile-metrics ttft,tpot,itl,e2el \ + --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ + --num-prompts 100 \ + --random-prefix-len $prefix_len \ + --port 8004 \ + --profile &> "$bm_log" +else + echo "No configuration met the latency requirements. Skipping final profiling run." +fi +pkill -if vllm echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT" From c09efff9767ad26ecf99a6e6c13243612c278df3 Mon Sep 17 00:00:00 2001 From: Zhonghua Deng Date: Tue, 5 Aug 2025 04:17:05 +0800 Subject: [PATCH 197/224] [Bugfix][V1][P/D]Fix the uneven polling issue in the toy proxy for P2pNcclConnector (#21819) Signed-off-by: Abatom --- .../disagg_proxy_p2p_nccl_xpyd.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py index 73da7af85f1d9..0c7d32d7862e3 100644 --- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py +++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py @@ -46,7 +46,7 @@ def _listen_for_register(poller, router_socket): global prefill_instances global prefill_cv with prefill_cv: - node = prefill_instances.pop(data["http_address"], None) + node = prefill_instances.get(data["http_address"], None) prefill_instances[data["http_address"]] = ( data["zmq_address"], time.time() + DEFAULT_PING_SECONDS, @@ -57,7 +57,7 @@ def _listen_for_register(poller, router_socket): global decode_instances global decode_cv with decode_cv: - node = decode_instances.pop(data["http_address"], None) + node = decode_instances.get(data["http_address"], None) decode_instances[data["http_address"]] = ( data["zmq_address"], time.time() + DEFAULT_PING_SECONDS, @@ -69,6 +69,7 @@ def _listen_for_register(poller, router_socket): remote_address, data, ) + return if node is None: print(f"🔵Add [HTTP:{data['http_address']}, ZMQ:{data['zmq_address']}]") From bdcb42e45db5cbbc02b0f69ac304c87d7a8cb6b6 Mon Sep 17 00:00:00 2001 From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com> Date: Tue, 5 Aug 2025 09:02:55 +0800 Subject: [PATCH 198/224] [NVIDIA] Auto detect modelopt quant and fix DSR1-FP4 weight loading (#22073) --- vllm/config.py | 15 ++++++ vllm/model_executor/layers/fused_moe/layer.py | 53 +++++++++++++------ vllm/transformers_utils/config.py | 14 +++++ 3 files changed, 67 insertions(+), 15 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 5c300e327397b..dd59526471782 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1108,6 +1108,21 @@ class ModelConfig: if quant_cfg is None: # compressed-tensors uses a "compression_config" key quant_cfg = getattr(self.hf_config, "compression_config", None) + + else: + # Set quant_method for ModelOpt models. + producer_name = quant_cfg.get("producer", {}).get("name") + if producer_name == "modelopt": + quant_algo = quant_cfg.get("quantization", + {}).get("quant_algo") + if quant_algo == "FP8": + quant_cfg["quant_method"] = "modelopt" + elif quant_algo == "NVFP4": + quant_cfg["quant_method"] = "modelopt_fp4" + elif quant_algo is not None: + raise ValueError( + f"Unknown ModelOpt quant algo: {quant_algo}") + return quant_cfg def _verify_quantization(self) -> None: diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 9e7296feeae1e..f155a1b11fbff 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -919,9 +919,13 @@ class FusedMoE(torch.nn.Module): elif shard_id == "w2": param_data[expert_id] = loaded_weight - def _load_w13_weight_scale(self, shard_dim: int, - loaded_weight: torch.Tensor, - param: torch.Tensor, tp_rank: int): + def _load_combined_w13_weight_scale(self, shard_dim: int, + loaded_weight: torch.Tensor, + param: torch.Tensor, tp_rank: int): + """ + Load w13 weight scales assuming that w1 weight scales and w3 weight + scales are stored in the same loaded_weight tensor. + """ shard_size = param.shape[shard_dim] loaded_weight = loaded_weight.narrow(shard_dim, shard_size * tp_rank, shard_size) @@ -1168,24 +1172,43 @@ class FusedMoE(torch.nn.Module): uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern( ) - # For per-tensor, FP4 uses "weight_scale_2", FP8 uses "weight_scale" - per_tensor_conditions = ( - "weight_scale_2" in weight_name if uses_weight_scale_2 else - "weight_scale" in weight_name) or "input_scale" in weight_name - - if "w13_weight_scale" in weight_name: - self._load_w13_weight_scale(shard_dim=shard_dim, - loaded_weight=loaded_weight, - param=param, - tp_rank=self.tp_rank) - elif per_tensor_conditions: + # Call _load_per_tensor_weight_scale() to load per-tensor (scalar) + # weights scales. + # Input scales are always per-tensor. + # Weight scales: FP4 uses "weight_scale_2" and FP8 uses + # "weight_scale" for per-tensor scales. + is_per_tensor = ("weight_scale_2" in weight_name + if uses_weight_scale_2 else "weight_scale" + in weight_name) or "input_scale" in weight_name + if is_per_tensor: self._load_per_tensor_weight_scale( shard_id=shard_id, param=param, loaded_weight=loaded_weight, expert_id=expert_id, ) - elif "weight" in weight_name: + return True if return_success else None + + # If the weight is w13_weight_scale and w13_weight_scales are + # combined into single loaded_weight, call + # _load_combined_w13_weight_scale() to load it. + # This is checked by comparing the hidden_out dims of the + # loaded_weight and the param. + if "w13_weight_scale" in weight_name: + loaded_weight_hidden_out = loaded_weight.shape[-2] + param_hidden_out = param.data.shape[-2] * self.tp_size + if loaded_weight_hidden_out == param_hidden_out: + self._load_combined_w13_weight_scale( + shard_dim=shard_dim, + loaded_weight=loaded_weight, + param=param, + tp_rank=self.tp_rank, + ) + return True if return_success else None + + # For other weights, call _load_model_weight_or_group_weight_scale() + # to load it. + if "weight" in weight_name: self._load_model_weight_or_group_weight_scale( shard_id=shard_id, shard_dim=shard_dim, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index cc41a771d06c2..8fe153464d360 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -449,6 +449,20 @@ def get_config( model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type] config.update({"architectures": [model_type]}) + # ModelOpt 0.31.0 and after saves the quantization config in the model + # config file. + quantization_config = config_dict.get("quantization_config", None) + + # ModelOpt 0.29.0 and before saves the quantization config in a separate + # "hf_quant_config.json" in the same directory as the model config file. + if quantization_config is None \ + and file_or_path_exists(model, "hf_quant_config.json", revision): + quantization_config = get_hf_file_to_dict("hf_quant_config.json", + model, revision) + + if quantization_config is not None: + config.quantization_config = quantization_config + if hf_overrides_kw: logger.debug("Overriding HF config with %s", hf_overrides_kw) config.update(hf_overrides_kw) From 2dffac464c82ac7c509c78f7d12a7c72ea765a63 Mon Sep 17 00:00:00 2001 From: PiteXChen <44110731+CLFutureX@users.noreply.github.com> Date: Tue, 5 Aug 2025 09:34:10 +0800 Subject: [PATCH 199/224] [Bugfix] V1 Fix the cursor leakage issue during request scheduling. (#21173) Signed-off-by: CLFutureX <775523362@qq.com> --- tests/v1/core/test_scheduler.py | 97 ++++++++++++++++++++++++++++++++- vllm/v1/core/sched/scheduler.py | 6 +- 2 files changed, 100 insertions(+), 3 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index c719d1975bba2..3f82261a59a76 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1307,13 +1307,18 @@ def create_requests_with_priority( mm_positions: Optional[list[PlaceholderRange]] = None, max_tokens: int = 16, stop_token_ids: Optional[list[int]] = None, - prompt_logprobs: Optional[int] = None): + prompt_logprobs: Optional[int] = None, + request_ids: Optional[list[str]] = None): """Create requests with specified priorities and arrival times.""" assert len(priorities) == num_requests if arrival_times is not None: assert len(arrival_times) == num_requests else: arrival_times = [float(i) for i in range(num_requests)] + if request_ids is not None: + assert len(request_ids) == num_requests + else: + request_ids = [f"{i}" for i in range(num_requests)] sampling_params = SamplingParams(ignore_eos=False, max_tokens=max_tokens, @@ -1328,7 +1333,7 @@ def create_requests_with_priority( mm_position = None mm_inputs = None request = Request( - request_id=f"{i}", + request_id=request_ids[i], prompt_token_ids=[i] * num_tokens, sampling_params=sampling_params, pooling_params=None, @@ -1829,3 +1834,91 @@ def test_schedule_skip_tokenizer_init_structured_output_request(): assert len(output.scheduled_new_reqs) == 0 assert len(scheduler.running) == 0 assert len(scheduler.waiting) == 1 + + +def test_priority_scheduling_preemption_victim_iterator_order(): + """Test that the scheduling order is maintained after + preempting lower-priority requests.""" + scheduler = create_scheduler_with_priority( + max_num_batched_tokens=200, + num_blocks=9, + ) + # Add three priority requests first. + priority_requests = create_requests_with_priority( + num_requests=3, + priorities=[3, 4, 5], + arrival_times=[1.0, 2.0, 3.0], + num_tokens=15, + request_ids=["1", "2", "3"], + ) + + for request in priority_requests: + scheduler.add_request(request) + # After scheduling, transfer from the waiting queue to the running queue. + # At this time, 3 blocks have been allocated, and 5 available blocks remain. + output = scheduler.schedule() + + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in priority_requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(priority_requests) + }, + sampled_token_ids=[[15] for _ in priority_requests], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[], + ) + scheduler.update_from_output(output, model_output) + + # Add tow high priority requests. + high_priority_requests = create_requests_with_priority( + num_requests=2, + priorities=[1, 2], + arrival_times=[4.0, 5.0], + num_tokens=16, + request_ids=["4", "5"], + ) + for request in high_priority_requests: + scheduler.add_request(request) + + # After scheduling, transfer the two high-priority requests from + # the waiting queue to the running queue. + # the IDs of the requests in the running queue are: 1, 2, 3, 4, 5. + # At this time, 3+2 blocks have been allocated, + # and 3 available blocks remain. + output = scheduler.schedule() + + merge_requests = priority_requests + high_priority_requests + + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in merge_requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(merge_requests) + }, + sampled_token_ids=[[1] for _ in merge_requests], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[], + ) + scheduler.update_from_output(output, model_output) + + # At this time, the request with the lowest priority + # (request.id = 2) will be preempted, freeing up 2 blocks, + # which exactly meets the resource allocation requirements + # for request.id = 4 and request.id = 5. + output = scheduler.schedule() + + # Should schedule the new request without preemption. + assert len(scheduler.running) == 4 # + assert len(scheduler.waiting) == 1 # + + running_priorities = [req.priority for req in scheduler.running] + running_req_ids = [req.request_id for req in scheduler.running] + + assert running_priorities == [3, 4, 1, 2] + assert running_req_ids == ["1", "2", "4", "5"] + assert scheduler.waiting.peek_request().priority == 5 diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 49a744cfec69a..413a853dfecbc 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -257,7 +257,11 @@ class Scheduler(SchedulerInterface): self.running, key=lambda r: (r.priority, r.arrival_time), ) - self.running.remove(preempted_req) + preempted_index = self.running.index(preempted_req) + if preempted_index <= req_index: + req_index -= 1 + scheduled_running_reqs.remove(preempted_req) + self.running.pop(preempted_index) else: preempted_req = self.running.pop() From 7175817637bde6c668b75cce91c022e3a33b3684 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 4 Aug 2025 18:37:06 -0700 Subject: [PATCH 200/224] Revert "[Bugfix] V1 Fix the cursor leakage issue during request scheduling." (#22223) --- tests/v1/core/test_scheduler.py | 97 +-------------------------------- vllm/v1/core/sched/scheduler.py | 6 +- 2 files changed, 3 insertions(+), 100 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 3f82261a59a76..c719d1975bba2 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1307,18 +1307,13 @@ def create_requests_with_priority( mm_positions: Optional[list[PlaceholderRange]] = None, max_tokens: int = 16, stop_token_ids: Optional[list[int]] = None, - prompt_logprobs: Optional[int] = None, - request_ids: Optional[list[str]] = None): + prompt_logprobs: Optional[int] = None): """Create requests with specified priorities and arrival times.""" assert len(priorities) == num_requests if arrival_times is not None: assert len(arrival_times) == num_requests else: arrival_times = [float(i) for i in range(num_requests)] - if request_ids is not None: - assert len(request_ids) == num_requests - else: - request_ids = [f"{i}" for i in range(num_requests)] sampling_params = SamplingParams(ignore_eos=False, max_tokens=max_tokens, @@ -1333,7 +1328,7 @@ def create_requests_with_priority( mm_position = None mm_inputs = None request = Request( - request_id=request_ids[i], + request_id=f"{i}", prompt_token_ids=[i] * num_tokens, sampling_params=sampling_params, pooling_params=None, @@ -1834,91 +1829,3 @@ def test_schedule_skip_tokenizer_init_structured_output_request(): assert len(output.scheduled_new_reqs) == 0 assert len(scheduler.running) == 0 assert len(scheduler.waiting) == 1 - - -def test_priority_scheduling_preemption_victim_iterator_order(): - """Test that the scheduling order is maintained after - preempting lower-priority requests.""" - scheduler = create_scheduler_with_priority( - max_num_batched_tokens=200, - num_blocks=9, - ) - # Add three priority requests first. - priority_requests = create_requests_with_priority( - num_requests=3, - priorities=[3, 4, 5], - arrival_times=[1.0, 2.0, 3.0], - num_tokens=15, - request_ids=["1", "2", "3"], - ) - - for request in priority_requests: - scheduler.add_request(request) - # After scheduling, transfer from the waiting queue to the running queue. - # At this time, 3 blocks have been allocated, and 5 available blocks remain. - output = scheduler.schedule() - - model_output = ModelRunnerOutput( - req_ids=[req.request_id for req in priority_requests], - req_id_to_index={ - req.request_id: i - for i, req in enumerate(priority_requests) - }, - sampled_token_ids=[[15] for _ in priority_requests], - spec_token_ids=None, - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[], - ) - scheduler.update_from_output(output, model_output) - - # Add tow high priority requests. - high_priority_requests = create_requests_with_priority( - num_requests=2, - priorities=[1, 2], - arrival_times=[4.0, 5.0], - num_tokens=16, - request_ids=["4", "5"], - ) - for request in high_priority_requests: - scheduler.add_request(request) - - # After scheduling, transfer the two high-priority requests from - # the waiting queue to the running queue. - # the IDs of the requests in the running queue are: 1, 2, 3, 4, 5. - # At this time, 3+2 blocks have been allocated, - # and 3 available blocks remain. - output = scheduler.schedule() - - merge_requests = priority_requests + high_priority_requests - - model_output = ModelRunnerOutput( - req_ids=[req.request_id for req in merge_requests], - req_id_to_index={ - req.request_id: i - for i, req in enumerate(merge_requests) - }, - sampled_token_ids=[[1] for _ in merge_requests], - spec_token_ids=None, - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=[], - ) - scheduler.update_from_output(output, model_output) - - # At this time, the request with the lowest priority - # (request.id = 2) will be preempted, freeing up 2 blocks, - # which exactly meets the resource allocation requirements - # for request.id = 4 and request.id = 5. - output = scheduler.schedule() - - # Should schedule the new request without preemption. - assert len(scheduler.running) == 4 # - assert len(scheduler.waiting) == 1 # - - running_priorities = [req.priority for req in scheduler.running] - running_req_ids = [req.request_id for req in scheduler.running] - - assert running_priorities == [3, 4, 1, 2] - assert running_req_ids == ["1", "2", "4", "5"] - assert scheduler.waiting.peek_request().priority == 5 diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 413a853dfecbc..49a744cfec69a 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -257,11 +257,7 @@ class Scheduler(SchedulerInterface): self.running, key=lambda r: (r.priority, r.arrival_time), ) - preempted_index = self.running.index(preempted_req) - if preempted_index <= req_index: - req_index -= 1 - scheduled_running_reqs.remove(preempted_req) - self.running.pop(preempted_index) + self.running.remove(preempted_req) else: preempted_req = self.running.pop() From 5ea71ff46fe503df12f18ad41d40f5c2b18dcfcd Mon Sep 17 00:00:00 2001 From: Giancarlo Delfin <32987265+TheEpicDolphin@users.noreply.github.com> Date: Mon, 4 Aug 2025 19:11:06 -0700 Subject: [PATCH 201/224] =?UTF-8?q?[V1]=20reduce=20block=20size=20for=20tr?= =?UTF-8?q?ee=20attention=20correctness=20test=20to=20fix=20'ou=E2=80=A6?= =?UTF-8?q?=20(#22207)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Giancarlo Delfin --- tests/v1/spec_decode/test_tree_attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py index 42468daa62a9a..456ce712d36e4 100644 --- a/tests/v1/spec_decode/test_tree_attention.py +++ b/tests/v1/spec_decode/test_tree_attention.py @@ -155,7 +155,7 @@ def test_tree_attn_correctness() -> None: dim_per_head = 128 num_kv_heads = 2 - block_size = 128 + block_size = 32 max_sequence_length = 8192 randomize_blocks = True for batch_size in [1, 16, 32]: From f4f4e7ef273645192fac837718b3fdcf073c597a Mon Sep 17 00:00:00 2001 From: lkchen Date: Mon, 4 Aug 2025 19:11:33 -0700 Subject: [PATCH 202/224] [V0 deprecation][P/D] Deprecate v0 `KVConnectorBase` code (1/2) (#21785) Signed-off-by: Linkun Chen --- .buildkite/test-pipeline.yaml | 1 - tests/kv_transfer/test_disagg.py | 120 ------- .../kv_transfer/kv_connector/base.py | 140 +------- .../kv_transfer/kv_connector/factory.py | 68 +--- .../kv_connector/lmcache_connector.py | 99 ------ .../kv_connector/mooncake_store_connector.py | 203 ----------- .../kv_connector/simple_connector.py | 329 ------------------ .../kv_transfer/kv_connector/utils.py | 9 +- .../kv_connector/v1/multi_connector.py | 8 +- .../kv_transfer/kv_connector_agent.py | 77 ---- .../kv_transfer/kv_transfer_state.py | 9 +- vllm/v1/core/sched/scheduler.py | 2 +- .../worker/kv_connector_model_runner_mixin.py | 6 +- 13 files changed, 31 insertions(+), 1040 deletions(-) delete mode 100644 tests/kv_transfer/test_disagg.py delete mode 100644 vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py delete mode 100644 vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py delete mode 100644 vllm/distributed/kv_transfer/kv_connector/simple_connector.py delete mode 100644 vllm/distributed/kv_transfer/kv_connector_agent.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 88e1197d703a4..b7a2ca6ca9b24 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -749,7 +749,6 @@ steps: # this test fails consistently. # TODO: investigate and fix - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s models/multimodal/generation/test_maverick.py diff --git a/tests/kv_transfer/test_disagg.py b/tests/kv_transfer/test_disagg.py deleted file mode 100644 index 9f2229cc41dff..0000000000000 --- a/tests/kv_transfer/test_disagg.py +++ /dev/null @@ -1,120 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os -import subprocess -import sys -import time -from subprocess import Popen - -import pytest -import requests -import torch - - -# Fixture to set up environment variables and teardown servers after tests -@pytest.fixture(scope="module", autouse=True) -def setup_servers(): - if torch.cuda.device_count() < 2: - pytest.skip("Skipping test: fewer than 2 GPUs available") - - # Set up environment variables - VLLM_HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'", - shell=True).decode().strip() - os.environ["VLLM_HOST_IP"] = VLLM_HOST_IP - - # Start prefill instance - prefill_cmd = [ - sys.executable, - "-m", - "vllm.entrypoints.openai.api_server", - "--model", - "meta-llama/Llama-3.2-1B-Instruct", - "--port", - "8100", - "--gpu-memory-utilization", - "0.5", - "--max-model-len", - "1000", - "--kv-transfer-config", - '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer",'\ - '"kv_rank":0,"kv_parallel_size":2}', - ] - prefill_env = os.environ.copy() - prefill_env["CUDA_VISIBLE_DEVICES"] = "0" - prefill_proc = Popen(prefill_cmd, env=prefill_env) - - # Start decode instance - decode_cmd = [ - sys.executable, - "-m", - "vllm.entrypoints.openai.api_server", - "--model", - "meta-llama/Llama-3.2-1B-Instruct", - "--port", - "8200", - "--gpu-memory-utilization", - "0.5", - "--max-model-len", - "1000", - "--kv-transfer-config", - '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer",'\ - '"kv_rank":1,"kv_parallel_size":2}', - ] - decode_env = os.environ.copy() - decode_env["CUDA_VISIBLE_DEVICES"] = "1" - decode_proc = Popen(decode_cmd, env=decode_env) - - # Wait for servers to be ready - assert wait_for_server(8100), "Prefill server did not start in time" - assert wait_for_server(8200), "Decode server did not start in time" - - # Yield to the test function and handle teardown after tests - yield - - # Cleanup: kill the processes - prefill_proc.terminate() - decode_proc.terminate() - - # Additional cleanup if needed - prefill_proc.wait() - decode_proc.wait() - - -# Helper function to wait for server -def wait_for_server(port, timeout=240): - start_time = time.time() - while time.time() - start_time < timeout: - try: - response = requests.get(f"http://localhost:{port}/v1/completions") - if response.status_code in [200, 405]: - return True - except requests.ConnectionError: - time.sleep(1) - return False - - -# Test function to send curl requests and validate responses -@pytest.mark.parametrize("prompt", ["San Francisco is a", "Santa Clara is a"]) -def test_disaggregated_prefilling(prompt): - # Send to prefill - response = requests.post("http://localhost:8100/v1/completions", - headers={"Content-Type": "application/json"}, - json={ - "model": "meta-llama/Llama-3.2-1B-Instruct", - "prompt": prompt, - "max_tokens": 1, - "temperature": 0 - }) - assert response.status_code == 200 - - # Send to decode - response = requests.post("http://localhost:8200/v1/completions", - headers={"Content-Type": "application/json"}, - json={ - "model": "meta-llama/Llama-3.2-1B-Instruct", - "prompt": prompt, - "max_tokens": 10, - "temperature": 0 - }) - assert response.status_code == 200 diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py index 868b227fc8994..011bbb69abb08 100644 --- a/vllm/distributed/kv_transfer/kv_connector/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/base.py @@ -1,142 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -KVConnectorBase Class for Distributed KV Cache & Hidden State communication - -The class provides two primary abstract methods: -1. send_kv_caches_and_hidden_states(): Send KV caches and hidden states -2. recv_kv_caches_and_hidden_states(): Recv KV caches and hidden states -""" - -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Optional, Union - -import torch +"""Defines the base type for KV cache connectors.""" from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 -from vllm.sequence import IntermediateTensors -if TYPE_CHECKING: - from vllm.config import VllmConfig - from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata +KVConnectorBase = KVConnectorBase_V1 +KVConnectorBaseType = KVConnectorBase_V1 - -class KVConnectorBase(ABC): - """ - Abstract base class for a KV connector. - - The class provides two primary abstract methods: - 1. send_kv_caches_and_hidden_states(): Send KV caches and hidden states - 2. recv_kv_caches_and_hidden_states(): Recv KV caches and hidden states - """ - - @abstractmethod - def __init__( - self, - rank: int, - local_rank: int, - config: "VllmConfig", - ): - raise NotImplementedError - - @abstractmethod - def close(self) -> None: - """Close the buffer and release resources. - - This method is responsible for cleaning up resources related to the - connector when it is no longer needed. - - Raises: - NotImplementedError: This method must be implemented in subclasses. - """ - raise NotImplementedError - - @abstractmethod - def send_kv_caches_and_hidden_states( - self, - model_executable: torch.nn.Module, - model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: list[torch.Tensor], - hidden_or_intermediate_states: Union[torch.Tensor, - IntermediateTensors], - ) -> None: - """ - Send KV caches and hidden states to the connector. - - This method processes the input tokens, KV caches, and - hidden/intermediate states for a given model and sends the data to the - decode instance. - - Args: - model_executable (torch.nn.Module): The model executable containing - start and end layer information. - model_input (ModelInputForGPUWithSamplingMetadata): The input - metadata from vLLM. - kv_caches (list[torch.Tensor]): List of KV caches (keys and values) - for each layer. - hidden_or_intermediate_states (Union[torch.Tensor, - IntermediateTensors]): - The hidden or intermediate states associated with the tokens. - - Returns: - None - - """ - - raise NotImplementedError - - @abstractmethod - def recv_kv_caches_and_hidden_states( - self, model_executable: torch.nn.Module, - model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: list[torch.Tensor] - ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool, - "ModelInputForGPUWithSamplingMetadata"]: - """ - Receive KV caches and hidden states from the connector. - - This method attempts to retrieve KV caches and hidden states for input - tokens. If all required KV caches and hidden states are received, it - will bypass model input, else it will fall back to normal vLLM model - forwarding. - - Args: - model_executable (torch.nn.Module): - The model executable from vLLM modelrunner. - model_input (ModelInputForGPUWithSamplingMetadata): - The model input from vLLM modelrunner. - kv_caches (list[torch.Tensor]): - List of KV caches for each layer. - - Returns: - - hidden_or_intermediate_states (torch.Tensor or - IntermediateTensors): - Concatenated hidden states if all required data is retrieved, - otherwise `None`. - - bypass_model_exec (bool): - Indicates whether the model execution can be skipped (True) or - needs to be redone (False). - - model_input (ModelInputForGPUWithSamplingMetadata): - Optionally adjusted input metadata for re-execution when - `bypass_model_exec=False`. - - """ - - raise NotImplementedError - - @classmethod - def get_required_kvcache_layout( - cls, vllm_config: "VllmConfig") -> Optional[str]: - """ - Get the required KV cache layout for this connector. - Args: - vllm_config (VllmConfig): the vllm config. - - Returns: - str: the required KV cache layout. e.g. HND, or NHD. - None if the connector does not require a specific layout. - """ - return None - - -KVConnectorBaseType = Union[KVConnectorBase, KVConnectorBase_V1] +__all__ = ["KVConnectorBase", "KVConnectorBaseType"] diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index cf7cde2c43771..01673a0d7c876 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -5,14 +5,10 @@ import importlib from typing import TYPE_CHECKING, Callable import vllm.envs as envs -from vllm.config import KVTransferConfig -from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType -from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1, - KVConnectorRole) +from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase +from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole from vllm.logger import init_logger -from .base import KVConnectorBase - if TYPE_CHECKING: from vllm.config import VllmConfig @@ -20,7 +16,7 @@ logger = init_logger(__name__) class KVConnectorFactory: - _registry: dict[str, Callable[[], type[KVConnectorBaseType]]] = {} + _registry: dict[str, Callable[[], type[KVConnectorBase]]] = {} @classmethod def register_connector(cls, name: str, module_path: str, @@ -29,28 +25,23 @@ class KVConnectorFactory: if name in cls._registry: raise ValueError(f"Connector '{name}' is already registered.") - def loader() -> type[KVConnectorBaseType]: + def loader() -> type[KVConnectorBase]: module = importlib.import_module(module_path) return getattr(module, class_name) cls._registry[name] = loader @classmethod - def create_connector_v0(cls, rank: int, local_rank: int, - config: "VllmConfig") -> KVConnectorBase: - if envs.VLLM_USE_V1: - raise ValueError("Attempting to initialize a V0 Connector, " + def create_connector( + cls, + config: "VllmConfig", + role: KVConnectorRole, + ) -> KVConnectorBase: + if not envs.VLLM_USE_V1: + raise ValueError("Attempting to initialize a V1 Connector, " f"but found {envs.VLLM_USE_V1=}") - connector_cls = cls.get_connector_class(config.kv_transfer_config) - assert issubclass(connector_cls, KVConnectorBase) - return connector_cls(rank, local_rank, config) - - @classmethod - def get_connector_class( - cls, kv_transfer_config: "KVTransferConfig" - ) -> type[KVConnectorBaseType]: - """Get the connector class by name.""" + kv_transfer_config = config.kv_transfer_config connector_name = kv_transfer_config.kv_connector if connector_name in cls._registry: connector_cls = cls._registry[connector_name]() @@ -61,21 +52,7 @@ class KVConnectorFactory: f"Unsupported connector type: {connector_name}") connector_module = importlib.import_module(connector_module_path) connector_cls = getattr(connector_module, connector_name) - return connector_cls - - @classmethod - def create_connector_v1( - cls, - config: "VllmConfig", - role: KVConnectorRole, - ) -> KVConnectorBase_V1: - if not envs.VLLM_USE_V1: - raise ValueError("Attempting to initialize a V1 Connector, " - f"but found {envs.VLLM_USE_V1=}") - - kv_transfer_config = config.kv_transfer_config - connector_cls = cls.get_connector_class(kv_transfer_config) - assert issubclass(connector_cls, KVConnectorBase_V1) + assert issubclass(connector_cls, KVConnectorBase) logger.info("Creating v1 connector with name: %s and engine_id: %s", connector_cls.__name__, kv_transfer_config.engine_id) # NOTE(Kuntai): v1 connector is explicitly separated into two roles. @@ -92,25 +69,6 @@ class KVConnectorFactory: # Register various connectors here. # The registration should not be done in each individual file, as we want to # only load the files corresponding to the current connector. -KVConnectorFactory.register_connector( - "PyNcclConnector", - "vllm.distributed.kv_transfer.kv_connector.simple_connector", - "SimpleConnector") - -KVConnectorFactory.register_connector( - "MooncakeConnector", - "vllm.distributed.kv_transfer.kv_connector.simple_connector", - "SimpleConnector") - -KVConnectorFactory.register_connector( - "LMCacheConnector", - "vllm.distributed.kv_transfer.kv_connector.lmcache_connector", - "LMCacheConnector") - -KVConnectorFactory.register_connector( - "MooncakeStoreConnector", - "vllm.distributed.kv_transfer.kv_connector.mooncake_store_connector", - "MooncakeStoreConnector") KVConnectorFactory.register_connector( "SharedStorageConnector", diff --git a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py deleted file mode 100644 index 78bf3095613a7..0000000000000 --- a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +++ /dev/null @@ -1,99 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -LMCache KV Cache Connector for Distributed Machine Learning Inference - -The LMCacheConnector can (1) transfer KV caches between prefill vLLM worker -(KV cache producer) and decode vLLM worker (KV cache consumer) using LMCache; -(2) offload and share KV caches. -""" - -from typing import TYPE_CHECKING, Union - -import torch - -from vllm.config import VllmConfig -from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase -from vllm.logger import init_logger -from vllm.sequence import IntermediateTensors - -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata - -logger = init_logger(__name__) - - -class LMCacheConnector(KVConnectorBase): - - def __init__( - self, - rank: int, - local_rank: int, - config: VllmConfig, - ): - - self.transfer_config = config.kv_transfer_config - self.vllm_config = config - - from lmcache.experimental.cache_engine import LMCacheEngineBuilder - from lmcache.integration.vllm.utils import ENGINE_NAME - from lmcache.integration.vllm.vllm_adapter import ( - RetrieveStatus, StoreStatus, init_lmcache_engine, - lmcache_retrieve_kv, lmcache_should_retrieve, lmcache_should_store, - lmcache_store_kv) - logger.info("Initializing LMCacheConfig under kv_transfer_config %s", - self.transfer_config) - - # TODO (Jiayi): Find model_config, parallel_config, and cache_config - self.engine = init_lmcache_engine(config.model_config, - config.parallel_config, - config.cache_config) - self.lmcache_engine_name = ENGINE_NAME - self.lmcache_engine_builder = LMCacheEngineBuilder - - self.model_config = config.model_config - self.parallel_config = config.parallel_config - self.cache_config = config.cache_config - self.lmcache_retrieve_kv = lmcache_retrieve_kv - self.lmcache_store_kv = lmcache_store_kv - self.lmcache_should_retrieve = lmcache_should_retrieve - self.lmcache_should_store = lmcache_should_store - self.store_status = StoreStatus - self.retrieve_status = RetrieveStatus - - def recv_kv_caches_and_hidden_states( - self, model_executable: torch.nn.Module, - model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: list[torch.Tensor] - ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool, - "ModelInputForGPUWithSamplingMetadata"]: - - retrieve_status = self.lmcache_should_retrieve(model_input) - model_input, bypass_model_exec, hidden_or_intermediate_states =\ - self.lmcache_retrieve_kv( - model_executable, model_input, self.cache_config, kv_caches, - retrieve_status) - return hidden_or_intermediate_states, bypass_model_exec, model_input - - def send_kv_caches_and_hidden_states( - self, - model_executable: torch.nn.Module, - model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: list[torch.Tensor], - hidden_or_intermediate_states: Union[torch.Tensor, - IntermediateTensors], - ) -> None: - - store_status = self.lmcache_should_store(model_input) - self.lmcache_store_kv( - self.model_config, - self.parallel_config, - self.cache_config, - model_executable, - model_input, - kv_caches, - store_status, - ) - - def close(self): - self.lmcache_engine_builder.destroy(self.lmcache_engine_name) diff --git a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py deleted file mode 100644 index 94a7ce91acf17..0000000000000 --- a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +++ /dev/null @@ -1,203 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -MooncakeStore Connector for Distributed Machine Learning Inference -The MooncakeStoreConnector transfers KV caches between prefill vLLM workers -(KV cache producer) and decode vLLM workers (KV cache consumer) using a -database-style KVStore. -""" -import hashlib -from typing import TYPE_CHECKING, Union - -import torch - -from vllm.config import VllmConfig -from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase -from vllm.distributed.kv_transfer.kv_connector.utils import ( - model_aware_kv_ops_helper as kv_helper) -from vllm.logger import init_logger -from vllm.sequence import IntermediateTensors - -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata - -logger = init_logger(__name__) - - -class MooncakeStoreConnector(KVConnectorBase): - - def __init__( - self, - rank: int, - local_rank: int, - config: VllmConfig, - ): - self.kv_transfer_config = config.kv_transfer_config - self.kv_helper = kv_helper(config) - self.local_tp_rank = local_rank - - # Init kv_store - if self.kv_transfer_config.kv_connector == "MooncakeStoreConnector": - # Check if MOONCAKE_CONFIG_PATH is set - import os - use_mooncake_store = os.getenv('MOONCAKE_CONFIG_PATH') is not None - - if not use_mooncake_store: - raise ValueError( - "To use MooncakeStoreConnector, you need to pass the ENV: " - "'MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json'.") - else: - from vllm.distributed.kv_transfer.kv_lookup_buffer.mooncake_store import ( # noqa: E501 - MooncakeStore) - logger.info( - "Initializing KVStoreConnector under kv_transfer_config %s", - self.kv_transfer_config) - self.kv_store = MooncakeStore(config) - else: - logger.error("Can not find %s", - self.kv_transfer_config.kv_connector) - - assert self.kv_store is not None - - def close(self) -> None: - """Close the buffer and release resources. - This method is responsible for cleaning up resources related to the - connector when it is no longer needed. - Raises: - NotImplementedError: This method must be implemented in subclasses. - """ - self.kv_store.close() - - def send_kv_caches_and_hidden_states( - self, - model_executable: torch.nn.Module, - model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: list[torch.Tensor], - hidden_or_intermediate_states: Union[torch.Tensor, - IntermediateTensors], - ) -> None: - input_tokens_tensor = model_input.input_tokens - seq_lens = model_input.attn_metadata.seq_lens - slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten() - start_layer = model_executable.model.start_layer - end_layer = model_executable.model.end_layer - num_heads, head_size = self.kv_helper.get_model_args(model_executable) - - for idx, slen in enumerate(seq_lens): - start_pos = sum(seq_lens[:idx]) - end_pos = start_pos + slen - - current_tokens = input_tokens_tensor[start_pos:end_pos] - store_key_prefix = self.tensor_hash(current_tokens) - keys, values = [], [] - - for layer_id in range(start_layer, end_layer): - kv_cache = kv_caches[layer_id - start_layer] - key_cache, value_cache = self.kv_helper.get_kv_from_cache( - kv_cache, num_heads, head_size) - current_slot_mapping = slot_mapping_flat[start_pos:end_pos] - - keys.append(key_cache[current_slot_mapping].unsqueeze(0)) - values.append(value_cache[current_slot_mapping].unsqueeze(0)) - - keys = torch.cat(keys, dim=0) - values = torch.cat(values, dim=0) - kvcache_to_sent = torch.stack((keys, values), dim=0) - store_kvcache_key = f"{store_key_prefix}_{self.local_tp_rank}" - self.kv_store.put(store_kvcache_key, kvcache_to_sent) - - hidden_key = f"{store_key_prefix}_hidden_{self.local_tp_rank}" - self.kv_store.put(hidden_key, - hidden_or_intermediate_states[start_pos:end_pos]) - - logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank()) - - def recv_kv_caches_and_hidden_states( - self, model_executable: torch.nn.Module, - model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: list[torch.Tensor] - ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool, - "ModelInputForGPUWithSamplingMetadata"]: - bypass_model_exec = True - input_tokens_tensor = model_input.input_tokens - seq_lens = model_input.attn_metadata.seq_lens - num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens - slot_mapping = model_input.attn_metadata.slot_mapping.flatten() - start_layer = model_executable.model.start_layer - end_layer = model_executable.model.end_layer - hidden_or_intermediate_states_for_one_req = [] - - for idx, slen in enumerate(seq_lens): - start_pos = sum(seq_lens[:idx]) - end_pos = start_pos + slen - - if start_pos >= num_prefill_tokens: - # This can happen during inflight batching. See: - # vllm/worker/model_runner.py::_prepare_model_input_tensors: - # - input_tokens[:num_prefill_tokens] contains prefill tokens. - # - input_tokens[num_prefill_tokens:] contains decode tokens. - logger.warning("You should set --enable_chunked_prefill=False " - "and --max_num_batched_tokens " - "should be equal to max_seq_len_to_capture") - bypass_model_exec = False - assert start_pos == num_prefill_tokens - break - - current_tokens = input_tokens_tensor[start_pos:end_pos] - - # get roi for current seq - load_key_prefix = self.tensor_hash(current_tokens) - load_kvcache_key = f"{load_key_prefix}_{self.local_tp_rank}" - remote_kv = self.kv_store.get(load_kvcache_key) - hidden_key = f"{load_key_prefix}_hidden_{self.local_tp_rank}" - hidden = self.kv_store.get(hidden_key) - - if remote_kv is None or hidden is None: - # didn't find any match. - bypass_model_exec = False - continue - - num_computed_tokens = current_tokens.shape[0] - - # update the end position based on how many tokens are cached. - end_pos = start_pos + num_computed_tokens - - # call self.kv_store to get kv layer by layer - for layer_id in range(start_layer, end_layer): - layer = model_executable.model.layers[layer_id] - # get kvcache object - kv_cache = kv_caches[layer_id - start_layer] - - # get remote kvcache - remote_k, remote_v = remote_kv[0][layer_id], remote_kv[1][ - layer_id] - - self.kv_helper.put_kv_to_cache(model_executable, remote_k, - remote_v, layer, kv_cache, - slot_mapping, start_pos, - end_pos) - - hidden_or_intermediate_states_for_one_req.append(hidden) - - if not bypass_model_exec: - logger.warning( - "[rank%d]: Failed to receive all KVs and hidden " - "states, redo model forwarding.", torch.distributed.get_rank()) - hidden_or_intermediate_states = None - - else: - logger.debug( - "[rank%d]: Successfully received all KVs and hidden " - "states, skip model forwarding.", torch.distributed.get_rank()) - hidden_or_intermediate_states = torch.cat( - hidden_or_intermediate_states_for_one_req, dim=0) - - return hidden_or_intermediate_states, bypass_model_exec, model_input - - @staticmethod - def tensor_hash(tensor: torch.Tensor) -> int: - """Calculate the hash value of the tensor.""" - tensor_bytes = tensor.clone().detach().cpu().numpy().tobytes() - hash_object = hashlib.blake2b(tensor_bytes) - hash_hex = hash_object.hexdigest() - return int(hash_hex[:16], 16) diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py deleted file mode 100644 index e7c079e1f115c..0000000000000 --- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py +++ /dev/null @@ -1,329 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Simple KV Cache Connector for Distributed Machine Learning Inference - -The SimpleConnector transfers KV caches between prefill vLLM worker (KV cache -producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe or -MooncakePipe. - -But the logic can be extended to support other pipe and lookup buffer. -""" -from typing import TYPE_CHECKING, Optional, Union - -import torch - -from vllm.config import VllmConfig -from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase -from vllm.distributed.kv_transfer.kv_connector.utils import ( - model_aware_kv_ops_helper as kv_helper) -from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import ( - SimpleBuffer) -from vllm.logger import init_logger -from vllm.sequence import IntermediateTensors - -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata - -logger = init_logger(__name__) - - -class SimpleConnector(KVConnectorBase): - - def __init__( - self, - rank: int, - local_rank: int, - config: VllmConfig, - ): - - self.config = config.kv_transfer_config - self.kv_helper = kv_helper(config) - - if self.config.kv_connector == "PyNcclConnector": - from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import ( - PyNcclPipe) - logger.info( - "Initializing PyNcclConfig under kv_transfer_config %s", - self.config) - elif self.config.kv_connector == "MooncakeConnector": - # Check if MOONCAKE_CONFIG_PATH is set - import os - use_mooncake_distributed_pipe = os.getenv( - 'MOONCAKE_CONFIG_PATH') is not None - - if not use_mooncake_distributed_pipe: - raise ValueError( - "To use MooncakeConnector, you need to pass the ENV: " - "'MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json'.") - else: - from vllm.distributed.kv_transfer.kv_pipe.mooncake_pipe import ( # noqa: E501 - MooncakePipe) - logger.info( - "Initializing MooncakeConfig under kv_transfer_config %s", - self.config) - - self.lookup_buffer_size = self.config.kv_buffer_size - - self.producer_buffer: Optional[SimpleBuffer] = None - self.consumer_buffer: Optional[SimpleBuffer] = None - - self.producer_data_pipe: Union[PyNcclPipe, MooncakePipe] - self.consumer_data_pipe: Union[PyNcclPipe, MooncakePipe] - self.producer_signal_pipe: Union[PyNcclPipe, MooncakePipe] - self.consumer_signal_pipe: Union[PyNcclPipe, MooncakePipe] - - # 2 pipes for every rank in the world - port_offset_base = 2 * rank - - # In disaggregated prefill, the prefill vLLM only uses send pipe - # and the decode vLLM only uses recv pipe - if self.config.is_kv_producer: - - if self.config.kv_connector == "PyNcclConnector": - self.producer_data_pipe = PyNcclPipe( - local_rank=local_rank, - config=self.config, - port_offset=port_offset_base, - ) - self.producer_signal_pipe = PyNcclPipe( - local_rank=local_rank, - config=self.config, - port_offset=port_offset_base + 1, - device="cpu", - ) - elif self.config.kv_connector == "MooncakeConnector": - self.producer_data_pipe = MooncakePipe( - local_rank=local_rank, - config=self.config, - ) - # We only need to initialize MooncakePipe once - self.producer_signal_pipe = self.producer_data_pipe - - self.producer_buffer = SimpleBuffer(self.producer_signal_pipe, - self.producer_data_pipe, - self.config.kv_buffer_size) - - else: - - # the current vLLM instance is KV consumer, so it needs to connect - # its recv pipe to the send pipe of KV producer - if self.config.kv_connector == "PyNcclConnector": - self.consumer_data_pipe = PyNcclPipe( - local_rank=local_rank, - config=self.config, - port_offset=port_offset_base, - ) - self.consumer_signal_pipe = PyNcclPipe( - local_rank=local_rank, - config=self.config, - port_offset=port_offset_base + 1, - device="cpu", - ) - elif self.config.kv_connector == "MooncakeConnector": - self.consumer_data_pipe = MooncakePipe( - local_rank=local_rank, - config=self.config, - ) - self.consumer_signal_pipe = self.consumer_data_pipe - - self.consumer_buffer = SimpleBuffer( - self.consumer_signal_pipe, - self.consumer_data_pipe, - self.config.kv_buffer_size, - ) - - def select(self, input_tokens: Optional[torch.Tensor], - roi: Optional[torch.Tensor]) -> list[Optional[torch.Tensor]]: - - assert self.consumer_buffer is not None, "Please initialize the "\ - "consumer buffer before calling select." - return self.consumer_buffer.drop_select(input_tokens, roi) - - def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor, - key: torch.Tensor, value: torch.Tensor, - hidden: torch.Tensor) -> None: - - assert self.producer_buffer is not None, "Please initialize the "\ - "producer buffer before calling insert." - - self.producer_buffer.insert(input_tokens, roi, key, value, hidden) - - def send_kv_caches_and_hidden_states( - self, - model_executable: torch.nn.Module, - model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: list[torch.Tensor], - hidden_or_intermediate_states: Union[torch.Tensor, - IntermediateTensors], - ) -> None: - - input_tokens_tensor = model_input.input_tokens - seq_lens = model_input.attn_metadata.seq_lens - slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten() - num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens - start_layer = model_executable.model.start_layer - end_layer = model_executable.model.end_layer - num_heads, head_size = self.kv_helper.get_model_args(model_executable) - - # query_lens contains new KV caches that are added to vLLM. - # so we will send them to decode instance - # FIXME(Kuntai): This assume that all requests are prefill. - for idx, slen in enumerate(seq_lens): - start_pos = sum(seq_lens[:idx]) - end_pos = start_pos + slen - - if start_pos >= num_prefill_tokens: - # vllm/worker/model_runner.py::_prepare_model_input_tensors: - # - input_tokens[:num_prefill_tokens] contains prefill tokens. - # - input_tokens[num_prefill_tokens:] contains decode tokens. - logger.warning("You have some decode requests while using " - "SimpleConnector. Their KVCache won't be sent.") - break - - current_tokens = input_tokens_tensor[start_pos:end_pos] - - keys, values = [], [] - - for layer_id in range(start_layer, end_layer): - kv_cache = kv_caches[layer_id - start_layer] - key_cache, value_cache = self.kv_helper.get_kv_from_cache( - kv_cache, num_heads, head_size) - - current_slot_mapping = slot_mapping_flat[start_pos:end_pos] - - keys.append(key_cache[current_slot_mapping].unsqueeze(0)) - values.append(value_cache[current_slot_mapping].unsqueeze(0)) - - keys = torch.cat(keys, dim=0) - values = torch.cat(values, dim=0) - - self.insert(current_tokens, - torch.ones_like(current_tokens, - dtype=bool), keys, values, - hidden_or_intermediate_states[start_pos:end_pos]) - - logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank()) - - def recv_kv_caches_and_hidden_states( - self, model_executable: torch.nn.Module, - model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: list[torch.Tensor] - ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool, - "ModelInputForGPUWithSamplingMetadata"]: - - # When bypass_model_exec is set to False, it means that at least for one - # request its corresponding KV cache or hidden state is missing. - # In this case we need to do prefilling to recompute missing KV cache - # and hidden states. - bypass_model_exec = True - - input_tokens_tensor = model_input.input_tokens - seq_lens = model_input.attn_metadata.seq_lens - num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens - slot_mapping = model_input.attn_metadata.slot_mapping.flatten() - start_layer = model_executable.model.start_layer - end_layer = model_executable.model.end_layer - - hidden_or_intermediate_states_for_one_req = [] - - input_tokens_list = [] - num_computed_tokens_list = [] - start_pos_list = [] - - # enumerate different requests - # FIXME(Kuntai): This impl assumes that all requests are prefill. - for idx, slen in enumerate(seq_lens): - start_pos = sum(seq_lens[:idx]) - end_pos = start_pos + slen - - if start_pos >= num_prefill_tokens: - # This can happen during inflight batching. See: - # vllm/worker/model_runner.py::_prepare_model_input_tensors: - # - input_tokens[:num_prefill_tokens] contains prefill tokens. - # - input_tokens[num_prefill_tokens:] contains decode tokens. - logger.warning("You should set --enable_chunked_prefill=False " - "and --max_num_batched_tokens " - "should be equal to --max_seq_len_to_capture") - bypass_model_exec = False - assert start_pos == num_prefill_tokens - break - - current_tokens = input_tokens_tensor[start_pos:end_pos] - num_tokens = slen - - # collecting data for rebuilding the input - input_tokens_list.append(current_tokens) - start_pos_list.append(start_pos) - - ret = self.select(current_tokens, - torch.ones_like(current_tokens, dtype=bool)) - if ret[0] is None: - # didn't find any match. - bypass_model_exec = False - num_computed_tokens_list.append(0) - continue - - roi: torch.Tensor = ret[1] - keys: torch.Tensor = ret[2] - values: torch.Tensor = ret[3] - hidden: torch.Tensor = ret[4] - - num_computed_tokens = roi.shape[0] - num_computed_tokens_list.append(num_computed_tokens) - - # check if both KV cache and the hidden states are received - # If not, need to redo the forwarding to compute missing states - if not all([(num_computed_tokens == num_tokens), hidden is not None - ]): - bypass_model_exec = False - - # update the end position based on how many tokens are cached. - end_pos = start_pos + num_computed_tokens - - # put received KV caches into paged memory - for cur_layer in range(start_layer, end_layer): - - layer_id = cur_layer - start_layer - kv_cache = kv_caches[layer_id] - layer = model_executable.model.layers[cur_layer] - - # get remote kvcache - remote_k, remote_v = keys[layer_id], values[layer_id] - - self.kv_helper.put_kv_to_cache(model_executable, remote_k, - remote_v, layer, kv_cache, - slot_mapping, start_pos, - end_pos) - - hidden_or_intermediate_states_for_one_req.append(hidden) - - if not bypass_model_exec: - # Some of the KV cache is not retrieved - # Here we will fall back to normal model forwarding - # But optionally you can adjust model_input so that you only do - # prefilling on those tokens that are missing KV caches. - logger.warning( - "[rank%d]: Failed to receive all KVs and hidden " - "states, redo model forwarding.", torch.distributed.get_rank()) - hidden_or_intermediate_states = None - - else: - logger.debug( - "[rank%d]: Successfully received all KVs and hidden " - "states, skip model forwarding.", torch.distributed.get_rank()) - hidden_or_intermediate_states = torch.cat( - hidden_or_intermediate_states_for_one_req, dim=0) - - return hidden_or_intermediate_states, bypass_model_exec, model_input - - def close(self): - self.producer_data_pipe.close() - self.consumer_data_pipe.close() - if self.config.kv_connector == "PyNcclConnector": - self.producer_signal_pipe.close() - self.consumer_signal_pipe.close() - elif self.config.kv_connector == "MooncakeConnector": - # MooncakePipe reuses data_pipe for signal_pipe, so we only have to - # close the data_pipe. - pass diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 1a11cb6d0189a..1da41790f9fb1 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -13,8 +13,8 @@ import torch import vllm.envs as envs from vllm import _custom_ops as ops from vllm.config import VllmConfig, get_current_vllm_config -from vllm.distributed.kv_transfer.kv_connector.factory import ( - KVConnectorFactory) +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1) from vllm.logger import init_logger from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput @@ -106,9 +106,8 @@ def get_kv_connector_cache_layout(): vllm_config = get_current_vllm_config() kv_config = vllm_config.kv_transfer_config if kv_config is not None: - connector_cls = KVConnectorFactory.get_connector_class(kv_config) - required_kvcache_layout = connector_cls.get_required_kvcache_layout( - vllm_config) + required_kvcache_layout = ( + KVConnectorBase_V1.get_required_kvcache_layout(vllm_config)) if required_kvcache_layout is not None: return required_kvcache_layout logger.info_once("Connectors do not specify a " \ diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index 934a03a12ee5e..62a4980bff975 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -52,7 +52,7 @@ class MultiConnector(KVConnectorBase_V1): temp_config.kv_transfer_config = KVTransferConfig( **ktc, engine_id=engine_id) self._connectors.append( - KVConnectorFactory.create_connector_v1(temp_config, role)) + KVConnectorFactory.create_connector(temp_config, role)) # A mapping from request id to the index of the connector chosen to # load the request from (if any). @@ -223,9 +223,9 @@ class MultiConnector(KVConnectorBase_V1): for ktc in ktcs: kv_transfer_config = KVTransferConfig(**ktc) temp_vllm_config.kv_transfer_config = kv_transfer_config - required_kvcache_layout = KVConnectorFactory.get_connector_class( - kv_transfer_config).get_required_kvcache_layout( - temp_vllm_config) + required_kvcache_layout = ( + KVConnectorBase_V1.get_required_kvcache_layout( + temp_vllm_config)) if required_kvcache_layout is not None: layouts.add(required_kvcache_layout) diff --git a/vllm/distributed/kv_transfer/kv_connector_agent.py b/vllm/distributed/kv_transfer/kv_connector_agent.py deleted file mode 100644 index 8633fdaf59f8b..0000000000000 --- a/vllm/distributed/kv_transfer/kv_connector_agent.py +++ /dev/null @@ -1,77 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""A centralized entrypoint to perform distributed KV cache transfer. - -This implementation is a shim wrapper on two APIs exposed by `kv_connector`: -1. `send_kv_caches_and_hidden_states` -2. `recv_kv_caches_and_hidden_states -""" -from typing import TYPE_CHECKING, Union - -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata - from vllm.config import VllmConfig - -import torch - -from vllm.distributed.kv_transfer.kv_connector.factory import ( - KVConnectorFactory) -from vllm.logger import init_logger -from vllm.sequence import IntermediateTensors - -logger = init_logger(__name__) - - -class KVTransferAgent: - """ - A class designated for distributed KV transfer - - Target use cases: - 1. Disaggregated prefill - 2. Remote KV cache storage - """ - - def __init__( - self, - rank: int, - local_rank: int, - config: "VllmConfig", - ): - - self.config = config - - if config.kv_transfer_config is None: - raise ValueError("KVTransferConfig is not set in the VllmConfig," - " cannot initialize KVConnector.") - - assert self.config.kv_transfer_config.is_kv_transfer_instance, "KV"\ - "TransferAgent should only be used when kv_connector is set." - - self.connector = KVConnectorFactory.create_connector_v0( - rank, local_rank, config) - - def send_kv_caches_and_hidden_states( - self, - model_executable: torch.nn.Module, - model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: list[torch.Tensor], - hidden_or_intermediate_states: Union[torch.Tensor, - IntermediateTensors], - ) -> None: - - self.connector.send_kv_caches_and_hidden_states( - model_executable, model_input, kv_caches, - hidden_or_intermediate_states) - - def close(self) -> None: - self.connector.close() - - def recv_kv_caches_and_hidden_states( - self, model_executable: torch.nn.Module, - model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: list[torch.Tensor] - ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool, - "ModelInputForGPUWithSamplingMetadata"]: - - return self.connector.recv_kv_caches_and_hidden_states( - model_executable, model_input, kv_caches) diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py index 60f1d5d8bca75..5e0f64fca220c 100644 --- a/vllm/distributed/kv_transfer/kv_transfer_state.py +++ b/vllm/distributed/kv_transfer/kv_transfer_state.py @@ -8,7 +8,6 @@ from vllm.distributed.kv_transfer.kv_connector.factory import ( KVConnectorFactory) from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1, KVConnectorRole) -from vllm.distributed.parallel_state import get_world_group if TYPE_CHECKING: from vllm.config import VllmConfig @@ -61,11 +60,7 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None: if (vllm_config.kv_transfer_config.is_kv_transfer_instance and _KV_CONNECTOR_AGENT is None): if envs.VLLM_USE_V1: - _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v1( + _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector( config=vllm_config, role=KVConnectorRole.WORKER) else: - _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v0( - rank=get_world_group().rank, - local_rank=get_world_group().local_rank, - config=vllm_config, - ) + raise ValueError("V0 is no longer supported") diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 49a744cfec69a..d39aea1f2d116 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -83,7 +83,7 @@ class Scheduler(SchedulerInterface): assert len(self.kv_cache_config.kv_cache_groups) == 1, ( "Multiple KV cache groups are not currently supported " "with KV connectors") - self.connector = KVConnectorFactory.create_connector_v1( + self.connector = KVConnectorFactory.create_connector( config=self.vllm_config, role=KVConnectorRole.SCHEDULER) self.kv_event_publisher = EventPublisherFactory.create( diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index 343befe176797..a03ebe35d8e0a 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -11,7 +11,7 @@ from typing import TYPE_CHECKING, Optional from vllm.config import VllmConfig from vllm.distributed.kv_transfer import (get_kv_transfer_group, has_kv_transfer_group) -from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 +from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase from vllm.forward_context import get_forward_context, set_forward_context from vllm.logger import init_logger from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput, @@ -31,7 +31,7 @@ class KVConnectorModelRunnerMixin: # Update KVConnector with the KVConnector metadata forward(). if has_kv_transfer_group(): kv_connector = get_kv_transfer_group() - assert isinstance(kv_connector, KVConnectorBase_V1) + assert isinstance(kv_connector, KVConnectorBase) assert scheduler_output.kv_connector_metadata is not None kv_connector.bind_connector_metadata( scheduler_output.kv_connector_metadata) @@ -93,7 +93,7 @@ class KVConnectorModelRunnerMixin: # Update KVConnector with the KVConnector metadata forward(). kv_connector = get_kv_transfer_group() - assert isinstance(kv_connector, KVConnectorBase_V1) + assert isinstance(kv_connector, KVConnectorBase) assert scheduler_output.kv_connector_metadata is not None kv_connector.bind_connector_metadata( scheduler_output.kv_connector_metadata) From 6ad6b8e115b8b46ad918284d862bdadded3af447 Mon Sep 17 00:00:00 2001 From: TJian Date: Mon, 4 Aug 2025 19:12:16 -0700 Subject: [PATCH 203/224] [FEAT] Refactor ROPE into module (#22192) Signed-off-by: tjtanaa --- .../model_executor/layers/rotary_embedding.py | 1967 ----------------- .../layers/rotary_embedding/__init__.py | 190 ++ .../layers/rotary_embedding/base.py | 237 ++ .../layers/rotary_embedding/common.py | 105 + .../rotary_embedding/deepseek_scaling_rope.py | 131 ++ .../rotary_embedding/dual_chunk_rope.py | 188 ++ .../dynamic_ntk_alpha_rope.py | 41 + .../dynamic_ntk_scaling_rope.py | 67 + .../rotary_embedding/linear_scaling_rope.py | 115 + .../layers/rotary_embedding/llama3_rope.py | 54 + .../rotary_embedding/llama4_vision_rope.py | 74 + .../layers/rotary_embedding/mrope.py | 670 ++++++ .../rotary_embedding/ntk_scaling_rope.py | 42 + .../phi3_long_rope_scaled_rope.py | 129 ++ .../rotary_embedding/yarn_scaling_rope.py | 68 + 15 files changed, 2111 insertions(+), 1967 deletions(-) delete mode 100644 vllm/model_executor/layers/rotary_embedding.py create mode 100644 vllm/model_executor/layers/rotary_embedding/__init__.py create mode 100644 vllm/model_executor/layers/rotary_embedding/base.py create mode 100644 vllm/model_executor/layers/rotary_embedding/common.py create mode 100644 vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py create mode 100644 vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py create mode 100644 vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py create mode 100644 vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py create mode 100644 vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py create mode 100644 vllm/model_executor/layers/rotary_embedding/llama3_rope.py create mode 100644 vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py create mode 100644 vllm/model_executor/layers/rotary_embedding/mrope.py create mode 100644 vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py create mode 100644 vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py create mode 100644 vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py deleted file mode 100644 index 24dd86620fe91..0000000000000 --- a/vllm/model_executor/layers/rotary_embedding.py +++ /dev/null @@ -1,1967 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py -# Copyright 2023 The vLLM team. -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Rotary Positional Embeddings.""" -import itertools -import math -from typing import Any, Optional, Union - -import numpy as np -import torch -import torch.nn as nn -from transformers import PretrainedConfig - -from vllm.model_executor.custom_op import CustomOp -from vllm.platforms import current_platform - -if current_platform.is_cuda(): - from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb - - -def _rotate_neox(x: torch.Tensor) -> torch.Tensor: - x1 = x[..., :x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2:] - return torch.cat((-x2, x1), dim=-1) - - -def _rotate_gptj(x: torch.Tensor) -> torch.Tensor: - x1 = x[..., ::2] - x2 = x[..., 1::2] - x = torch.stack((-x2, x1), dim=-1) - return x.flatten(-2) - - -def _apply_rotary_emb_torch( - x: torch.Tensor, - cos: torch.Tensor, - sin: torch.Tensor, - is_neox_style: bool, -) -> torch.Tensor: - cos = cos.unsqueeze(-2).to(x.dtype) - sin = sin.unsqueeze(-2).to(x.dtype) - if is_neox_style: - x1, x2 = torch.chunk(x, 2, dim=-1) - else: - x1 = x[..., ::2] - x2 = x[..., 1::2] - o1 = x1 * cos - x2 * sin - o2 = x2 * cos + x1 * sin - if is_neox_style: - return torch.cat((o1, o2), dim=-1) - else: - return torch.stack((o1, o2), dim=-1).flatten(-2) - - -def _apply_rotary_emb(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, - is_neox_style: bool) -> torch.Tensor: - """ - Args: - x: [num_tokens, num_heads, head_size] - cos: [num_tokens, head_size // 2] - sin: [num_tokens, head_size // 2] - is_neox_style: Whether to use the Neox-style or GPT-J-style rotary - positional embeddings. - """ - if current_platform.is_cuda(): - return apply_rotary_emb(x.unsqueeze(0), cos, sin, - not is_neox_style).squeeze(0) - else: - return _apply_rotary_emb_torch(x, cos, sin, is_neox_style) - - -@CustomOp.register("rotary_embedding") -class RotaryEmbedding(CustomOp): - """Original rotary positional embedding.""" - - def __init__( - self, - head_size: int, - rotary_dim: int, - max_position_embeddings: int, - base: float, - is_neox_style: bool, - dtype: torch.dtype, - ) -> None: - super().__init__() - self.head_size = head_size - self.rotary_dim = rotary_dim - self.max_position_embeddings = max_position_embeddings - self.base = base - self.is_neox_style = is_neox_style - self.dtype = dtype - - cache = self._compute_cos_sin_cache() - cache = cache.to(dtype) - self.cos_sin_cache: torch.Tensor - self.register_buffer("cos_sin_cache", cache, persistent=False) - - def _compute_inv_freq(self, base: float) -> torch.Tensor: - """Compute the inverse frequency.""" - # NOTE(woosuk): To exactly match the HF implementation, we need to - # use CPU to compute the cache and then move it to GPU. However, we - # create the cache on GPU for faster initialization. This may cause - # a slight numerical difference between the HF implementation and ours. - inv_freq = 1.0 / (base**(torch.arange( - 0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim)) - return inv_freq - - def _compute_cos_sin_cache(self) -> torch.Tensor: - """Compute the cos and sin cache.""" - inv_freq = self._compute_inv_freq(self.base) - t = torch.arange(self.max_position_embeddings, dtype=torch.float) - - freqs = torch.einsum("i,j -> ij", t, inv_freq) - cos = freqs.cos() - sin = freqs.sin() - cache = torch.cat((cos, sin), dim=-1) - return cache - - def forward_native( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - """A PyTorch-native implementation of forward().""" - if offsets is not None: - positions = positions + offsets - positions = positions.flatten() - num_tokens = positions.shape[0] - cos_sin = self.cos_sin_cache.index_select(0, positions) - cos, sin = cos_sin.chunk(2, dim=-1) - - query_shape = query.shape - query = query.view(num_tokens, -1, self.head_size) - query_rot = query[..., :self.rotary_dim] - query_pass = query[..., self.rotary_dim:] - query_rot = _apply_rotary_emb_torch(query_rot, cos, sin, - self.is_neox_style) - query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) - - # key may be None in some cases, e.g. cross-layer KV sharing - if key is not None: - key_shape = key.shape - key = key.view(num_tokens, -1, self.head_size) - key_rot = key[..., :self.rotary_dim] - key_pass = key[..., self.rotary_dim:] - key_rot = _apply_rotary_emb_torch(key_rot, cos, sin, - self.is_neox_style) - key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) - return query, key - - def forward_cuda( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - from vllm import _custom_ops as ops - - # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`) - # is expensive, so avoid calling it if possible - if self.cos_sin_cache.device != query.device or \ - self.cos_sin_cache.dtype != query.dtype: - self.cos_sin_cache = self.cos_sin_cache.to(query.device, - dtype=query.dtype) - - # ops.rotary_embedding()/batched_rotary_embedding() - # are in-place operations that update the query and key tensors. - if offsets is not None: - ops.batched_rotary_embedding(positions, query, key, self.head_size, - self.cos_sin_cache, - self.is_neox_style, self.rotary_dim, - offsets) - else: - ops.rotary_embedding(positions, query, key, self.head_size, - self.cos_sin_cache, self.is_neox_style) - return query, key - - def forward_xpu( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - from vllm._ipex_ops import ipex_ops as ops - - self.cos_sin_cache = self.cos_sin_cache.to(positions.device, - dtype=query.dtype) - # ops.rotary_embedding()/batched_rotary_embedding() - # are in-place operations that update the query and key tensors. - if key is None: - # XPU kernel doesn't support key=None so fall back to native impl - # TODO(sarckk): add support for optional key in - # ipex.llm.functional.rotary_embedding_batched - return self.forward_native(positions, query, key, offsets) - else: - if offsets is not None: - ops.batched_rotary_embedding(positions, query, key, - self.head_size, - self.cos_sin_cache, - self.is_neox_style, - self.rotary_dim, offsets) - else: - ops.rotary_embedding(positions, query, key, self.head_size, - self.cos_sin_cache, self.is_neox_style) - return query, key - - def forward_neuron( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - - def _apply_rotary_emb_neuron( - x: torch.Tensor, - cos: torch.Tensor, - sin: torch.Tensor, - is_neox_style: bool, - ) -> torch.Tensor: - cos = cos.unsqueeze(-2).to(x.dtype) - sin = sin.unsqueeze(-2).to(x.dtype) - if is_neox_style: - x1, x2 = torch.chunk(x, 2, dim=-1) - else: - # x1 = x[..., ::2] - - # x2 = x[..., 1::2] - d = x.shape[-1] // 2 - x_reshaped = x.view(-1, x.shape[-1]) - x1 = x_reshaped[:, ::2].view(*x.shape[:-1], d) - x2 = x_reshaped[:, 1::2].view(*x.shape[:-1], d) - o1 = x1 * cos - x2 * sin - o2 = x2 * cos + x1 * sin - if is_neox_style: - return torch.cat((o1, o2), dim=-1) - else: - return torch.stack((o1, o2), dim=-1).flatten(-2) - - if offsets is not None: - positions = positions + offsets - - self.cos_sin_cache = self.cos_sin_cache.to(query.device, - dtype=query.dtype) - - positions = positions.flatten() - num_tokens = positions.shape[0] - cos_sin = self.cos_sin_cache.index_select(0, positions) - cos, sin = cos_sin.chunk(2, dim=-1) - - query_shape = query.shape - query = query.view(num_tokens, -1, self.head_size) - if key is not None: - key_shape = key.shape - key = key.view(num_tokens, -1, self.head_size) - - if self.rotary_dim == self.head_size: - query = _apply_rotary_emb(query, cos, sin, self.is_neox_style) - query = query.reshape(query_shape) - if key is not None: - key = _apply_rotary_emb(key, cos, sin, self.is_neox_style) - key = key.reshape(key_shape) - else: - head_size = query.shape[-1] - query_reshaped = query.view(-1, head_size) - query_pass = query_reshaped[:, self.rotary_dim:].view( - *query.shape[:-1], head_size - self.rotary_dim) - query_rot = query_reshaped[:, :self.rotary_dim].view( - *query.shape[:-1], self.rotary_dim) - query_rot = _apply_rotary_emb_neuron(query_rot, cos, sin, - self.is_neox_style) - query = torch.cat((query_rot, query_pass), - dim=-1).reshape(query_shape) - - if key is not None: - key_reshaped = key.view(-1, head_size) - key_pass = key_reshaped[:, self.rotary_dim:].view( - *key.shape[:-1], head_size - self.rotary_dim) - key_rot = key_reshaped[:, :self.rotary_dim].view( - *key.shape[:-1], self.rotary_dim) - key_rot = _apply_rotary_emb_neuron(key_rot, cos, sin, - self.is_neox_style) - key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) - return query, key - - def extra_repr(self) -> str: - s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}" - s += f", max_position_embeddings={self.max_position_embeddings}" - s += f", base={self.base}, is_neox_style={self.is_neox_style}" - return s - - -class LinearScalingRotaryEmbedding(RotaryEmbedding): - """RotaryEmbedding extended with linear scaling. - - It supports multiple scaling factors. Since multiple LoRA adapters may have - different scaling factors, we need multiple cos/sin caches. In this way, - instead of running rotary embedding kernel per lora, we can run multiple - lora in a batched way. - - In addition to that, we also keep the cos/sin cache for the scaling factor - of 1 (default) at all times. - - Exemplary for two scaling factors x=1, y and z with embeddings - [[x11, x12, ... x1m], ..., [xn1, xn2, ..., xnm]] and - [[y11, y12, ... y1o], ..., [yn1, yn2, ..., yno]], and - [[z11, z12, ... z1p], ..., [zn1, zn2, ..., znp]], - - we construct the cos/sin cache as follows: - [[x11, x12, ... x1m, y11, y12, ... y1o, z11, z12, ... z1p], - ... - [xn1, xn2, ... xnm, yn1, yn2, ... yno, zn1, zn2, ... znp]] - - We then use offsets to index into the cos/sin cache for - the respective scaling factors. - - The offset to cache can be accessed via `scaling_factor_to_offset` API. - - Credits to the Reddit user /u/kaiokendev - """ - - def __init__( - self, - head_size: int, - rotary_dim: int, - max_position_embeddings: int, - base: float, - is_neox_style: bool, - scaling_factors: Union[list[float], float], - dtype: torch.dtype, - ) -> None: - if isinstance(scaling_factors, float): - scaling_factors = [scaling_factors] - self.scaling_factors: list[float] = scaling_factors # noqa - super().__init__(head_size, rotary_dim, max_position_embeddings, base, - is_neox_style, dtype) - # Lazy initialized. - self._scaling_factor_to_offset: dict[float, int] - - def _compute_cos_sin_cache(self) -> torch.Tensor: - inv_freq = self._compute_inv_freq(self.base) - cache_list: list[torch.Tensor] = [] - # offsets to the next cache in a tensor. - # Each offset corresponds to the same index in scaling_factors. - offsets: list[int] = [] - for scaling_factor in self.scaling_factors: - # NOTE(woosuk): self.max_position_embeddings is the original - # maximum length before applying the rope scaling. - # Thus, the maximum length after applying the rope scaling is - # self.max_position_embeddings * self.scaling_factor. - max_len = self.max_position_embeddings * scaling_factor - t = torch.arange(max_len, dtype=torch.float) - t = t / scaling_factor - - freqs = torch.einsum("i,j -> ij", t, inv_freq) - cos = freqs.cos() - sin = freqs.sin() - cache = torch.cat((cos, sin), dim=-1) - if not cache_list: - offset = 0 - else: - last_offset = offsets[-1] - next_max_len = cache_list[-1].shape[0] - offset = last_offset + next_max_len - offsets.append(offset) - cache_list.append(cache) - self._scaling_factor_to_offset = { - float(scaling_factor): offsets[i] - for i, scaling_factor in enumerate(self.scaling_factors) - } - assert len(self.scaling_factors) == len(offsets) - return torch.cat(cache_list, dim=0) - - @property - def scaling_factor_to_offset(self) -> dict[float, int]: - return self._scaling_factor_to_offset - - -class NTKScalingRotaryEmbedding(RotaryEmbedding): - """RotaryEmbedding extended with fixed and mixed NTK scaling. - https://kexue.fm/archives/9706 """ - - def __init__(self, - head_size: int, - rotary_dim: int, - max_position_embeddings: int, - base: float, - is_neox_style: bool, - scaling_factor: float, - dtype: torch.dtype, - mixed_b: Optional[float] = None) -> None: - self.scaling_factor = scaling_factor - self.mixed_b = mixed_b - super().__init__(head_size, rotary_dim, max_position_embeddings, base, - is_neox_style, dtype) - - def _compute_inv_freq(self, base: float) -> torch.Tensor: - base = self.base * (self.scaling_factor if self.mixed_b is None else 1) - inv_freq = super()._compute_inv_freq(base) - - if self.mixed_b is None: - inv_freq = inv_freq / self.scaling_factor**(2 / self.rotary_dim) - else: - a = torch.tensor(self.scaling_factor).log() / (self.rotary_dim / - 2)**self.mixed_b - lambda_1_m = (a * torch.arange( - 1, self.rotary_dim // 2 + 1).float()**self.mixed_b).exp() - inv_freq = inv_freq / lambda_1_m - - return inv_freq - - -class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding): - """RotaryEmbedding extended with Dynamic NTK scaling. - - Credits to the Reddit users /u/bloc97 and /u/emozilla - """ - - def __init__( - self, - head_size: int, - rotary_dim: int, - max_position_embeddings: int, - base: float, - is_neox_style: bool, - scaling_factor: float, - dtype: torch.dtype, - ) -> None: - self.scaling_factor = scaling_factor - super().__init__(head_size, rotary_dim, max_position_embeddings, base, - is_neox_style, dtype) - - def _compute_cos_sin_cache(self) -> torch.Tensor: - # NOTE(woosuk): self.max_position_embeddings is the original - # maximum length before applying the rope scaling. - # Thus, the maximum length after applying the rope scaling is - # self.max_position_embeddings * self.scaling_factor. - max_len = self.max_position_embeddings * self.scaling_factor - base = self.base * ( - (self.scaling_factor * max_len / self.max_position_embeddings) - - (self.scaling_factor - 1))**(self.rotary_dim / - (self.rotary_dim - 2)) - inv_freq = self._compute_inv_freq(base) - t = torch.arange(max_len, dtype=torch.float) - - freqs = torch.einsum("i,j -> ij", t, inv_freq) - cos = freqs.cos() - sin = freqs.sin() - cache = torch.cat((cos, sin), dim=-1) - return cache - - -class DynamicNTKAlphaRotaryEmbedding(RotaryEmbedding): - """RotaryEmbedding extended with Dynamic NTK alpha. - - Based on the original RotaryEmbedding implementation. - """ - - def __init__( - self, - head_size: int, - rotary_dim: int, - max_position_embeddings: int, - base: float, - is_neox_style: bool, - scaling_alpha: float, - dtype: torch.dtype, - ) -> None: - self.scaling_alpha = scaling_alpha - super().__init__(head_size, rotary_dim, max_position_embeddings, base, - is_neox_style, dtype) - - def _compute_cos_sin_cache(self) -> torch.Tensor: - # For Hunyuan DynamicNTKAlphaRotaryEmbedding - max_len = self.max_position_embeddings - base = self.base * self.scaling_alpha**(self.rotary_dim / - (self.rotary_dim - 2)) - inv_freq = self._compute_inv_freq(base) - t = torch.arange(max_len, dtype=torch.float) - - freqs = torch.einsum("i,j -> ij", t, inv_freq) - cos = freqs.cos() - sin = freqs.sin() - cache = torch.cat((cos, sin), dim=-1) - return cache - - -# Inverse dim formula to find dim based on number of rotations -def _yarn_find_correction_dim(num_rotations: int, - dim: int, - base: float = 10000, - max_position_embeddings: int = 2048) -> float: - return (dim * math.log(max_position_embeddings / - (num_rotations * 2 * math.pi))) / (2 * - math.log(base)) - - -# Find dim range bounds based on rotations -def _yarn_find_correction_range( - low_rot: int, - high_rot: int, - dim: int, - base: float = 10000, - max_position_embeddings: int = 2048) -> tuple[int, int]: - low = math.floor( - _yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)) - high = math.ceil( - _yarn_find_correction_dim(high_rot, dim, base, - max_position_embeddings)) - return max(low, 0), min(high, dim - 1) # Clamp values just in case - - -def _yarn_linear_ramp_mask(low: float, high: float, dim: int, - dtype: torch.dtype) -> torch.Tensor: - if low == high: - high += 0.001 # Prevent singularity - - linear_func = (torch.arange(dim, dtype=dtype) - low) / (high - low) - ramp_func = torch.clamp(linear_func, 0, 1) - return ramp_func - - -def _yarn_get_mscale(scale: float = 1) -> float: - if scale <= 1: - return 1.0 - return 0.1 * math.log(scale) + 1.0 - - -class YaRNScalingRotaryEmbedding(RotaryEmbedding): - """RotaryEmbedding extended with YaRN method. - - Credits to Peng et al. github.com/jquesnelle/yarn - """ - - def __init__( - self, - head_size: int, - rotary_dim: int, - max_position_embeddings: int, - base: float, - is_neox_style: bool, - scaling_factor: float, - dtype: torch.dtype, - *, - extrapolation_factor: float = 1, - attn_factor: float = 1, - beta_fast: int = 32, - beta_slow: int = 1, - ) -> None: - self.scaling_factor = scaling_factor - self.extrapolation_factor = extrapolation_factor - self.attn_factor = attn_factor - self.beta_fast = beta_fast - self.beta_slow = beta_slow - # Get n-d magnitude scaling corrected for interpolation - self.mscale = float( - _yarn_get_mscale(self.scaling_factor) * attn_factor) - super().__init__(head_size, rotary_dim, max_position_embeddings, base, - is_neox_style, dtype) - - def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: - pos_freqs = self.base**( - torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / - self.rotary_dim) - inv_freq_extrapolation = 1.0 / pos_freqs - inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs) - - low, high = _yarn_find_correction_range(self.beta_fast, self.beta_slow, - self.rotary_dim, self.base, - self.max_position_embeddings) - # Get n-d rotational scaling corrected for extrapolation - inv_freq_mask = (1 - _yarn_linear_ramp_mask( - low, high, self.rotary_dim // 2, - dtype=torch.float)) * self.extrapolation_factor - inv_freq = inv_freq_interpolation * ( - 1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask - return inv_freq - - def _compute_cos_sin_cache(self) -> torch.Tensor: - inv_freq = self._compute_inv_freq(self.scaling_factor) - t = torch.arange(self.max_position_embeddings * self.scaling_factor, - dtype=torch.float32) - freqs = torch.einsum("i,j -> ij", t, inv_freq) - cos = (freqs.cos() * self.mscale) - sin = (freqs.sin() * self.mscale) - cache = torch.cat((cos, sin), dim=-1) - return cache - - -class Phi3LongRoPEScaledRotaryEmbedding(nn.Module): - """Phi3 family of models scaled rotary embedding. - - Based on the original RotaryEmbedding implementation. - """ - - def __init__( - self, - head_size: int, - rotary_dim: int, - max_position_embeddings: int, - original_max_position_embeddings: int, - base: float, - is_neox_style: bool, - dtype: torch.dtype, - short_factor: list[float], - long_factor: list[float], - short_mscale: Optional[float] = None, - long_mscale: Optional[float] = None, - ): - super().__init__() - - if is_neox_style is False: - raise ValueError( - "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style." - ) - - self.rotary_dim = rotary_dim - self.head_size = head_size - self.max_position_embeddings = max_position_embeddings - self.original_max_position_embeddings = original_max_position_embeddings - self.base = base - self.short_factor = short_factor - self.long_factor = long_factor - - scale = self.max_position_embeddings / \ - self.original_max_position_embeddings - if scale <= 1.0: - scaling_factor = 1.0 - else: - scaling_factor = math.sqrt( - 1 + math.log(scale) / - math.log(self.original_max_position_embeddings)) - if short_mscale is None: - short_mscale = scaling_factor - if long_mscale is None: - long_mscale = scaling_factor - - self.short_mscale = short_mscale - self.long_mscale = long_mscale - - short_cache = self._compute_cos_sin_cache( - original_max_position_embeddings, short_factor, short_mscale) - short_cache = short_cache.to(dtype) - - long_cache = self._compute_cos_sin_cache(max_position_embeddings, - long_factor, long_mscale) - long_cache = long_cache.to(dtype) - - long_short_cache = torch.cat([short_cache, long_cache], dim=0) - self.register_buffer("long_short_cos_sin_cache", - long_short_cache, - persistent=False) - - def _compute_inv_freq(self, rescale_factors: list[float]) -> torch.Tensor: - rescale_factors = torch.tensor(rescale_factors, dtype=torch.float32) - inv_freq = 1.0 / (rescale_factors * (self.base**(torch.arange( - 0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))) - return inv_freq - - def _compute_cos_sin_cache( - self, - max_position_embeddings: int, - rescale_factors: list[float], - mscale: float, - ) -> torch.Tensor: - inv_freq = self._compute_inv_freq(rescale_factors) - t = torch.arange(max_position_embeddings, dtype=torch.float) - freqs = torch.einsum("i,j -> ij", t, inv_freq) - cos = freqs.cos() * mscale - sin = freqs.sin() * mscale - cache = torch.cat((cos, sin), dim=-1) - return cache - - def forward( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - assert key is not None - query = query.view(*query.shape[:-1], -1, self.head_size) - key = key.view(*key.shape[:-1], -1, self.head_size) - - k = self.original_max_position_embeddings - long_prompt_offset = (torch.any(positions > k).float() * - torch.full_like(positions, k)).long() - idx = (torch.add(positions, long_prompt_offset) - if long_prompt_offset is not None else positions) - idx = torch.add(idx, offsets) if offsets is not None else idx - cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx) - - cos, sin = cos_sin.chunk(2, dim=-1) - cos = cos.repeat(1, 2).unsqueeze(-2) - sin = sin.repeat(1, 2).unsqueeze(-2) - - query_rot = query[..., :self.rotary_dim] - query_pass = query[..., self.rotary_dim:] - query_rot = query_rot * cos + _rotate_neox(query_rot) * sin - query = torch.cat((query_rot, query_pass), dim=-1) - - key_rot = key[..., :self.rotary_dim] - key_pass = key[..., self.rotary_dim:] - key_rot = key_rot * cos + _rotate_neox(key_rot) * sin - key = torch.cat((key_rot, key_pass), dim=-1) - - return query.flatten(-2), key.flatten(-2) - - -def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: - if scale <= 1: - return 1.0 - return 0.1 * mscale * math.log(scale) + 1.0 - - -class DeepseekScalingRotaryEmbedding(RotaryEmbedding): - """RotaryEmbedding extended with YaRN method. - - Credits to Peng et al. github.com/jquesnelle/yarn - """ - - def __init__( - self, - head_size: int, - rotary_dim: int, - max_position_embeddings: int, - base: float, - is_neox_style: bool, - scaling_factor: float, - dtype: torch.dtype, - *, - extrapolation_factor: float = 1, - attn_factor: float = 1, - beta_fast: int = 32, - beta_slow: int = 1, - mscale: float = 1, - mscale_all_dim: float = 0, - ) -> None: - self.scaling_factor = scaling_factor - self.extrapolation_factor = extrapolation_factor - self.attn_factor = attn_factor - self.beta_fast = beta_fast - self.beta_slow = beta_slow - # Get n-d magnitude scaling corrected for interpolation. - self.mscale = float( - yarn_get_mscale(self.scaling_factor, float(mscale)) / - yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) * - attn_factor) - super().__init__(head_size, rotary_dim, max_position_embeddings, base, - is_neox_style, dtype) - - def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: - pos_freqs = self.base**( - torch.arange(0, - self.rotary_dim, - 2, - dtype=torch.float, - device=current_platform.device_type) / - self.rotary_dim) - inv_freq_extrapolation = 1.0 / pos_freqs - inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs) - - low, high = _yarn_find_correction_range(self.beta_fast, self.beta_slow, - self.rotary_dim, self.base, - self.max_position_embeddings) - # Get n-d rotational scaling corrected for extrapolation - inv_freq_mask = (1 - _yarn_linear_ramp_mask( - low, high, self.rotary_dim // 2, - dtype=torch.float)) * self.extrapolation_factor - inv_freq = inv_freq_interpolation * ( - 1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask - return inv_freq - - def _compute_cos_sin_cache(self) -> torch.Tensor: - inv_freq = self._compute_inv_freq(self.scaling_factor) - t = torch.arange(self.max_position_embeddings * self.scaling_factor, - device=current_platform.device_type, - dtype=torch.float32) - freqs = torch.einsum("i,j -> ij", t, inv_freq) - cos = (freqs.cos() * self.mscale) - sin = (freqs.sin() * self.mscale) - cache = torch.cat((cos, sin), dim=-1) - return cache - - def forward( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: Optional[torch.Tensor] = None, - offsets: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - """PyTorch-native implementation equivalent to forward().""" - assert key is not None - query_rot = query[..., :self.rotary_dim] - key_rot = key[..., :self.rotary_dim] - if self.rotary_dim < self.head_size: - query_pass = query[..., self.rotary_dim:] - key_pass = key[..., self.rotary_dim:] - - if self.cos_sin_cache.device != positions.device: - self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to( - positions.device) - cos_sin = self.cos_sin_cache[torch.add(positions, offsets) - if offsets is not None else positions] - cos, sin = cos_sin.chunk(2, dim=-1) - if self.is_neox_style: - # NOTE(woosuk): Here we assume that the positions tensor has the - # shape [batch_size, seq_len]. - cos = cos.repeat(1, 1, 2).unsqueeze(-2) - sin = sin.repeat(1, 1, 2).unsqueeze(-2) - else: - cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2) - sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2) - - rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj - query_rot = query_rot * cos + rotate_fn(query_rot) * sin - key_rot = key_rot * cos + rotate_fn(key_rot) * sin - - if self.rotary_dim < self.head_size: - query = torch.cat((query_rot, query_pass), dim=-1) - key = torch.cat((key_rot, key_pass), dim=-1) - else: - query = query_rot - key = key_rot - return query, key - - -class Llama3RotaryEmbedding(RotaryEmbedding): - - def __init__( - self, - head_size: int, - rotary_dim: int, - max_position_embeddings: int, - base: float, - is_neox_style: bool, - dtype: torch.dtype, - scaling_factor: float, - low_freq_factor: float, - high_freq_factor: float, - orig_max_position: int, - ) -> None: - self.scaling_factor = scaling_factor - self.low_freq_factor = low_freq_factor - self.high_freq_factor = high_freq_factor - self.orig_max_position = orig_max_position - super().__init__(head_size, rotary_dim, max_position_embeddings, base, - is_neox_style, dtype) - - def _compute_inv_freq(self, base: float) -> torch.Tensor: - inv_freqs = super()._compute_inv_freq(base) - low_freq_wavelen = self.orig_max_position / self.low_freq_factor - high_freq_wavelen = self.orig_max_position / self.high_freq_factor - - wave_len = 2 * math.pi / inv_freqs - if self.low_freq_factor != self.high_freq_factor: - smooth = (self.orig_max_position / wave_len - self.low_freq_factor - ) / (self.high_freq_factor - self.low_freq_factor) - else: - smooth = 0 - new_freqs = torch.where( - wave_len < high_freq_wavelen, - inv_freqs, - torch.where( - wave_len > low_freq_wavelen, - inv_freqs / self.scaling_factor, - (1 - smooth) * inv_freqs / self.scaling_factor + - smooth * inv_freqs, - ), - ) - return new_freqs - - -class Llama4VisionRotaryEmbedding(RotaryEmbedding): - - def __init__( - self, - head_size: int, - rotary_dim: int, - max_position_embeddings: int, - base: float, - is_neox_style: bool, - dtype: torch.dtype, - ): - super().__init__(head_size, rotary_dim, max_position_embeddings, base, - is_neox_style, dtype) - - def _compute_inv_freq(self, base: float) -> torch.Tensor: - inv_freqs = super()._compute_inv_freq(base) - inv_freqs = inv_freqs[:(self.rotary_dim // 2)] - return inv_freqs - - def _compute_cos_sin_cache(self) -> torch.Tensor: - inv_freq = self._compute_inv_freq(self.base) - - # self.max_position_embeddings here is number of image patches - # i.e. (image_size // patch_size) ** 2 - num_patches = self.max_position_embeddings - img_idx = torch.arange(num_patches, - dtype=torch.int32) \ - .reshape(num_patches, 1) - img_idx = torch.cat([img_idx, img_idx[:1]], dim=0) - img_idx[-1, -1] = -2 # set to ID_CLS_TOKEN - num_patches_single_dim = int(math.sqrt(num_patches)) - frequencies_x = img_idx % num_patches_single_dim - frequencies_y = img_idx // num_patches_single_dim - freqs_x = ((frequencies_x + 1)[..., None] * - inv_freq[None, None, :]).repeat_interleave(2, dim=-1) - freqs_y = ((frequencies_y + 1)[..., None] * - inv_freq[None, None, :]).repeat_interleave(2, dim=-1) - freqs = torch.cat([freqs_x, freqs_y], - dim=-1).float().contiguous()[..., ::2] - freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0) - cache = torch.view_as_complex( - torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1)) - return cache - - def forward( - self, - query: torch.Tensor, - key: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - assert key is not None - self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device) - query_ = torch.view_as_complex(query.float().reshape( - *query.shape[:-1], -1, 2)) - key_ = torch.view_as_complex(key.float().reshape( - *key.shape[:-1], -1, 2)) - broadcast_shape = [ - d if i == 1 or i == (query_.ndim - 1) else 1 - for i, d in enumerate(query_.shape) - ] - freqs_ci = self.cos_sin_cache.view(*broadcast_shape) - query_out = torch.view_as_real(query_ * freqs_ci).flatten(3) - key_out = torch.view_as_real(key_ * freqs_ci).flatten(3) - return query_out.type_as(query), key_out.type_as(key) - - -class MRotaryEmbedding(RotaryEmbedding): - """Rotary Embedding with Multimodal Sections.""" - - def __init__( - self, - head_size: int, - rotary_dim: int, - max_position_embeddings: int, - base: float, - is_neox_style: bool, - dtype: torch.dtype, - mrope_section: Optional[list[int]] = None, - ) -> None: - # In Qwen2.5-VL, the maximum index value is related to the duration of - # the input video. We enlarge max_position_embeddings to 4 times to get - # a larger the cos and sin cache. - self.cache_max_position_num = max_position_embeddings * 4 - super().__init__(head_size, rotary_dim, self.cache_max_position_num, - base, is_neox_style, dtype) - - self.mrope_section = mrope_section - if self.mrope_section: - assert sum(self.mrope_section) == rotary_dim // 2 - - def forward( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - """PyTorch-native implementation equivalent to forward(). - - Args: - positions: - [num_tokens,] (text only) or - [3, num_tokens] (T/H/W positions with multimodal inputs) - query: [num_tokens, num_heads * head_size] - key: [num_tokens, num_kv_heads * head_size] - """ - assert positions.ndim == 1 or positions.ndim == 2 - assert key is not None - - num_tokens = positions.shape[-1] - cos_sin = self.cos_sin_cache[positions] - cos, sin = cos_sin.chunk(2, dim=-1) - if positions.ndim == 2: - assert self.mrope_section - - cos = torch.cat([ - m[i] - for i, m in enumerate(cos.split(self.mrope_section, dim=-1)) - ], - dim=-1) - sin = torch.cat([ - m[i] - for i, m in enumerate(sin.split(self.mrope_section, dim=-1)) - ], - dim=-1) - - query_shape = query.shape - query = query.view(num_tokens, -1, self.head_size) - query_rot = query[..., :self.rotary_dim] - query_pass = query[..., self.rotary_dim:] - query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style) - query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) - - key_shape = key.shape - key = key.view(num_tokens, -1, self.head_size) - key_rot = key[..., :self.rotary_dim] - key_pass = key[..., self.rotary_dim:] - key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style) - key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) - return query, key - - @classmethod - def get_input_positions( - cls, - input_tokens: list[int], - hf_config: PretrainedConfig, - image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], - video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], - second_per_grid_ts: Optional[list[float]], - context_len: int = 0, - seq_len: Optional[int] = None, - audio_feature_lengths: Optional[torch.Tensor] = None, - use_audio_in_video: bool = False, - ) -> tuple[list[list[int]], int]: - """Get mrope input positions and delta value.""" - - image_grid_thw = [] if image_grid_thw is None else image_grid_thw - video_grid_thw = [] if video_grid_thw is None else video_grid_thw - second_per_grid_ts = [] if second_per_grid_ts is None else \ - second_per_grid_ts - - llm_positions, mrope_position_delta = \ - cls.get_input_positions_tensor( - input_tokens=input_tokens, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - context_len=context_len, - seq_len=seq_len, - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) - - return llm_positions.tolist(), mrope_position_delta - - @classmethod - def get_input_positions_tensor( - cls, - input_tokens: list[int], - hf_config: PretrainedConfig, - image_grid_thw: Union[list[list[int]], torch.Tensor], - video_grid_thw: Union[list[list[int]], torch.Tensor], - second_per_grid_ts: list[float], - context_len: int = 0, - seq_len: Optional[int] = None, - audio_feature_lengths: Optional[torch.Tensor] = None, - use_audio_in_video: bool = False, - ) -> tuple[torch.Tensor, int]: - from vllm.transformers_utils.config import thinker_uses_mrope - if thinker_uses_mrope(hf_config): - return cls._omni_get_input_positions_tensor( - input_tokens=input_tokens, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - context_len=context_len, - seq_len=seq_len, - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) - elif hf_config.model_type in ["glm4v", "glm4v_moe"]: - return cls._glm4v_get_input_positions_tensor( - input_tokens=input_tokens, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - context_len=context_len, - seq_len=seq_len, - ) - else: - return cls._vl_get_input_positions_tensor( - input_tokens=input_tokens, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - context_len=context_len, - seq_len=seq_len, - ) - - @classmethod - def _glm4v_get_input_positions_tensor( - cls, - input_tokens: list[int], - hf_config: PretrainedConfig, - image_grid_thw: Union[list[list[int]], torch.Tensor], - video_grid_thw: Union[list[list[int]], torch.Tensor], - context_len: int = 0, - seq_len: Optional[int] = None, - ) -> tuple[torch.Tensor, int]: - """Get mrope input positions and delta value for GLM4V.""" - - image_token_id = hf_config.image_token_id - video_start_token_id = hf_config.video_start_token_id - video_end_token_id = hf_config.video_end_token_id - spatial_merge_size = hf_config.vision_config.spatial_merge_size - llm_pos_ids_list: list = [] - - if not (image_grid_thw is None and video_grid_thw is None): - if isinstance(image_grid_thw, torch.Tensor): - image_grid_thw = image_grid_thw.tolist() - - input_token_type: list[str] = [] - video_check_flg = False - for token in input_tokens: - if token == video_start_token_id: - video_check_flg = True - elif token == video_end_token_id: - video_check_flg = False - - if (token == image_token_id) and (video_check_flg is False): - input_token_type.append("image") - elif (token == image_token_id) and (video_check_flg is True): - input_token_type.append("video") - else: - input_token_type.append("text") - - input_type_group: list[tuple[str, int, int]] = [] - for key, group_iter in itertools.groupby( - enumerate(input_token_type), lambda x: x[1]): - group_list = list(group_iter) - start_index = group_list[0][0] - end_index = group_list[-1][0] + 1 - input_type_group.append((key, start_index, end_index)) - - video_frame_num = 1 - mm_data_idx = 0 - for modality_type, start_idx, end_idx in input_type_group: - st_idx = llm_pos_ids_list[-1].max() + 1 if len( - llm_pos_ids_list) > 0 else 0 - if modality_type == "image": - t, h, w = ( - image_grid_thw[mm_data_idx][0], - image_grid_thw[mm_data_idx][1], - image_grid_thw[mm_data_idx][2], - ) - llm_grid_t, llm_grid_h, llm_grid_w = \ - t, h // spatial_merge_size, w // spatial_merge_size - - t_index = torch.arange(llm_grid_t).view(-1, 1).expand( - -1, llm_grid_h * llm_grid_w).flatten() - h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand( - llm_grid_t, -1, llm_grid_w).flatten() - w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand( - llm_grid_t, llm_grid_h, -1).flatten() - llm_pos_ids_list.append( - torch.stack([t_index, h_index, w_index]) + st_idx) - mm_data_idx += 1 - - elif modality_type == "video": - t, h, w = ( - video_frame_num, - image_grid_thw[mm_data_idx][1], - image_grid_thw[mm_data_idx][2], - ) - llm_grid_t, llm_grid_h, llm_grid_w = \ - t, h // spatial_merge_size, w // spatial_merge_size - - for t_idx in range(llm_grid_t): - t_index = torch.tensor(t_idx).view(-1, 1).expand( - -1, llm_grid_h * llm_grid_w).flatten() - h_index = torch.arange(llm_grid_h).view( - 1, -1, 1).expand(1, -1, llm_grid_w).flatten() - w_index = torch.arange(llm_grid_w).view( - 1, 1, -1).expand(1, llm_grid_h, -1).flatten() - llm_pos_ids_list.append( - torch.stack([t_index, h_index, w_index]) + st_idx) - - mm_data_idx += 1 - video_frame_num += 1 - - else: - text_len = end_idx - start_idx - llm_pos_ids_list.append( - torch.arange(text_len).view(1, -1).expand(3, -1) + - st_idx) - video_frame_num = 1 - - else: - text_len = len(input_tokens) - llm_pos_ids_list.append( - torch.arange(text_len).view(1, -1).expand(3, -1)) - - llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) - llm_positions = llm_positions[:, context_len:seq_len] - mrope_position_delta = (llm_positions.max() + 1 - - len(input_tokens)).item() - return llm_positions, mrope_position_delta - - @classmethod - def _vl_get_input_positions_tensor( - cls, - input_tokens: list[int], - hf_config: PretrainedConfig, - image_grid_thw: Union[list[list[int]], torch.Tensor], - video_grid_thw: Union[list[list[int]], torch.Tensor], - second_per_grid_ts: list[float], - context_len: int = 0, - seq_len: Optional[int] = None, - ) -> tuple[torch.Tensor, int]: - """Get mrope input positions and delta value.""" - - image_token_id = hf_config.image_token_id - video_token_id = hf_config.video_token_id - vision_start_token_id = hf_config.vision_start_token_id - spatial_merge_size = hf_config.vision_config.spatial_merge_size - tokens_per_second = getattr(hf_config.vision_config, - "tokens_per_second", 1.0) - - input_tokens_tensor = torch.tensor(input_tokens) - vision_start_indices = torch.argwhere( - input_tokens_tensor == vision_start_token_id).squeeze(1) - vision_tokens = input_tokens_tensor[vision_start_indices + 1] - image_nums = (vision_tokens == image_token_id).sum() - video_nums = (vision_tokens == video_token_id).sum() - llm_pos_ids_list: list = [] - - st = 0 - remain_images, remain_videos = image_nums, video_nums - - image_index, video_index = 0, 0 - for _ in range(image_nums + video_nums): - video_second_per_grid_t = 0.0 - if image_token_id in input_tokens and remain_images > 0: - ed_image = input_tokens.index(image_token_id, st) - else: - ed_image = len(input_tokens) + 1 - if video_token_id in input_tokens and remain_videos > 0: - ed_video = input_tokens.index(video_token_id, st) - else: - ed_video = len(input_tokens) + 1 - if ed_image < ed_video: - t, h, w = ( - image_grid_thw[image_index][0], - image_grid_thw[image_index][1], - image_grid_thw[image_index][2], - ) - image_index += 1 - remain_images -= 1 - ed = ed_image - else: - t, h, w = ( - video_grid_thw[video_index][0], - video_grid_thw[video_index][1], - video_grid_thw[video_index][2], - ) - video_second_per_grid_t = 1.0 - if second_per_grid_ts: - video_second_per_grid_t = second_per_grid_ts[video_index] - video_index += 1 - remain_videos -= 1 - ed = ed_video - - llm_grid_t, llm_grid_h, llm_grid_w = \ - t, h // spatial_merge_size, w // spatial_merge_size - text_len = ed - st - - st_idx = llm_pos_ids_list[-1].max() + 1 if len( - llm_pos_ids_list) > 0 else 0 - llm_pos_ids_list.append( - torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) - - t_index = (torch.arange(llm_grid_t).view(-1, 1).expand( - -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t * - tokens_per_second).long().flatten() - - h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand( - llm_grid_t, -1, llm_grid_w).flatten() - w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand( - llm_grid_t, llm_grid_h, -1).flatten() - llm_pos_ids_list.append( - torch.stack([t_index, h_index, w_index]) + text_len + st_idx) - st = ed + llm_grid_t * llm_grid_h * llm_grid_w - - if st < len(input_tokens): - st_idx = llm_pos_ids_list[-1].max() + 1 if len( - llm_pos_ids_list) > 0 else 0 - text_len = len(input_tokens) - st - llm_pos_ids_list.append( - torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) - - llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) - mrope_position_delta = (llm_positions.max() + 1 - - len(input_tokens)).item() - llm_positions = llm_positions[:, context_len:seq_len] - - return llm_positions, mrope_position_delta - - @classmethod - def _omni_get_input_positions_tensor( - cls, - input_tokens: list[int], - hf_config: PretrainedConfig, - image_grid_thw: Union[list[list[int]], torch.Tensor], - video_grid_thw: Union[list[list[int]], torch.Tensor], - second_per_grid_ts: Optional[list[float]] = None, - context_len: int = 0, - seq_len: Optional[int] = None, - audio_feature_lengths: Optional[torch.Tensor] = None, - use_audio_in_video: bool = False, - ) -> tuple[torch.Tensor, int]: - """Get mrope input positions and delta value (Qwen2.5-Omni version). - - Differences from MRotaryEmbedding: - 1. Add audio support (and related `audio_feature_lengths`). - 2. Add `use_audio_in_video` option to read audio from video inputs. - In this case, audio and vision position ids will be split into - chunks and interleaved. - - Example: - - (V_i are vision position ids, A_i are audio position ids) - - |V_1 ... V_n|A_1 ... A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|... - |vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |... - """ - - # TODO(fyabc): refactor and share more code with - # _vl_get_input_positions_tensor. - - thinker_config = hf_config.thinker_config - audio_token_id = thinker_config.audio_token_index - image_token_id = thinker_config.image_token_index - video_token_id = thinker_config.video_token_index - audio_start_token_id = thinker_config.audio_start_token_id - audio_end_token_id = thinker_config.audio_end_token_id - vision_start_token_id = thinker_config.vision_start_token_id - vision_end_token_id = thinker_config.vision_end_token_id - seconds_per_chunk = thinker_config.seconds_per_chunk - spatial_merge_size = thinker_config.vision_config.spatial_merge_size - tokens_per_second = getattr(thinker_config.vision_config, - "tokens_per_second", 25) - - if isinstance(image_grid_thw, list): - image_grid_thw = torch.tensor(image_grid_thw) - if isinstance(video_grid_thw, list): - video_grid_thw = torch.tensor(video_grid_thw) - - src_item = input_tokens - audio_seqlens = audio_feature_lengths - if not second_per_grid_ts: - second_per_grid_ts = [1] * video_grid_thw.shape[0] - audio_idx = 0 - video_idx = 0 - image_idx = 0 - new_src_item: list[int] = [] - llm_pos_ids_list: list[torch.Tensor] = [] - - idx = 0 - while idx < len(src_item): - new_src_item_len = len(new_src_item) - start_idx = llm_pos_ids_list[-1].max() + 1 if len( - llm_pos_ids_list) > 0 else 0 - if src_item[idx] not in [ - audio_token_id, video_token_id, image_token_id - ]: - if use_audio_in_video and idx > 0: - if src_item[idx] == vision_end_token_id and \ - src_item[idx - 1] == audio_end_token_id: - # processing the <|audio_eos|> before <|vision_eos|> - start_idx -= 1 - elif src_item[idx] == audio_start_token_id and \ - src_item[idx - 1] == vision_start_token_id: - # processing the <|audio_bos|> after <|vision_eos|> - start_idx -= 1 - new_src_item.append(src_item[idx]) - llm_pos_ids = torch.tensor([start_idx], - dtype=torch.long).expand(3, -1) - llm_pos_ids_list.append(llm_pos_ids) - elif src_item[idx] == audio_token_id: - assert audio_seqlens is not None - audio_seqlen = audio_seqlens[audio_idx] - place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1) - new_src_item.extend([audio_token_id] * place_num) - llm_pos_ids = torch.arange(place_num).expand(3, -1) + start_idx - llm_pos_ids_list.append(llm_pos_ids) - audio_idx += 1 - elif src_item[idx] == image_token_id: - grid_t = image_grid_thw[image_idx][0] - grid_hs = image_grid_thw[:, 1] - grid_ws = image_grid_thw[:, 2] - t_index = (torch.arange(grid_t) * 1 * tokens_per_second).long() - llm_pos_ids = cls._get_llm_pos_ids_for_vision( - start_idx, image_idx, spatial_merge_size, t_index, grid_hs, - grid_ws) - llm_pos_ids_list.append(llm_pos_ids) - vision_seqlen = image_grid_thw[image_idx].prod() // ( - spatial_merge_size**2) - new_src_item.extend([image_token_id] * vision_seqlen) - image_idx += 1 - elif src_item[idx] == video_token_id and not use_audio_in_video: - grid_t = video_grid_thw[video_idx][0] - grid_hs = video_grid_thw[:, 1] - grid_ws = video_grid_thw[:, 2] - t_index = (torch.arange(grid_t) * - second_per_grid_ts[video_idx] * - tokens_per_second).long() - llm_pos_ids = cls._get_llm_pos_ids_for_vision( - start_idx, video_idx, spatial_merge_size, t_index, grid_hs, - grid_ws) - llm_pos_ids_list.append(llm_pos_ids) - vision_seqlen = video_grid_thw[video_idx].prod() // ( - spatial_merge_size**2) - new_src_item.extend([video_token_id] * vision_seqlen) - video_idx += 1 - else: - # read audio from video - assert audio_seqlens is not None - audio_seqlen = audio_seqlens[audio_idx] - vision_seqlen = video_grid_thw[video_idx].prod() // ( - spatial_merge_size**2) - grid_t = video_grid_thw[video_idx][0] - grid_h = video_grid_thw[video_idx][1] - grid_w = video_grid_thw[video_idx][2] - grid_hs = video_grid_thw[:, 1] - grid_ws = video_grid_thw[:, 2] - t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk) - t_index = (torch.arange(grid_t) * - second_per_grid_ts[video_idx] * - tokens_per_second).long() - t_index_split_chunk = cls._split_list_into_ranges( - t_index, t_ntoken_per_chunk) - place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1) + 2 - pure_audio_len = place_num - 2 - added_audio_len = 0 - audio_llm_pos_ids_list: list[torch.Tensor] = [] - for t_chunk in t_index_split_chunk: - vision_ntoken_per_chunk = len( - t_chunk) * grid_h * grid_w // (spatial_merge_size**2) - new_src_item.extend([video_token_id] * - vision_ntoken_per_chunk) - vision_llm_pos_ids_list = cls._get_llm_pos_ids_for_vision( - start_idx, video_idx, spatial_merge_size, t_chunk, - grid_hs, grid_ws).split(1, dim=1) - llm_pos_ids_list.extend(vision_llm_pos_ids_list) - new_src_item.extend( - min(t_ntoken_per_chunk, pure_audio_len - - added_audio_len) * [audio_token_id]) - audio_start_idx = start_idx if len( - audio_llm_pos_ids_list - ) == 0 else audio_llm_pos_ids_list[-1][0].item() + 1 - if min(t_ntoken_per_chunk, - pure_audio_len - added_audio_len) > 0: - audio_llm_pos_ids_list = (torch.arange( - min(t_ntoken_per_chunk, pure_audio_len - - added_audio_len)).expand(3, -1) + - audio_start_idx).split(1, - dim=1) - else: - audio_llm_pos_ids_list = [] - added_audio_len += min(t_ntoken_per_chunk, - pure_audio_len - added_audio_len) - llm_pos_ids_list.extend(audio_llm_pos_ids_list) - if added_audio_len < pure_audio_len: - new_src_item.extend( - (pure_audio_len - added_audio_len) * [audio_token_id]) - audio_llm_pos_ids_list = ( - torch.arange(pure_audio_len - added_audio_len).expand( - 3, -1) + llm_pos_ids_list[-1].max() + 1).split( - 1, dim=1) - llm_pos_ids_list.extend(audio_llm_pos_ids_list) - audio_idx += 1 - video_idx += 1 - # move to the next token - idx += len(new_src_item) - new_src_item_len - - llm_positions = torch.cat(llm_pos_ids_list, dim=1) - mrope_position_delta = torch.cat(llm_pos_ids_list, - dim=1).max() + 1 - len(src_item) - llm_positions = llm_positions[:, context_len:seq_len] - - return llm_positions, mrope_position_delta - - @staticmethod - def _get_llm_pos_ids_for_vision( - start_idx: int, - vision_idx: int, - spatial_merge_size: int, - t_index: list[int], - grid_hs: torch.Tensor, - grid_ws: torch.Tensor, - ) -> torch.Tensor: - llm_pos_ids_list = [] - llm_grid_h = grid_hs[vision_idx] // spatial_merge_size - llm_grid_w = grid_ws[vision_idx] // spatial_merge_size - h_index = (torch.arange(llm_grid_h).view(1, -1, 1).expand( - len(t_index), -1, llm_grid_w).flatten()) - w_index = (torch.arange(llm_grid_w).view(1, 1, -1).expand( - len(t_index), llm_grid_h, -1).flatten()) - t_index_tensor = torch.Tensor(t_index).to(llm_grid_h.device).view( - -1, 1).expand(-1, llm_grid_h * llm_grid_w).long().flatten() - _llm_pos_ids = torch.stack([t_index_tensor, h_index, w_index]) - llm_pos_ids_list.append(_llm_pos_ids + start_idx) - llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1) - return llm_pos_ids - - @staticmethod - def _split_list_into_ranges(lst: torch.Tensor, - interval: int) -> list[list[int]]: - ranges: list[list[int]] = [[] - for _ in range((max(lst) // interval) + 1)] - for num in lst: - index = num // interval - ranges[index].append(num) - return ranges - - @staticmethod - def get_next_input_positions( - mrope_position_delta: int, - context_len: int, - seq_len: int, - ) -> list[list[int]]: - return [ - list( - range(context_len + mrope_position_delta, - seq_len + mrope_position_delta)) for _ in range(3) - ] - - @staticmethod - def get_next_input_positions_tensor(out: np.ndarray, out_offset: int, - mrope_position_delta: int, - context_len: int, num_new_tokens: int): - - values = np.arange(mrope_position_delta + context_len, - mrope_position_delta + context_len + num_new_tokens, - dtype=out.dtype) - out[:, out_offset:out_offset + num_new_tokens] = values - - @classmethod - def omni_get_updates_use_audio_in_video( - cls, - thinker_config: PretrainedConfig, - audio_len: int, - video_grid_thw: Union[list[int], torch.Tensor], - video_second_per_grid_t: float, - ) -> list[int]: - """Get video prompt updates when `use_audio_in_video` is True. - - In this case, audio and vision update ids will be split into - chunks and interleaved (details in `_omni_get_input_positions_tensor`). - - <|video_bos|><|VIDEO|><|video_eos|> => - <|video_bos|><|audio_bos|>(... chunks ...)<|audio_eos|><|video_eos|> - """ - - audio_token_id = thinker_config.audio_token_index - video_token_id = thinker_config.video_token_index - audio_start_token_id = thinker_config.audio_start_token_id - audio_end_token_id = thinker_config.audio_end_token_id - seconds_per_chunk = thinker_config.seconds_per_chunk - spatial_merge_size = thinker_config.vision_config.spatial_merge_size - tokens_per_second = getattr(thinker_config.vision_config, - "tokens_per_second", 25) - - grid_t = video_grid_thw[0] - grid_h = video_grid_thw[1] - grid_w = video_grid_thw[2] - t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk) - t_index = (torch.arange(grid_t) * video_second_per_grid_t * - tokens_per_second).long() - t_index_split_chunk = cls._split_list_into_ranges( - t_index, t_ntoken_per_chunk) - - updates = [audio_start_token_id] - added_audio_len = 0 - for t_chunk in t_index_split_chunk: - vision_ntoken_per_chunk = len(t_chunk) * grid_h * grid_w // ( - spatial_merge_size**2) - updates.extend([video_token_id] * vision_ntoken_per_chunk) - - audio_chunk_size = min(t_ntoken_per_chunk, - audio_len - added_audio_len) - updates.extend(audio_chunk_size * [audio_token_id]) - added_audio_len += audio_chunk_size - if added_audio_len < audio_len: - updates.extend((audio_len - added_audio_len) * [audio_token_id]) - updates.extend([audio_end_token_id]) - - return updates - - -@CustomOp.register("dual_chunk_rotary_embedding") -class DualChunkRotaryEmbedding(CustomOp): - """Rotary positional embedding for Dual Chunk Attention.""" - - def __init__( - self, - head_size: int, - rotary_dim: int, - max_position_embeddings: int, - base: float, - is_neox_style: bool, - dtype: torch.dtype, - chunk_size: int, - local_size: int, - ) -> None: - super().__init__() - self.head_size = head_size - self.rotary_dim = rotary_dim - self.max_position_embeddings = max_position_embeddings - self.base = base - self.is_neox_style = is_neox_style - self.chunk_size = chunk_size - self.local_size = local_size - self.dtype = dtype - self.device = torch.device(f"cuda:{torch.cuda.current_device()}") - (q_cache, qc_cache, k_cache, qc_no_clamp_cache, - q_inter_cache) = self._compute_cos_sin_cache() - - self.register_buffer("cos_sin_q_cache", q_cache, persistent=False) - self.register_buffer("cos_sin_qc_cache", qc_cache, persistent=False) - self.register_buffer("cos_sin_k_cache", k_cache, persistent=False) - self.register_buffer("cos_sin_qc_no_clamp_cache", - qc_no_clamp_cache, - persistent=False) - self.register_buffer("cos_sin_q_inter_cache", - q_inter_cache, - persistent=False) - - def _compute_inv_freq(self, base: float) -> torch.Tensor: - """Compute the inverse frequency.""" - # NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`. - # However, we use `torch.arange(..., dtype=torch.float)` instead to - # avoid numerical issues with large base values (e.g., 10000000). - # This may cause a slight numerical difference between the HF - # implementation and ours. - # NOTE(woosuk): To exactly match the HF implementation, we need to - # use CPU to compute the cache and then move it to GPU. However, we - # create the cache on GPU for faster initialization. This may cause - # a slight numerical difference between the HF implementation and ours. - inv_freq = 1.0 / (base**(torch.arange( - 0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim)) - return inv_freq - - def _compute_cos_sin_cache(self) -> torch.Tensor: - """Compute the cos and sin cache.""" - inv_freq = self._compute_inv_freq(self.base) - chunk_len = self.chunk_size - self.local_size - q_t = torch.arange(chunk_len, dtype=torch.float) - qc_t = (torch.arange(chunk_len, dtype=torch.float) + - chunk_len).clamp(max=self.chunk_size) - k_t = torch.arange(self.max_position_embeddings, - dtype=torch.float) % chunk_len - - # count from chunk_len, no clamp(self.chunk_size) restriction - qc_no_clamp_t = torch.arange(chunk_len, dtype=torch.float) + chunk_len - # count from self.chunk_size for q_inter's rope - q_inter_t = torch.arange(chunk_len, - dtype=torch.float) + self.chunk_size - - q_freqs = torch.outer(q_t, inv_freq) - qc_freqs = torch.outer(qc_t, inv_freq) - k_freqs = torch.outer(k_t, inv_freq) - qc_no_clamp_freqs = torch.outer(qc_no_clamp_t, inv_freq) - q_inter_freqs = torch.outer(q_inter_t, inv_freq) - - q_cos = q_freqs.cos() - q_sin = q_freqs.sin() - qc_cos = qc_freqs.cos() - qc_sin = qc_freqs.sin() - k_cos = k_freqs.cos() - k_sin = k_freqs.sin() - - qc_no_clamp_cos = qc_no_clamp_freqs.cos() - qc_no_clamp_sin = qc_no_clamp_freqs.sin() - q_inter_cos = q_inter_freqs.cos() - q_inter_sin = q_inter_freqs.sin() - - q_cache = torch.cat((q_cos, q_sin), dim=-1).to(dtype=self.dtype, - device=self.device) - qc_cache = torch.cat((qc_cos, qc_sin), dim=-1).to(dtype=self.dtype, - device=self.device) - k_cache = torch.cat((k_cos, k_sin), dim=-1).to(dtype=self.dtype, - device=self.device) - qc_no_clamp_cache = torch.cat((qc_no_clamp_cos, qc_no_clamp_sin), - dim=-1).to(dtype=self.dtype, - device=self.device) - q_inter_cache = torch.cat((q_inter_cos, q_inter_sin), - dim=-1).to(dtype=self.dtype, - device=self.device) - return q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache - - def forward( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor, - offsets: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, torch.Tensor]: - query = query.view(*query.shape[:-1], -1, self.head_size) - key = key.view(*key.shape[:-1], -1, self.head_size) - query_rot = query[..., :self.rotary_dim] - key_rot = key[..., :self.rotary_dim] - if self.rotary_dim < self.head_size: - query_pass = query[..., self.rotary_dim:] - key_pass = key[..., self.rotary_dim:] - else: - query_pass = None - key_pass = None - - positions_with_offsets = (torch.add(positions, offsets) - if offsets is not None else positions) - key = self._apply_rotary_embedding( - self.cos_sin_k_cache[positions_with_offsets], key_rot, key_pass) - chunk_len = self.chunk_size - self.local_size - query = self._apply_rotary_embedding( - self.cos_sin_q_cache[positions_with_offsets % chunk_len], - query_rot, query_pass) - query_succ = self._apply_rotary_embedding( - self.cos_sin_qc_cache[positions_with_offsets % chunk_len], - query_rot, query_pass) - query_inter = self._apply_rotary_embedding( - self.cos_sin_qc_cache[chunk_len - 1].repeat(positions.shape[0], 1), - query_rot, query_pass) - query_succ_critical = self._apply_rotary_embedding( - self.cos_sin_qc_no_clamp_cache[positions_with_offsets % chunk_len], - query_rot, query_pass) - query_inter_critical = self._apply_rotary_embedding( - self.cos_sin_q_inter_cache[positions_with_offsets % chunk_len], - query_rot, query_pass) - - # merge query into one tensor to simplify the interfaces - query = torch.cat(( - query, - query_succ, - query_inter, - query_succ_critical, - query_inter_critical, - ), - dim=-1) - return query, key - - def _apply_rotary_embedding(self, cos_sin, hidden_rot, hidden_pass): - cos, sin = cos_sin.chunk(2, dim=-1) - if self.is_neox_style: - # NOTE(woosuk): Here we assume that the positions tensor has the - # shape [batch_size, seq_len]. - cos = cos.repeat(1, 1, 2).unsqueeze(-2) - sin = sin.repeat(1, 1, 2).unsqueeze(-2) - else: - cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2) - sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2) - rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj - hidden_rot = hidden_rot * cos + rotate_fn(hidden_rot) * sin - - if self.rotary_dim < self.head_size: - hidden = torch.cat((hidden_rot, hidden_pass), dim=-1) - else: - hidden = hidden_rot - return hidden.flatten(-2).squeeze(0) - - def extra_repr(self) -> str: - s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}" - s += f", max_position_embeddings={self.max_position_embeddings}" - s += f", base={self.base}, is_neox_style={self.is_neox_style}" - s += f", chunk_size={self.chunk_size}, local_size={self.local_size}" - return s - - -_ROPE_DICT: dict[tuple, RotaryEmbedding] = {} - - -def get_rope( - head_size: int, - rotary_dim: int, - max_position: int, - base: float, - is_neox_style: bool = True, - rope_scaling: Optional[dict[str, Any]] = None, - dtype: Optional[torch.dtype] = None, - partial_rotary_factor: float = 1.0, - dual_chunk_attention_config: Optional[dict[str, Any]] = None, -) -> RotaryEmbedding: - if dtype is None: - dtype = torch.get_default_dtype() - if rope_scaling is not None: - # Transforms every value that is a list into a tuple for caching calls - rope_scaling_tuple = { - k: tuple(v) if isinstance(v, list) else v - for k, v in rope_scaling.items() - } - rope_scaling_args = tuple(rope_scaling_tuple.items()) - else: - rope_scaling_args = None - - if dual_chunk_attention_config is not None: - dual_chunk_attention_tuple = { - k: tuple(v) if isinstance(v, list) else v - for k, v in dual_chunk_attention_config.items() - if k != "sparse_attention_config" - } - dual_chunk_attention_args = tuple(dual_chunk_attention_tuple.items()) - else: - dual_chunk_attention_args = None - - if partial_rotary_factor < 1.0: - rotary_dim = int(rotary_dim * partial_rotary_factor) - key = (head_size, rotary_dim, max_position, base, is_neox_style, - rope_scaling_args, dual_chunk_attention_args, dtype) - if key in _ROPE_DICT: - return _ROPE_DICT[key] - - if dual_chunk_attention_config is not None: - extra_kwargs = { - k: v - for k, v in dual_chunk_attention_config.items() - if k in ("chunk_size", "local_size") - } - rotary_emb = DualChunkRotaryEmbedding(head_size, rotary_dim, - max_position, base, - is_neox_style, dtype, - **extra_kwargs) - elif not rope_scaling: - rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base, - is_neox_style, dtype) - else: - scaling_type = rope_scaling["rope_type"] - - if scaling_type == "llama3": - scaling_factor = rope_scaling["factor"] - low_freq_factor = rope_scaling["low_freq_factor"] - high_freq_factor = rope_scaling["high_freq_factor"] - original_max_position = rope_scaling[ - "original_max_position_embeddings"] - rotary_emb = Llama3RotaryEmbedding(head_size, rotary_dim, - max_position, base, - is_neox_style, dtype, - scaling_factor, low_freq_factor, - high_freq_factor, - original_max_position) - elif scaling_type == "mllama4": - rotary_emb = Llama4VisionRotaryEmbedding(head_size, rotary_dim, - max_position, base, - is_neox_style, dtype) - elif scaling_type == "default": - if "mrope_section" in rope_scaling: - rotary_emb = MRotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - dtype, - mrope_section=rope_scaling["mrope_section"], - ) - else: - rotary_emb = RotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - dtype, - ) - elif scaling_type == "linear": - scaling_factor = rope_scaling["factor"] - rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim, - max_position, base, - is_neox_style, - scaling_factor, dtype) - elif scaling_type == "ntk": - scaling_factor = rope_scaling["factor"] - mixed_b = rope_scaling.get('mixed_b', None) - rotary_emb = NTKScalingRotaryEmbedding(head_size, rotary_dim, - max_position, base, - is_neox_style, - scaling_factor, dtype, - mixed_b) - elif scaling_type == "dynamic": - if "alpha" in rope_scaling: - scaling_alpha = rope_scaling["alpha"] - rotary_emb = DynamicNTKAlphaRotaryEmbedding( - head_size, rotary_dim, max_position, base, is_neox_style, - scaling_alpha, dtype) - elif "factor" in rope_scaling: - scaling_factor = rope_scaling["factor"] - rotary_emb = DynamicNTKScalingRotaryEmbedding( - head_size, rotary_dim, max_position, base, is_neox_style, - scaling_factor, dtype) - else: - raise ValueError("Dynamic rope scaling must contain either " - "'alpha' or 'factor' field") - elif scaling_type == "yarn": - scaling_factor = rope_scaling["factor"] - original_max_position = rope_scaling[ - "original_max_position_embeddings"] - extra_kwargs = { - k: v - for k, v in rope_scaling.items() - if k in ("extrapolation_factor", "attn_factor", "beta_fast", - "beta_slow") - } - rotary_emb = YaRNScalingRotaryEmbedding(head_size, rotary_dim, - original_max_position, - base, is_neox_style, - scaling_factor, dtype, - **extra_kwargs) - elif scaling_type == "deepseek_yarn": - scaling_factor = rope_scaling["factor"] - original_max_position = rope_scaling[ - "original_max_position_embeddings"] - # assert max_position == original_max_position * scaling_factor - extra_kwargs = { - k: v - for k, v in rope_scaling.items() - if k in ("extrapolation_factor", "attn_factor", "beta_fast", - "beta_slow", "mscale", "mscale_all_dim") - } - rotary_emb = DeepseekScalingRotaryEmbedding( - head_size, rotary_dim, original_max_position, base, - is_neox_style, scaling_factor, dtype, **extra_kwargs) - elif scaling_type == "longrope": - short_factor = rope_scaling["short_factor"] - long_factor = rope_scaling["long_factor"] - original_max_position = rope_scaling[ - "original_max_position_embeddings"] - extra_kwargs = { - k: v - for k, v in rope_scaling.items() - if k in ("short_mscale", "long_mscale") - } - rotary_emb = Phi3LongRoPEScaledRotaryEmbedding( - head_size, rotary_dim, max_position, original_max_position, - base, is_neox_style, dtype, short_factor, long_factor, - **extra_kwargs) - else: - raise ValueError(f"Unknown RoPE scaling type {scaling_type}") - _ROPE_DICT[key] = rotary_emb - return rotary_emb diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py new file mode 100644 index 0000000000000..564f9a5c00750 --- /dev/null +++ b/vllm/model_executor/layers/rotary_embedding/__init__.py @@ -0,0 +1,190 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Rotary Positional Embeddings.""" +from typing import Any, Optional + +import torch + +from .base import RotaryEmbedding +from .deepseek_scaling_rope import DeepseekScalingRotaryEmbedding +from .dual_chunk_rope import DualChunkRotaryEmbedding +from .dynamic_ntk_alpha_rope import DynamicNTKAlphaRotaryEmbedding +from .dynamic_ntk_scaling_rope import DynamicNTKScalingRotaryEmbedding +from .linear_scaling_rope import LinearScalingRotaryEmbedding +from .llama3_rope import Llama3RotaryEmbedding +from .llama4_vision_rope import Llama4VisionRotaryEmbedding +from .mrope import MRotaryEmbedding +from .ntk_scaling_rope import NTKScalingRotaryEmbedding +from .phi3_long_rope_scaled_rope import Phi3LongRoPEScaledRotaryEmbedding +from .yarn_scaling_rope import YaRNScalingRotaryEmbedding + +_ROPE_DICT: dict[tuple, RotaryEmbedding] = {} + + +def get_rope( + head_size: int, + rotary_dim: int, + max_position: int, + base: float, + is_neox_style: bool = True, + rope_scaling: Optional[dict[str, Any]] = None, + dtype: Optional[torch.dtype] = None, + partial_rotary_factor: float = 1.0, + dual_chunk_attention_config: Optional[dict[str, Any]] = None, +) -> RotaryEmbedding: + if dtype is None: + dtype = torch.get_default_dtype() + if rope_scaling is not None: + # Transforms every value that is a list into a tuple for caching calls + rope_scaling_tuple = { + k: tuple(v) if isinstance(v, list) else v + for k, v in rope_scaling.items() + } + rope_scaling_args = tuple(rope_scaling_tuple.items()) + else: + rope_scaling_args = None + + if dual_chunk_attention_config is not None: + dual_chunk_attention_tuple = { + k: tuple(v) if isinstance(v, list) else v + for k, v in dual_chunk_attention_config.items() + if k != "sparse_attention_config" + } + dual_chunk_attention_args = tuple(dual_chunk_attention_tuple.items()) + else: + dual_chunk_attention_args = None + + if partial_rotary_factor < 1.0: + rotary_dim = int(rotary_dim * partial_rotary_factor) + key = (head_size, rotary_dim, max_position, base, is_neox_style, + rope_scaling_args, dual_chunk_attention_args, dtype) + if key in _ROPE_DICT: + return _ROPE_DICT[key] + + if dual_chunk_attention_config is not None: + extra_kwargs = { + k: v + for k, v in dual_chunk_attention_config.items() + if k in ("chunk_size", "local_size") + } + rotary_emb = DualChunkRotaryEmbedding(head_size, rotary_dim, + max_position, base, + is_neox_style, dtype, + **extra_kwargs) + elif not rope_scaling: + rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base, + is_neox_style, dtype) + else: + scaling_type = rope_scaling["rope_type"] + + if scaling_type == "llama3": + scaling_factor = rope_scaling["factor"] + low_freq_factor = rope_scaling["low_freq_factor"] + high_freq_factor = rope_scaling["high_freq_factor"] + original_max_position = rope_scaling[ + "original_max_position_embeddings"] + rotary_emb = Llama3RotaryEmbedding(head_size, rotary_dim, + max_position, base, + is_neox_style, dtype, + scaling_factor, low_freq_factor, + high_freq_factor, + original_max_position) + elif scaling_type == "mllama4": + rotary_emb = Llama4VisionRotaryEmbedding(head_size, rotary_dim, + max_position, base, + is_neox_style, dtype) + elif scaling_type == "default": + if "mrope_section" in rope_scaling: + rotary_emb = MRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + dtype, + mrope_section=rope_scaling["mrope_section"], + ) + else: + rotary_emb = RotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + dtype, + ) + elif scaling_type == "linear": + scaling_factor = rope_scaling["factor"] + rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim, + max_position, base, + is_neox_style, + scaling_factor, dtype) + elif scaling_type == "ntk": + scaling_factor = rope_scaling["factor"] + mixed_b = rope_scaling.get('mixed_b', None) + rotary_emb = NTKScalingRotaryEmbedding(head_size, rotary_dim, + max_position, base, + is_neox_style, + scaling_factor, dtype, + mixed_b) + elif scaling_type == "dynamic": + if "alpha" in rope_scaling: + scaling_alpha = rope_scaling["alpha"] + rotary_emb = DynamicNTKAlphaRotaryEmbedding( + head_size, rotary_dim, max_position, base, is_neox_style, + scaling_alpha, dtype) + elif "factor" in rope_scaling: + scaling_factor = rope_scaling["factor"] + rotary_emb = DynamicNTKScalingRotaryEmbedding( + head_size, rotary_dim, max_position, base, is_neox_style, + scaling_factor, dtype) + else: + raise ValueError("Dynamic rope scaling must contain either " + "'alpha' or 'factor' field") + elif scaling_type == "yarn": + scaling_factor = rope_scaling["factor"] + original_max_position = rope_scaling[ + "original_max_position_embeddings"] + extra_kwargs = { + k: v + for k, v in rope_scaling.items() + if k in ("extrapolation_factor", "attn_factor", "beta_fast", + "beta_slow") + } + rotary_emb = YaRNScalingRotaryEmbedding(head_size, rotary_dim, + original_max_position, + base, is_neox_style, + scaling_factor, dtype, + **extra_kwargs) + elif scaling_type == "deepseek_yarn": + scaling_factor = rope_scaling["factor"] + original_max_position = rope_scaling[ + "original_max_position_embeddings"] + # assert max_position == original_max_position * scaling_factor + extra_kwargs = { + k: v + for k, v in rope_scaling.items() + if k in ("extrapolation_factor", "attn_factor", "beta_fast", + "beta_slow", "mscale", "mscale_all_dim") + } + rotary_emb = DeepseekScalingRotaryEmbedding( + head_size, rotary_dim, original_max_position, base, + is_neox_style, scaling_factor, dtype, **extra_kwargs) + elif scaling_type == "longrope": + short_factor = rope_scaling["short_factor"] + long_factor = rope_scaling["long_factor"] + original_max_position = rope_scaling[ + "original_max_position_embeddings"] + extra_kwargs = { + k: v + for k, v in rope_scaling.items() + if k in ("short_mscale", "long_mscale") + } + rotary_emb = Phi3LongRoPEScaledRotaryEmbedding( + head_size, rotary_dim, max_position, original_max_position, + base, is_neox_style, dtype, short_factor, long_factor, + **extra_kwargs) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + _ROPE_DICT[key] = rotary_emb + return rotary_emb diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py new file mode 100644 index 0000000000000..10fce857a8ae2 --- /dev/null +++ b/vllm/model_executor/layers/rotary_embedding/base.py @@ -0,0 +1,237 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Rotary Positional Embeddings Base Class.""" +from typing import Optional + +import torch + +from vllm.model_executor.custom_op import CustomOp + +from .common import apply_rotary_emb_dispatch, apply_rotary_emb_torch + + +@CustomOp.register("rotary_embedding") +class RotaryEmbedding(CustomOp): + """Original rotary positional embedding.""" + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + dtype: torch.dtype, + ) -> None: + super().__init__() + self.head_size = head_size + self.rotary_dim = rotary_dim + self.max_position_embeddings = max_position_embeddings + self.base = base + self.is_neox_style = is_neox_style + self.dtype = dtype + + cache = self._compute_cos_sin_cache() + cache = cache.to(dtype) + self.cos_sin_cache: torch.Tensor + self.register_buffer("cos_sin_cache", cache, persistent=False) + + def _compute_inv_freq(self, base: float) -> torch.Tensor: + """Compute the inverse frequency.""" + # NOTE(woosuk): To exactly match the HF implementation, we need to + # use CPU to compute the cache and then move it to GPU. However, we + # create the cache on GPU for faster initialization. This may cause + # a slight numerical difference between the HF implementation and ours. + inv_freq = 1.0 / (base**(torch.arange( + 0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim)) + return inv_freq + + def _compute_cos_sin_cache(self) -> torch.Tensor: + """Compute the cos and sin cache.""" + inv_freq = self._compute_inv_freq(self.base) + t = torch.arange(self.max_position_embeddings, dtype=torch.float) + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + cache = torch.cat((cos, sin), dim=-1) + return cache + + def forward_native( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + offsets: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """A PyTorch-native implementation of forward().""" + if offsets is not None: + positions = positions + offsets + positions = positions.flatten() + num_tokens = positions.shape[0] + cos_sin = self.cos_sin_cache.index_select(0, positions) + cos, sin = cos_sin.chunk(2, dim=-1) + + query_shape = query.shape + query = query.view(num_tokens, -1, self.head_size) + query_rot = query[..., :self.rotary_dim] + query_pass = query[..., self.rotary_dim:] + query_rot = apply_rotary_emb_torch(query_rot, cos, sin, + self.is_neox_style) + query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + + # key may be None in some cases, e.g. cross-layer KV sharing + if key is not None: + key_shape = key.shape + key = key.view(num_tokens, -1, self.head_size) + key_rot = key[..., :self.rotary_dim] + key_pass = key[..., self.rotary_dim:] + key_rot = apply_rotary_emb_torch(key_rot, cos, sin, + self.is_neox_style) + key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + return query, key + + def forward_cuda( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + offsets: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + from vllm import _custom_ops as ops + + # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`) + # is expensive, so avoid calling it if possible + if self.cos_sin_cache.device != query.device or \ + self.cos_sin_cache.dtype != query.dtype: + self.cos_sin_cache = self.cos_sin_cache.to(query.device, + dtype=query.dtype) + + # ops.rotary_embedding()/batched_rotary_embedding() + # are in-place operations that update the query and key tensors. + if offsets is not None: + ops.batched_rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, + self.is_neox_style, self.rotary_dim, + offsets) + else: + ops.rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, self.is_neox_style) + return query, key + + def forward_xpu( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + offsets: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + from vllm._ipex_ops import ipex_ops as ops + + self.cos_sin_cache = self.cos_sin_cache.to(positions.device, + dtype=query.dtype) + # ops.rotary_embedding()/batched_rotary_embedding() + # are in-place operations that update the query and key tensors. + if key is None: + # XPU kernel doesn't support key=None so fall back to native impl + # TODO(sarckk): add support for optional key in + # ipex.llm.functional.rotary_embedding_batched + return self.forward_native(positions, query, key, offsets) + else: + if offsets is not None: + ops.batched_rotary_embedding(positions, query, key, + self.head_size, + self.cos_sin_cache, + self.is_neox_style, + self.rotary_dim, offsets) + else: + ops.rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, self.is_neox_style) + return query, key + + def forward_neuron( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + offsets: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + + def _apply_rotary_emb_neuron( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + is_neox_style: bool, + ) -> torch.Tensor: + cos = cos.unsqueeze(-2).to(x.dtype) + sin = sin.unsqueeze(-2).to(x.dtype) + if is_neox_style: + x1, x2 = torch.chunk(x, 2, dim=-1) + else: + # x1 = x[..., ::2] + + # x2 = x[..., 1::2] + d = x.shape[-1] // 2 + x_reshaped = x.view(-1, x.shape[-1]) + x1 = x_reshaped[:, ::2].view(*x.shape[:-1], d) + x2 = x_reshaped[:, 1::2].view(*x.shape[:-1], d) + o1 = x1 * cos - x2 * sin + o2 = x2 * cos + x1 * sin + if is_neox_style: + return torch.cat((o1, o2), dim=-1) + else: + return torch.stack((o1, o2), dim=-1).flatten(-2) + + if offsets is not None: + positions = positions + offsets + + self.cos_sin_cache = self.cos_sin_cache.to(query.device, + dtype=query.dtype) + + positions = positions.flatten() + num_tokens = positions.shape[0] + cos_sin = self.cos_sin_cache.index_select(0, positions) + cos, sin = cos_sin.chunk(2, dim=-1) + + query_shape = query.shape + query = query.view(num_tokens, -1, self.head_size) + if key is not None: + key_shape = key.shape + key = key.view(num_tokens, -1, self.head_size) + + if self.rotary_dim == self.head_size: + query = apply_rotary_emb_dispatch(query, cos, sin, + self.is_neox_style) + query = query.reshape(query_shape) + if key is not None: + key = apply_rotary_emb_dispatch(key, cos, sin, + self.is_neox_style) + key = key.reshape(key_shape) + else: + head_size = query.shape[-1] + query_reshaped = query.view(-1, head_size) + query_pass = query_reshaped[:, self.rotary_dim:].view( + *query.shape[:-1], head_size - self.rotary_dim) + query_rot = query_reshaped[:, :self.rotary_dim].view( + *query.shape[:-1], self.rotary_dim) + query_rot = _apply_rotary_emb_neuron(query_rot, cos, sin, + self.is_neox_style) + query = torch.cat((query_rot, query_pass), + dim=-1).reshape(query_shape) + + if key is not None: + key_reshaped = key.view(-1, head_size) + key_pass = key_reshaped[:, self.rotary_dim:].view( + *key.shape[:-1], head_size - self.rotary_dim) + key_rot = key_reshaped[:, :self.rotary_dim].view( + *key.shape[:-1], self.rotary_dim) + key_rot = _apply_rotary_emb_neuron(key_rot, cos, sin, + self.is_neox_style) + key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + return query, key + + def extra_repr(self) -> str: + s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}" + s += f", max_position_embeddings={self.max_position_embeddings}" + s += f", base={self.base}, is_neox_style={self.is_neox_style}" + return s diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py new file mode 100644 index 0000000000000..8d821bea19e3e --- /dev/null +++ b/vllm/model_executor/layers/rotary_embedding/common.py @@ -0,0 +1,105 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math + +import torch + +from vllm.platforms import current_platform + +if current_platform.is_cuda(): + from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb + + +# common functions +def rotate_neox(x: torch.Tensor) -> torch.Tensor: + x1 = x[..., :x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), dim=-1) + + +def rotate_gptj(x: torch.Tensor) -> torch.Tensor: + x1 = x[..., ::2] + x2 = x[..., 1::2] + x = torch.stack((-x2, x1), dim=-1) + return x.flatten(-2) + + +def apply_rotary_emb_torch( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + is_neox_style: bool, +) -> torch.Tensor: + cos = cos.unsqueeze(-2).to(x.dtype) + sin = sin.unsqueeze(-2).to(x.dtype) + if is_neox_style: + x1, x2 = torch.chunk(x, 2, dim=-1) + else: + x1 = x[..., ::2] + x2 = x[..., 1::2] + o1 = x1 * cos - x2 * sin + o2 = x2 * cos + x1 * sin + if is_neox_style: + return torch.cat((o1, o2), dim=-1) + else: + return torch.stack((o1, o2), dim=-1).flatten(-2) + + +def apply_rotary_emb_dispatch(x: torch.Tensor, cos: torch.Tensor, + sin: torch.Tensor, + is_neox_style: bool) -> torch.Tensor: + """ + Args: + x: [num_tokens, num_heads, head_size] + cos: [num_tokens, head_size // 2] + sin: [num_tokens, head_size // 2] + is_neox_style: Whether to use the Neox-style or GPT-J-style rotary + positional embeddings. + """ + if current_platform.is_cuda(): + return apply_rotary_emb(x.unsqueeze(0), cos, sin, + not is_neox_style).squeeze(0) + else: + return apply_rotary_emb_torch(x, cos, sin, is_neox_style) + + +# yarn functions +# Inverse dim formula to find dim based on number of rotations +def yarn_find_correction_dim(num_rotations: int, + dim: int, + base: float = 10000, + max_position_embeddings: int = 2048) -> float: + return (dim * math.log(max_position_embeddings / + (num_rotations * 2 * math.pi))) / (2 * + math.log(base)) + + +# Find dim range bounds based on rotations +def yarn_find_correction_range( + low_rot: int, + high_rot: int, + dim: int, + base: float = 10000, + max_position_embeddings: int = 2048) -> tuple[int, int]: + low = math.floor( + yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)) + high = math.ceil( + yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)) + return max(low, 0), min(high, dim - 1) # Clamp values just in case + + +def yarn_linear_ramp_mask(low: float, high: float, dim: int, + dtype: torch.dtype) -> torch.Tensor: + if low == high: + high += 0.001 # Prevent singularity + + linear_func = (torch.arange(dim, dtype=dtype) - low) / (high - low) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + +def yarn_get_mscale(scale: float = 1) -> float: + if scale <= 1: + return 1.0 + return 0.1 * math.log(scale) + 1.0 diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py new file mode 100644 index 0000000000000..cd888b733426b --- /dev/null +++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py @@ -0,0 +1,131 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math +from typing import Optional + +import torch + +from vllm.platforms import current_platform + +from .base import RotaryEmbedding +from .common import (rotate_gptj, rotate_neox, yarn_find_correction_range, + yarn_linear_ramp_mask) + + +def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 + + +class DeepseekScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with YaRN method. + + Credits to Peng et al. github.com/jquesnelle/yarn + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + scaling_factor: float, + dtype: torch.dtype, + *, + extrapolation_factor: float = 1, + attn_factor: float = 1, + beta_fast: int = 32, + beta_slow: int = 1, + mscale: float = 1, + mscale_all_dim: float = 0, + ) -> None: + self.scaling_factor = scaling_factor + self.extrapolation_factor = extrapolation_factor + self.attn_factor = attn_factor + self.beta_fast = beta_fast + self.beta_slow = beta_slow + # Get n-d magnitude scaling corrected for interpolation. + self.mscale = float( + yarn_get_mscale(self.scaling_factor, float(mscale)) / + yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) * + attn_factor) + super().__init__(head_size, rotary_dim, max_position_embeddings, base, + is_neox_style, dtype) + + def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: + pos_freqs = self.base**( + torch.arange(0, + self.rotary_dim, + 2, + dtype=torch.float, + device=current_platform.device_type) / + self.rotary_dim) + inv_freq_extrapolation = 1.0 / pos_freqs + inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs) + + low, high = yarn_find_correction_range(self.beta_fast, self.beta_slow, + self.rotary_dim, self.base, + self.max_position_embeddings) + # Get n-d rotational scaling corrected for extrapolation + inv_freq_mask = (1 - yarn_linear_ramp_mask( + low, high, self.rotary_dim // 2, + dtype=torch.float)) * self.extrapolation_factor + inv_freq = inv_freq_interpolation * ( + 1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask + return inv_freq + + def _compute_cos_sin_cache(self) -> torch.Tensor: + inv_freq = self._compute_inv_freq(self.scaling_factor) + t = torch.arange(self.max_position_embeddings * self.scaling_factor, + device=current_platform.device_type, + dtype=torch.float32) + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = (freqs.cos() * self.mscale) + sin = (freqs.sin() * self.mscale) + cache = torch.cat((cos, sin), dim=-1) + return cache + + def forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + offsets: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """PyTorch-native implementation equivalent to forward().""" + assert key is not None + query_rot = query[..., :self.rotary_dim] + key_rot = key[..., :self.rotary_dim] + if self.rotary_dim < self.head_size: + query_pass = query[..., self.rotary_dim:] + key_pass = key[..., self.rotary_dim:] + + if self.cos_sin_cache.device != positions.device: + self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to( + positions.device) + cos_sin = self.cos_sin_cache[torch.add(positions, offsets) + if offsets is not None else positions] + cos, sin = cos_sin.chunk(2, dim=-1) + if self.is_neox_style: + # NOTE(woosuk): Here we assume that the positions tensor has the + # shape [batch_size, seq_len]. + cos = cos.repeat(1, 1, 2).unsqueeze(-2) + sin = sin.repeat(1, 1, 2).unsqueeze(-2) + else: + cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2) + sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2) + + rotate_fn = rotate_neox if self.is_neox_style else rotate_gptj + query_rot = query_rot * cos + rotate_fn(query_rot) * sin + key_rot = key_rot * cos + rotate_fn(key_rot) * sin + + if self.rotary_dim < self.head_size: + query = torch.cat((query_rot, query_pass), dim=-1) + key = torch.cat((key_rot, key_pass), dim=-1) + else: + query = query_rot + key = key_rot + return query, key diff --git a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py new file mode 100644 index 0000000000000..3d8da0fa9d8f5 --- /dev/null +++ b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py @@ -0,0 +1,188 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Optional + +import torch + +from vllm.model_executor.custom_op import CustomOp + +from .common import rotate_gptj, rotate_neox + + +@CustomOp.register("dual_chunk_rotary_embedding") +class DualChunkRotaryEmbedding(CustomOp): + """Rotary positional embedding for Dual Chunk Attention.""" + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + dtype: torch.dtype, + chunk_size: int, + local_size: int, + ) -> None: + super().__init__() + self.head_size = head_size + self.rotary_dim = rotary_dim + self.max_position_embeddings = max_position_embeddings + self.base = base + self.is_neox_style = is_neox_style + self.chunk_size = chunk_size + self.local_size = local_size + self.dtype = dtype + self.device = torch.device(f"cuda:{torch.cuda.current_device()}") + (q_cache, qc_cache, k_cache, qc_no_clamp_cache, + q_inter_cache) = self._compute_cos_sin_cache() + + self.register_buffer("cos_sin_q_cache", q_cache, persistent=False) + self.register_buffer("cos_sin_qc_cache", qc_cache, persistent=False) + self.register_buffer("cos_sin_k_cache", k_cache, persistent=False) + self.register_buffer("cos_sin_qc_no_clamp_cache", + qc_no_clamp_cache, + persistent=False) + self.register_buffer("cos_sin_q_inter_cache", + q_inter_cache, + persistent=False) + + def _compute_inv_freq(self, base: float) -> torch.Tensor: + """Compute the inverse frequency.""" + # NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`. + # However, we use `torch.arange(..., dtype=torch.float)` instead to + # avoid numerical issues with large base values (e.g., 10000000). + # This may cause a slight numerical difference between the HF + # implementation and ours. + # NOTE(woosuk): To exactly match the HF implementation, we need to + # use CPU to compute the cache and then move it to GPU. However, we + # create the cache on GPU for faster initialization. This may cause + # a slight numerical difference between the HF implementation and ours. + inv_freq = 1.0 / (base**(torch.arange( + 0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim)) + return inv_freq + + def _compute_cos_sin_cache(self) -> torch.Tensor: + """Compute the cos and sin cache.""" + inv_freq = self._compute_inv_freq(self.base) + chunk_len = self.chunk_size - self.local_size + q_t = torch.arange(chunk_len, dtype=torch.float) + qc_t = (torch.arange(chunk_len, dtype=torch.float) + + chunk_len).clamp(max=self.chunk_size) + k_t = torch.arange(self.max_position_embeddings, + dtype=torch.float) % chunk_len + + # count from chunk_len, no clamp(self.chunk_size) restriction + qc_no_clamp_t = torch.arange(chunk_len, dtype=torch.float) + chunk_len + # count from self.chunk_size for q_inter's rope + q_inter_t = torch.arange(chunk_len, + dtype=torch.float) + self.chunk_size + + q_freqs = torch.outer(q_t, inv_freq) + qc_freqs = torch.outer(qc_t, inv_freq) + k_freqs = torch.outer(k_t, inv_freq) + qc_no_clamp_freqs = torch.outer(qc_no_clamp_t, inv_freq) + q_inter_freqs = torch.outer(q_inter_t, inv_freq) + + q_cos = q_freqs.cos() + q_sin = q_freqs.sin() + qc_cos = qc_freqs.cos() + qc_sin = qc_freqs.sin() + k_cos = k_freqs.cos() + k_sin = k_freqs.sin() + + qc_no_clamp_cos = qc_no_clamp_freqs.cos() + qc_no_clamp_sin = qc_no_clamp_freqs.sin() + q_inter_cos = q_inter_freqs.cos() + q_inter_sin = q_inter_freqs.sin() + + q_cache = torch.cat((q_cos, q_sin), dim=-1).to(dtype=self.dtype, + device=self.device) + qc_cache = torch.cat((qc_cos, qc_sin), dim=-1).to(dtype=self.dtype, + device=self.device) + k_cache = torch.cat((k_cos, k_sin), dim=-1).to(dtype=self.dtype, + device=self.device) + qc_no_clamp_cache = torch.cat((qc_no_clamp_cos, qc_no_clamp_sin), + dim=-1).to(dtype=self.dtype, + device=self.device) + q_inter_cache = torch.cat((q_inter_cos, q_inter_sin), + dim=-1).to(dtype=self.dtype, + device=self.device) + return q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache + + def forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + query = query.view(*query.shape[:-1], -1, self.head_size) + key = key.view(*key.shape[:-1], -1, self.head_size) + query_rot = query[..., :self.rotary_dim] + key_rot = key[..., :self.rotary_dim] + if self.rotary_dim < self.head_size: + query_pass = query[..., self.rotary_dim:] + key_pass = key[..., self.rotary_dim:] + else: + query_pass = None + key_pass = None + + positions_with_offsets = (torch.add(positions, offsets) + if offsets is not None else positions) + key = self._apply_rotary_embedding( + self.cos_sin_k_cache[positions_with_offsets], key_rot, key_pass) + chunk_len = self.chunk_size - self.local_size + query = self._apply_rotary_embedding( + self.cos_sin_q_cache[positions_with_offsets % chunk_len], + query_rot, query_pass) + query_succ = self._apply_rotary_embedding( + self.cos_sin_qc_cache[positions_with_offsets % chunk_len], + query_rot, query_pass) + query_inter = self._apply_rotary_embedding( + self.cos_sin_qc_cache[chunk_len - 1].repeat(positions.shape[0], 1), + query_rot, query_pass) + query_succ_critical = self._apply_rotary_embedding( + self.cos_sin_qc_no_clamp_cache[positions_with_offsets % chunk_len], + query_rot, query_pass) + query_inter_critical = self._apply_rotary_embedding( + self.cos_sin_q_inter_cache[positions_with_offsets % chunk_len], + query_rot, query_pass) + + # merge query into one tensor to simplify the interfaces + query = torch.cat(( + query, + query_succ, + query_inter, + query_succ_critical, + query_inter_critical, + ), + dim=-1) + return query, key + + def _apply_rotary_embedding(self, cos_sin, hidden_rot, hidden_pass): + cos, sin = cos_sin.chunk(2, dim=-1) + if self.is_neox_style: + # NOTE(woosuk): Here we assume that the positions tensor has the + # shape [batch_size, seq_len]. + cos = cos.repeat(1, 1, 2).unsqueeze(-2) + sin = sin.repeat(1, 1, 2).unsqueeze(-2) + else: + cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2) + sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2) + rotate_fn = rotate_neox if self.is_neox_style else rotate_gptj + hidden_rot = hidden_rot * cos + rotate_fn(hidden_rot) * sin + + if self.rotary_dim < self.head_size: + hidden = torch.cat((hidden_rot, hidden_pass), dim=-1) + else: + hidden = hidden_rot + return hidden.flatten(-2).squeeze(0) + + def extra_repr(self) -> str: + s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}" + s += f", max_position_embeddings={self.max_position_embeddings}" + s += f", base={self.base}, is_neox_style={self.is_neox_style}" + s += f", chunk_size={self.chunk_size}, local_size={self.local_size}" + return s diff --git a/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py b/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py new file mode 100644 index 0000000000000..1da39bbd303bd --- /dev/null +++ b/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from .base import RotaryEmbedding + + +class DynamicNTKAlphaRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with Dynamic NTK alpha. + + Based on the original RotaryEmbedding implementation. + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + scaling_alpha: float, + dtype: torch.dtype, + ) -> None: + self.scaling_alpha = scaling_alpha + super().__init__(head_size, rotary_dim, max_position_embeddings, base, + is_neox_style, dtype) + + def _compute_cos_sin_cache(self) -> torch.Tensor: + # For Hunyuan DynamicNTKAlphaRotaryEmbedding + max_len = self.max_position_embeddings + base = self.base * self.scaling_alpha**(self.rotary_dim / + (self.rotary_dim - 2)) + inv_freq = self._compute_inv_freq(base) + t = torch.arange(max_len, dtype=torch.float) + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + cache = torch.cat((cos, sin), dim=-1) + return cache diff --git a/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py new file mode 100644 index 0000000000000..ec2008b90cfb8 --- /dev/null +++ b/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch + +from .base import RotaryEmbedding + + +class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with Dynamic NTK scaling. + + Credits to the Reddit users /u/bloc97 and /u/emozilla + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + scaling_factor: float, + dtype: torch.dtype, + ) -> None: + self.scaling_factor = scaling_factor + super().__init__(head_size, rotary_dim, max_position_embeddings, base, + is_neox_style, dtype) + + def _compute_cos_sin_cache(self) -> torch.Tensor: + # NOTE(woosuk): self.max_position_embeddings is the original + # maximum length before applying the rope scaling. + # Thus, the maximum length after applying the rope scaling is + # self.max_position_embeddings * self.scaling_factor. + max_len = self.max_position_embeddings * self.scaling_factor + base = self.base * ( + (self.scaling_factor * max_len / self.max_position_embeddings) - + (self.scaling_factor - 1))**(self.rotary_dim / + (self.rotary_dim - 2)) + inv_freq = self._compute_inv_freq(base) + t = torch.arange(max_len, dtype=torch.float) + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + cache = torch.cat((cos, sin), dim=-1) + return cache diff --git a/vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py new file mode 100644 index 0000000000000..6e920991882d4 --- /dev/null +++ b/vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py @@ -0,0 +1,115 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Union + +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch + +from .base import RotaryEmbedding + + +class LinearScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with linear scaling. + + It supports multiple scaling factors. Since multiple LoRA adapters may have + different scaling factors, we need multiple cos/sin caches. In this way, + instead of running rotary embedding kernel per lora, we can run multiple + lora in a batched way. + + In addition to that, we also keep the cos/sin cache for the scaling factor + of 1 (default) at all times. + + Exemplary for two scaling factors x=1, y and z with embeddings + [[x11, x12, ... x1m], ..., [xn1, xn2, ..., xnm]] and + [[y11, y12, ... y1o], ..., [yn1, yn2, ..., yno]], and + [[z11, z12, ... z1p], ..., [zn1, zn2, ..., znp]], + + we construct the cos/sin cache as follows: + [[x11, x12, ... x1m, y11, y12, ... y1o, z11, z12, ... z1p], + ... + [xn1, xn2, ... xnm, yn1, yn2, ... yno, zn1, zn2, ... znp]] + + We then use offsets to index into the cos/sin cache for + the respective scaling factors. + + The offset to cache can be accessed via `scaling_factor_to_offset` API. + + Credits to the Reddit user /u/kaiokendev + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + scaling_factors: Union[list[float], float], + dtype: torch.dtype, + ) -> None: + if isinstance(scaling_factors, float): + scaling_factors = [scaling_factors] + self.scaling_factors: list[float] = scaling_factors # noqa + super().__init__(head_size, rotary_dim, max_position_embeddings, base, + is_neox_style, dtype) + # Lazy initialized. + self._scaling_factor_to_offset: dict[float, int] + + def _compute_cos_sin_cache(self) -> torch.Tensor: + inv_freq = self._compute_inv_freq(self.base) + cache_list: list[torch.Tensor] = [] + # offsets to the next cache in a tensor. + # Each offset corresponds to the same index in scaling_factors. + offsets: list[int] = [] + for scaling_factor in self.scaling_factors: + # NOTE(woosuk): self.max_position_embeddings is the original + # maximum length before applying the rope scaling. + # Thus, the maximum length after applying the rope scaling is + # self.max_position_embeddings * self.scaling_factor. + max_len = self.max_position_embeddings * scaling_factor + t = torch.arange(max_len, dtype=torch.float) + t = t / scaling_factor + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + cache = torch.cat((cos, sin), dim=-1) + if not cache_list: + offset = 0 + else: + last_offset = offsets[-1] + next_max_len = cache_list[-1].shape[0] + offset = last_offset + next_max_len + offsets.append(offset) + cache_list.append(cache) + self._scaling_factor_to_offset = { + float(scaling_factor): offsets[i] + for i, scaling_factor in enumerate(self.scaling_factors) + } + assert len(self.scaling_factors) == len(offsets) + return torch.cat(cache_list, dim=0) + + @property + def scaling_factor_to_offset(self) -> dict[float, int]: + return self._scaling_factor_to_offset diff --git a/vllm/model_executor/layers/rotary_embedding/llama3_rope.py b/vllm/model_executor/layers/rotary_embedding/llama3_rope.py new file mode 100644 index 0000000000000..adcef549bc4c2 --- /dev/null +++ b/vllm/model_executor/layers/rotary_embedding/llama3_rope.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math + +import torch + +from .base import RotaryEmbedding + + +class Llama3RotaryEmbedding(RotaryEmbedding): + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + dtype: torch.dtype, + scaling_factor: float, + low_freq_factor: float, + high_freq_factor: float, + orig_max_position: int, + ) -> None: + self.scaling_factor = scaling_factor + self.low_freq_factor = low_freq_factor + self.high_freq_factor = high_freq_factor + self.orig_max_position = orig_max_position + super().__init__(head_size, rotary_dim, max_position_embeddings, base, + is_neox_style, dtype) + + def _compute_inv_freq(self, base: float) -> torch.Tensor: + inv_freqs = super()._compute_inv_freq(base) + low_freq_wavelen = self.orig_max_position / self.low_freq_factor + high_freq_wavelen = self.orig_max_position / self.high_freq_factor + + wave_len = 2 * math.pi / inv_freqs + if self.low_freq_factor != self.high_freq_factor: + smooth = (self.orig_max_position / wave_len - self.low_freq_factor + ) / (self.high_freq_factor - self.low_freq_factor) + else: + smooth = 0 + new_freqs = torch.where( + wave_len < high_freq_wavelen, + inv_freqs, + torch.where( + wave_len > low_freq_wavelen, + inv_freqs / self.scaling_factor, + (1 - smooth) * inv_freqs / self.scaling_factor + + smooth * inv_freqs, + ), + ) + return new_freqs diff --git a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py new file mode 100644 index 0000000000000..415a85ab698bc --- /dev/null +++ b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math +from typing import Optional + +import torch + +from .base import RotaryEmbedding + + +class Llama4VisionRotaryEmbedding(RotaryEmbedding): + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + dtype: torch.dtype, + ): + super().__init__(head_size, rotary_dim, max_position_embeddings, base, + is_neox_style, dtype) + + def _compute_inv_freq(self, base: float) -> torch.Tensor: + inv_freqs = super()._compute_inv_freq(base) + inv_freqs = inv_freqs[:(self.rotary_dim // 2)] + return inv_freqs + + def _compute_cos_sin_cache(self) -> torch.Tensor: + inv_freq = self._compute_inv_freq(self.base) + + # self.max_position_embeddings here is number of image patches + # i.e. (image_size // patch_size) ** 2 + num_patches = self.max_position_embeddings + img_idx = torch.arange(num_patches, + dtype=torch.int32) \ + .reshape(num_patches, 1) + img_idx = torch.cat([img_idx, img_idx[:1]], dim=0) + img_idx[-1, -1] = -2 # set to ID_CLS_TOKEN + num_patches_single_dim = int(math.sqrt(num_patches)) + frequencies_x = img_idx % num_patches_single_dim + frequencies_y = img_idx // num_patches_single_dim + freqs_x = ((frequencies_x + 1)[..., None] * + inv_freq[None, None, :]).repeat_interleave(2, dim=-1) + freqs_y = ((frequencies_y + 1)[..., None] * + inv_freq[None, None, :]).repeat_interleave(2, dim=-1) + freqs = torch.cat([freqs_x, freqs_y], + dim=-1).float().contiguous()[..., ::2] + freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0) + cache = torch.view_as_complex( + torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1)) + return cache + + def forward( + self, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + assert key is not None + self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device) + query_ = torch.view_as_complex(query.float().reshape( + *query.shape[:-1], -1, 2)) + key_ = torch.view_as_complex(key.float().reshape( + *key.shape[:-1], -1, 2)) + broadcast_shape = [ + d if i == 1 or i == (query_.ndim - 1) else 1 + for i, d in enumerate(query_.shape) + ] + freqs_ci = self.cos_sin_cache.view(*broadcast_shape) + query_out = torch.view_as_real(query_ * freqs_ci).flatten(3) + key_out = torch.view_as_real(key_ * freqs_ci).flatten(3) + return query_out.type_as(query), key_out.type_as(key) diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py new file mode 100644 index 0000000000000..a75b9e5eb435c --- /dev/null +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -0,0 +1,670 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import itertools +from typing import Optional, Union + +import numpy as np +import torch +from transformers import PretrainedConfig + +from .base import RotaryEmbedding +from .common import apply_rotary_emb_dispatch + + +class MRotaryEmbedding(RotaryEmbedding): + """Rotary Embedding with Multimodal Sections.""" + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + dtype: torch.dtype, + mrope_section: Optional[list[int]] = None, + ) -> None: + # In Qwen2.5-VL, the maximum index value is related to the duration of + # the input video. We enlarge max_position_embeddings to 4 times to get + # a larger the cos and sin cache. + self.cache_max_position_num = max_position_embeddings * 4 + super().__init__(head_size, rotary_dim, self.cache_max_position_num, + base, is_neox_style, dtype) + + self.mrope_section = mrope_section + if self.mrope_section: + assert sum(self.mrope_section) == rotary_dim // 2 + + def forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """PyTorch-native implementation equivalent to forward(). + + Args: + positions: + [num_tokens,] (text only) or + [3, num_tokens] (T/H/W positions with multimodal inputs) + query: [num_tokens, num_heads * head_size] + key: [num_tokens, num_kv_heads * head_size] + """ + assert positions.ndim == 1 or positions.ndim == 2 + assert key is not None + + num_tokens = positions.shape[-1] + cos_sin = self.cos_sin_cache[positions] + cos, sin = cos_sin.chunk(2, dim=-1) + if positions.ndim == 2: + assert self.mrope_section + + cos = torch.cat([ + m[i] + for i, m in enumerate(cos.split(self.mrope_section, dim=-1)) + ], + dim=-1) + sin = torch.cat([ + m[i] + for i, m in enumerate(sin.split(self.mrope_section, dim=-1)) + ], + dim=-1) + + query_shape = query.shape + query = query.view(num_tokens, -1, self.head_size) + query_rot = query[..., :self.rotary_dim] + query_pass = query[..., self.rotary_dim:] + query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, + self.is_neox_style) + query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + + key_shape = key.shape + key = key.view(num_tokens, -1, self.head_size) + key_rot = key[..., :self.rotary_dim] + key_pass = key[..., self.rotary_dim:] + key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, + self.is_neox_style) + key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + return query, key + + @classmethod + def get_input_positions( + cls, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], + video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], + second_per_grid_ts: Optional[list[float]], + context_len: int = 0, + seq_len: Optional[int] = None, + audio_feature_lengths: Optional[torch.Tensor] = None, + use_audio_in_video: bool = False, + ) -> tuple[list[list[int]], int]: + """Get mrope input positions and delta value.""" + + image_grid_thw = [] if image_grid_thw is None else image_grid_thw + video_grid_thw = [] if video_grid_thw is None else video_grid_thw + second_per_grid_ts = [] if second_per_grid_ts is None else \ + second_per_grid_ts + + llm_positions, mrope_position_delta = \ + cls.get_input_positions_tensor( + input_tokens=input_tokens, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + context_len=context_len, + seq_len=seq_len, + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) + + return llm_positions.tolist(), mrope_position_delta + + @classmethod + def get_input_positions_tensor( + cls, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Union[list[list[int]], torch.Tensor], + video_grid_thw: Union[list[list[int]], torch.Tensor], + second_per_grid_ts: list[float], + context_len: int = 0, + seq_len: Optional[int] = None, + audio_feature_lengths: Optional[torch.Tensor] = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + from vllm.transformers_utils.config import thinker_uses_mrope + if thinker_uses_mrope(hf_config): + return cls._omni_get_input_positions_tensor( + input_tokens=input_tokens, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + context_len=context_len, + seq_len=seq_len, + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) + elif hf_config.model_type in ["glm4v", "glm4v_moe"]: + return cls._glm4v_get_input_positions_tensor( + input_tokens=input_tokens, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + context_len=context_len, + seq_len=seq_len, + ) + else: + return cls._vl_get_input_positions_tensor( + input_tokens=input_tokens, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + context_len=context_len, + seq_len=seq_len, + ) + + @classmethod + def _glm4v_get_input_positions_tensor( + cls, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Union[list[list[int]], torch.Tensor], + video_grid_thw: Union[list[list[int]], torch.Tensor], + context_len: int = 0, + seq_len: Optional[int] = None, + ) -> tuple[torch.Tensor, int]: + """Get mrope input positions and delta value for GLM4V.""" + + image_token_id = hf_config.image_token_id + video_start_token_id = hf_config.video_start_token_id + video_end_token_id = hf_config.video_end_token_id + spatial_merge_size = hf_config.vision_config.spatial_merge_size + llm_pos_ids_list: list = [] + + if not (image_grid_thw is None and video_grid_thw is None): + if isinstance(image_grid_thw, torch.Tensor): + image_grid_thw = image_grid_thw.tolist() + + input_token_type: list[str] = [] + video_check_flg = False + for token in input_tokens: + if token == video_start_token_id: + video_check_flg = True + elif token == video_end_token_id: + video_check_flg = False + + if (token == image_token_id) and (video_check_flg is False): + input_token_type.append("image") + elif (token == image_token_id) and (video_check_flg is True): + input_token_type.append("video") + else: + input_token_type.append("text") + + input_type_group: list[tuple[str, int, int]] = [] + for key, group_iter in itertools.groupby( + enumerate(input_token_type), lambda x: x[1]): + group_list = list(group_iter) + start_index = group_list[0][0] + end_index = group_list[-1][0] + 1 + input_type_group.append((key, start_index, end_index)) + + video_frame_num = 1 + mm_data_idx = 0 + for modality_type, start_idx, end_idx in input_type_group: + st_idx = llm_pos_ids_list[-1].max() + 1 if len( + llm_pos_ids_list) > 0 else 0 + if modality_type == "image": + t, h, w = ( + image_grid_thw[mm_data_idx][0], + image_grid_thw[mm_data_idx][1], + image_grid_thw[mm_data_idx][2], + ) + llm_grid_t, llm_grid_h, llm_grid_w = \ + t, h // spatial_merge_size, w // spatial_merge_size + + t_index = torch.arange(llm_grid_t).view(-1, 1).expand( + -1, llm_grid_h * llm_grid_w).flatten() + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand( + llm_grid_t, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand( + llm_grid_t, llm_grid_h, -1).flatten() + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + st_idx) + mm_data_idx += 1 + + elif modality_type == "video": + t, h, w = ( + video_frame_num, + image_grid_thw[mm_data_idx][1], + image_grid_thw[mm_data_idx][2], + ) + llm_grid_t, llm_grid_h, llm_grid_w = \ + t, h // spatial_merge_size, w // spatial_merge_size + + for t_idx in range(llm_grid_t): + t_index = torch.tensor(t_idx).view(-1, 1).expand( + -1, llm_grid_h * llm_grid_w).flatten() + h_index = torch.arange(llm_grid_h).view( + 1, -1, 1).expand(1, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view( + 1, 1, -1).expand(1, llm_grid_h, -1).flatten() + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + st_idx) + + mm_data_idx += 1 + video_frame_num += 1 + + else: + text_len = end_idx - start_idx + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + + st_idx) + video_frame_num = 1 + + else: + text_len = len(input_tokens) + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1)) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + llm_positions = llm_positions[:, context_len:seq_len] + mrope_position_delta = (llm_positions.max() + 1 - + len(input_tokens)).item() + return llm_positions, mrope_position_delta + + @classmethod + def _vl_get_input_positions_tensor( + cls, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Union[list[list[int]], torch.Tensor], + video_grid_thw: Union[list[list[int]], torch.Tensor], + second_per_grid_ts: list[float], + context_len: int = 0, + seq_len: Optional[int] = None, + ) -> tuple[torch.Tensor, int]: + """Get mrope input positions and delta value.""" + + image_token_id = hf_config.image_token_id + video_token_id = hf_config.video_token_id + vision_start_token_id = hf_config.vision_start_token_id + spatial_merge_size = hf_config.vision_config.spatial_merge_size + tokens_per_second = getattr(hf_config.vision_config, + "tokens_per_second", 1.0) + + input_tokens_tensor = torch.tensor(input_tokens) + vision_start_indices = torch.argwhere( + input_tokens_tensor == vision_start_token_id).squeeze(1) + vision_tokens = input_tokens_tensor[vision_start_indices + 1] + image_nums = (vision_tokens == image_token_id).sum() + video_nums = (vision_tokens == video_token_id).sum() + llm_pos_ids_list: list = [] + + st = 0 + remain_images, remain_videos = image_nums, video_nums + + image_index, video_index = 0, 0 + for _ in range(image_nums + video_nums): + video_second_per_grid_t = 0.0 + if image_token_id in input_tokens and remain_images > 0: + ed_image = input_tokens.index(image_token_id, st) + else: + ed_image = len(input_tokens) + 1 + if video_token_id in input_tokens and remain_videos > 0: + ed_video = input_tokens.index(video_token_id, st) + else: + ed_video = len(input_tokens) + 1 + if ed_image < ed_video: + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + image_index += 1 + remain_images -= 1 + ed = ed_image + else: + t, h, w = ( + video_grid_thw[video_index][0], + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + video_second_per_grid_t = 1.0 + if second_per_grid_ts: + video_second_per_grid_t = second_per_grid_ts[video_index] + video_index += 1 + remain_videos -= 1 + ed = ed_video + + llm_grid_t, llm_grid_h, llm_grid_w = \ + t, h // spatial_merge_size, w // spatial_merge_size + text_len = ed - st + + st_idx = llm_pos_ids_list[-1].max() + 1 if len( + llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + t_index = (torch.arange(llm_grid_t).view(-1, 1).expand( + -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t * + tokens_per_second).long().flatten() + + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand( + llm_grid_t, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand( + llm_grid_t, llm_grid_h, -1).flatten() + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + text_len + st_idx) + st = ed + llm_grid_t * llm_grid_h * llm_grid_w + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if len( + llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + mrope_position_delta = (llm_positions.max() + 1 - + len(input_tokens)).item() + llm_positions = llm_positions[:, context_len:seq_len] + + return llm_positions, mrope_position_delta + + @classmethod + def _omni_get_input_positions_tensor( + cls, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Union[list[list[int]], torch.Tensor], + video_grid_thw: Union[list[list[int]], torch.Tensor], + second_per_grid_ts: Optional[list[float]] = None, + context_len: int = 0, + seq_len: Optional[int] = None, + audio_feature_lengths: Optional[torch.Tensor] = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + """Get mrope input positions and delta value (Qwen2.5-Omni version). + + Differences from MRotaryEmbedding: + 1. Add audio support (and related `audio_feature_lengths`). + 2. Add `use_audio_in_video` option to read audio from video inputs. + In this case, audio and vision position ids will be split into + chunks and interleaved. + + Example: + + (V_i are vision position ids, A_i are audio position ids) + + |V_1 ... V_n|A_1 ... A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|... + |vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |... + """ + + # TODO(fyabc): refactor and share more code with + # _vl_get_input_positions_tensor. + + thinker_config = hf_config.thinker_config + audio_token_id = thinker_config.audio_token_index + image_token_id = thinker_config.image_token_index + video_token_id = thinker_config.video_token_index + audio_start_token_id = thinker_config.audio_start_token_id + audio_end_token_id = thinker_config.audio_end_token_id + vision_start_token_id = thinker_config.vision_start_token_id + vision_end_token_id = thinker_config.vision_end_token_id + seconds_per_chunk = thinker_config.seconds_per_chunk + spatial_merge_size = thinker_config.vision_config.spatial_merge_size + tokens_per_second = getattr(thinker_config.vision_config, + "tokens_per_second", 25) + + if isinstance(image_grid_thw, list): + image_grid_thw = torch.tensor(image_grid_thw) + if isinstance(video_grid_thw, list): + video_grid_thw = torch.tensor(video_grid_thw) + + src_item = input_tokens + audio_seqlens = audio_feature_lengths + if not second_per_grid_ts: + second_per_grid_ts = [1] * video_grid_thw.shape[0] + audio_idx = 0 + video_idx = 0 + image_idx = 0 + new_src_item: list[int] = [] + llm_pos_ids_list: list[torch.Tensor] = [] + + idx = 0 + while idx < len(src_item): + new_src_item_len = len(new_src_item) + start_idx = llm_pos_ids_list[-1].max() + 1 if len( + llm_pos_ids_list) > 0 else 0 + if src_item[idx] not in [ + audio_token_id, video_token_id, image_token_id + ]: + if use_audio_in_video and idx > 0: + if src_item[idx] == vision_end_token_id and \ + src_item[idx - 1] == audio_end_token_id: + # processing the <|audio_eos|> before <|vision_eos|> + start_idx -= 1 + elif src_item[idx] == audio_start_token_id and \ + src_item[idx - 1] == vision_start_token_id: + # processing the <|audio_bos|> after <|vision_eos|> + start_idx -= 1 + new_src_item.append(src_item[idx]) + llm_pos_ids = torch.tensor([start_idx], + dtype=torch.long).expand(3, -1) + llm_pos_ids_list.append(llm_pos_ids) + elif src_item[idx] == audio_token_id: + assert audio_seqlens is not None + audio_seqlen = audio_seqlens[audio_idx] + place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1) + new_src_item.extend([audio_token_id] * place_num) + llm_pos_ids = torch.arange(place_num).expand(3, -1) + start_idx + llm_pos_ids_list.append(llm_pos_ids) + audio_idx += 1 + elif src_item[idx] == image_token_id: + grid_t = image_grid_thw[image_idx][0] + grid_hs = image_grid_thw[:, 1] + grid_ws = image_grid_thw[:, 2] + t_index = (torch.arange(grid_t) * 1 * tokens_per_second).long() + llm_pos_ids = cls._get_llm_pos_ids_for_vision( + start_idx, image_idx, spatial_merge_size, t_index, grid_hs, + grid_ws) + llm_pos_ids_list.append(llm_pos_ids) + vision_seqlen = image_grid_thw[image_idx].prod() // ( + spatial_merge_size**2) + new_src_item.extend([image_token_id] * vision_seqlen) + image_idx += 1 + elif src_item[idx] == video_token_id and not use_audio_in_video: + grid_t = video_grid_thw[video_idx][0] + grid_hs = video_grid_thw[:, 1] + grid_ws = video_grid_thw[:, 2] + t_index = (torch.arange(grid_t) * + second_per_grid_ts[video_idx] * + tokens_per_second).long() + llm_pos_ids = cls._get_llm_pos_ids_for_vision( + start_idx, video_idx, spatial_merge_size, t_index, grid_hs, + grid_ws) + llm_pos_ids_list.append(llm_pos_ids) + vision_seqlen = video_grid_thw[video_idx].prod() // ( + spatial_merge_size**2) + new_src_item.extend([video_token_id] * vision_seqlen) + video_idx += 1 + else: + # read audio from video + assert audio_seqlens is not None + audio_seqlen = audio_seqlens[audio_idx] + vision_seqlen = video_grid_thw[video_idx].prod() // ( + spatial_merge_size**2) + grid_t = video_grid_thw[video_idx][0] + grid_h = video_grid_thw[video_idx][1] + grid_w = video_grid_thw[video_idx][2] + grid_hs = video_grid_thw[:, 1] + grid_ws = video_grid_thw[:, 2] + t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk) + t_index = (torch.arange(grid_t) * + second_per_grid_ts[video_idx] * + tokens_per_second).long() + t_index_split_chunk = cls._split_list_into_ranges( + t_index, t_ntoken_per_chunk) + place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1) + 2 + pure_audio_len = place_num - 2 + added_audio_len = 0 + audio_llm_pos_ids_list: list[torch.Tensor] = [] + for t_chunk in t_index_split_chunk: + vision_ntoken_per_chunk = len( + t_chunk) * grid_h * grid_w // (spatial_merge_size**2) + new_src_item.extend([video_token_id] * + vision_ntoken_per_chunk) + vision_llm_pos_ids_list = cls._get_llm_pos_ids_for_vision( + start_idx, video_idx, spatial_merge_size, t_chunk, + grid_hs, grid_ws).split(1, dim=1) + llm_pos_ids_list.extend(vision_llm_pos_ids_list) + new_src_item.extend( + min(t_ntoken_per_chunk, pure_audio_len - + added_audio_len) * [audio_token_id]) + audio_start_idx = start_idx if len( + audio_llm_pos_ids_list + ) == 0 else audio_llm_pos_ids_list[-1][0].item() + 1 + if min(t_ntoken_per_chunk, + pure_audio_len - added_audio_len) > 0: + audio_llm_pos_ids_list = (torch.arange( + min(t_ntoken_per_chunk, pure_audio_len - + added_audio_len)).expand(3, -1) + + audio_start_idx).split(1, + dim=1) + else: + audio_llm_pos_ids_list = [] + added_audio_len += min(t_ntoken_per_chunk, + pure_audio_len - added_audio_len) + llm_pos_ids_list.extend(audio_llm_pos_ids_list) + if added_audio_len < pure_audio_len: + new_src_item.extend( + (pure_audio_len - added_audio_len) * [audio_token_id]) + audio_llm_pos_ids_list = ( + torch.arange(pure_audio_len - added_audio_len).expand( + 3, -1) + llm_pos_ids_list[-1].max() + 1).split( + 1, dim=1) + llm_pos_ids_list.extend(audio_llm_pos_ids_list) + audio_idx += 1 + video_idx += 1 + # move to the next token + idx += len(new_src_item) - new_src_item_len + + llm_positions = torch.cat(llm_pos_ids_list, dim=1) + mrope_position_delta = torch.cat(llm_pos_ids_list, + dim=1).max() + 1 - len(src_item) + llm_positions = llm_positions[:, context_len:seq_len] + + return llm_positions, mrope_position_delta + + @staticmethod + def _get_llm_pos_ids_for_vision( + start_idx: int, + vision_idx: int, + spatial_merge_size: int, + t_index: list[int], + grid_hs: torch.Tensor, + grid_ws: torch.Tensor, + ) -> torch.Tensor: + llm_pos_ids_list = [] + llm_grid_h = grid_hs[vision_idx] // spatial_merge_size + llm_grid_w = grid_ws[vision_idx] // spatial_merge_size + h_index = (torch.arange(llm_grid_h).view(1, -1, 1).expand( + len(t_index), -1, llm_grid_w).flatten()) + w_index = (torch.arange(llm_grid_w).view(1, 1, -1).expand( + len(t_index), llm_grid_h, -1).flatten()) + t_index_tensor = torch.Tensor(t_index).to(llm_grid_h.device).view( + -1, 1).expand(-1, llm_grid_h * llm_grid_w).long().flatten() + _llm_pos_ids = torch.stack([t_index_tensor, h_index, w_index]) + llm_pos_ids_list.append(_llm_pos_ids + start_idx) + llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1) + return llm_pos_ids + + @staticmethod + def _split_list_into_ranges(lst: torch.Tensor, + interval: int) -> list[list[int]]: + ranges: list[list[int]] = [[] + for _ in range((max(lst) // interval) + 1)] + for num in lst: + index = num // interval + ranges[index].append(num) + return ranges + + @staticmethod + def get_next_input_positions( + mrope_position_delta: int, + context_len: int, + seq_len: int, + ) -> list[list[int]]: + return [ + list( + range(context_len + mrope_position_delta, + seq_len + mrope_position_delta)) for _ in range(3) + ] + + @staticmethod + def get_next_input_positions_tensor(out: np.ndarray, out_offset: int, + mrope_position_delta: int, + context_len: int, num_new_tokens: int): + + values = np.arange(mrope_position_delta + context_len, + mrope_position_delta + context_len + num_new_tokens, + dtype=out.dtype) + out[:, out_offset:out_offset + num_new_tokens] = values + + @classmethod + def omni_get_updates_use_audio_in_video( + cls, + thinker_config: PretrainedConfig, + audio_len: int, + video_grid_thw: Union[list[int], torch.Tensor], + video_second_per_grid_t: float, + ) -> list[int]: + """Get video prompt updates when `use_audio_in_video` is True. + + In this case, audio and vision update ids will be split into + chunks and interleaved (details in `_omni_get_input_positions_tensor`). + + <|video_bos|><|VIDEO|><|video_eos|> => + <|video_bos|><|audio_bos|>(... chunks ...)<|audio_eos|><|video_eos|> + """ + + audio_token_id = thinker_config.audio_token_index + video_token_id = thinker_config.video_token_index + audio_start_token_id = thinker_config.audio_start_token_id + audio_end_token_id = thinker_config.audio_end_token_id + seconds_per_chunk = thinker_config.seconds_per_chunk + spatial_merge_size = thinker_config.vision_config.spatial_merge_size + tokens_per_second = getattr(thinker_config.vision_config, + "tokens_per_second", 25) + + grid_t = video_grid_thw[0] + grid_h = video_grid_thw[1] + grid_w = video_grid_thw[2] + t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk) + t_index = (torch.arange(grid_t) * video_second_per_grid_t * + tokens_per_second).long() + t_index_split_chunk = cls._split_list_into_ranges( + t_index, t_ntoken_per_chunk) + + updates = [audio_start_token_id] + added_audio_len = 0 + for t_chunk in t_index_split_chunk: + vision_ntoken_per_chunk = len(t_chunk) * grid_h * grid_w // ( + spatial_merge_size**2) + updates.extend([video_token_id] * vision_ntoken_per_chunk) + + audio_chunk_size = min(t_ntoken_per_chunk, + audio_len - added_audio_len) + updates.extend(audio_chunk_size * [audio_token_id]) + added_audio_len += audio_chunk_size + if added_audio_len < audio_len: + updates.extend((audio_len - added_audio_len) * [audio_token_id]) + updates.extend([audio_end_token_id]) + + return updates diff --git a/vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py new file mode 100644 index 0000000000000..42926bad22ef6 --- /dev/null +++ b/vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Optional + +import torch + +from .base import RotaryEmbedding + + +class NTKScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with fixed and mixed NTK scaling. + https://kexue.fm/archives/9706 """ + + def __init__(self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + scaling_factor: float, + dtype: torch.dtype, + mixed_b: Optional[float] = None) -> None: + self.scaling_factor = scaling_factor + self.mixed_b = mixed_b + super().__init__(head_size, rotary_dim, max_position_embeddings, base, + is_neox_style, dtype) + + def _compute_inv_freq(self, base: float) -> torch.Tensor: + base = self.base * (self.scaling_factor if self.mixed_b is None else 1) + inv_freq = super()._compute_inv_freq(base) + + if self.mixed_b is None: + inv_freq = inv_freq / self.scaling_factor**(2 / self.rotary_dim) + else: + a = torch.tensor(self.scaling_factor).log() / (self.rotary_dim / + 2)**self.mixed_b + lambda_1_m = (a * torch.arange( + 1, self.rotary_dim // 2 + 1).float()**self.mixed_b).exp() + inv_freq = inv_freq / lambda_1_m + + return inv_freq diff --git a/vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py b/vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py new file mode 100644 index 0000000000000..9c36d633e2a9f --- /dev/null +++ b/vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py @@ -0,0 +1,129 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import math +from typing import Optional + +import torch +import torch.nn as nn + +from .common import rotate_neox + + +class Phi3LongRoPEScaledRotaryEmbedding(nn.Module): + """Phi3 family of models scaled rotary embedding. + + Based on the original RotaryEmbedding implementation. + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + original_max_position_embeddings: int, + base: float, + is_neox_style: bool, + dtype: torch.dtype, + short_factor: list[float], + long_factor: list[float], + short_mscale: Optional[float] = None, + long_mscale: Optional[float] = None, + ): + super().__init__() + + if is_neox_style is False: + raise ValueError( + "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style." + ) + + self.rotary_dim = rotary_dim + self.head_size = head_size + self.max_position_embeddings = max_position_embeddings + self.original_max_position_embeddings = original_max_position_embeddings + self.base = base + self.short_factor = short_factor + self.long_factor = long_factor + + scale = self.max_position_embeddings / \ + self.original_max_position_embeddings + if scale <= 1.0: + scaling_factor = 1.0 + else: + scaling_factor = math.sqrt( + 1 + math.log(scale) / + math.log(self.original_max_position_embeddings)) + if short_mscale is None: + short_mscale = scaling_factor + if long_mscale is None: + long_mscale = scaling_factor + + self.short_mscale = short_mscale + self.long_mscale = long_mscale + + short_cache = self._compute_cos_sin_cache( + original_max_position_embeddings, short_factor, short_mscale) + short_cache = short_cache.to(dtype) + + long_cache = self._compute_cos_sin_cache(max_position_embeddings, + long_factor, long_mscale) + long_cache = long_cache.to(dtype) + + long_short_cache = torch.cat([short_cache, long_cache], dim=0) + self.register_buffer("long_short_cos_sin_cache", + long_short_cache, + persistent=False) + + def _compute_inv_freq(self, rescale_factors: list[float]) -> torch.Tensor: + rescale_factors = torch.tensor(rescale_factors, dtype=torch.float32) + inv_freq = 1.0 / (rescale_factors * (self.base**(torch.arange( + 0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))) + return inv_freq + + def _compute_cos_sin_cache( + self, + max_position_embeddings: int, + rescale_factors: list[float], + mscale: float, + ) -> torch.Tensor: + inv_freq = self._compute_inv_freq(rescale_factors) + t = torch.arange(max_position_embeddings, dtype=torch.float) + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() * mscale + sin = freqs.sin() * mscale + cache = torch.cat((cos, sin), dim=-1) + return cache + + def forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + offsets: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + assert key is not None + query = query.view(*query.shape[:-1], -1, self.head_size) + key = key.view(*key.shape[:-1], -1, self.head_size) + + k = self.original_max_position_embeddings + long_prompt_offset = (torch.any(positions > k).float() * + torch.full_like(positions, k)).long() + idx = (torch.add(positions, long_prompt_offset) + if long_prompt_offset is not None else positions) + idx = torch.add(idx, offsets) if offsets is not None else idx + cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx) + + cos, sin = cos_sin.chunk(2, dim=-1) + cos = cos.repeat(1, 2).unsqueeze(-2) + sin = sin.repeat(1, 2).unsqueeze(-2) + + query_rot = query[..., :self.rotary_dim] + query_pass = query[..., self.rotary_dim:] + query_rot = query_rot * cos + rotate_neox(query_rot) * sin + query = torch.cat((query_rot, query_pass), dim=-1) + + key_rot = key[..., :self.rotary_dim] + key_pass = key[..., self.rotary_dim:] + key_rot = key_rot * cos + rotate_neox(key_rot) * sin + key = torch.cat((key_rot, key_pass), dim=-1) + + return query.flatten(-2), key.flatten(-2) diff --git a/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py new file mode 100644 index 0000000000000..851565c5667a4 --- /dev/null +++ b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from .base import RotaryEmbedding +from .common import (yarn_find_correction_range, yarn_get_mscale, + yarn_linear_ramp_mask) + + +class YaRNScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with YaRN method. + + Credits to Peng et al. github.com/jquesnelle/yarn + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + scaling_factor: float, + dtype: torch.dtype, + *, + extrapolation_factor: float = 1, + attn_factor: float = 1, + beta_fast: int = 32, + beta_slow: int = 1, + ) -> None: + self.scaling_factor = scaling_factor + self.extrapolation_factor = extrapolation_factor + self.attn_factor = attn_factor + self.beta_fast = beta_fast + self.beta_slow = beta_slow + # Get n-d magnitude scaling corrected for interpolation + self.mscale = float(yarn_get_mscale(self.scaling_factor) * attn_factor) + super().__init__(head_size, rotary_dim, max_position_embeddings, base, + is_neox_style, dtype) + + def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: + pos_freqs = self.base**( + torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / + self.rotary_dim) + inv_freq_extrapolation = 1.0 / pos_freqs + inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs) + + low, high = yarn_find_correction_range(self.beta_fast, self.beta_slow, + self.rotary_dim, self.base, + self.max_position_embeddings) + # Get n-d rotational scaling corrected for extrapolation + inv_freq_mask = (1 - yarn_linear_ramp_mask( + low, high, self.rotary_dim // 2, + dtype=torch.float)) * self.extrapolation_factor + inv_freq = inv_freq_interpolation * ( + 1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask + return inv_freq + + def _compute_cos_sin_cache(self) -> torch.Tensor: + inv_freq = self._compute_inv_freq(self.scaling_factor) + t = torch.arange(self.max_position_embeddings * self.scaling_factor, + dtype=torch.float32) + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = (freqs.cos() * self.mscale) + sin = (freqs.sin() * self.mscale) + cache = torch.cat((cos, sin), dim=-1) + return cache From 031ca762d7bdb566917c8aa39a0294fea89c55ed Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Mon, 4 Aug 2025 22:12:28 -0400 Subject: [PATCH 204/224] [ROCm][Bugfix] Compilation passes fix (#22202) Signed-off-by: Gregory Shtrasberg --- vllm/compilation/pass_manager.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 54f00d5415216..e07e52be9fdf6 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -7,11 +7,13 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform -if current_platform.is_cuda(): +if current_platform.is_cuda_alike(): from .fusion import FusionPass - from .collective_fusion import AllReduceFusionPass, AsyncTPPass from .fusion_attn import AttnFusionPass +if current_platform.is_cuda(): + from .collective_fusion import AllReduceFusionPass, AsyncTPPass + from .activation_quant_fusion import ActivationQuantFusionPass from .fix_functionalization import FixFunctionalizationPass from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context From 6fa41e0c32f3f1b3d4f146c7f6a9872dcf9d0968 Mon Sep 17 00:00:00 2001 From: Yuxuan Zhang <2448370773@qq.com> Date: Tue, 5 Aug 2025 10:12:38 +0800 Subject: [PATCH 205/224] self.gate dtype update for GLM-4.5 (#22203) Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> --- docs/models/supported_models.md | 2 +- tests/models/registry.py | 2 +- vllm/model_executor/models/glm4_moe.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index be3d51a025edf..017a339ffca0c 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -606,7 +606,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `GLM4VForCausalLM`^ | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4MoeForCausalLM` | GLM-4.5 | T + IE+ + VE+ | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4v_moeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V-Air`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Glm4v_moeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index d86bd20fb0e34..47057d32e9cd7 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -383,7 +383,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"), # noqa: E501 - "Glm4v_moeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V-Air", + "Glm4v_moeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V", is_available_online=False), # noqa: E501 "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m", trust_remote_code=True, diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index c702684c6caa1..bd3e27662ee7c 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -123,6 +123,7 @@ class Glm4MoE(nn.Module): config.n_routed_experts, bias=False, quant_config=None, + params_dtype=torch.float32, prefix=f"{prefix}.gate") self.gate.e_score_correction_bias = nn.Parameter( @@ -180,7 +181,7 @@ class Glm4MoE(nn.Module): if self.n_shared_experts is not None: shared_output = self.shared_experts(hidden_states) - router_logits, _ = self.gate(hidden_states) + router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32)) final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits) * self.routed_scaling_factor From d7b28f34153a5116174383d97e41a1279b51e5cb Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 4 Aug 2025 22:13:19 -0400 Subject: [PATCH 206/224] [Log] DeepGEMM Update Log for Unaligned Problem Size (#22208) Signed-off-by: yewentao256 --- .../layers/fused_moe/deep_gemm_moe.py | 21 +++++++++++++++++-- .../layers/fused_moe/fused_moe.py | 6 ++---- .../layers/fused_moe/triton_deep_gemm_moe.py | 4 ++-- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index bd3605378b6dc..ba7105c83a92f 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -33,7 +33,7 @@ def deep_gemm_block_shape() -> list[int]: return [block, block] -def _valid_deep_gemm_shape(M: int, N: int, K: int): +def _valid_deep_gemm_shape(M: int, N: int, K: int) -> bool: align = deep_gemm_block_shape()[0] return align <= M and N % align == 0 and K % align == 0 @@ -51,9 +51,26 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor, M = hidden_states.size(0) _, K, N = w2.size() + + align = deep_gemm_block_shape()[0] + if not _valid_deep_gemm_shape(M, N, K): logger.debug_once( - "DeepGemm disabled: unaligned problem size. M: %s, N: %s, K: %s", + "DeepGemm disabled due to unaligned problem size. " + "M: %s, N: %s, K: %s. M should >= align size " + "and N and K must be multiples of %s." + "This is not an error and we will fall back to triton.", + M, + N, + K, + align, + ) + return False + elif N <= 512: + logger.debug_once( + "DeepGemm disabled for N <= 512. M: %s, N: %s, K: %s. " + "This means we will fallback to triton " + "for this specific shape for further speed up.", M, N, K, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 56d1dfe135b3b..597af08c3c9fa 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1360,10 +1360,8 @@ def fused_experts( # E8M0 scale, which means we requantize the weight and input to the specific # scale. Fallen back to cutlass or triton for some cases would cause # accuracy issue. - N = w1.size(1) - should_use_deep_gemm = ((N > 512 - and _valid_deep_gemm(hidden_states, w1, w2)) - or is_blackwell_deep_gemm_used()) + should_use_deep_gemm = is_blackwell_deep_gemm_used() or _valid_deep_gemm( + hidden_states, w1, w2) if (allow_deep_gemm and use_fp8_w8a8 and should_use_deep_gemm): assert apply_router_weight_on_input is False assert is_act_and_mul, ( diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py index 1b31368c79cd5..c67f7e808301a 100644 --- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -107,8 +107,8 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): # Note: the deep gemm workspaces are strictly larger than the triton # workspaces so we can be pessimistic here and allocate for DeepGemm # even if we fall back to triton later, e.g. if expert maps are set. - if self.allow_deep_gemm and (_valid_deep_gemm_shape(M, N, K) - or is_blackwell_deep_gemm_used()): + if self.allow_deep_gemm and (is_blackwell_deep_gemm_used() + or _valid_deep_gemm_shape(M, N, K)): assert self.deep_gemm_expert is not None return self.deep_gemm_expert.workspace_shapes( a, aq, M, N, K, topk, global_num_experts, local_num_experts, From 8a6e108e76aed89ea23c345bd8fc46d904911e7c Mon Sep 17 00:00:00 2001 From: tlipoca9 <160737620+tlipoca9@users.noreply.github.com> Date: Tue, 5 Aug 2025 10:15:31 +0800 Subject: [PATCH 207/224] fix: kimi_k2 return empty tool call list (#22149) Signed-off-by: tlipoca9 --- .../openai/tool_parsers/kimi_k2_tool_parser.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py index b0df442dd8644..834b33052b45d 100644 --- a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py @@ -38,15 +38,15 @@ class KimiK2ToolParser(ToolParser): self.tool_call_end_token: str = "<|tool_call_end|>" self.tool_call_regex = re.compile( - r"<\|tool_call_begin\|>\s*(?P[\w\.]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P.*?)\s*<\|tool_call_end\|>" + r"<\|tool_call_begin\|>\s*(?P.+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P.*?)\s*<\|tool_call_end\|>" ) self.stream_tool_call_portion_regex = re.compile( - r"(?P[\w\.]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P.*)" + r"(?P.+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P.*)" ) self.stream_tool_call_name_regex = re.compile( - r"(?P[\w\.]+:\d+)\s*") + r"(?P.+:\d+)\s*") if not self.model_tokenizer: raise ValueError( @@ -374,4 +374,4 @@ class KimiK2ToolParser(ToolParser): except Exception: logger.exception("Error trying to handle streaming tool call.") - return None # do not stream a delta. skip this token ID. \ No newline at end of file + return None # do not stream a delta. skip this token ID. From 7b455cf1c036d12470374d716800d0fd09290a5a Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Tue, 5 Aug 2025 10:17:18 +0800 Subject: [PATCH 208/224] [Misc] Remove pass_config from CompilationConfig dump_json excluded (#21911) Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> --- vllm/config.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index dd59526471782..1100e1077401c 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4374,12 +4374,20 @@ class CompilationConfig: "disabled_custom_ops": True, "compilation_time": True, "bs_to_padded_graph_size": True, - "pass_config": True, "traced_files": True, "inductor_compile_config": { "post_grad_custom_post_pass": True, }, } + + # exclude default attr in pass_config + pass_config_exclude = {} + for attr, default_val in vars(PassConfig()).items(): + if getattr(self.pass_config, attr) == default_val: + pass_config_exclude[attr] = True + if pass_config_exclude: + exclude["pass_config"] = pass_config_exclude + # The cast to string is necessary because Pydantic is mocked in docs # builds and sphinx-argparse doesn't know the return type of decode() return str( From 29b97c09950fb6970756f5e2cfd4a3d7c1f4d72e Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Tue, 5 Aug 2025 10:36:20 +0800 Subject: [PATCH 209/224] [Doc] add backend to doc string of initialize_model_parallel (#22142) Signed-off-by: Andy Xie --- vllm/distributed/parallel_state.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index f31e4766bfdad..48a82d30193e3 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1013,6 +1013,7 @@ def initialize_model_parallel( parallelism. pipeline_model_parallel_size: number of GPUs used for pipeline model parallelism. + backend: name of torch distributed communication backend. Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize From bd3db7f46965bfc979734a6d4b50cf96184c10d8 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Tue, 5 Aug 2025 10:36:55 +0800 Subject: [PATCH 210/224] [Misc] log more detailed message for ensure_model_parallel_initialized (#22144) Signed-off-by: Andy Xie --- vllm/distributed/parallel_state.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 48a82d30193e3..470c1355d2a91 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1125,14 +1125,14 @@ def ensure_model_parallel_initialized( assert ( get_tensor_model_parallel_world_size() == tensor_model_parallel_size - ), ("tensor parallel group already initialized, but of unexpected size: " - f"{get_tensor_model_parallel_world_size()=} vs. " - f"{tensor_model_parallel_size=}") + ), ("tensor parallel group already initialized, but of unexpected size. " + f"got: {get_tensor_model_parallel_world_size()=} vs. " + f"wanted: {tensor_model_parallel_size=}") pp_world_size = get_pp_group().world_size assert (pp_world_size == pipeline_model_parallel_size), ( - "pipeline parallel group already initialized, but of unexpected size: " - f"{pp_world_size=} vs. " - f"{pipeline_model_parallel_size=}") + "pipeline parallel group already initialized, but of unexpected size. " + f"got: {pp_world_size=} vs. " + f"wanted: {pipeline_model_parallel_size=}") def prepare_communication_buffer_for_model(model: torch.nn.Module): From 4b3e4474d73ae9cf0d6c8315570fdffd71037d08 Mon Sep 17 00:00:00 2001 From: "ZiTian.Zhao" Date: Tue, 5 Aug 2025 12:43:24 +0800 Subject: [PATCH 211/224] Optimize configuration access with LRU cache in custom ops (#22204) Signed-off-by: zitian zhao --- vllm/config.py | 10 +++++++++- vllm/model_executor/custom_op.py | 8 ++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 1100e1077401c..34952279c9d19 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -15,7 +15,7 @@ from collections.abc import Mapping from contextlib import contextmanager from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass, replace) -from functools import cached_property +from functools import cached_property, lru_cache from importlib.util import find_spec from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional, Protocol, TypeVar, Union, cast, get_args) @@ -5123,6 +5123,14 @@ def set_current_vllm_config(vllm_config: VllmConfig, finally: _current_vllm_config = old_vllm_config _current_prefix = old_prefix + # Clear the compilation config cache when context changes + get_cached_compilation_config.cache_clear() + + +@lru_cache(maxsize=1) +def get_cached_compilation_config(): + """Cache config to avoid repeated calls to get_current_vllm_config()""" + return get_current_vllm_config().compilation_config def get_current_vllm_config() -> VllmConfig: diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index f6e79cd676f8c..6b5a107396c92 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -5,7 +5,7 @@ from typing import Optional import torch.nn as nn -from vllm.config import get_current_vllm_config +from vllm.config import get_cached_compilation_config from vllm.logger import init_logger from vllm.platforms import current_platform @@ -86,7 +86,7 @@ class CustomOp(nn.Module): def dispatch_forward(self): # NOTE(woosuk): Here we assume that vLLM was built for only one # specific backend. Currently, we do not support dynamic dispatching. - compilation_config = get_current_vllm_config().compilation_config + compilation_config = get_cached_compilation_config() enabled = self.enabled() if enabled: compilation_config.enabled_custom_ops.update([self.__class__.name]) @@ -115,7 +115,7 @@ class CustomOp(nn.Module): @classmethod def enabled(cls) -> bool: # if no name, then it was not registered - compilation_config = get_current_vllm_config().compilation_config + compilation_config = get_cached_compilation_config() custom_ops = compilation_config.custom_ops if not hasattr(cls, "name"): logger.warning_once( @@ -138,7 +138,7 @@ class CustomOp(nn.Module): Specifying 'all' or 'none' in custom_op takes precedence. """ from vllm.config import CompilationLevel - compilation_config = get_current_vllm_config().compilation_config + compilation_config = get_cached_compilation_config() default_on = (compilation_config.level < CompilationLevel.PIECEWISE or not compilation_config.use_inductor) count_none = compilation_config.custom_ops.count("none") From cdfd6871a5c4f125c9b3707ec5c1260db54f4b03 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 5 Aug 2025 13:40:09 +0800 Subject: [PATCH 212/224] [Bugfix] Misaligned params in TreeAttentionImpl (#22226) Signed-off-by: DarkLight1337 --- vllm/v1/attention/backends/tree_attn.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py index 4fb7483284053..a071f0921df94 100644 --- a/vllm/v1/attention/backends/tree_attn.py +++ b/vllm/v1/attention/backends/tree_attn.py @@ -4,7 +4,7 @@ import ast from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Optional import torch @@ -313,15 +313,11 @@ class TreeAttentionImpl(AttentionImpl): alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: - if blocksparse_params is not None: - raise ValueError( - "TreeAttention does not support block-sparse attention.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) From e79a12fc3afb33171b06af3f1b74a42b29d1c6c2 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 5 Aug 2025 02:54:52 -0400 Subject: [PATCH 213/224] [UX] Fail if an invalid attention backend is specified (#22217) Signed-off-by: mgoin --- .../attention/test_attention_selector.py | 20 +++++-------------- vllm/attention/selector.py | 4 ++++ 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 93bf20da4adba..bfeafaa9e27e6 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -278,23 +278,13 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch): @pytest.mark.parametrize("use_v1", [True, False]) def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch): - + """Test that invalid attention backend names raise ValueError.""" with monkeypatch.context() as m, patch( "vllm.attention.selector.current_platform", CudaPlatform()): m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL) - # Test with head size 32 - backend = get_attn_backend(32, torch.float16, None, 16, False) - EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN" - assert backend.get_name() == EXPECTED - - # when block size == 16, backend will fall back to XFORMERS - # this behavior is not yet supported on V1. - if use_v1: - # TODO: support fallback on V1! - # https://github.com/vllm-project/vllm/issues/14524 - pass - else: - backend = get_attn_backend(16, torch.float16, None, 16, False) - assert backend.get_name() == "XFORMERS" + # Should raise ValueError for invalid backend + with pytest.raises(ValueError) as exc_info: + get_attn_backend(32, torch.float16, None, 16, False) + assert "Invalid attention backend: 'INVALID'" in str(exc_info.value) diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 2e3c8638125f7..596c556e54f06 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -193,6 +193,10 @@ def _cached_get_attn_backend( backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND if backend_by_env_var is not None: selected_backend = backend_name_to_enum(backend_by_env_var) + if selected_backend is None: + raise ValueError( + f"Invalid attention backend: '{backend_by_env_var}'. " + f"Valid backends are: {list(_Backend.__members__.keys())}") # get device-specific attn_backend attention_cls = current_platform.get_attn_backend_cls( From 811ac13d039648f2d78a636ce4366e70449380c8 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 5 Aug 2025 14:54:55 +0800 Subject: [PATCH 214/224] [Core] Factor out common logic for MM budget calculation (#22228) Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 217 ++++++++++++++--------------- vllm/v1/worker/tpu_model_runner.py | 189 +++++++++++-------------- vllm/v1/worker/utils.py | 109 +++++++++++++++ 3 files changed, 299 insertions(+), 216 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 041687ae28b20..85976fc1c825b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -36,7 +36,8 @@ from vllm.model_executor.models.interfaces import (is_mixture_of_experts, from vllm.model_executor.models.interfaces_base import ( VllmModelForPooling, is_pooling_model, is_text_generation_model) from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange +from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs, + PlaceholderRange) from vllm.multimodal.utils import group_mm_inputs_by_modality from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingType @@ -51,7 +52,6 @@ from vllm.v1.attention.backends.utils import ( make_kv_sharing_fast_prefill_attention_metadata, make_local_attention_virtual_batches, reorder_batch_to_split_decodes_and_prefills) -from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.kv_cache_interface import (AttentionSpec, ChunkedLocalAttentionSpec, FullAttentionSpec, KVCacheConfig, @@ -73,7 +73,7 @@ from vllm.v1.worker.kv_connector_model_runner_mixin import ( from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin from ..sample.logits_processor import LogitsProcessorManager -from .utils import (bind_kv_cache, gather_mm_placeholders, +from .utils import (MultiModalBudget, bind_kv_cache, gather_mm_placeholders, initialize_kv_cache_for_kv_sharing, sanity_check_mm_encoder_outputs, scatter_mm_placeholders) @@ -148,14 +148,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope - encoder_compute_budget, encoder_cache_size = compute_encoder_budget( - model_config=model_config, - scheduler_config=scheduler_config, - mm_registry=self.mm_registry, - ) - self.max_num_encoder_input_tokens = encoder_compute_budget - self.encoder_cache_size = encoder_cache_size - # Sampler self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode) @@ -330,6 +322,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.kv_sharing_fast_prefill_logits_indices = torch.zeros( self.max_num_tokens, dtype=torch.int32, device=self.device) + self.mm_budget = (MultiModalBudget( + self.model_config, + self.scheduler_config, + self.mm_registry, + max_model_len=self.max_model_len, + max_num_reqs=self.max_num_reqs, + ) if self.is_multimodal_model else None) + self.reorder_batch_threshold: Optional[int] = None def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None: @@ -578,37 +578,33 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Refresh batch metadata with any pending updates. self.input_batch.refresh_metadata() - def _init_model_kwargs_for_multimodal_model( + def _extract_mm_kwargs( self, - scheduler_output: Optional["SchedulerOutput"] = None, - num_reqs: int = -1, - ) -> dict[str, Any]: - - model_kwargs: dict[str, Any] = {} - if self.is_multimodal_raw_input_supported: - # This model requires the raw multimodal data in input. + scheduler_output: "SchedulerOutput", + ) -> BatchedTensorInputs: + if self.is_multimodal_raw_input_supported: # noqa: SIM102 if scheduler_output: - multi_modal_kwargs_list = [] + multi_modal_kwargs_list = list[MultiModalKwargs]() for req in scheduler_output.scheduled_new_reqs: req_mm_inputs = req.mm_inputs if not isinstance(req_mm_inputs, list): req_mm_inputs = list(req_mm_inputs) multi_modal_kwargs_list.extend(req_mm_inputs) - multi_modal_kwargs = MultiModalKwargs.batch( - multi_modal_kwargs_list) - else: - # The only case where SchedulerOutput is None is for - # a dummy run let's get some dummy data. - dummy_data = [ - self.mm_registry.get_decoder_dummy_data( - model_config=self.model_config, - seq_len=1).multi_modal_data for i in range(num_reqs) - ] - multi_modal_kwargs = MultiModalKwargs.batch(dummy_data) - model_kwargs.update(multi_modal_kwargs) + return MultiModalKwargs.batch(multi_modal_kwargs_list) - return model_kwargs + return {} + + def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs: + if self.is_multimodal_raw_input_supported: + mm_budget = self.mm_budget + assert mm_budget is not None + + dummy_modality, _ = mm_budget.get_modality_with_max_tokens() + + return self._get_mm_dummy_batch(dummy_modality, num_seqs) + + return {} def _get_cumsum_and_arange( self, @@ -1517,19 +1513,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. - input_ids = self.input_ids[:num_scheduled_tokens] - - model_kwargs = self._init_model_kwargs_for_multimodal_model( - scheduler_output=scheduler_output) - inputs_embeds = self.model.get_input_embeddings( - input_ids=input_ids, + inputs_embeds_scheduled = self.model.get_input_embeddings( + input_ids=self.input_ids[:num_scheduled_tokens], multimodal_embeddings=mm_embeds or None, ) # TODO(woosuk): Avoid the copy. Optimize. - self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds) - inputs_embeds = self.inputs_embeds[:num_input_tokens] + self.inputs_embeds[:num_scheduled_tokens].copy_( + inputs_embeds_scheduled) + input_ids = None + inputs_embeds = self.inputs_embeds[:num_input_tokens] + model_mm_kwargs = self._extract_mm_kwargs(scheduler_output) else: # For text-only models, we use token ids as input. # While it is possible to use embeddings as input just like the @@ -1537,7 +1532,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # then the embedding layer is not included in the CUDA graph. input_ids = self.input_ids[:num_input_tokens] inputs_embeds = None - model_kwargs = {} + model_mm_kwargs = {} if self.uses_mrope: positions = self.mrope_positions[:, :num_input_tokens] else: @@ -1571,7 +1566,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds, **MultiModalKwargs.as_kwargs( - model_kwargs, + model_mm_kwargs, device=self.device, ), ) @@ -2149,6 +2144,30 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): yield input_ids.fill_(0) + def _get_mm_dummy_batch( + self, + modality: str, + max_items_per_batch: int, + ) -> BatchedTensorInputs: + """Dummy data for profiling and precompiling multimodal models.""" + dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( + model_config=self.model_config, + seq_len=self.max_num_tokens, + mm_counts={modality: 1}, + ) + dummy_mm_data = dummy_decoder_data.multi_modal_data + + # Result in the maximum GPU consumption of the model + dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0) + dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) + + batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] * + max_items_per_batch) + return MultiModalKwargs.as_kwargs( + batched_dummy_mm_inputs, + device=self.device, + ) + @torch.inference_mode() def _dummy_run( self, @@ -2213,16 +2232,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): with self.maybe_dummy_run_with_lora(self.lora_config, num_scheduled_tokens): - model = self.model if self.is_multimodal_model: - model_kwargs = self._init_model_kwargs_for_multimodal_model( - num_reqs=num_reqs) input_ids = None inputs_embeds = self.inputs_embeds[:num_tokens] + model_mm_kwargs = self._dummy_mm_kwargs(num_reqs) else: input_ids = self.input_ids[:num_tokens] inputs_embeds = None - model_kwargs = {} + model_mm_kwargs = {} if self.uses_mrope: positions = self.mrope_positions[:, :num_tokens] @@ -2247,13 +2264,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.vllm_config, num_tokens=num_tokens, num_tokens_across_dp=num_tokens_across_dp): - outputs = model( + outputs = self.model( input_ids=input_ids, positions=positions, intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds, **MultiModalKwargs.as_kwargs( - model_kwargs, + model_mm_kwargs, device=self.device, ), ) @@ -2423,75 +2440,51 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def profile_run(self) -> None: # Profile with multimodal encoder & encoder cache. - # TODO: handle encoder-decoder models once we support them. - if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0 - and self.encoder_cache_size > 0): + if self.is_multimodal_model: + mm_budget = self.mm_budget + assert mm_budget is not None - # NOTE: Currently model is profiled with a single non-text - # modality with the max possible input tokens even when - # it supports multiple. - max_tokens_by_modality_dict = self.mm_registry \ - .get_max_tokens_per_item_by_nonzero_modality(self.model_config) - dummy_data_modality, max_tokens_per_mm_item = max( - max_tokens_by_modality_dict.items(), key=lambda item: item[1]) + # TODO: handle encoder-decoder models once we support them. + if (encoder_budget := mm_budget.get_encoder_budget()) > 0: + # NOTE: Currently model is profiled with a single non-text + # modality with the max possible input tokens even when + # it supports multiple. + ( + dummy_modality, + max_tokens, + ) = mm_budget.get_modality_with_max_tokens() + ( + max_mm_items_per_prompt, + max_mm_items_per_batch, + ) = mm_budget.get_max_items(dummy_modality, max_tokens) - # Check how many items of this modality can be supported by - # the encoder budget. - encoder_budget = min(self.max_num_encoder_input_tokens, - self.encoder_cache_size) + logger.info( + "Encoder cache will be initialized with a budget of " + "%s tokens, and profiled with %s %s items of the maximum " + "feature size.", + encoder_budget, + max_mm_items_per_batch, + dummy_modality, + ) - max_num_mm_items_encoder_budget = encoder_budget // \ - max_tokens_per_mm_item + # Create dummy batch of multimodal inputs. + batched_dummy_mm_inputs = self._get_mm_dummy_batch( + dummy_modality, + max_mm_items_per_batch, + ) - # Check how many items of this modality can be supported by - # the decoder budget. - max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt( - self.model_config)[dummy_data_modality] + # Run multimodal encoder. + dummy_encoder_outputs = self.model.get_multimodal_embeddings( + **batched_dummy_mm_inputs) - # NOTE: We do not consider max_num_batched_tokens on purpose - # because the multimodal embeddings can be generated in advance - # and chunked prefilled. - max_num_mm_items_decoder_budget = self.max_num_reqs * \ - max_mm_items_per_req + sanity_check_mm_encoder_outputs( + dummy_encoder_outputs, + expected_num_items=max_mm_items_per_batch, + ) - max_num_mm_items = max( - 1, - min(max_num_mm_items_encoder_budget, - max_num_mm_items_decoder_budget)) - - logger.info( - "Encoder cache will be initialized with a budget of %s tokens," - " and profiled with %s %s items of the maximum feature size.", - encoder_budget, max_num_mm_items, dummy_data_modality) - - # Create dummy batch of multimodal inputs. - dummy_mm_kwargs = self.mm_registry.get_decoder_dummy_data( - model_config=self.model_config, - seq_len=max_tokens_per_mm_item, - mm_counts={ - dummy_data_modality: 1 - }, - ).multi_modal_data - - batched_dummy_mm_inputs = MultiModalKwargs.batch( - [dummy_mm_kwargs] * max_num_mm_items, - pin_memory=self.pin_memory) - batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs( - batched_dummy_mm_inputs, - device=self.device, - ) - - # Run multimodal encoder. - dummy_encoder_outputs = self.model.get_multimodal_embeddings( - **batched_dummy_mm_inputs) - - sanity_check_mm_encoder_outputs( - dummy_encoder_outputs, - expected_num_items=max_num_mm_items, - ) - - # Cache the dummy encoder outputs. - self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs)) + # Cache the dummy encoder outputs. + self.encoder_cache["tmp"] = dict( + enumerate(dummy_encoder_outputs)) # Add `is_profile` here to pre-allocate communication buffers hidden_states, last_hidden_states \ diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 67cb2f9dd810e..5f3188efdb244 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -42,7 +42,6 @@ from vllm.v1.attention.backends.pallas import (TPU_STR_DTYPE_TO_TORCH_DTYPE, PallasAttentionBackend, PallasMetadata, get_page_size_bytes) -from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec, KVCacheConfig, KVCacheSpec, SlidingWindowSpec) @@ -55,7 +54,8 @@ from vllm.v1.worker.kv_connector_model_runner_mixin import ( from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin from vllm.v1.worker.tpu_input_batch import CachedRequestState, InputBatch -from .utils import (bind_kv_cache, initialize_kv_cache_for_kv_sharing, +from .utils import (MultiModalBudget, bind_kv_cache, + initialize_kv_cache_for_kv_sharing, sanity_check_mm_encoder_outputs) if TYPE_CHECKING: @@ -195,14 +195,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # TODO: Support M-RoPE (e.g, Qwen2-VL) assert not self.uses_mrope, "TPU does not support M-RoPE yet." - encoder_compute_budget, encoder_cache_size = compute_encoder_budget( - model_config=model_config, - scheduler_config=scheduler_config, - mm_registry=self.mm_registry, - ) - self.max_num_encoder_input_tokens = encoder_compute_budget - self.encoder_cache_size = encoder_cache_size - self._num_slices_per_kv_cache_update_block = \ _get_num_slices_per_kv_cache_update_block(get_page_size_bytes( block_size=self.block_size, @@ -294,36 +286,13 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.structured_decode_arange = torch.arange( 0, 32, device="cpu", pin_memory=self.pin_memory) - # Get maximum number of mm items per modality (batch size). - self.max_num_mm_items_by_modality = dict() - if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0 - and self.encoder_cache_size > 0): - max_tokens_by_modality_dict = ( - MULTIMODAL_REGISTRY. - get_max_tokens_per_item_by_nonzero_modality(self.model_config)) - for modality, max_tokens in max_tokens_by_modality_dict.items(): - # Check how many items of this modality can be supported by - # the encoder budget. - encoder_budget = min(self.max_num_encoder_input_tokens, - self.encoder_cache_size) - - max_num_mm_items_encoder_budget = cdiv(encoder_budget, - max_tokens) - - # Check how many items of this modality can be supported by - # the decoder budget. - max_mm_items_per_req = self.mm_registry.\ - get_mm_limits_per_prompt(self.model_config)[modality] - - # NOTE: We do not consider max_num_batched_tokens on purpose - # because the multimodal embeddings can be generated in advance - # and chunked prefilled. - max_num_mm_items_decoder_budget = self.max_num_reqs * \ - max_mm_items_per_req - - max_num_mm_items = min(max_num_mm_items_encoder_budget, - max_num_mm_items_decoder_budget) - self.max_num_mm_items_by_modality[modality] = max_num_mm_items + self.mm_budget = (MultiModalBudget( + self.model_config, + self.scheduler_config, + self.mm_registry, + max_model_len=self.max_model_len, + max_num_reqs=self.max_num_reqs, + ) if self.is_multimodal_model else None) if not self.use_spmd: self.sample_from_logits_func = torch.compile( @@ -1335,23 +1304,33 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): xm.mark_step() # Captures metadata updates def _precompile_mm_encoder(self) -> None: + if not self.is_multimodal_model: + return + # Pre-compile MM encoder for all supported data modalities. hf_config = self.vllm_config.model_config.hf_config - for mode, max_items_by_mode in \ - self.max_num_mm_items_by_modality.items(): + + mm_budget = self.mm_budget + assert mm_budget is not None + + max_items_per_seq_by_modality = mm_budget.max_items_per_batch_by_modality # noqa: E501 + + for mode, max_items_per_seq in max_items_per_seq_by_modality.items(): logger.info( "Compiling Multimodal %s Encoder with different input" " shapes.", mode) start = time.perf_counter() # No padding for MM encoder just yet. - for num_items in range(1, max_items_by_mode + 1): + for num_items in range(1, max_items_per_seq + 1): logger.info(" -- mode: %s items: %d", mode, num_items) batched_dummy_mm_inputs = self._get_mm_dummy_batch( - mode, num_items) + mode, + num_items, + ) # Run multimodal encoder. xm.mark_step() - mm_embeds = self.model.\ - get_multimodal_embeddings(**batched_dummy_mm_inputs) + mm_embeds = self.model.get_multimodal_embeddings( + **batched_dummy_mm_inputs) xm.mark_step() num_patches = mm_embeds[0].shape[0] items_size = num_patches * num_items @@ -1547,51 +1526,61 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_tokens: int, ) -> None: # Profile with multimodal encoder & encoder cache. - # TODO: handle encoder-decoder models once we support them. - if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0 - and self.encoder_cache_size > 0): + if self.is_multimodal_model: + mm_budget = self.mm_budget + assert mm_budget is not None - # NOTE: Currently model is profiled with a single non-text - # modality with the max possible input tokens even when - # it supports multiple. - dummy_data_modality, max_num_mm_items = max( - self.max_num_mm_items_by_modality.items(), key=lambda t: t[1]) + # TODO: handle encoder-decoder models once we support them. + if (encoder_budget := mm_budget.get_encoder_budget()) > 0: + # NOTE: Currently model is profiled with a single non-text + # modality with the max possible input tokens even when + # it supports multiple. + ( + dummy_modality, + max_tokens, + ) = mm_budget.get_modality_with_max_tokens() + ( + max_mm_items_per_prompt, + max_mm_items_per_batch, + ) = mm_budget.get_max_items(dummy_modality, max_tokens) - encoder_budget = min(self.max_num_encoder_input_tokens, - self.encoder_cache_size) + logger.info( + "Encoder cache will be initialized with a budget of " + "%s tokens, and profiled with %s %s items of the maximum " + "feature size.", + encoder_budget, + max_mm_items_per_batch, + dummy_modality, + ) - logger.info( - "Encoder cache will be initialized with a budget of %d tokens," - " and profiled with %s %s items of the maximum feature size.", - encoder_budget, max_num_mm_items, dummy_data_modality) + # Create dummy batch of multimodal inputs. + batched_dummy_mm_inputs = self._get_mm_dummy_batch( + dummy_modality, + max_mm_items_per_batch, + ) - # Create dummy batch of multimodal inputs. - batched_dummy_mm_inputs = self._get_mm_dummy_batch( - dummy_data_modality, max_num_mm_items) + # Run multimodal encoder. + # Isolate encoder graph from post-processing to minimize + # impact of recompilation until it's fixed. + start = time.perf_counter() + xm.mark_step() + dummy_encoder_outputs = self.model.get_multimodal_embeddings( + **batched_dummy_mm_inputs) + xm.mark_step() + xm.wait_device_ops() + end = time.perf_counter() + logger.info( + "Multimodal Encoder profiling finished in in %.2f [secs].", + end - start) - # Run multimodal encoder. - # Isolate encoder graph from post-processing to minimize - # impact of recompilation until it's fixed. - start = time.perf_counter() - xm.mark_step() - dummy_encoder_outputs = self.model.get_multimodal_embeddings( - **batched_dummy_mm_inputs) - xm.mark_step() - xm.wait_device_ops() - end = time.perf_counter() - logger.info( - "Multimodal Encoder profiling finished in in %.2f [secs].", - end - start) + sanity_check_mm_encoder_outputs( + dummy_encoder_outputs, + expected_num_items=max_mm_items_per_batch, + ) - assert len(dummy_encoder_outputs) == max_num_mm_items, ( - "Expected dimension 0 of encoder outputs to match the number " - f"of multimodal data items: {max_num_mm_items}, got " - f"{len(dummy_encoder_outputs)=} instead. This is most likely " - "due to the 'get_multimodal_embeddings' method of the model " - "not implemented correctly.") - - # Cache the dummy encoder outputs. - self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs)) + # Cache the dummy encoder outputs. + self.encoder_cache["tmp"] = dict( + enumerate(dummy_encoder_outputs)) # Trigger compilation for general shape. self._dummy_run(num_tokens, self.num_reqs_max_model_len, @@ -1809,33 +1798,25 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.grammar_bitmask_cpu[:num_reqs].to(logits.device), \ self.structured_decode_arange.to(logits.device) - def _get_mm_dummy_batch(self, modality: str, - batch_size: int) -> BatchedTensorInputs: - # Dummy data for pre-compiling multimodal models. - dummy_request_data = self.mm_registry.get_decoder_dummy_data( + def _get_mm_dummy_batch( + self, + modality: str, + max_items_per_batch: int, + ) -> BatchedTensorInputs: + """Dummy data for profiling and precompiling multimodal models.""" + dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( model_config=self.model_config, seq_len=self.max_num_tokens, + mm_counts={modality: 1}, ) - dummy_mm_data = dummy_request_data.multi_modal_data + dummy_mm_data = dummy_decoder_data.multi_modal_data - # Dummy data definition in V0 may contain multiple multimodal items - # (e.g, multiple images) for a single request, therefore here we - # always replicate first item by max_num_mm_items times since in V1 - # they are scheduled to be processed separately. - assert isinstance(dummy_mm_data, MultiModalKwargs), ( - "Expected dummy multimodal data to be of type " - f"MultiModalKwargs, got {type(dummy_mm_data)=} instead. " - "This is most likely due to the model not having a merged " - "processor.") - - # When models have a merged processor, their dummy data is - # already batched `MultiModalKwargs`, therefore we take the first - # `MultiModalKwargsItem` from the desired modality to profile on. + # Result in the maximum GPU consumption of the model dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0) dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item]) batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] * - batch_size) + max_items_per_batch) return MultiModalKwargs.as_kwargs( batched_dummy_mm_inputs, device=self.device, diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 3ecb1d7dd6560..6761b3c5e41db 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -5,14 +5,123 @@ from typing import TYPE_CHECKING, Optional import torch +from vllm.config import ModelConfig, SchedulerConfig from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index +from vllm.multimodal.registry import MultiModalRegistry +from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.kv_cache_interface import KVCacheGroupSpec if TYPE_CHECKING: from vllm.attention.layer import Attention +class MultiModalBudget: + """Helper class to calculate budget information for multi-modal models.""" + + def __init__( + self, + model_config: ModelConfig, + scheduler_config: SchedulerConfig, + mm_registry: MultiModalRegistry, + *, + max_model_len: int, + max_num_reqs: int, + ) -> None: + super().__init__() + + self.model_config = model_config + self.scheduler_config = scheduler_config + self.mm_registry = mm_registry + + encoder_compute_budget, encoder_cache_size = compute_encoder_budget( + model_config=model_config, + scheduler_config=scheduler_config, + mm_registry=mm_registry, + ) + + self.max_num_encoder_input_tokens = encoder_compute_budget + self.encoder_cache_size = encoder_cache_size + self.max_model_len = max_model_len + self.max_num_reqs = max_num_reqs + + self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config) + + max_items_per_prompt_by_modality = dict[str, int]() + max_items_per_batch_by_modality = dict[str, int]() + + max_tokens_by_modality = mm_registry \ + .get_max_tokens_per_item_by_nonzero_modality(model_config) + + for modality, max_tokens in max_tokens_by_modality.items(): + ( + max_items_per_prompt, + max_items_per_batch, + ) = self.get_max_items(modality, max_tokens) + + max_items_per_prompt_by_modality[modality] = max_items_per_prompt + max_items_per_batch_by_modality[modality] = max_items_per_batch + + self.max_tokens_by_modality = max_tokens_by_modality + self.max_items_per_prompt_by_modality = max_items_per_prompt_by_modality + self.max_items_per_batch_by_modality = max_items_per_batch_by_modality + + def get_modality_with_max_tokens(self) -> tuple[str, int]: + max_tokens_by_modality = self.max_tokens_by_modality + modality, max_tokens = max(max_tokens_by_modality.items(), + key=lambda item: item[1]) + + return modality, max_tokens + + def get_encoder_budget(self) -> int: + return min(self.max_num_encoder_input_tokens, self.encoder_cache_size) + + def get_max_items( + self, + modality: str, + max_tokens_per_item: int, + ) -> tuple[int, int]: + if max_tokens_per_item == 0: + return 0, 0 + + # Check how many items of this modality can be supported by + # the encoder budget. + encoder_budget = self.get_encoder_budget() + + # TODO: handle encoder-decoder models once we support them. + if encoder_budget == 0: + return 0, 0 + + max_encoder_items_per_batch = encoder_budget // max_tokens_per_item + + # Check how many items of this modality can be supported by + # the decoder budget. + mm_limit = self.mm_limits[modality] + + max_items_per_prompt = max( + 1, + min(mm_limit, self.max_model_len // max_tokens_per_item), + ) + + scheduler_config = self.scheduler_config + max_num_reqs = self.max_num_reqs + + if not scheduler_config.enable_chunked_prefill: + max_num_reqs = min( + max_num_reqs, + scheduler_config.max_num_batched_tokens // max_tokens_per_item, + ) + + max_decoder_items_per_batch = max_num_reqs * max_items_per_prompt + + max_items_per_batch = max( + 1, + min(max_encoder_items_per_batch, max_decoder_items_per_batch), + ) + + return max_items_per_prompt, max_items_per_batch + + def sanity_check_mm_encoder_outputs( mm_embeddings: MultiModalEmbeddings, expected_num_items: int, From 586f286789a09f5616be74ee8bedde0a9f698a72 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 5 Aug 2025 15:37:00 +0800 Subject: [PATCH 215/224] [Model] Pooling model activation supports per request control by PoolingParams (#20538) Signed-off-by: wang.yuqi --- tests/entrypoints/llm/test_classify.py | 67 ++++++ tests/entrypoints/llm/test_embedding.py | 56 +++++ tests/entrypoints/llm/test_reward.py | 66 ++++++ tests/entrypoints/llm/test_score.py | 69 ++++++ .../entrypoints/openai/test_classification.py | 31 +++ tests/entrypoints/openai/test_embedding.py | 34 +++ tests/entrypoints/openai/test_rerank.py | 38 +++ tests/entrypoints/openai/test_score.py | 41 ++++ .../pooling/test_override_pooler_config.py | 127 ++++++++++ tests/models/language/pooling/test_reward.py | 4 +- tests/models/utils.py | 7 + tests/test_pooling_params.py | 106 +++++++++ vllm/config.py | 30 +-- vllm/entrypoints/llm.py | 22 +- vllm/entrypoints/openai/protocol.py | 20 +- vllm/model_executor/layers/pooler.py | 222 +++++++++--------- vllm/model_executor/models/config.py | 32 +++ vllm/model_executor/models/jamba.py | 2 - vllm/model_executor/models/jina_vl.py | 5 +- vllm/model_executor/models/qwen2_rm.py | 3 - vllm/pooling_params.py | 139 +++++++++-- 21 files changed, 948 insertions(+), 173 deletions(-) create mode 100644 tests/entrypoints/llm/test_classify.py create mode 100644 tests/entrypoints/llm/test_embedding.py create mode 100644 tests/entrypoints/llm/test_reward.py create mode 100644 tests/entrypoints/llm/test_score.py create mode 100644 tests/models/language/pooling/test_override_pooler_config.py create mode 100644 tests/test_pooling_params.py diff --git a/tests/entrypoints/llm/test_classify.py b/tests/entrypoints/llm/test_classify.py new file mode 100644 index 0000000000000..abdce8935ea58 --- /dev/null +++ b/tests/entrypoints/llm/test_classify.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import weakref + +import pytest +import torch + +from vllm import LLM, PoolingParams +from vllm.distributed import cleanup_dist_env_and_memory + +from ...models.utils import softmax + +MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach" + +prompts = ["The chef prepared a delicious meal."] + + +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + +@pytest.fixture(scope="module") +def llm(): + # pytest caches the fixture so we use weakref.proxy to + # enable garbage collection + llm = LLM(model=MODEL_NAME, + max_num_batched_tokens=32768, + tensor_parallel_size=1, + gpu_memory_utilization=0.75, + enforce_eager=True, + seed=0) + + with llm.deprecate_legacy_api(): + yield weakref.proxy(llm) + + del llm + + cleanup_dist_env_and_memory() + + +@pytest.mark.skip_global_cleanup +def test_pooling_params(llm: LLM): + + def get_outputs(activation): + outputs = llm.classify( + prompts, + pooling_params=PoolingParams(activation=activation), + use_tqdm=False) + return torch.tensor([x.outputs.probs for x in outputs]) + + default = get_outputs(activation=None) + w_activation = get_outputs(activation=True) + wo_activation = get_outputs(activation=False) + + assert torch.allclose(default, w_activation, + atol=1e-2), "Default should use activation." + assert not torch.allclose( + w_activation, wo_activation, + atol=1e-2), "wo_activation should not use activation." + assert torch.allclose( + softmax(wo_activation), w_activation, atol=1e-2 + ), "w_activation should be close to activation(wo_activation)." diff --git a/tests/entrypoints/llm/test_embedding.py b/tests/entrypoints/llm/test_embedding.py new file mode 100644 index 0000000000000..ba20d7b9548ef --- /dev/null +++ b/tests/entrypoints/llm/test_embedding.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import weakref + +import pytest +import torch +import torch.nn.functional as F + +from vllm import LLM, PoolingParams +from vllm.distributed import cleanup_dist_env_and_memory + +MODEL_NAME = "intfloat/multilingual-e5-small" + +prompts = ["The chef prepared a delicious meal."] + + +@pytest.fixture(scope="module") +def llm(): + # pytest caches the fixture so we use weakref.proxy to + # enable garbage collection + llm = LLM(model=MODEL_NAME, + max_num_batched_tokens=32768, + tensor_parallel_size=1, + gpu_memory_utilization=0.75, + enforce_eager=True, + seed=0) + + with llm.deprecate_legacy_api(): + yield weakref.proxy(llm) + + del llm + + cleanup_dist_env_and_memory() + + +@pytest.mark.skip_global_cleanup +def test_pooling_params(llm: LLM): + + def get_outputs(normalize): + outputs = llm.embed(prompts, + pooling_params=PoolingParams(normalize=normalize), + use_tqdm=False) + return torch.tensor([x.outputs.embedding for x in outputs]) + + default = get_outputs(normalize=None) + w_normal = get_outputs(normalize=True) + wo_normal = get_outputs(normalize=False) + + assert torch.allclose(default, w_normal, + atol=1e-2), "Default should use normal." + assert not torch.allclose(w_normal, wo_normal, + atol=1e-2), "wo_normal should not use normal." + assert torch.allclose( + w_normal, F.normalize(wo_normal, p=2, dim=-1), + atol=1e-2), "w_normal should be close to normal(wo_normal)." diff --git a/tests/entrypoints/llm/test_reward.py b/tests/entrypoints/llm/test_reward.py new file mode 100644 index 0000000000000..361e2d0e1047f --- /dev/null +++ b/tests/entrypoints/llm/test_reward.py @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import weakref + +import pytest +import torch + +from vllm import LLM, PoolingParams +from vllm.distributed import cleanup_dist_env_and_memory + +from ...models.utils import softmax + +MODEL_NAME = "internlm/internlm2-1_8b-reward" + +prompts = ["The chef prepared a delicious meal."] + + +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + +@pytest.fixture(scope="module") +def llm(): + # pytest caches the fixture so we use weakref.proxy to + # enable garbage collection + llm = LLM(model=MODEL_NAME, + max_num_batched_tokens=32768, + tensor_parallel_size=1, + gpu_memory_utilization=0.75, + enforce_eager=True, + trust_remote_code=True, + seed=0) + + with llm.deprecate_legacy_api(): + yield weakref.proxy(llm) + + del llm + + cleanup_dist_env_and_memory() + + +@pytest.mark.skip_global_cleanup +def test_pooling_params(llm: LLM): + + def get_outputs(softmax): + outputs = llm.reward(prompts, + pooling_params=PoolingParams(softmax=softmax), + use_tqdm=False) + return torch.cat([x.outputs.data for x in outputs]) + + default = get_outputs(softmax=None) + w_softmax = get_outputs(softmax=True) + wo_softmax = get_outputs(softmax=False) + + assert torch.allclose(default, w_softmax, + atol=1e-2), "Default should use softmax." + assert not torch.allclose(w_softmax, wo_softmax, + atol=1e-2), "wo_softmax should not use softmax." + assert torch.allclose( + softmax(wo_softmax), w_softmax, + atol=1e-2), "w_softmax should be close to softmax(wo_softmax)." diff --git a/tests/entrypoints/llm/test_score.py b/tests/entrypoints/llm/test_score.py new file mode 100644 index 0000000000000..dd4eae0ccc06e --- /dev/null +++ b/tests/entrypoints/llm/test_score.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import weakref + +import pytest +import torch + +from vllm import LLM, PoolingParams +from vllm.distributed import cleanup_dist_env_and_memory + +from ...models.utils import softmax + +MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls" + + +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + +@pytest.fixture(scope="module") +def llm(): + # pytest caches the fixture so we use weakref.proxy to + # enable garbage collection + llm = LLM(model=MODEL_NAME, + max_num_batched_tokens=32768, + tensor_parallel_size=1, + gpu_memory_utilization=0.75, + enforce_eager=True, + seed=0) + + with llm.deprecate_legacy_api(): + yield weakref.proxy(llm) + + del llm + + cleanup_dist_env_and_memory() + + +@pytest.mark.skip_global_cleanup +def test_pooling_params(llm: LLM): + + def get_outputs(activation): + text_1 = "What is the capital of France?" + text_2 = "The capital of France is Paris." + + outputs = llm.score( + text_1, + text_2, + pooling_params=PoolingParams(activation=activation), + use_tqdm=False) + return torch.tensor([x.outputs.score for x in outputs]) + + default = get_outputs(activation=None) + w_activation = get_outputs(activation=True) + wo_activation = get_outputs(activation=False) + + assert torch.allclose(default, w_activation, + atol=1e-2), "Default should use activation." + assert not torch.allclose( + w_activation, wo_activation, + atol=1e-2), "wo_activation should not use activation." + assert torch.allclose( + softmax(wo_activation), w_activation, atol=1e-2 + ), "w_activation should be close to activation(wo_activation)." diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py index b2472658ca81c..bcf127307f730 100644 --- a/tests/entrypoints/openai/test_classification.py +++ b/tests/entrypoints/openai/test_classification.py @@ -3,6 +3,8 @@ import pytest import requests +import torch +import torch.nn.functional as F from vllm.entrypoints.openai.protocol import ClassificationResponse @@ -181,3 +183,32 @@ async def test_invocations(server: RemoteOpenAIServer): assert classification_data.keys() == invocation_data.keys() assert classification_data["probs"] == pytest.approx( invocation_data["probs"], rel=0.01) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_activation(server: RemoteOpenAIServer, model_name: str): + input_text = ["This product was excellent and exceeded my expectations"] + + async def get_outputs(activation): + response = requests.post(server.url_for("classify"), + json={ + "model": model_name, + "input": input_text, + "activation": activation + }) + outputs = response.json() + return torch.tensor([x['probs'] for x in outputs["data"]]) + + default = await get_outputs(activation=None) + w_activation = await get_outputs(activation=True) + wo_activation = await get_outputs(activation=False) + + assert torch.allclose(default, w_activation, + atol=1e-2), "Default should use activation." + assert not torch.allclose( + w_activation, wo_activation, + atol=1e-2), "wo_activation should not use activation." + assert torch.allclose( + F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2 + ), "w_activation should be close to activation(wo_activation)." diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index a7203befcc402..cf2442a569388 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -8,6 +8,8 @@ import openai import pytest import pytest_asyncio import requests +import torch +import torch.nn.functional as F from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.transformers_utils.tokenizer import get_tokenizer @@ -369,3 +371,35 @@ async def test_invocations_conversation(server: RemoteOpenAIServer): embeddings_1_lst=[invocation_data["embedding"]], name_0="chat", name_1="invocation") + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_normalize(server: RemoteOpenAIServer, model_name: str): + input_text = ["The chef prepared a delicious meal."] + + async def get_outputs(normalize): + request_args = { + "model": MODEL_NAME, + "input": input_text, + "encoding_format": "float", + "normalize": normalize + } + + response = requests.post(server.url_for("v1/embeddings"), + json=request_args) + outputs = response.json() + + return torch.tensor([x['embedding'] for x in outputs["data"]]) + + default = await get_outputs(normalize=None) + w_normal = await get_outputs(normalize=True) + wo_normal = await get_outputs(normalize=False) + + assert torch.allclose(default, w_normal, + atol=1e-2), "Default should use normal." + assert not torch.allclose(w_normal, wo_normal, + atol=1e-2), "wo_normal should not use normal." + assert torch.allclose( + w_normal, F.normalize(wo_normal, p=2, dim=-1), + atol=1e-2), "w_normal should be close to normal(wo_normal)." diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py index 4da97fe13691b..f121693e329fa 100644 --- a/tests/entrypoints/openai/test_rerank.py +++ b/tests/entrypoints/openai/test_rerank.py @@ -3,6 +3,8 @@ import pytest import requests +import torch +import torch.nn.functional as F from vllm.entrypoints.openai.protocol import RerankResponse @@ -125,3 +127,39 @@ def test_invocations(server: RemoteOpenAIServer): assert rerank_result.keys() == invocations_result.keys() assert rerank_result["relevance_score"] == pytest.approx( invocations_result["relevance_score"], rel=0.01) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_activation(server: RemoteOpenAIServer, model_name: str): + + async def get_outputs(activation): + query = "What is the capital of France?" + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris." + ] + + response = requests.post(server.url_for("rerank"), + json={ + "model": model_name, + "query": query, + "documents": documents, + "activation": activation + }) + outputs = response.json() + + return torch.tensor([x['relevance_score'] for x in outputs["results"]]) + + default = await get_outputs(activation=None) + w_activation = await get_outputs(activation=True) + wo_activation = await get_outputs(activation=False) + + assert torch.allclose(default, w_activation, + atol=1e-2), "Default should use activation." + assert not torch.allclose( + w_activation, wo_activation, + atol=1e-2), "wo_activation should not use activation." + assert torch.allclose( + F.sigmoid(wo_activation), w_activation, atol=1e-2 + ), "w_activation should be close to activation(wo_activation)." diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index 187542b7bafc9..1a5df1d2dbd2d 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -4,6 +4,7 @@ from typing import Any import pytest import requests +import torch import torch.nn.functional as F from torch import tensor @@ -220,3 +221,43 @@ class TestModel: assert score_data.keys() == invocation_data.keys() assert score_data["score"] == pytest.approx( invocation_data["score"], rel=0.01) + + def test_activation(self, server: RemoteOpenAIServer, model: dict[str, + Any]): + + def get_outputs(activation): + text_1 = "What is the capital of France?" + text_2 = "The capital of France is Paris." + response = requests.post(server.url_for("score"), + json={ + "model": model["name"], + "text_1": text_1, + "text_2": text_2, + "activation": activation + }) + if response.status_code != 200: + return response + + outputs = response.json() + return torch.tensor([x['score'] for x in outputs["data"]]) + + if model["is_cross_encoder"]: + + default = get_outputs(activation=None) + w_activation = get_outputs(activation=True) + wo_activation = get_outputs(activation=False) + + assert torch.allclose(default, w_activation, + atol=1e-2), "Default should use activation." + assert not torch.allclose( + w_activation, wo_activation, + atol=1e-2), "wo_activation should not use activation." + assert torch.allclose( + F.sigmoid(wo_activation), w_activation, atol=1e-2 + ), "w_activation should be close to activation(wo_activation)." + else: + get_outputs(activation=None) + + # The activation parameter only works for the is_cross_encoder model + response = get_outputs(activation=True) + assert response.status_code == 400 diff --git a/tests/models/language/pooling/test_override_pooler_config.py b/tests/models/language/pooling/test_override_pooler_config.py new file mode 100644 index 0000000000000..2b1c74652e76f --- /dev/null +++ b/tests/models/language/pooling/test_override_pooler_config.py @@ -0,0 +1,127 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch +import torch.nn.functional as F + +from tests.models.utils import softmax +from vllm.config import PoolerConfig + + +@pytest.mark.parametrize( + "model", + [ + "jason9693/Qwen2.5-1.5B-apeach", + "papluca/xlm-roberta-base-language-detection" + ], +) +@pytest.mark.parametrize("dtype", ["half"]) +def test_classify_models_using_activation( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, +) -> None: + + with vllm_runner(model, + max_model_len=512, + dtype=dtype, + override_pooler_config=PoolerConfig( + activation=False)) as vllm_model: + wo_activation_out = vllm_model.classify(example_prompts) + + with vllm_runner(model, + max_model_len=512, + dtype=dtype, + override_pooler_config=PoolerConfig( + activation=True)) as vllm_model: + w_activation_out = vllm_model.classify(example_prompts) + + for wo_activation, w_activation in zip(wo_activation_out, + w_activation_out): + wo_activation = torch.tensor(wo_activation) + w_activation = torch.tensor(w_activation) + + assert not torch.allclose( + wo_activation, w_activation, + atol=1e-2), "override_pooler_config is not working" + assert torch.allclose(softmax(wo_activation), w_activation, + 1e-3 if dtype == "float" else 1e-2) + + +@pytest.mark.parametrize( + "model", + [ + "intfloat/multilingual-e5-small", + ], +) +@pytest.mark.parametrize("dtype", ["half"]) +def test_embed_models_using_normalize( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, +) -> None: + + with vllm_runner(model, + max_model_len=512, + dtype=dtype, + override_pooler_config=PoolerConfig( + normalize=False)) as vllm_model: + wo_normalize = torch.tensor(vllm_model.embed(example_prompts)) + + with vllm_runner( + model, + max_model_len=512, + dtype=dtype, + override_pooler_config=PoolerConfig(normalize=True)) as vllm_model: + w_normalize = torch.tensor(vllm_model.embed(example_prompts)) + + assert not torch.allclose( + wo_normalize, w_normalize, + atol=1e-2), "override_pooler_config normalize is not working" + assert torch.allclose( + F.normalize(wo_normalize, p=2, dim=-1), w_normalize, + atol=1e-2), "w_normal should be close to normal(wo_normal)." + + +@pytest.mark.parametrize( + "model", + [ + "internlm/internlm2-1_8b-reward", + ], +) +@pytest.mark.parametrize("dtype", ["half"]) +def test_reward_models_using_softmax( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, +) -> None: + + with vllm_runner( + model, + max_model_len=1024, + dtype=dtype, + override_pooler_config=PoolerConfig(softmax=False)) as vllm_model: + wo_softmax = vllm_model.encode(example_prompts) + + with vllm_runner( + model, + max_model_len=1024, + dtype=dtype, + override_pooler_config=PoolerConfig(softmax=True)) as vllm_model: + w_softmax = vllm_model.encode(example_prompts) + + for wo, w in zip(wo_softmax, w_softmax): + wo = torch.tensor(wo) + w = torch.tensor(w) + + assert not torch.allclose( + wo, w, atol=1e-2), "override_pooler_config softmax is not working" + assert torch.allclose( + softmax(wo), w, + atol=1e-2), "w_softmax should be close to softmax(wo_softmax)." diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py index a5f7dca76d822..7add1d975c634 100644 --- a/tests/models/language/pooling/test_reward.py +++ b/tests/models/language/pooling/test_reward.py @@ -103,7 +103,7 @@ def test_prm_models( # check logits difference for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): - hf_output = torch.tensor(hf_output) - vllm_output = torch.tensor(vllm_output) + hf_output = torch.tensor(hf_output).float() + vllm_output = torch.tensor(vllm_output).float() assert torch.allclose(hf_output, vllm_output, 1.5e-2) diff --git a/tests/models/utils.py b/tests/models/utils.py index 3cd0721be1b65..bda7ea3e3ad51 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -330,6 +330,13 @@ def matryoshka_fy(tensor: torch.Tensor, dimensions: int): return tensor +def softmax(data): + if data.shape[-1] == 1: + return F.sigmoid(data) + else: + return F.softmax(data, dim=-1) + + class EmbedModelInfo(NamedTuple): name: str is_matryoshka: bool = False diff --git a/tests/test_pooling_params.py b/tests/test_pooling_params.py new file mode 100644 index 0000000000000..52c03015483c9 --- /dev/null +++ b/tests/test_pooling_params.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + +from tests.models.utils import EmbedModelInfo +from vllm import PoolingParams +from vllm.config import ModelConfig + +EMBEDDING_MODELS = [ + EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False), + EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5", + is_matryoshka=True, + matryoshka_dimensions=[256]), +] + + +def test_task(): + pooling_params = PoolingParams() + pooling_params.verify(task="score") + + pooling_params = PoolingParams(task="score") + pooling_params.verify(task="score") + + with pytest.raises(ValueError): + pooling_params.verify(task="encode") + + +def test_embed(): + task = "embed" + pooling_params = PoolingParams(normalize=None) + pooling_params.verify(task=task) + + pooling_params = PoolingParams(normalize=True) + pooling_params.verify(task=task) + + pooling_params = PoolingParams(normalize=False) + pooling_params.verify(task=task) + + invalid_parameters = ["activation", "softmax"] + for p in invalid_parameters: + with pytest.raises(ValueError): + pooling_params = PoolingParams(**{p: True}) + pooling_params.verify(task=task) + + +@pytest.mark.parametrize("model_info", EMBEDDING_MODELS) +def test_embed_dimensions(model_info: EmbedModelInfo): + task = "embed" + model_config = ModelConfig( + model_info.name, + task="auto", + tokenizer=model_info.name, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + ) + + pooling_params = PoolingParams(dimensions=None) + pooling_params.verify(task=task, model_config=model_config) + + with pytest.raises(ValueError): + pooling_params = PoolingParams(dimensions=1) + pooling_params.verify(task=task, model_config=model_config) + + if model_info.is_matryoshka: + assert model_info.matryoshka_dimensions is not None + pooling_params = PoolingParams( + dimensions=model_info.matryoshka_dimensions[0]) + pooling_params.verify(task=task, model_config=model_config) + + +@pytest.mark.parametrize("task", ["score", "classify"]) +def test_classify(task): + pooling_params = PoolingParams(activation=None) + pooling_params.verify(task=task) + + pooling_params = PoolingParams(activation=True) + pooling_params.verify(task=task) + + pooling_params = PoolingParams(activation=False) + pooling_params.verify(task=task) + + invalid_parameters = ["dimensions", "normalize", "softmax"] + for p in invalid_parameters: + with pytest.raises(ValueError): + pooling_params = PoolingParams(**{p: True}) + pooling_params.verify(task=task) + + +def test_encode(): + task = "encode" + pooling_params = PoolingParams(softmax=None) + pooling_params.verify(task=task) + + pooling_params = PoolingParams(softmax=True) + pooling_params.verify(task=task) + + pooling_params = PoolingParams(softmax=False) + pooling_params.verify(task=task) + + invalid_parameters = ["dimensions", "normalize", "activation"] + for p in invalid_parameters: + with pytest.raises(ValueError): + pooling_params = PoolingParams(**{p: True}) + pooling_params.verify(task=task) diff --git a/vllm/config.py b/vllm/config.py index 34952279c9d19..899862bf541e7 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -913,15 +913,6 @@ class ModelConfig: if getattr(pooler_config, k) is None: setattr(pooler_config, k, v) - if self.is_matryoshka: - if pooler_config.normalize is None: - pooler_config.normalize = True - elif not pooler_config.normalize: - raise ValueError( - "`normalize` must be enabled (set to True) " - "for models that are compatible with " - "Matryoshka Representation.") - return pooler_config return None @@ -3438,25 +3429,34 @@ class PoolerConfig: [`vllm.model_executor.layers.pooler.PoolingType`][]. """ + ## for embeddings models normalize: Optional[bool] = None """ - Whether to normalize the pooled outputs. Usually, this should be set to - ``True`` for embedding outputs. + Whether to normalize the embeddings outputs. + """ + dimensions: Optional[int] = None + """ + Reduce the dimensions of embeddings if model + support matryoshka representation. """ + ## for classification models + activation: Optional[bool] = None + """ + Whether to apply activation function to the classification outputs. + """ + + ## for reward models softmax: Optional[bool] = None """ - Whether to apply softmax to the pooled outputs. Usually, this should be set - to ``True`` for classification outputs. + Whether to apply softmax to the reward outputs. """ - step_tag_id: Optional[int] = None """ If set, only the score corresponding to the ``step_tag_id`` in the generated sentence should be returned. Otherwise, the scores for all tokens are returned. """ - returned_token_ids: Optional[list[int]] = None """ A list of indices for the vocabulary dimensions to be extracted, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 842a22ccebaa4..ca24b0c32b73b 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1189,6 +1189,8 @@ class LLM: /, *, use_tqdm: Union[bool, Callable[..., tqdm]] = True, + pooling_params: Optional[Union[PoolingParams, + Sequence[PoolingParams]]] = None, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, ) -> list[ClassificationRequestOutput]: """ @@ -1207,7 +1209,8 @@ class LLM: it is used to create the progress bar. If `False`, no progress bar is created. lora_request: LoRA request to use for generation, if any. - + pooling_params: The pooling parameters for pooling. If None, we + use the default pooling parameters. Returns: A list of `ClassificationRequestOutput` objects containing the embedding vectors in the same order as the input prompts. @@ -1220,6 +1223,7 @@ class LLM: items = self.encode( prompts, use_tqdm=use_tqdm, + pooling_params=pooling_params, lora_request=lora_request, pooling_task="classify", ) @@ -1272,6 +1276,7 @@ class LLM: text_2: list[Union[str, TextPrompt, TokensPrompt]], truncate_prompt_tokens: Optional[int] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, + pooling_params: Optional[PoolingParams] = None, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, ) -> list[ScoringRequestOutput]: @@ -1280,6 +1285,7 @@ class LLM: truncate_prompt_tokens=truncate_prompt_tokens, use_tqdm=use_tqdm, lora_request=lora_request, + pooling_params=pooling_params, pooling_task="embed", ) @@ -1306,6 +1312,7 @@ class LLM: data_2: Union[list[str], list[ScoreContentPartParam]], truncate_prompt_tokens: Optional[int] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, + pooling_params: Optional[PoolingParams] = None, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, ) -> list[ScoringRequestOutput]: model_config = self.llm_engine.model_config @@ -1317,7 +1324,12 @@ class LLM: if len(data_1) == 1: data_1 = data_1 * len(data_2) - pooling_params = PoolingParams(task="score") + if pooling_params is None: + pooling_params = PoolingParams(task="score") + + model_config = self.llm_engine.model_config + pooling_params.verify("score", model_config) + tokenization_kwargs: dict[str, Any] = {} _validate_truncation_size(model_config.max_model_len, @@ -1379,6 +1391,7 @@ class LLM: *, truncate_prompt_tokens: Optional[int] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, + pooling_params: Optional[PoolingParams] = None, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, ) -> list[ScoringRequestOutput]: """Generate similarity scores for all pairs `` or @@ -1410,7 +1423,8 @@ class LLM: it is used to create the progress bar. If `False`, no progress bar is created. lora_request: LoRA request to use for generation, if any. - + pooling_params: The pooling parameters for pooling. If None, we + use the default pooling parameters. Returns: A list of `ScoringRequestOutput` objects containing the generated scores in the same order as the input prompts. @@ -1494,6 +1508,7 @@ class LLM: data_2, # type: ignore[arg-type] truncate_prompt_tokens, use_tqdm, + pooling_params, lora_request) else: return self._embedding_score( @@ -1502,6 +1517,7 @@ class LLM: data_2, # type: ignore[arg-type] truncate_prompt_tokens, use_tqdm, + pooling_params, lora_request) def start_profile(self) -> None: diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index d77aee345843c..64f2beb14021a 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1274,11 +1274,13 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): "not set it, a random_uuid will be generated. This id is used " "through out the inference process and return in response."), ) + normalize: Optional[bool] = None # --8<-- [end:embedding-extra-params] def to_pooling_params(self): - return PoolingParams(dimensions=self.dimensions) + return PoolingParams(dimensions=self.dimensions, + normalize=self.normalize) class EmbeddingChatRequest(OpenAIBaseModel): @@ -1332,6 +1334,7 @@ class EmbeddingChatRequest(OpenAIBaseModel): "not set it, a random_uuid will be generated. This id is used " "through out the inference process and return in response."), ) + normalize: Optional[bool] = None # --8<-- [end:chat-embedding-extra-params] @model_validator(mode="before") @@ -1344,7 +1347,8 @@ class EmbeddingChatRequest(OpenAIBaseModel): return data def to_pooling_params(self): - return PoolingParams(dimensions=self.dimensions) + return PoolingParams(dimensions=self.dimensions, + normalize=self.normalize) EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest] @@ -1375,10 +1379,12 @@ class ScoreRequest(OpenAIBaseModel): "if the served model does not use priority scheduling."), ) + activation: Optional[bool] = None + # --8<-- [end:score-extra-params] def to_pooling_params(self): - return PoolingParams() + return PoolingParams(activation=self.activation) class RerankRequest(OpenAIBaseModel): @@ -1403,10 +1409,12 @@ class RerankRequest(OpenAIBaseModel): "if the served model does not use priority scheduling."), ) + activation: Optional[bool] = None + # --8<-- [end:rerank-extra-params] def to_pooling_params(self): - return PoolingParams() + return PoolingParams(activation=self.activation) class RerankDocument(BaseModel): @@ -1553,10 +1561,12 @@ class ClassificationRequest(OpenAIBaseModel): "if the served model does not use priority scheduling."), ) + activation: Optional[bool] = None + # --8<-- [end:classification-extra-params] def to_pooling_params(self): - return PoolingParams() + return PoolingParams(activation=self.activation) class ClassificationData(OpenAIBaseModel): diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 5bfd4aaccc17c..0f2e58eb9b5d9 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -41,35 +41,18 @@ class PoolingType(IntEnum): @dataclass(frozen=True) class ResolvedPoolingConfig: pooling_type: PoolingType - - normalize: bool - softmax: bool - step_tag_id: Optional[int] - returned_token_ids: Optional[list[int]] + task: PoolingTask @classmethod def from_config_with_defaults( cls, + task: PoolingTask, pooler_config: PoolerConfig, pooling_type: PoolingType, - normalize: bool, - softmax: bool, - step_tag_id: Optional[int] = None, - returned_token_ids: Optional[list[int]] = None, ) -> "ResolvedPoolingConfig": - return cls( - pooling_type=PoolingType[pooler_config.pooling_type] - if pooler_config.pooling_type is not None else pooling_type, - normalize=pooler_config.normalize - if pooler_config.normalize is not None else normalize, - softmax=pooler_config.softmax - if pooler_config.softmax is not None else softmax, - step_tag_id=pooler_config.step_tag_id - if pooler_config.step_tag_id is not None else step_tag_id, - returned_token_ids=pooler_config.returned_token_ids - if pooler_config.returned_token_ids is not None else - returned_token_ids, - ) + return cls(task=task, + pooling_type=PoolingType[pooler_config.pooling_type] + if pooler_config.pooling_type is not None else pooling_type) @dataclass(frozen=True) @@ -89,22 +72,15 @@ class Pooler(nn.Module, ABC): pooler_config: PoolerConfig, *, default_pooling_type: PoolingType = PoolingType.ALL, - default_normalize: bool = False, - default_softmax: bool = False, - default_step_tag_id: Optional[int] = None, - default_returned_token_ids: Optional[list[int]] = None, ): resolved_config = ResolvedPoolingConfig.from_config_with_defaults( + task="encode", pooler_config=pooler_config, pooling_type=default_pooling_type, - normalize=default_normalize, - softmax=default_softmax, - step_tag_id=default_step_tag_id, - returned_token_ids=default_returned_token_ids, ) if resolved_config.pooling_type == PoolingType.STEP: - return StepPooler.from_config(resolved_config) + return StepPooler() return SimplePooler.from_config(resolved_config) @@ -113,14 +89,11 @@ class Pooler(nn.Module, ABC): pooler_config: PoolerConfig, *, default_pooling_type: PoolingType = PoolingType.LAST, - default_normalize: bool = True, - default_softmax: bool = False, ): resolved_config = ResolvedPoolingConfig.from_config_with_defaults( + task="embed", pooler_config=pooler_config, pooling_type=default_pooling_type, - normalize=default_normalize, - softmax=default_softmax, ) return SimplePooler.from_config(resolved_config) @@ -131,23 +104,18 @@ class Pooler(nn.Module, ABC): classifier: Optional[ClassifierFn], *, default_pooling_type: PoolingType = PoolingType.LAST, - default_normalize: bool = False, - default_softmax: bool = True, ): resolved_config = ResolvedPoolingConfig.from_config_with_defaults( + task="classify", pooler_config=pooler_config, pooling_type=default_pooling_type, - normalize=default_normalize, - softmax=default_softmax, ) - base_pooler = SimplePooler.from_config(resolved_config) - if classifier is None: - return base_pooler + + pooling = PoolingMethod.from_pooling_type(resolved_config.pooling_type) return ClassifierPooler( - pooling=base_pooler.pooling, + pooling=pooling, classifier=classifier, - act_fn=base_pooler.head.activation, ) @abstractmethod @@ -198,11 +166,17 @@ def get_prompt_token_ids( ] -def get_tasks(pooling_metadata: PoolingMetadata) -> list[PoolingTask]: +def get_pooling_params( + pooling_metadata: PoolingMetadata) -> list[PoolingParams]: if isinstance(pooling_metadata, V0PoolingMetadata): pooling_params = [p for _, p in pooling_metadata.seq_groups] else: pooling_params = pooling_metadata.pooling_params + return pooling_params + + +def get_tasks(pooling_metadata: PoolingMetadata) -> list[PoolingTask]: + pooling_params = get_pooling_params(pooling_metadata) tasks: list[PoolingTask] = [ task for pooling_param in pooling_params @@ -484,49 +458,30 @@ class LambdaPoolerActivation(PoolerActivation): class PoolerHead(nn.Module): - @classmethod - def from_config(cls, pooler_config: ResolvedPoolingConfig) -> "PoolerHead": - if pooler_config.normalize and pooler_config.softmax: - raise ValueError("`normalize=True` and `softmax=True` should not " - "be set together") - - activation: PoolerActivation - if pooler_config.normalize: - activation = PoolerNormalize() - elif pooler_config.softmax: - activation = PoolerClassify() - else: - activation = PoolerIdentity() - - return cls(activation) - def __init__(self, activation: PoolerActivation) -> None: super().__init__() - self.activation = activation def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor], pooling_metadata: PoolingMetadata): - # Using float32 in PoolerHead - if isinstance(pooled_data, list): - for i in range(len(pooled_data)): - pooled_data[i] = pooled_data[i].to(torch.float32) - else: - pooled_data = pooled_data.to(torch.float32) + return self.activation(pooled_data) + + +class EmbeddingPoolerHead(PoolerHead): + + def __init__(self) -> None: + super().__init__(activation=PoolerNormalize()) + + def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor], + pooling_metadata: PoolingMetadata): + + pooling_params = get_pooling_params(pooling_metadata) # for matryoshka representation - if isinstance(pooling_metadata, V0PoolingMetadata): - dimensions_list = [ - pooling_param.dimensions - for _, pooling_param in pooling_metadata.seq_groups - ] - else: - assert isinstance(pooled_data, list) - dimensions_list = [ - pooling_param.dimensions - for pooling_param in pooling_metadata.pooling_params - ] + dimensions_list = [ + pooling_param.dimensions for pooling_param in pooling_params + ] if any(d is not None for d in dimensions_list): # change the output dimension assert len(pooled_data) == len(dimensions_list) @@ -541,7 +496,41 @@ class PoolerHead(nn.Module): for vecs, d in zip(pooled_data, dimensions_list) ] - return self.activation(pooled_data) + # for normalize + flags = [p.normalize for p in pooling_params] + if len(set(flags)) == 1: + if flags[0]: + pooled_data = self.activation(pooled_data) + else: + pooled_data = [ + self.activation(vecs) if f else vecs + for vecs, f in zip(pooled_data, flags) + ] + + return pooled_data + + +class RewardPoolerHead(PoolerHead): + + def __init__(self) -> None: + super().__init__(activation=PoolerClassify()) + + def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor], + pooling_metadata: PoolingMetadata): + pooling_params = get_pooling_params(pooling_metadata) + + # for softmax + flags = [p.softmax for p in pooling_params] + if len(set(flags)) == 1: + if flags[0]: + pooled_data = self.activation(pooled_data) + else: + pooled_data = [ + self.activation(vecs) if f else vecs + for vecs, f in zip(pooled_data, flags) + ] + + return pooled_data class SimplePooler(Pooler): @@ -559,8 +548,12 @@ class SimplePooler(Pooler): pooler_config: ResolvedPoolingConfig, ) -> "SimplePooler": pooling = PoolingMethod.from_pooling_type(pooler_config.pooling_type) - head = PoolerHead.from_config(pooler_config) - + if pooler_config.task == "embed": + head = EmbeddingPoolerHead() + elif pooler_config.task == "encode": + head = RewardPoolerHead() + else: + raise NotImplementedError(f"Unknown task: {pooler_config.task}") return cls(pooling, head) def __init__(self, pooling: PoolingMethod, head: PoolerHead) -> None: @@ -587,29 +580,11 @@ class SimplePooler(Pooler): class StepPooler(Pooler): - @classmethod - def from_config(cls, pooler_config: ResolvedPoolingConfig) -> "StepPooler": - assert pooler_config.pooling_type == PoolingType.STEP - - return cls( - PoolerHead.from_config(pooler_config), - step_tag_id=pooler_config.step_tag_id, - returned_token_ids=pooler_config.returned_token_ids, - ) - - def __init__( - self, - head: PoolerHead, - *, - step_tag_id: Optional[int] = None, - returned_token_ids: Optional[list[int]] = None, - ) -> None: + def __init__(self, ) -> None: super().__init__() self.pooling = AllPool() - self.head = head - self.step_tag_id = step_tag_id - self.returned_token_ids = returned_token_ids + self.head = RewardPoolerHead() def extract_states( self, @@ -620,10 +595,15 @@ class StepPooler(Pooler): prompt_token_ids = get_prompt_token_ids(pooling_metadata) pooled_data = list[torch.Tensor]() - returned_token_ids = self.returned_token_ids - step_tag_id = self.step_tag_id - for data, token_id in zip(pooled_data_lst, prompt_token_ids): + pooling_params = get_pooling_params(pooling_metadata) + + for data, token_id, pooling_param in zip(pooled_data_lst, + prompt_token_ids, + pooling_params): + step_tag_id = pooling_param.step_tag_id + returned_token_ids = pooling_param.returned_token_ids + if returned_token_ids is not None and len(returned_token_ids) > 0: data = data[:, returned_token_ids] @@ -669,14 +649,14 @@ class ClassifierPooler(Pooler): def __init__( self, pooling: PoolingFn, - classifier: ClassifierFn, - act_fn: PoolerActivation, + classifier: Optional[ClassifierFn], + act_fn: Optional[PoolerActivation] = None, ) -> None: super().__init__() self.pooling = pooling self.classifier = classifier - self.act_fn = act_fn + self.act_fn = act_fn or PoolerClassify() def get_supported_tasks(self) -> Set[PoolingTask]: return {"classify", "score"} @@ -688,15 +668,25 @@ class ClassifierPooler(Pooler): ) -> PoolerOutput: pooled_data = self.pooling(hidden_states, pooling_metadata) - # apply classifier once on the full batch if possible - if isinstance(pooled_data, torch.Tensor): - pooled_output = self.classifier(pooled_data) - elif len({data.shape for data in pooled_data}) <= 1: - pooled_output = self.classifier(torch.stack(pooled_data)) - else: - pooled_output = [self.classifier(data) for data in pooled_data] + if self.classifier is not None: + # apply classifier once on the full batch if possible + if isinstance(pooled_data, torch.Tensor): + pooled_data = self.classifier(pooled_data) + elif len({data.shape for data in pooled_data}) <= 1: + pooled_data = self.classifier(torch.stack(pooled_data)) + else: + pooled_data = [self.classifier(data) for data in pooled_data] - scores = self.act_fn(pooled_output) + pooling_params = get_pooling_params(pooling_metadata) + flags = [p.activation for p in pooling_params] + + if len(set(flags)) == 1: + scores = self.act_fn(pooled_data) if flags[0] else pooled_data + else: + scores = [ + self.act_fn(vecs) if f else vecs + for vecs, f in zip(pooled_data, flags) + ] return build_output(scores) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 9030ff307bee3..6f09be7a59410 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -44,6 +44,15 @@ class GteNewModelConfig(VerifyAndUpdateConfig): } +class JambaForSequenceClassificationConfig(VerifyAndUpdateConfig): + + @staticmethod + def verify_and_update_config(vllm_config: "VllmConfig") -> None: + pooler_config = vllm_config.model_config.pooler_config + if pooler_config.activation is None: + pooler_config.activation = False + + class JinaRobertaModelConfig(VerifyAndUpdateConfig): @staticmethod @@ -155,6 +164,26 @@ class NomicBertModelConfig(VerifyAndUpdateConfig): vllm_config.recalculate_max_model_len(max_model_len) +class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig): + + @staticmethod + def verify_and_update_config(vllm_config: "VllmConfig") -> None: + pooler_config = vllm_config.model_config.pooler_config + + if pooler_config.step_tag_id is None: + pooler_config.step_tag_id = 151651 + + +class Qwen2ForRewardModelConfig(VerifyAndUpdateConfig): + + @staticmethod + def verify_and_update_config(vllm_config: "VllmConfig") -> None: + pooler_config = vllm_config.model_config.pooler_config + + if pooler_config.softmax is None: + pooler_config.softmax = False + + class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig): @staticmethod @@ -309,8 +338,11 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "GteModel": SnowflakeGteNewModelConfig, "GteNewModel": GteNewModelConfig, "NomicBertModel": NomicBertModelConfig, + "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig, + "Qwen2ForRewardModel": Qwen2ForRewardModelConfig, "Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig, "XLMRobertaModel": JinaRobertaModelConfig, "JinaVLForRanking": JinaVLForSequenceClassificationConfig, + "JambaForSequenceClassification": JambaForSequenceClassificationConfig, "GraniteMoeHybridForCausalLM": GraniteMoeHybridModelConfig, } diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 263f4c8379cf2..ab21b7ce2c5f5 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -593,7 +593,5 @@ class JambaForSequenceClassification(JambaForCausalLM): pooler_config, classifier=self.score, default_pooling_type=PoolingType.LAST, - default_normalize=False, - default_softmax=False, ), }) diff --git a/vllm/model_executor/models/jina_vl.py b/vllm/model_executor/models/jina_vl.py index 0c4284f7daaac..8c64f636c6a0f 100644 --- a/vllm/model_executor/models/jina_vl.py +++ b/vllm/model_executor/models/jina_vl.py @@ -90,15 +90,12 @@ class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration, prefix=maybe_prefix(prefix, "qwen2_vl")) config = vllm_config.model_config.hf_config pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None # logit bias for sigmoid normalization self.LOGIT_BIAS = 2.65 self.score = JinaVLScorer(config) - - pooler_config = vllm_config.model_config.pooler_config - assert pooler_config is not None - self.pooler = DispatchPooler({ "encode": Pooler.for_encode(pooler_config), diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index f12e9a041a944..9b6b70c75c341 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -117,8 +117,5 @@ class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel): Pooler.for_encode( pooler_config, default_pooling_type=PoolingType.STEP, - default_normalize=False, - default_softmax=True, - default_step_tag_id=151651, ) }) diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 23eb775f2dc69..7077f68353fc5 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from copy import deepcopy from typing import TYPE_CHECKING, Optional import msgspec @@ -19,13 +20,25 @@ class PoolingParams( """API parameters for pooling models. Attributes: + normalize: Whether to normalize the embeddings outputs. dimensions: Reduce the dimensions of embeddings if model support matryoshka representation. + activation: Whether to apply activation function to + the classification outputs. + softmax: Whether to apply softmax to the reward outputs. """ + ## for embeddings models dimensions: Optional[int] = None + normalize: Optional[bool] = None - output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY + ## for classification models + activation: Optional[bool] = None + + ## for reward models + softmax: Optional[bool] = None + step_tag_id: Optional[int] = None + returned_token_ids: Optional[list[int]] = None task: Optional[PoolingTask] = None """Internal use only.""" @@ -33,15 +46,32 @@ class PoolingParams( requires_token_ids: bool = False """Internal use only.""" + output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY + + @property + def all_parameters(self) -> list[str]: + return [ + "dimensions", "normalize", "activation", "softmax", "step_tag_id", + "returned_token_ids" + ] + + @property + def valid_parameters(self): + return { + "embed": ["dimensions", "normalize"], + "classify": ["activation"], + "score": ["activation"], + "encode": ["softmax", "step_tag_id", "returned_token_ids"], + } + def clone(self) -> "PoolingParams": """Returns a deep copy of the PoolingParams instance.""" - return PoolingParams( - dimensions=self.dimensions, - task=self.task, - requires_token_ids=self.requires_token_ids, - ) + return deepcopy(self) + + def verify(self, + task: PoolingTask, + model_config: Optional["ModelConfig"] = None) -> None: - def verify(self, task: PoolingTask, model_config: "ModelConfig") -> None: if self.task is None: self.task = task elif self.task != task: @@ -52,28 +82,91 @@ class PoolingParams( # which is not available in model config. So, it's not included # in this method - if self.dimensions is not None: - if not model_config.is_matryoshka: - raise ValueError( - f'Model "{model_config.served_model_name}" does not ' - f'support matryoshka representation, ' - f'changing output dimensions will lead to poor results.') + self._merge_default_parameters(model_config) + self._set_default_parameters(model_config) + self._verify_valid_parameters() - mds = model_config.matryoshka_dimensions - if mds is not None: - if self.dimensions not in mds: + def _merge_default_parameters(self, + model_config: Optional["ModelConfig"] = None + ) -> None: + + if model_config is None: + return + + pooler_config = model_config.pooler_config + if pooler_config is None: + return + + assert self.task is not None, "task must be set" + valid_parameters = self.valid_parameters[self.task] + + for k in valid_parameters: + if getattr(pooler_config, k, None) is None: + continue + + if getattr(self, k, None) is None: + setattr(self, k, getattr(pooler_config, k)) + + def _set_default_parameters(self, model_config: Optional["ModelConfig"]): + if self.task == "embed": + if self.normalize is None: + self.normalize = True + + if self.dimensions is not None and model_config is not None: + if not model_config.is_matryoshka: raise ValueError( - f'Model "{model_config.served_model_name}" ' - f'only supports {str(mds)} matryoshka dimensions, ' - f'use other output dimensions will ' - f'lead to poor results.') - elif self.dimensions < 1: - raise ValueError("Dimensions must be greater than 0") + f'Model "{model_config.served_model_name}" does not ' + f'support matryoshka representation, ' + f'changing output dimensions will lead to poor results.' + ) + + mds = model_config.matryoshka_dimensions + if mds is not None: + if self.dimensions not in mds: + raise ValueError( + f'Model "{model_config.served_model_name}" ' + f'only supports {str(mds)} matryoshka dimensions, ' + f'use other output dimensions will ' + f'lead to poor results.') + elif self.dimensions < 1: + raise ValueError("Dimensions must be greater than 0") + + elif self.task in ["classify", "score"]: + if self.activation is None: + self.activation = True + + elif self.task == "encode": + if self.softmax is None: + self.softmax = True + else: + raise ValueError(f"Unknown pooling task: {self.task}") + + def _verify_valid_parameters(self): + assert self.task is not None, "task must be set" + valid_parameters = self.valid_parameters[self.task] + invalid_parameters = [] + for k in self.all_parameters: + if k in valid_parameters: + continue + + if getattr(self, k, None) is not None: + invalid_parameters.append(k) + + if invalid_parameters: + raise ValueError( + f"Task {self.task} only supports {valid_parameters} " + f"parameters, does not support " + f"{invalid_parameters} parameters") def __repr__(self) -> str: return (f"PoolingParams(" - f"dimensions={self.dimensions}, " f"task={self.task}, " + f"normalize={self.normalize}, " + f"dimensions={self.dimensions}, " + f"activation={self.activation}, " + f"softmax={self.softmax}, " + f"step_tag_id={self.step_tag_id}, " + f"returned_token_ids={self.returned_token_ids}, " f"requires_token_ids={self.requires_token_ids})") def __post_init__(self) -> None: From d1bf1b97111df876737e3af3d9249c7ccc545f15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Tue, 5 Aug 2025 11:33:46 +0200 Subject: [PATCH 216/224] [Docs][TPU] Highlight TPU Software version selection (#22242) Signed-off-by: NickLucche --- docs/getting_started/installation/google_tpu.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/getting_started/installation/google_tpu.md b/docs/getting_started/installation/google_tpu.md index 55d69d11fa401..6f09babb3aba0 100644 --- a/docs/getting_started/installation/google_tpu.md +++ b/docs/getting_started/installation/google_tpu.md @@ -85,7 +85,7 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ | PROJECT_ID | Your Google Cloud project | | ZONE | The GCP zone where you want to create your Cloud TPU. The value you use depends on the version of TPUs you are using. For more information, see [TPU regions and zones] | | ACCELERATOR_TYPE | The TPU version you want to use. Specify the TPU version, for example `v5litepod-4` specifies a v5e TPU with 4 cores, `v6e-1` specifies a v6e TPU with 1 core. For more information, see [TPU versions]. | -| RUNTIME_VERSION | The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). For more information see [TPU VM images]. | +| RUNTIME_VERSION | The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). | | SERVICE_ACCOUNT | The email address for your service account. You can find it in the IAM Cloud Console under *Service Accounts*. For example: `tpu-service-account@.iam.gserviceaccount.com` | Connect to your TPU VM using SSH: @@ -94,6 +94,9 @@ Connect to your TPU VM using SSH: gcloud compute tpus tpu-vm ssh TPU_NAME --project PROJECT_ID --zone ZONE ``` +!!! note + When configuring `RUNTIME_VERSION` ("TPU software version") on GCP, ensure it matches the TPU generation you've selected by referencing the [TPU VM images] compatibility matrix. Using an incompatible version may prevent vLLM from running correctly. + [TPU versions]: https://cloud.google.com/tpu/docs/runtimes [TPU VM images]: https://cloud.google.com/tpu/docs/runtimes [TPU regions and zones]: https://cloud.google.com/tpu/docs/regions-zones From 05fae021750be1049927299ea2317d742c03718a Mon Sep 17 00:00:00 2001 From: Benji Beck Date: Tue, 5 Aug 2025 02:36:18 -0700 Subject: [PATCH 217/224] Migrate KimiVLImagePixelInputs to TensorSchema (#21769) Signed-off-by: Benji Beck Co-authored-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/kimi_vl.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index 9c0a6ba92389b..1c7ddd7df7f82 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -46,7 +46,7 @@ import copy import math from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass -from typing import Any, Literal, Optional, TypedDict, Union +from typing import Annotated, Any, Literal, Optional, Union import torch from torch import nn @@ -79,6 +79,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config +from vllm.utils.tensor_schema import TensorSchema, TensorShape from .utils import is_pp_missing_parameter, maybe_prefix @@ -118,15 +119,22 @@ class KimiVLMultiModalProjector(nn.Module): return hidden_states -class KimiVLImagePixelInputs(TypedDict): - type: Literal["pixel_values"] - pixel_values: Union[torch.Tensor, list[torch.Tensor]] +class KimiVLImagePixelInputs(TensorSchema): """ - Shape:`(num_patches, num_channels, patch_size, patch_size)` + Dimensions: + - nc: Number of channels + - np: Number of patches + - ps: Patch size + - ni: Number of images """ + type: Literal["pixel_values"] = "pixel_values" - image_grid_hws: torch.Tensor - """Shape:`(num_images, 2)`""" + pixel_values: Annotated[ + Union[torch.Tensor, list[torch.Tensor]], + TensorShape("np", 3, "ps", "ps"), + ] + + image_grid_hws: Annotated[torch.Tensor, TensorShape("ni", 2)] # TODO: support embeds too @@ -348,8 +356,6 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal): pixel_values = pixel_values.reshape(-1, num_channels, patch_size, patch_size) pixel_values = pixel_values.to(self.vision_tower.dtype) - # image_grid_hws.shape = (N, 2) - assert image_grid_hws.ndim == 2, f"unexpected shape for image_grid_hws: {image_grid_hws.shape}" return KimiVLImagePixelInputs( type="pixel_values", From 4771df7b2bd1ed06fbdc564c98e1b86efaff69b3 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 5 Aug 2025 05:36:43 -0400 Subject: [PATCH 218/224] [Feature] Non-contiguous Support for FP8 Quantization (#21961) Signed-off-by: yewentao256 Co-authored-by: mgoin --- csrc/quantization/fp8/common.cu | 259 ++++++++++++++++++++----------- csrc/quantization/fp8/common.cuh | 107 ------------- tests/quantization/test_fp8.py | 33 ++++ vllm/_custom_ops.py | 9 +- 4 files changed, 207 insertions(+), 201 deletions(-) diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu index 0e1eab66f0b98..5fe5dd04bd891 100644 --- a/csrc/quantization/fp8/common.cu +++ b/csrc/quantization/fp8/common.cu @@ -1,7 +1,8 @@ #include "common.cuh" #include "dispatch_utils.h" - +#include "../vectorization_utils.cuh" #include +#include #ifndef USE_ROCM #include @@ -12,74 +13,127 @@ namespace vllm { template -__global__ void scaled_fp8_quant_kernel(fp8_type* __restrict__ out, - const scalar_t* __restrict__ input, - const float* __restrict__ scale, - int64_t num_elems) { - int tid = blockDim.x * blockIdx.x + threadIdx.x; +__global__ void scaled_fp8_quant_kernel_strided( + fp8_type* __restrict__ out, const scalar_t* __restrict__ input, + const float* __restrict__ scale, int hidden_size, int64_t in_row_stride, + int64_t out_row_stride) { + const int64_t token_idx = blockIdx.x; // one token per block + const int tid = threadIdx.x; - // Invert the scale so that we can use multiplications to avoid expensive - // division. - const float inverted_scale = 1.0f / (*scale); - scaled_fp8_conversion_vec( - out, input, inverted_scale, num_elems, tid, blockDim.x * gridDim.x); + const scalar_t* token_in = input + token_idx * in_row_stride; + fp8_type* token_out = out + token_idx * out_row_stride; + + const float inv_scale = 1.0f / (*scale); + + vectorize_with_alignment<16>( + token_in, token_out, hidden_size, tid, blockDim.x, + [=] __device__(fp8_type & dst, const scalar_t& src) { + dst = scaled_fp8_conversion(static_cast(src), + inv_scale); + }); } template -__global__ void dynamic_per_token_scaled_fp8_quant_kernel( - fp8_type* __restrict__ out, float* __restrict__ scale, - scalar_t const* __restrict__ input, float const* __restrict__ scale_ub, - const int hidden_size) { - int const tid = threadIdx.x; - int const token_idx = blockIdx.x; +__global__ void segmented_max_reduction_strided( + float* __restrict__ scale, const scalar_t* __restrict__ input, + int hidden_size, int64_t in_row_stride, int64_t num_tokens) { + __shared__ float cache[256]; + const int tid = threadIdx.x; + int64_t token_idx = blockIdx.x; - // Use int64 to avoid overflowing an int32 when calculating this offset - int64_t offset = static_cast(token_idx) * hidden_size; - scalar_t const* __restrict__ token_input = &input[offset]; - fp8_type* __restrict__ token_output = &out[offset]; - - // For vectorization, token_input and token_output pointers need to be - // aligned at 32-byte and 16-byte addresses respectively. - bool const can_vectorize = hidden_size % 16 == 0; - - float absmax_val = 0.0f; - if (can_vectorize) { - absmax_val = thread_max_vec(token_input, hidden_size, tid, blockDim.x); - } else { - for (int i = tid; i < hidden_size; i += blockDim.x) { - float const x = static_cast(token_input[i]); - absmax_val = fmaxf(absmax_val, fabsf(x)); - } + // one block per token. Guard in case gridDim.x > num_tokens. + if (token_idx >= num_tokens) { + return; } + const scalar_t* row_ptr = input + token_idx * in_row_stride; + + // each thread scans elements of the row in a strided fashion. + float thread_max = 0.0f; + for (int e = tid; e < hidden_size; e += blockDim.x) { + float v = fabsf(static_cast(row_ptr[e])); + thread_max = fmaxf(thread_max, v); + } + + cache[tid] = thread_max; + __syncthreads(); + + // parallel reduction to find row max. + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) { + if (tid < offset) { + cache[tid] = fmaxf(cache[tid], cache[tid + offset]); + } + __syncthreads(); + } + + // thread 0 updates global scale (per-tensor) atomically. + if (tid == 0) { + atomicMaxFloat(scale, cache[0] / quant_type_max_v); + } +} + +template +__global__ void scaled_fp8_quant_kernel_strided_dynamic( + fp8_type* __restrict__ out, const scalar_t* __restrict__ input, + const float* __restrict__ scale, int hidden_size, int64_t in_row_stride, + int64_t out_row_stride) { + const int64_t token_idx = blockIdx.x; + const int tid = threadIdx.x; + + const scalar_t* token_in = input + token_idx * in_row_stride; + fp8_type* token_out = out + token_idx * out_row_stride; + + const float reciprocal_scale = 1.0f / (*scale); + vectorize_with_alignment<16>( + token_in, token_out, hidden_size, tid, blockDim.x, + [=] __device__(fp8_type & dst, const scalar_t& src) { + dst = scaled_fp8_conversion(static_cast(src), + reciprocal_scale); + }); +} + +template +__global__ void dynamic_per_token_scaled_fp8_quant_kernel_strided( + fp8_type* __restrict__ out, float* __restrict__ scale, + const scalar_t* __restrict__ input, const float* __restrict__ scale_ub, + int hidden_size, int64_t in_row_stride, int64_t out_row_stride) { + const int64_t token_idx = blockIdx.x; + const int tid = threadIdx.x; + + // Use int64 to avoid overflowing an int32 when calculating this offset + int64_t in_offset = static_cast(token_idx) * in_row_stride; + int64_t out_offset = static_cast(token_idx) * out_row_stride; + const scalar_t* token_in = input + in_offset; + fp8_type* token_out = out + out_offset; + + // 1) per-token absmax + float absmax_val = 0.f; + vectorize_read_with_alignment<16>( + token_in, hidden_size, tid, blockDim.x, [&] __device__(scalar_t v) { + absmax_val = fmaxf(absmax_val, fabsf(static_cast(v))); + }); + using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage reduceStorage; - float const block_absmax_val_maybe = - BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x); + __shared__ typename BlockReduce::TempStorage tmp; + const float block_max = + BlockReduce(tmp).Reduce(absmax_val, cub::Max{}, blockDim.x); + __shared__ float token_scale; if (tid == 0) { - if (scale_ub) { - token_scale = fminf(block_absmax_val_maybe, *scale_ub); - } else { - token_scale = block_absmax_val_maybe; - } - // token scale computation + token_scale = scale_ub ? fminf(block_max, *scale_ub) : block_max; token_scale = fmaxf(token_scale / quant_type_max_v, min_scaling_factor::val()); scale[token_idx] = token_scale; } __syncthreads(); - // Note that we don't use inverted scales so we can match FBGemm impl. - if (can_vectorize) { - scaled_fp8_conversion_vec( - token_output, token_input, token_scale, hidden_size, tid, blockDim.x); - } else { - for (int i = tid; i < hidden_size; i += blockDim.x) { - token_output[i] = scaled_fp8_conversion( - static_cast(token_input[i]), token_scale); - } - } + // 2) quantize + vectorize_with_alignment<16>( + token_in, token_out, hidden_size, tid, blockDim.x, + [=] __device__(fp8_type & dst, const scalar_t& src) { + dst = scaled_fp8_conversion(static_cast(src), + token_scale); + }); } } // namespace vllm @@ -88,23 +142,31 @@ void static_scaled_fp8_quant(torch::Tensor& out, // [..., d] torch::Tensor const& input, // [..., d] torch::Tensor const& scale) // [1] { - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(out.is_contiguous()); - int const block_size = 256; - int const num_tokens = input.numel() / input.size(-1); - int const num_elems = input.numel(); - dim3 const grid(num_tokens); - dim3 const block(block_size); + TORCH_CHECK(input.stride(-1) == 1, + "last dimension of input must be contiguous"); + TORCH_CHECK(out.stride(-1) == 1, + "last dimension of output must be contiguous"); + + const int hidden_size = input.size(-1); + const int num_tokens = input.numel() / hidden_size; + const int block_size = 256; + dim3 grid(num_tokens); + dim3 block(block_size); + + const int64_t in_row_stride = input.stride(-2); + const int64_t out_row_stride = out.stride(-2); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); VLLM_DISPATCH_FLOATING_TYPES( input.scalar_type(), "scaled_fp8_quant_kernel_scalar_type", [&] { VLLM_DISPATCH_FP8_TYPES( out.scalar_type(), "scaled_fp8_quant_kernel_fp8_type", [&] { - vllm::scaled_fp8_quant_kernel + vllm::scaled_fp8_quant_kernel_strided <<>>( out.data_ptr(), input.data_ptr(), - scale.data_ptr(), num_elems); + scale.data_ptr(), hidden_size, in_row_stride, + out_row_stride); }); }); } @@ -113,27 +175,42 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, // [..., d] torch::Tensor const& input, // [..., d] torch::Tensor& scale) // [1] { - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(out.is_contiguous()); - int const block_size = 256; - int const num_tokens = input.numel() / input.size(-1); - int const num_elems = input.numel(); - dim3 const grid(num_tokens); - dim3 const block(block_size); + TORCH_CHECK(input.stride(-1) == 1, + "last dimension of input must be contiguous"); + TORCH_CHECK(out.stride(-1) == 1, + "last dimension of output must be contiguous"); + + const int hidden_size = input.size(-1); + const int num_tokens = input.numel() / hidden_size; + const int block_size = 256; + dim3 grid(num_tokens); + dim3 block(block_size); + + const int64_t in_row_stride = input.stride(-2); + const int64_t out_row_stride = out.stride(-2); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + // scale tensor should be initialised to <=0 before reduction + AT_CUDA_CHECK( + cudaMemsetAsync(scale.data_ptr(), 0, sizeof(float), stream)); + VLLM_DISPATCH_FLOATING_TYPES( input.scalar_type(), "scaled_fp8_quant_kernel_scalar_type", [&] { VLLM_DISPATCH_FP8_TYPES( out.scalar_type(), "scaled_fp8_quant_kernel_fp8_type", [&] { - vllm::segmented_max_reduction - <<>>(scale.data_ptr(), - input.data_ptr(), - num_elems); - vllm::scaled_fp8_quant_kernel + vllm::segmented_max_reduction_strided + <<>>( + scale.data_ptr(), input.data_ptr(), + hidden_size, in_row_stride, + static_cast(num_tokens)); + + vllm::scaled_fp8_quant_kernel_strided_dynamic <<>>( out.data_ptr(), input.data_ptr(), - scale.data_ptr(), num_elems); + scale.data_ptr(), hidden_size, in_row_stride, + out_row_stride); }); }); } @@ -142,14 +219,19 @@ void dynamic_per_token_scaled_fp8_quant( torch::Tensor& out, // [..., d] torch::Tensor const& input, // [..., d] torch::Tensor& scales, std::optional const& scale_ub) { - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(out.is_contiguous()); + TORCH_CHECK(input.stride(-1) == 1, + "last dimension of input must be contiguous"); + TORCH_CHECK(out.stride(-1) == 1, + "last dimension of output must be contiguous"); - int const hidden_size = input.size(-1); - int const num_tokens = input.numel() / hidden_size; - int const block_size = 256; - dim3 const grid(num_tokens); - dim3 const block(std::min(hidden_size, block_size)); + const int hidden_size = input.size(-1); + const int num_tokens = input.numel() / hidden_size; + const int block_size = 256; + dim3 grid(num_tokens); + dim3 block(std::min(hidden_size, block_size)); + + const int64_t in_row_stride = input.stride(-2); + const int64_t out_row_stride = out.stride(-2); const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); @@ -159,13 +241,12 @@ void dynamic_per_token_scaled_fp8_quant( VLLM_DISPATCH_FP8_TYPES( out.scalar_type(), "dynamic_per_token_scaled_fp8_quant_kernel_fp8_type", [&] { - vllm::dynamic_per_token_scaled_fp8_quant_kernel - <<>>( - out.data_ptr(), scales.data_ptr(), - input.data_ptr(), - scale_ub.has_value() ? scale_ub->data_ptr() - : nullptr, - hidden_size); + vllm::dynamic_per_token_scaled_fp8_quant_kernel_strided< + scalar_t, fp8_t><<>>( + out.data_ptr(), scales.data_ptr(), + input.data_ptr(), + scale_ub.has_value() ? scale_ub->data_ptr() : nullptr, + hidden_size, in_row_stride, out_row_stride); }); }); } diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh index d36f94a8f10d6..1aad6330c44b8 100644 --- a/csrc/quantization/fp8/common.cuh +++ b/csrc/quantization/fp8/common.cuh @@ -55,111 +55,4 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val, #endif } -// Compute the absolute maximum m of the input tensor and store -// m / float8_e4m3::max() in *scale. Each thread block performs a -// reduction tree and the memory in scale is atomically updated. -// So to get the right answer, *scale needs to be initialized to -// a value <= 0.0 and we need to wait for all thread blocks to -// finish before consuming *scale. -template -__global__ void segmented_max_reduction(float* __restrict__ scale, - const scalar_t* __restrict__ input, - int64_t num_elems) { - __shared__ float cache[256]; - int64_t i = blockDim.x * blockIdx.x + threadIdx.x; - - // First store maximum for all values processes by - // the current thread in cache[threadIdx.x] - scalar_t tmp = 0.0; - while (i < num_elems) { - float x = static_cast(input[i]); - tmp = fmaxf(tmp, fabsf(x)); - i += blockDim.x * gridDim.x; - } - cache[threadIdx.x] = tmp; - - __syncthreads(); - - // Now perform parallel reduction within the thread block - int ib = blockDim.x / 2; - while (ib != 0) { - if (threadIdx.x < ib && cache[threadIdx.x + ib] > cache[threadIdx.x]) { - cache[threadIdx.x] = cache[threadIdx.x + ib]; - } - __syncthreads(); - ib /= 2; - } - // Finally, since cache[0] contains the maximum for this thread block, - // atomically write the max to the target location - if (threadIdx.x == 0) { - atomicMaxFloat(scale, cache[0] / quant_type_max_v); - } -} - -template -__device__ float thread_max_vec(scalar_t const* __restrict__ input, - int64_t const num_elems, int const tid, - int const step) { - constexpr size_t VEC_SIZE = 16; - using scalarxN_t = vec_n_t; - // Vectorized input/output to better utilize memory bandwidth. - auto const* vectorized_in = reinterpret_cast(input); - - // num_elems / VEC_SIZE (which is 16) - int64_t const num_vec_elems = num_elems >> 4; - float absmax_val = 0.0f; - -#pragma unroll - for (int64_t i = tid; i < num_vec_elems; i += step) { - scalarxN_t in_vec = vectorized_in[i]; -#pragma unroll - for (int j = 0; j < VEC_SIZE; ++j) { - absmax_val = fmaxf(absmax_val, fabsf(in_vec.val[j])); - } - } - - // Handle the remaining elements if num_elems is not divisible by VEC_SIZE - for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) { - absmax_val = fmaxf(absmax_val, fabsf(input[i])); - } - - return absmax_val; -} - -template -__device__ void scaled_fp8_conversion_vec(fp8_type* __restrict__ out, - scalar_t const* __restrict__ input, - float const scale, - int64_t const num_elems, - int const tid, int const step) { - constexpr size_t VEC_SIZE = 16; - using scalarxN_t = vec_n_t; - using float8xN_t = q8_n_t; - // Vectorized input/output to better utilize memory bandwidth. - auto const* vectorized_in = reinterpret_cast(input); - auto* vectorized_out = reinterpret_cast(out); - - // num_elems / VEC_SIZE (which is 16) - int64_t const num_vec_elems = num_elems >> 4; - -#pragma unroll - for (int64_t i = tid; i < num_vec_elems; i += step) { - scalarxN_t in_vec = vectorized_in[i]; - float8xN_t out_vec; - -#pragma unroll - for (int j = 0; j < VEC_SIZE; ++j) { - out_vec.val[j] = scaled_fp8_conversion( - static_cast(in_vec.val[j]), scale); - } - vectorized_out[i] = out_vec; - } - - // Handle the remaining elements if num_elems is not divisible by VEC_SIZE - for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) { - out[i] = scaled_fp8_conversion( - static_cast(input[i]), scale); - } -} - } // namespace vllm diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index e5ab7b3dd3cfb..0b37c83c92c2a 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -194,3 +194,36 @@ def test_scaled_fp8_quant(dtype) -> None: ref_y, per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale, dtype)) + + # non-contiguous input with padding + m, n, padded_stride = 975, 512, 576 + padded_tensor = (torch.randn(size=(m, padded_stride), device="cuda") * + 13).to(dtype) + x_nc = padded_tensor[:, :n] # shape (m, n) with stride (padded_stride, 1) + + assert not x_nc.is_contiguous() + assert x_nc.stride(0) == padded_stride + + # dynamic quantization + ref_y_nc, inv_scale_nc = ops.scaled_fp8_quant(x_nc, None) + ref_y_nc = per_tensor_dequantize(ref_y_nc, inv_scale_nc, dtype) + + # reference dynamic quantization + y_nc = quantize_ref(x_nc, inv_scale_nc) + torch.testing.assert_close( + ref_y_nc, per_tensor_dequantize(y_nc, inv_scale_nc, dtype)) + + # static quantization + y_nc, _ = ops.scaled_fp8_quant(x_nc, inv_scale_nc) + torch.testing.assert_close( + ref_y_nc, per_tensor_dequantize(y_nc, inv_scale_nc, dtype)) + + # padding after non-contiguous input quantization + y_nc_pad, _ = ops.scaled_fp8_quant(x_nc, + inv_scale_nc, + num_token_padding=m + 10) + assert y_nc_pad.shape[0] == m + 10 + torch.testing.assert_close( + ref_y_nc, + per_tensor_dequantize(torch.narrow(y_nc_pad, 0, 0, x_nc.shape[0]), + inv_scale_nc, dtype)) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 35345b1be01c2..e6f69e2344efa 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1279,14 +1279,13 @@ def scaled_fp8_quant( device=input.device, dtype=torch.float32) torch.ops._C.dynamic_per_token_scaled_fp8_quant( - output, input.contiguous(), scale, scale_ub) + output, input, scale, scale_ub) else: - scale = torch.zeros(1, device=input.device, dtype=torch.float32) - torch.ops._C.dynamic_scaled_fp8_quant(output, input.contiguous(), - scale) + scale = torch.empty(1, device=input.device, dtype=torch.float32) + torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale) else: assert scale.numel() == 1, f"{scale.shape}" - torch.ops._C.static_scaled_fp8_quant(output, input.contiguous(), scale) + torch.ops._C.static_scaled_fp8_quant(output, input, scale) return output, scale From 83156c7b89fb880744216f3475c99f698d67a4dc Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Tue, 5 Aug 2025 17:45:34 +0800 Subject: [PATCH 219/224] [NVIDIA] Support Flashinfer TRT-LLM Prefill Attention Kernel (#22095) Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 2 +- ...y => benchmark_trtllm_decode_attention.py} | 1 - .../benchmark_trtllm_prefill_attention.py | 250 +++++++++++++++ .../test_flashinfer_trtllm_attention.py | 293 ++++++++++++++++++ ...test_flashinfer_trtllm_decode_attention.py | 138 --------- vllm/attention/backends/flashinfer.py | 4 +- vllm/envs.py | 6 +- vllm/utils/flashinfer.py | 17 +- vllm/v1/attention/backends/flashinfer.py | 223 ++++++++----- 9 files changed, 700 insertions(+), 234 deletions(-) rename benchmarks/kernels/{benchmark_trtllm_attention.py => benchmark_trtllm_decode_attention.py} (99%) create mode 100644 benchmarks/kernels/benchmark_trtllm_prefill_attention.py create mode 100644 tests/kernels/attention/test_flashinfer_trtllm_attention.py delete mode 100644 tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b7a2ca6ca9b24..e139c6b30586e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -664,7 +664,7 @@ steps: # Attention # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' - - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py + - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py - pytest -v -s tests/kernels/test_cutlass_mla_decode.py # Quantization - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' diff --git a/benchmarks/kernels/benchmark_trtllm_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py similarity index 99% rename from benchmarks/kernels/benchmark_trtllm_attention.py rename to benchmarks/kernels/benchmark_trtllm_decode_attention.py index 68c48858e61cc..77136edca45b5 100644 --- a/benchmarks/kernels/benchmark_trtllm_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py @@ -41,7 +41,6 @@ def benchmark_decode( device = "cuda" torch.manual_seed(0) - # Currently only HEAD_GRP_SIZE == 8 is supported HEAD_GRP_SIZE = 8 MAX_SEQ_LEN = max_seq_len diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py new file mode 100644 index 0000000000000..67bd9aebbcca9 --- /dev/null +++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py @@ -0,0 +1,250 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import csv +import os +import random +from datetime import datetime + +import flashinfer +import torch + +FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 + +# KV Cache Layout for TRT-LLM +# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim) + + +def to_float8(x, dtype=torch.float8_e4m3fn): + finfo = torch.finfo(dtype) + min_val, max_val = x.aminmax() + amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12) + scale = finfo.max / amax * 0.1 + x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max) + return x_scl_sat.to(dtype), scale.float().reciprocal() + + +@torch.no_grad() +def benchmark_prefill( + num_seqs, + max_seq_len, + page_size=16, + dtype=torch.bfloat16, + kv_layout="HND", + num_kv_heads=8, + kv_cache_dtype="auto", + head_dim=128, + warmup=10, + trials=20, +): + torch.set_default_device("cuda") + torch.manual_seed(0) + + HEAD_GRP_SIZE = 8 + MAX_SEQ_LEN = max_seq_len + + # large number to reduce kv_cache reuse + NUM_BLOCKS = int(256000 / page_size) + + workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.int8) + + num_qo_heads = num_kv_heads * HEAD_GRP_SIZE + sm_scale = float(1.0 / (head_dim**0.5)) + + q_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)] + q_lens[-1] = MAX_SEQ_LEN + max_q_len = max(q_lens) + q_indptr = torch.cat( + [ + torch.tensor([0], dtype=torch.int32), + torch.cumsum( + torch.tensor(q_lens, dtype=torch.int32), dim=0, dtype=torch.int32 + ), + ] + ) + q = torch.randn(sum(q_lens), num_qo_heads, head_dim, dtype=dtype) + + kv_lens = [random.randint(0, MAX_SEQ_LEN) for _ in range(num_seqs)] + kv_lens[-1] = MAX_SEQ_LEN + + seq_lens = [q_len + kv_len for q_len, kv_len in zip(q_lens, kv_lens)] + max_seq_len = max(seq_lens) + seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int32) + + max_num_blocks_per_seq = (max_seq_len + page_size - 1) // page_size + block_tables = torch.randint( + 0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32 + ) + + kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, page_size, head_dim) + kv_cache = torch.randn(size=kv_cache_shape, dtype=dtype) + k_scale = v_scale = 1.0 + + if kv_cache_dtype.startswith("fp8"): + kv_cache, _ = to_float8(kv_cache) + + output_trtllm = torch.empty(q.shape, dtype=dtype) + + kv_indptr = [0] + kv_indices = [] + kv_last_page_lens = [] + for i in range(num_seqs): + seq_len = seq_lens[i] + assert seq_len > 0 + num_blocks = (seq_len + page_size - 1) // page_size + kv_indices.extend(block_tables[i, :num_blocks]) + kv_indptr.append(kv_indptr[-1] + num_blocks) + kv_last_page_len = seq_len % page_size + if kv_last_page_len == 0: + kv_last_page_len = page_size + kv_last_page_lens.append(kv_last_page_len) + + kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32) + kv_indices = torch.tensor(kv_indices, dtype=torch.int32) + kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32) + + output_baseline = torch.empty(q.shape, dtype=dtype) + + wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( + workspace_buffer, kv_layout + ) + wrapper.plan( + q_indptr, + kv_indptr, + kv_indices, + kv_last_page_lens, + num_qo_heads, + num_kv_heads, + head_dim, + page_size, + causal=True, + sm_scale=sm_scale, + q_data_type=dtype, + kv_data_type=kv_cache.dtype, + ) + + def time_fn(fn, warmup=10, trials=20): + torch.cuda.synchronize() + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + times = [] + for i in range(warmup): + fn() + for i in range(trials): + start.record() + fn() + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) # ms + return sum(times) / len(times), torch.std(torch.tensor(times)) + + def baseline_prefill(): + return wrapper.run( + q, kv_cache, k_scale=k_scale, v_scale=v_scale, out=output_baseline + ) + + def trt_prefill(): + return flashinfer.prefill.trtllm_batch_context_with_kv_cache( + query=q, + kv_cache=kv_cache, + workspace_buffer=workspace_buffer, + block_tables=block_tables, + seq_lens=seq_lens_tensor, + max_q_len=max_q_len, + max_kv_len=max_seq_len, + bmm1_scale=k_scale * sm_scale, + bmm2_scale=v_scale, + batch_size=num_seqs, + cum_seq_lens_q=q_indptr, + cum_seq_lens_kv=kv_indptr, + out=output_trtllm, + ) + + trt_mean, trt_std = time_fn(trt_prefill) + baseline_mean, baseline_std = time_fn(baseline_prefill) + + # Calculate percentage speedup (positive means TRT is faster) + speedup_percent = (baseline_mean - trt_mean) / baseline_mean + + print( + f"\t{num_seqs}\t{max_seq_len}\t{trt_mean:.5f}\t{trt_std.item():.5f}" + f"\t{baseline_mean:.5f}\t{baseline_std.item():.5f}\t{speedup_percent:.5f}" + ) + + # Return results for CSV writing + return { + "num_seqs": num_seqs, + "trt_mean": trt_mean, + "trt_std": trt_std.item(), + "baseline_mean": baseline_mean, + "baseline_std": baseline_std.item(), + "speedup_percent": speedup_percent, + "q_dtype": str(dtype), + "kv_cache_dtype": kv_cache_dtype, + "page_size": page_size, + "num_kv_heads": num_kv_heads, + "head_dim": head_dim, + "max_seq_len": max_seq_len, + } + + +def write_results_to_csv(results, filename=None): + """Write benchmark results to CSV file.""" + if filename is None: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv" + + fieldnames = [ + "num_seqs", + "trt_mean", + "trt_std", + "baseline_mean", + "baseline_std", + "speedup_percent", + "q_dtype", + "kv_cache_dtype", + "page_size", + "num_kv_heads", + "head_dim", + "max_seq_len", + ] + + file_exists = os.path.exists(filename) + + with open(filename, "a", newline="") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + + if not file_exists: + writer.writeheader() + + for result in results: + writer.writerow(result) + + print(f"Results written to {filename}") + + +if __name__ == "__main__": + num_seqs = [1, 4, 8, 16, 32, 64, 128, 256] + max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072] + all_results = [] + + print( + "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, " + "output_dtype: bfloat16" + ) + print( + "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t" + "baseline_std\tspeedup_percent" + ) + for max_seq_len in max_seq_lens: + for bs in num_seqs: + result = benchmark_prefill( + bs, + max_seq_len, + dtype=torch.bfloat16, + kv_cache_dtype="auto", + ) + all_results.append(result) + + # Write all results to CSV + write_results_to_csv(all_results) diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py new file mode 100644 index 0000000000000..e87ce520bc66b --- /dev/null +++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py @@ -0,0 +1,293 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + +import flashinfer +import pytest +import torch + +from vllm.platforms import current_platform + +if not current_platform.is_device_capability(100): + pytest.skip("This TRTLLM kernel requires NVIDIA Blackwell.", + allow_module_level=True) + +FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 + +# KV Cache Layout for TRT-LLM +# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim) + +MAX_Q_LEN = 1024 +MAX_KV_LEN = 4096 +BATCH_SIZES = [4, 12] +NUM_HEADS = [(64, 8), (16, 16), (40, 8), (32, 8)] +HEAD_SIZES = [128] +BLOCK_SIZES = [16, 32] +KV_LAYOUTS = ["HND"] +DTYPES = [torch.float16, torch.bfloat16] +KV_CACHE_DTYPES = [None, current_platform.fp8_dtype()] +NUM_BLOCKS = 32768 # Large enough to test overflow in index calculation. +SOFT_CAPS = [None, 50.0] + + +def to_float8(x, dtype=torch.float8_e4m3fn): + finfo = torch.finfo(dtype) + min_val, max_val = x.aminmax() + amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12) + scale = finfo.max / amax * 0.1 + x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max) + return x_scl_sat.to(dtype), scale.float().reciprocal() + + +@pytest.mark.parametrize("batch_size", BATCH_SIZES) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("block_size", BLOCK_SIZES) +@pytest.mark.parametrize("kv_layout", KV_LAYOUTS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES) +@pytest.mark.parametrize("soft_cap", SOFT_CAPS) +@torch.inference_mode +def test_flashinfer_trtllm_decode_with_baseline( + batch_size: int, + num_heads: tuple[int, int], + head_size: int, + block_size: int, + kv_layout: str, + dtype: torch.dtype, + kv_cache_dtype: Optional[torch.dtype], + soft_cap: Optional[float], +) -> None: + kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype + + torch.set_default_device("cuda") + current_platform.seed_everything(0) + + kv_lens = torch.randint(1, MAX_KV_LEN, (batch_size, ), dtype=torch.int32) + kv_lens[-1] = MAX_KV_LEN + max_kv_len = torch.max(kv_lens).item() + num_seqs = len(kv_lens) + + num_query_heads = num_heads[0] + num_kv_heads = num_heads[1] + assert num_query_heads % num_kv_heads == 0 + + scale = head_size**-0.5 + + query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype) + + kv_cache_shape = None + if kv_layout == "NHD": + kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size) + elif kv_layout == "HND": + kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size) + else: + raise ValueError(f"Invalid kv_layout: {kv_layout}") + key_value_cache = torch.randn(kv_cache_shape, dtype=dtype) + kv_scale = 1.0 + if kv_cache_dtype is current_platform.fp8_dtype(): + key_value_cache, kv_scale = to_float8(key_value_cache, + current_platform.fp8_dtype()) + + max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size + block_tables = torch.randint(0, + NUM_BLOCKS, + (num_seqs, max_num_blocks_per_seq), + dtype=torch.int32) + k_scale = v_scale = kv_scale + kv_indptr = [0] + kv_indices = [] + kv_last_page_lens = [] + for i in range(num_seqs): + seq_len = kv_lens[i] + assert seq_len > 0 + num_blocks = (seq_len + block_size - 1) // block_size + kv_indices.extend(block_tables[i, :num_blocks]) + kv_indptr.append(kv_indptr[-1] + num_blocks) + kv_last_page_len = seq_len % block_size + if kv_last_page_len == 0: + kv_last_page_len = block_size + kv_last_page_lens.append(kv_last_page_len) + + kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32) + kv_indices = torch.tensor(kv_indices, dtype=torch.int32) + kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32) + + workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8) + wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper( + workspace_buffer, + kv_layout, + use_tensor_cores=((num_query_heads // num_kv_heads) > 4)) + wrapper.plan(kv_indptr, + kv_indices, + kv_last_page_lens, + num_query_heads, + num_kv_heads, + head_size, + block_size, + "NONE", + sm_scale=scale, + q_data_type=dtype, + kv_data_type=kv_cache_dtype, + logits_soft_cap=soft_cap) + + output = torch.empty(query.shape, dtype=dtype) + wrapper.run(query, + key_value_cache, + k_scale=k_scale, + v_scale=v_scale, + out=output) + + # TRTLLM Decode + kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32) + output_trtllm = torch.empty(query.shape, dtype=dtype) + flashinfer.decode.trtllm_batch_decode_with_kv_cache( + query=query.contiguous(), + kv_cache=key_value_cache, + workspace_buffer=workspace_buffer, + block_tables=block_tables, + seq_lens=kv_lens_tensor, + max_seq_len=max_kv_len, + bmm1_scale=k_scale * scale, + bmm2_scale=v_scale, + out=output_trtllm, + ) + + torch.testing.assert_close(output, output_trtllm, atol=1e-2, rtol=1e-2), \ + f"{torch.max(torch.abs(output - output_trtllm))}" + + +@pytest.mark.parametrize("batch_size", BATCH_SIZES) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("block_size", BLOCK_SIZES) +@pytest.mark.parametrize("kv_layout", KV_LAYOUTS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES) +@pytest.mark.parametrize("soft_cap", [None]) +@torch.inference_mode +def test_flashinfer_trtllm_prefill_with_baseline( + batch_size: int, + num_heads: tuple[int, int], + head_size: int, + block_size: int, + kv_layout: str, + dtype: torch.dtype, + kv_cache_dtype: Optional[torch.dtype], + soft_cap: Optional[float], +) -> None: + kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype + if dtype != kv_cache_dtype: + pytest.skip(f"Not supported dtype({dtype}) with " + "kv_cache_dtype({kv_cache_dtype})") + + torch.set_default_device("cuda") + current_platform.seed_everything(0) + + q_lens = torch.randint(1, MAX_Q_LEN, (batch_size, ), dtype=torch.int32) + q_lens[-1] = MAX_Q_LEN + max_q_len = torch.max(q_lens).item() + q_indptr = torch.cat([ + torch.tensor([0], dtype=torch.int32), + torch.cumsum(q_lens, dim=0, dtype=torch.int32), + ]) + + kv_lens = torch.randint(0, MAX_KV_LEN, (batch_size, ), dtype=torch.int32) + kv_lens[-1] = MAX_KV_LEN + + seq_lens = kv_lens + q_lens + max_seq_len = torch.max(seq_lens).item() + num_seqs = len(seq_lens) + + num_query_heads = num_heads[0] + num_kv_heads = num_heads[1] + assert num_query_heads % num_kv_heads == 0 + + scale = head_size**-0.5 + + query = torch.randn(torch.sum(q_lens).item(), + num_query_heads, + head_size, + dtype=dtype) + + kv_cache_shape = None + if kv_layout == "NHD": + kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size) + elif kv_layout == "HND": + kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size) + else: + raise ValueError(f"Invalid kv_layout: {kv_layout}") + key_value_cache = torch.randn(kv_cache_shape, dtype=dtype) + kv_scale = 1.0 + if kv_cache_dtype is current_platform.fp8_dtype(): + key_value_cache, kv_scale = to_float8(key_value_cache, + current_platform.fp8_dtype()) + + max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size + block_tables = torch.randint(0, + NUM_BLOCKS, + (num_seqs, max_num_blocks_per_seq), + dtype=torch.int32) + k_scale = v_scale = kv_scale + kv_indptr = [0] + kv_indices = [] + kv_last_page_lens = [] + for i in range(num_seqs): + seq_len = seq_lens[i] + assert seq_len > 0 + num_blocks = (seq_len + block_size - 1) // block_size + kv_indices.extend(block_tables[i, :num_blocks]) + kv_indptr.append(kv_indptr[-1] + num_blocks) + kv_last_page_len = seq_len % block_size + if kv_last_page_len == 0: + kv_last_page_len = block_size + kv_last_page_lens.append(kv_last_page_len) + + kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32) + kv_indices = torch.tensor(kv_indices, dtype=torch.int32) + kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32) + + workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8) + wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( + workspace_buffer, kv_layout) + wrapper.plan(q_indptr, + kv_indptr, + kv_indices, + kv_last_page_lens, + num_query_heads, + num_kv_heads, + head_size, + block_size, + causal=True, + sm_scale=scale, + q_data_type=dtype, + kv_data_type=kv_cache_dtype, + logits_soft_cap=soft_cap) + + output = torch.empty(query.shape, dtype=dtype) + wrapper.run(query, + key_value_cache, + k_scale=k_scale, + v_scale=v_scale, + out=output) + + # TRTLLM Decode + output_trtllm = torch.empty(query.shape, dtype=dtype) + flashinfer.prefill.trtllm_batch_context_with_kv_cache( + query=query.contiguous(), + kv_cache=key_value_cache, + workspace_buffer=workspace_buffer, + block_tables=block_tables, + seq_lens=seq_lens, + max_q_len=max_q_len, + max_kv_len=max_seq_len, + bmm1_scale=k_scale * scale, + bmm2_scale=v_scale, + batch_size=num_seqs, + cum_seq_lens_q=q_indptr, + cum_seq_lens_kv=kv_indptr, + out=output_trtllm, + ) + + torch.testing.assert_close(output, output_trtllm, atol=1e-2, rtol=1e-2), \ + f"{torch.max(torch.abs(output - output_trtllm))}" diff --git a/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py deleted file mode 100644 index 2e2130fab6a21..0000000000000 --- a/tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py +++ /dev/null @@ -1,138 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional - -import flashinfer -import pytest -import torch - -from vllm.platforms import current_platform - -if not current_platform.is_device_capability(100): - pytest.skip("This TRTLLM kernel requires NVIDIA Blackwell.", - allow_module_level=True) - -FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 - -# KV Cache Layout for TRT-LLM -# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim) - -NUM_HEADS = [(64, 8), (16, 16), (40, 8), (32, 8)] -HEAD_SIZES = [128] -BLOCK_SIZES = [16, 32] -DTYPES = [torch.float16, torch.bfloat16] -NUM_BLOCKS = 32768 # Large enough to test overflow in index calculation. -SOFT_CAPS = [None, 30.0, 50.0] - - -def to_float8(x, dtype=torch.float8_e4m3fn): - finfo = torch.finfo(dtype) - min_val, max_val = x.aminmax() - amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12) - scale = finfo.max / amax * 0.1 - x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max) - return x_scl_sat.to(dtype), scale.float().reciprocal() - - -@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]]) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("kv_layout", ["HND"]) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("soft_cap", SOFT_CAPS) -@torch.inference_mode -def test_flashinfer_trtllm_decode_with_baseline( - kv_lens: list[int], - num_heads: tuple[int, int], - head_size: int, - dtype: torch.dtype, - block_size: int, - soft_cap: Optional[float], - kv_layout: str, -) -> None: - torch.set_default_device("cuda") - current_platform.seed_everything(0) - num_seqs = len(kv_lens) - num_query_heads = num_heads[0] - num_kv_heads = num_heads[1] - - assert num_query_heads % num_kv_heads == 0 - max_kv_len = max(kv_lens) - scale = head_size**-0.5 - - query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype) - kv_cache_shape = None - if kv_layout == "NHD": - kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size) - elif kv_layout == "HND": - kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size) - else: - raise ValueError(f"Invalid kv_layout: {kv_layout}") - key_value_cache = torch.randn(kv_cache_shape, dtype=dtype) - - max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size - block_tables = torch.randint(0, - NUM_BLOCKS, - (num_seqs, max_num_blocks_per_seq), - dtype=torch.int32) - k_scale = v_scale = 1.0 - kv_indptr = [0] - kv_indices = [] - kv_last_page_lens = [] - for i in range(num_seqs): - seq_len = kv_lens[i] - assert seq_len > 0 - num_blocks = (seq_len + block_size - 1) // block_size - kv_indices.extend(block_tables[i, :num_blocks]) - kv_indptr.append(kv_indptr[-1] + num_blocks) - kv_last_page_len = seq_len % block_size - if kv_last_page_len == 0: - kv_last_page_len = block_size - kv_last_page_lens.append(kv_last_page_len) - - kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32) - kv_indices = torch.tensor(kv_indices, dtype=torch.int32) - kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32) - - workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8) - wrapper = flashinfer.\ - BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, kv_layout, - use_tensor_cores=( - (num_query_heads//num_kv_heads) > 4) - ) - wrapper.plan(kv_indptr, - kv_indices, - kv_last_page_lens, - num_query_heads, - num_kv_heads, - head_size, - block_size, - "NONE", - q_data_type=dtype, - kv_data_type=dtype, - logits_soft_cap=soft_cap) - - output = torch.empty(query.shape, dtype=dtype) - wrapper.run(query, key_value_cache, scale, out=output) - - # TRTLLM Decode - max_kv_len = max(kv_lens) - kv_lens_tensor = torch.tensor(kv_lens, - dtype=torch.int, - device=query.device) - output_trtllm = torch.empty(query.shape, dtype=dtype) - flashinfer.decode.trtllm_batch_decode_with_kv_cache( - query.contiguous(), - key_value_cache, - workspace_buffer, - block_tables, - kv_lens_tensor, - max_kv_len, - bmm1_scale=k_scale * scale, - bmm2_scale=v_scale, - out=output_trtllm, - ) - - torch.testing.assert_close(output, output_trtllm, atol=1e-2, rtol=1e-2), \ - f"{torch.max(torch.abs(output - output_trtllm))}" diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index b3372ce2eca8c..78d8a67e37f8f 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -46,7 +46,7 @@ from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype, make_tensor_with_pad) -from vllm.utils.flashinfer import use_trtllm_decode_attention +from vllm.utils.flashinfer import use_trtllm_attention logger = init_logger(__name__) @@ -1114,7 +1114,7 @@ class FlashInferImpl(AttentionImpl): assert decode_meta.decode_wrapper._sm_scale == softmax_scale # TODO: @pavanimajety Remove this once the switch happens # inside flashinfer. - if not use_trtllm_decode_attention( + if not use_trtllm_attention( num_decode_tokens, attn_metadata.max_decode_seq_len, kv_cache_dtype, attn_metadata.num_qo_heads, attn_metadata.num_kv_heads, attn_metadata.head_dim): diff --git a/vllm/envs.py b/vllm/envs.py index 78f955f78a987..9bce5c6d2e0bb 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1027,9 +1027,9 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_CUDNN_PREFILL": lambda: bool(int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0"))), - # If set to 1, use the TRTLLM Decode Attention backend in flashinfer. - "VLLM_USE_TRTLLM_DECODE_ATTENTION": - lambda: os.getenv("VLLM_USE_TRTLLM_DECODE_ATTENTION", None), + # If set to 1, use the TRTLLM Attention backend in flashinfer. + "VLLM_USE_TRTLLM_ATTENTION": + lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None), # Controls garbage collection during CUDA graph capture. # If set to 0 (default), enables GC freezing to speed up capture time. diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 29967bc516715..cce1aefaf9b02 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -124,7 +124,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool: @functools.cache def has_nvidia_artifactory() -> bool: """Return ``True`` if NVIDIA's artifactory is accessible. - + This checks connectivity to the kernel inference library artifactory which is required for downloading certain cubin kernels like TRTLLM FHMA. """ @@ -144,7 +144,7 @@ def has_nvidia_artifactory() -> bool: return False -def use_trtllm_decode_attention( +def use_trtllm_attention( num_tokens: int, max_seq_len: int, kv_cache_dtype: str, @@ -159,29 +159,26 @@ def use_trtllm_decode_attention( # Check if the dimensions are supported by TRTLLM decode attention if (attn_head_size is None or num_qo_heads is None or num_kv_heads is None - or num_qo_heads // num_kv_heads > 8 or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128): return False - env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION + env_value = envs.VLLM_USE_TRTLLM_ATTENTION if env_value is not None: - logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s", - env_value) + logger.info_once("VLLM_USE_TRTLLM_ATTENTION is set to %s", env_value) # Environment variable is set - respect it # Making the conditional check for zero because # the path is automatically enabled if the batch size condition # is satisfied. no_use_trtllm = (env_value == "0") if not no_use_trtllm: - logger.info_once("Using TRTLLM decode attention.") + logger.info_once("Using TRTLLM attention.") return not no_use_trtllm else: # Environment variable not set - use auto-detection use_trtllm = (num_tokens <= 256 and max_seq_len < 131072 and kv_cache_dtype == "auto") if use_trtllm: - logger.warning_once( - "Using TRTLLM decode attention (auto-detected).") + logger.warning_once("Using TRTLLM attention (auto-detected).") return use_trtllm @@ -195,5 +192,5 @@ __all__ = [ "has_flashinfer_moe", "has_flashinfer_cutlass_fused_moe", "has_nvidia_artifactory", - "use_trtllm_decode_attention", + "use_trtllm_attention", ] diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 3697cb9387a92..8592d1b26dfa8 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -12,6 +12,7 @@ from flashinfer import (BatchDecodeWithPagedKVCacheWrapper, MultiLevelCascadeAttentionWrapper) from flashinfer.decode import (_get_range_buf, get_seq_lens, trtllm_batch_decode_with_kv_cache) +from flashinfer.prefill import trtllm_batch_context_with_kv_cache import vllm.envs as envs from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, @@ -19,7 +20,7 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.utils import cdiv, is_pin_memory_available -from vllm.utils.flashinfer import use_trtllm_decode_attention +from vllm.utils.flashinfer import use_trtllm_attention from vllm.v1.attention.backends.flash_attn import use_cascade_attention # yapf conflicts with isort for this block # yapf: disable @@ -149,9 +150,12 @@ class FlashInferMetadata: slot_mapping: torch.Tensor # For flashinfer trtllm batch decode + max_q_len: int max_seq_len: int seq_lens: torch.Tensor block_table_tensor: torch.Tensor + prefill_use_trtllm: bool + decode_use_trtllm: bool # For handling prefill decode split num_decodes: int @@ -170,6 +174,9 @@ class FlashInferMetadata: decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None cascade_wrapper: Optional[MultiLevelCascadeAttentionWrapper] = None + qo_indptr_gpu: Optional[torch.Tensor] = None + paged_kv_indptr_gpu: Optional[torch.Tensor] = None + def __post_init__(self): if self.head_dim is not None: FlashInferBackend.validate_head_size(self.head_dim) @@ -305,8 +312,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): 2, self._get_workspace_buffer(), get_kv_cache_layout()) return self._cascade_wrapper - def _plan(self, num_prefills: int, num_decodes: int, - attn_metadata: FlashInferMetadata): + def _plan(self, attn_metadata: FlashInferMetadata): if attn_metadata.use_cascade: attn_metadata.cascade_wrapper = self._get_cascade_wrapper() attn_metadata.cascade_wrapper.plan( @@ -341,6 +347,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): # Regular attention (common case). # Decodes are at the front and prefills are at the back, # according to reorder_batch() + num_prefills = attn_metadata.num_prefills + num_decodes = attn_metadata.num_decodes if num_prefills > 0: # Decodes are first so prefills start after the last decode prefill_start = num_decodes @@ -356,23 +364,31 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): # to be relative to the start of the prefill queries. qo_indptr_cpu = attn_metadata.qo_indptr_cpu[ prefill_start:] - attn_metadata.qo_indptr_cpu[prefill_start] - attn_metadata.prefill_wrapper.plan( - qo_indptr_cpu, - attn_metadata.paged_kv_indptr_cpu[prefill_start:], - attn_metadata.paged_kv_indices, - attn_metadata.paged_kv_last_page_len_cpu[prefill_start:], - attn_metadata.num_qo_heads, - attn_metadata.num_kv_heads, - attn_metadata.head_dim, - attn_metadata.page_size, - causal=True, - sm_scale=self.global_hyperparameters.sm_scale, - window_left=self.global_hyperparameters.window_left, - logits_soft_cap=self.global_hyperparameters. - logits_soft_cap, - q_data_type=attn_metadata.q_data_type, - kv_data_type=attn_metadata.kv_data_type, - ) + paged_kv_indptr_cpu = attn_metadata.paged_kv_indptr_cpu[ + prefill_start:] + if not attn_metadata.prefill_use_trtllm: + attn_metadata.prefill_wrapper.plan( + qo_indptr_cpu, + paged_kv_indptr_cpu, + attn_metadata.paged_kv_indices, + attn_metadata. + paged_kv_last_page_len_cpu[prefill_start:], + attn_metadata.num_qo_heads, + attn_metadata.num_kv_heads, + attn_metadata.head_dim, + attn_metadata.page_size, + causal=True, + sm_scale=self.global_hyperparameters.sm_scale, + window_left=self.global_hyperparameters.window_left, + logits_soft_cap=self.global_hyperparameters. + logits_soft_cap, + q_data_type=attn_metadata.q_data_type, + kv_data_type=attn_metadata.kv_data_type, + ) + else: + attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(self.device) + attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to( + self.device) if num_decodes > 0: pure_decode = num_prefills == 0 @@ -400,11 +416,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): attn_metadata.decode_wrapper = self._get_decode_wrapper( num_input_tokens, use_cudagraph) - if not use_trtllm_decode_attention( - num_decodes, attn_metadata.max_seq_len, - self.cache_config.cache_dtype, - attn_metadata.num_qo_heads, attn_metadata.num_kv_heads, - attn_metadata.head_dim): + if not attn_metadata.decode_use_trtllm: # Use the persistent buffer with padding length, # instead of the same address but chunked version # in atten_metadata when using cudagraph. @@ -437,6 +449,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): split_decodes_and_prefills(common_attn_metadata) page_size = self.kv_cache_spec.block_size + max_q_len = common_attn_metadata.max_query_len max_seq_len = common_attn_metadata.seq_lens_cpu.max() seq_lens = common_attn_metadata.seq_lens seq_lens_cpu = common_attn_metadata.seq_lens_cpu @@ -503,6 +516,24 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): cache_dtype) else: kv_cache_dtype = self.kv_cache_spec.dtype + + num_qo_heads = self.vllm_config.model_config.get_num_attention_heads( + self.vllm_config.parallel_config) + num_kv_heads = self.kv_cache_spec.num_kv_heads + head_dim = self.kv_cache_spec.head_size + + # currently prefill trtllm attention does not support fp8 kv cache + # trtllm may not support sliding window + prefill_use_trtllm = (self.global_hyperparameters.window_left == -1 + and not cache_dtype.startswith("fp8") + and use_trtllm_attention( + num_prefill_tokens, max_seq_len, cache_dtype, + num_qo_heads, num_kv_heads, head_dim)) + decode_use_trtllm = (self.global_hyperparameters.window_left == -1 + and use_trtllm_attention( + num_decode_tokens, max_seq_len, cache_dtype, + num_qo_heads, num_kv_heads, head_dim)) + attn_metadata = FlashInferMetadata( num_actual_tokens=num_actual_tokens, qo_indptr_cpu=common_attn_metadata.query_start_loc_cpu, @@ -510,14 +541,19 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): paged_kv_indices=paged_kv_indices, paged_kv_last_page_len_cpu=self. paged_kv_last_page_len_cpu[:num_reqs], - num_qo_heads=self.vllm_config.model_config.get_num_attention_heads( - self.vllm_config.parallel_config), - num_kv_heads=self.kv_cache_spec.num_kv_heads, - head_dim=self.kv_cache_spec.head_size, + num_qo_heads=num_qo_heads, + num_kv_heads=num_kv_heads, + head_dim=head_dim, page_size=page_size, kv_data_type=kv_cache_dtype, q_data_type=self.vllm_config.model_config.dtype, slot_mapping=common_attn_metadata.slot_mapping, + max_q_len=max_q_len, + max_seq_len=max_seq_len, + seq_lens=seq_lens, + block_table_tensor=block_table_tensor, + prefill_use_trtllm=prefill_use_trtllm, + decode_use_trtllm=decode_use_trtllm, num_decodes=num_decodes, num_decode_tokens=num_decode_tokens, num_prefills=num_prefills, @@ -527,12 +563,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): shared_kv_page_indptr_cpu=shared_kv_page_indptr_cpu, shared_kv_page_indices_cpu=shared_kv_page_indices_cpu, shared_kv_last_page_len_cpu=shared_kv_last_page_len_cpu, - max_seq_len=max_seq_len, - seq_lens=seq_lens, - block_table_tensor=block_table_tensor, ) - self._plan(num_prefills, num_decodes, attn_metadata) + self._plan(attn_metadata) return attn_metadata @@ -698,30 +731,64 @@ class FlashInferImpl(AttentionImpl): # Regular attention (common case). # Decodes are at the front and prefills are at the back, # according to reorder_batch() - if prefill_wrapper := attn_metadata.prefill_wrapper: + if num_prefill_tokens > 0: + prefill_wrapper = attn_metadata.prefill_wrapper prefill_query = query[num_decode_tokens:] assert prefill_query.shape[0] == num_prefill_tokens assert prefill_wrapper is not None - assert prefill_wrapper._causal - assert prefill_wrapper._window_left == window_left - assert prefill_wrapper._logits_soft_cap == (self.logits_soft_cap - or 0.0) - assert prefill_wrapper._sm_scale == self.scale - prefill_wrapper.run( - prefill_query, - kv_cache_permute, - k_scale=layer._k_scale_float, - v_scale=layer._v_scale_float, - out=output[num_decode_tokens:], - ) - if decode_wrapper := attn_metadata.decode_wrapper: + + if not attn_metadata.prefill_use_trtllm: + assert prefill_wrapper._causal + assert prefill_wrapper._window_left == window_left + assert prefill_wrapper._logits_soft_cap == ( + self.logits_soft_cap or 0.0) + assert prefill_wrapper._sm_scale == self.scale + prefill_wrapper.run( + prefill_query, + kv_cache_permute, + k_scale=layer._k_scale_float, + v_scale=layer._v_scale_float, + out=output[num_decode_tokens:], + ) + else: + # prefill_query may be non-contiguous + prefill_query = prefill_query.contiguous() + workspace_buffer = prefill_wrapper._float_workspace_buffer + block_tables_prefill = attn_metadata.block_table_tensor[ + num_decode_tokens:] + seq_lens_prefill = attn_metadata.seq_lens[num_decode_tokens:] + + # This path needs to be enabled with VLLM_KV_CACHE_LAYOUT = HND + assert get_kv_cache_layout() == "HND" + assert prefill_query.is_contiguous() + assert kv_cache_permute.is_contiguous() + assert workspace_buffer.is_contiguous() + assert block_tables_prefill.is_contiguous() + assert seq_lens_prefill.is_contiguous() + + trtllm_batch_context_with_kv_cache( + query=prefill_query, + kv_cache=kv_cache_permute, + workspace_buffer=workspace_buffer, + block_tables=block_tables_prefill, + seq_lens=seq_lens_prefill, + max_q_len=attn_metadata.max_q_len, + max_kv_len=attn_metadata.max_seq_len, + bmm1_scale=layer._k_scale_float * self.scale, + bmm2_scale=layer._v_scale_float, + batch_size=attn_metadata.num_prefills, + cum_seq_lens_q=attn_metadata.qo_indptr_gpu, + cum_seq_lens_kv=attn_metadata.paged_kv_indptr_gpu, + out=output[num_decode_tokens:], + ) + + if num_decode_tokens > 0: + decode_wrapper = attn_metadata.decode_wrapper decode_query = query[:num_decode_tokens] assert decode_query.shape[0] == num_decode_tokens assert decode_wrapper is not None - if not use_trtllm_decode_attention( - attn_metadata.num_decodes, attn_metadata.max_seq_len, - self.kv_cache_dtype, attn_metadata.num_qo_heads, - attn_metadata.num_kv_heads, attn_metadata.head_dim): + + if not attn_metadata.decode_use_trtllm: assert decode_wrapper._window_left == window_left assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap or 0.0) @@ -734,34 +801,32 @@ class FlashInferImpl(AttentionImpl): out=output[:num_decode_tokens], ) else: + # decode_query may be non-contiguous + decode_query = decode_query.contiguous() + workspace_buffer = decode_wrapper._float_workspace_buffer + block_tables_decode = attn_metadata.block_table_tensor[: + num_decode_tokens] + seq_lens_decode = attn_metadata.seq_lens[:num_decode_tokens] + # This path needs to be enabled with VLLM_KV_CACHE_LAYOUT = HND - if num_decode_tokens > 0: - # decode_query may be non-contiguous - decode_query = decode_query.contiguous() - block_tables_decode = attn_metadata.block_table_tensor[: - num_decode_tokens] - seq_lens_decode = attn_metadata.seq_lens[: - num_decode_tokens] - workspace_buffer = decode_wrapper._float_workspace_buffer + assert get_kv_cache_layout() == "HND" + assert decode_query.is_contiguous() + assert kv_cache_permute.is_contiguous() + assert workspace_buffer.is_contiguous() + assert block_tables_decode.is_contiguous() + assert seq_lens_decode.is_contiguous() - assert get_kv_cache_layout() == "HND" - assert decode_query.is_contiguous() - assert kv_cache_permute.is_contiguous() - assert block_tables_decode.is_contiguous() - assert seq_lens_decode.is_contiguous() - assert workspace_buffer.is_contiguous() - - trtllm_batch_decode_with_kv_cache( - query=decode_query, - kv_cache=kv_cache_permute, - workspace_buffer=workspace_buffer, - block_tables=block_tables_decode, - seq_lens=seq_lens_decode, - max_seq_len=attn_metadata.max_seq_len, - bmm1_scale=layer._k_scale_float * self.scale, - bmm2_scale=layer._v_scale_float, - out=output[:num_decode_tokens], - ) + trtllm_batch_decode_with_kv_cache( + query=decode_query, + kv_cache=kv_cache_permute, + workspace_buffer=workspace_buffer, + block_tables=block_tables_decode, + seq_lens=seq_lens_decode, + max_seq_len=attn_metadata.max_seq_len, + bmm1_scale=layer._k_scale_float * self.scale, + bmm2_scale=layer._v_scale_float, + out=output[:num_decode_tokens], + ) return output_padded @@ -786,8 +851,8 @@ def fast_plan_decode( non_blocking: bool = True, ) -> None: """ - A faster version of BatchDecodeWithPagedKVCacheWrapper::plan used for - cudagraph capture/replay, while the no cudagraph version turns back + A faster version of BatchDecodeWithPagedKVCacheWrapper::plan used for + cudagraph capture/replay, while the no cudagraph version turns back to the original plan. using original plan after passing host-side buffers: - only host-to-device copy of indptr and last_page_len buffers From 74333ae2f6c3c4aa4b55301e5ed7aba03a5b09f8 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Tue, 5 Aug 2025 18:17:46 +0800 Subject: [PATCH 220/224] [Misc] correct static type check for GroupCoordinator (#21946) Signed-off-by: Andy Xie --- .../device_communicators/ray_communicator.py | 1 + vllm/distributed/eplb/eplb_state.py | 3 ++ vllm/distributed/parallel_state.py | 29 ++++++++++++++++--- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/vllm/distributed/device_communicators/ray_communicator.py b/vllm/distributed/device_communicators/ray_communicator.py index e5ba297ebcc1b..46cc1c2f52d67 100644 --- a/vllm/distributed/device_communicators/ray_communicator.py +++ b/vllm/distributed/device_communicators/ray_communicator.py @@ -70,6 +70,7 @@ class RayPPCommunicator(Communicator): assert ray.get_gpu_ids(), "RayPPCommunicator has no GPUs assigned" self._comm = get_pp_group().device_communicator + assert self._comm is not None # Since we wrap around the vLLM _PP communicator, we use # the rank from the vLLM communicator, and ignore the rank diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index af64620849688..f64b516b0d042 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -251,6 +251,7 @@ class EplbState: if global_expert_load is not None: ep_group = get_ep_group().device_group + assert ep_group is not None assert global_expert_load.shape == (model.num_moe_layers, model.num_logical_experts) assert global_expert_load.dtype == torch.int64 @@ -357,6 +358,7 @@ class EplbState: # Collect load metrics from all ranks ep_group = get_ep_group().device_group + assert ep_group is not None num_tokens_list = [ torch.empty_like(num_tokens) for _ in range(ep_group.size()) ] @@ -412,6 +414,7 @@ class EplbState: """ ep_group = get_ep_group().device_group + assert ep_group is not None ep_rank = ep_group.rank() time_start = None diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 470c1355d2a91..6c25cdcfb7b8c 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -196,10 +196,11 @@ class GroupCoordinator: # 3 | 1 | 3 | 1 | 3 local_rank: int # local rank used to assign devices rank_in_group: int # rank inside the group - cpu_group: ProcessGroup # group for CPU communication - device_group: ProcessGroup # group for device communication + cpu_group: Optional[ProcessGroup] # group for CPU communication + device_group: Optional[ProcessGroup] # group for device communication use_device_communicator: bool # whether to use device communicator - device_communicator: DeviceCommunicatorBase # device communicator + device_communicator: Optional[ + DeviceCommunicatorBase] # device communicator mq_broadcaster: Optional[Any] # shared memory broadcaster def __init__( @@ -250,7 +251,7 @@ class GroupCoordinator: self.use_device_communicator = use_device_communicator - self.device_communicator: DeviceCommunicatorBase = None # type: ignore + self.device_communicator = None if use_device_communicator and self.world_size > 1: device_comm_cls = resolve_obj_by_qualname( current_platform.get_device_communicator_cls()) @@ -364,6 +365,8 @@ class GroupCoordinator: return self._all_reduce_out_place(input_) def _all_reduce_out_place(self, input_: torch.Tensor) -> torch.Tensor: + if self.device_communicator is None: + raise ValueError("No device communicator found") return self.device_communicator.all_reduce(input_) def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: @@ -384,12 +387,16 @@ class GroupCoordinator: def _all_gather_out_place(self, input_: torch.Tensor, dim: int) -> torch.Tensor: + if self.device_communicator is None: + raise ValueError("No device communicator found") return self.device_communicator.all_gather(input_, dim) def all_gatherv(self, input_: Union[torch.Tensor, list[torch.Tensor]], dim: int = 0, sizes: Optional[list[int]] = None): + if self.device_communicator is None: + raise ValueError("No device communicator found") return self.device_communicator.all_gatherv(input_, dim, sizes) def reduce_scatter(self, @@ -414,10 +421,14 @@ class GroupCoordinator: input_: torch.Tensor, dim: int = -1, sizes: Optional[list[int]] = None) -> torch.Tensor: + if self.device_communicator is None: + raise ValueError("No device communicator found") return self.device_communicator.reduce_scatterv(input_, dim, sizes) def _reduce_scatter_out_place(self, input_: torch.Tensor, dim: int) -> torch.Tensor: + if self.device_communicator is None: + raise ValueError("No device communicator found") return self.device_communicator.reduce_scatter(input_, dim) def gather(self, @@ -433,6 +444,8 @@ class GroupCoordinator: # Bypass the function if we are using only 1 GPU. if world_size == 1: return input_ + if self.device_communicator is None: + raise ValueError("No device communicator found") return self.device_communicator.gather(input_, dst, dim) def broadcast(self, input_: torch.Tensor, src: int = 0): @@ -667,6 +680,8 @@ class GroupCoordinator: assert dst < self.world_size, f"Invalid dst rank ({dst})" if self.use_cpu_custom_send_recv: + if self.device_communicator is None: + raise ValueError("No device communicator found") self.device_communicator.send_tensor_dict( # type: ignore tensor_dict, dst) return None @@ -727,6 +742,8 @@ class GroupCoordinator: assert src < self.world_size, f"Invalid src rank ({src})" if self.use_cpu_custom_send_recv: + if self.device_communicator is None: + raise ValueError("No device communicator found") return self.device_communicator.recv_tensor_dict( # type: ignore src) @@ -784,6 +801,8 @@ class GroupCoordinator: def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None: """Sends a tensor to the destination rank in a blocking way""" """NOTE: `dst` is the local rank of the destination rank.""" + if self.device_communicator is None: + raise ValueError("No device communicator found") self.device_communicator.send(tensor, dst) def recv(self, @@ -792,6 +811,8 @@ class GroupCoordinator: src: Optional[int] = None) -> torch.Tensor: """Receives a tensor from the source rank.""" """NOTE: `src` is the local rank of the source rank.""" + if self.device_communicator is None: + raise ValueError("No device communicator found") return self.device_communicator.recv(size, dtype, src) def destroy(self): From 0c275ad5ad1af35636581bffaafc9e694a270378 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Tue, 5 Aug 2025 15:53:23 +0200 Subject: [PATCH 221/224] [V0 Deprecation][TPU] Remove V1 flag check from tests (#22248) Signed-off-by: NickLucche --- tests/v1/tpu/test_mha_attn.py | 7 ------- tests/v1/tpu/test_multimodal.py | 7 ------- tests/v1/tpu/test_sampler.py | 8 +------- 3 files changed, 1 insertion(+), 21 deletions(-) diff --git a/tests/v1/tpu/test_mha_attn.py b/tests/v1/tpu/test_mha_attn.py index 55fee4ee1ad43..9d690851b70eb 100644 --- a/tests/v1/tpu/test_mha_attn.py +++ b/tests/v1/tpu/test_mha_attn.py @@ -12,17 +12,10 @@ import torch_xla import torch_xla.core import torch_xla.core.xla_model -from vllm import envs from vllm.attention.layer import MultiHeadAttention from vllm.attention.selector import _cached_get_attn_backend from vllm.platforms import current_platform -if not envs.VLLM_USE_V1: - pytest.skip( - "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.", - allow_module_level=True, - ) - @pytest.fixture(autouse=True) def clear_cache(): diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py index a61773a4f611b..bcc2993028dd6 100644 --- a/tests/v1/tpu/test_multimodal.py +++ b/tests/v1/tpu/test_multimodal.py @@ -4,19 +4,12 @@ import openai import pytest -from vllm import envs from vllm.multimodal.utils import encode_image_base64, fetch_image from vllm.platforms import current_platform from ...entrypoints.openai.test_vision import TEST_IMAGE_URLS from ...utils import RemoteOpenAIServer -if not envs.VLLM_USE_V1: - pytest.skip( - "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.", - allow_module_level=True, - ) - @pytest.fixture(scope="session") def base64_encoded_image() -> dict[str, str]: diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py index 198bb1e16ed9f..fa950e5f7f85b 100644 --- a/tests/v1/tpu/test_sampler.py +++ b/tests/v1/tpu/test_sampler.py @@ -4,16 +4,10 @@ import random import pytest -from vllm import LLM, envs +from vllm import LLM from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams -if not envs.VLLM_USE_V1: - pytest.skip( - "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.", - allow_module_level=True, - ) - @pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"]) @pytest.mark.skipif(not current_platform.is_tpu(), From c494f96fbcf5e9f19f59e3dea6c2780aeb6c567f Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 5 Aug 2025 09:57:10 -0400 Subject: [PATCH 222/224] Use UV_LINK_MODE=copy in Dockerfile to avoid hardlink fail (#22128) Signed-off-by: mgoin --- docker/Dockerfile | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index 0d6afca74e867..c529d22e63191 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -119,6 +119,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Reference: https://github.com/astral-sh/uv/pull/1694 ENV UV_HTTP_TIMEOUT=500 ENV UV_INDEX_STRATEGY="unsafe-best-match" +# Use copy mode to avoid hardlink failures with Docker cache mounts +ENV UV_LINK_MODE=copy # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 # as it was causing spam when compiling the CUTLASS kernels @@ -181,6 +183,8 @@ COPY requirements/build.txt requirements/build.txt # Reference: https://github.com/astral-sh/uv/pull/1694 ENV UV_HTTP_TIMEOUT=500 ENV UV_INDEX_STRATEGY="unsafe-best-match" +# Use copy mode to avoid hardlink failures with Docker cache mounts +ENV UV_LINK_MODE=copy RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system -r requirements/build.txt \ @@ -272,6 +276,8 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL # Reference: https://github.com/astral-sh/uv/pull/1694 ENV UV_HTTP_TIMEOUT=500 ENV UV_INDEX_STRATEGY="unsafe-best-match" +# Use copy mode to avoid hardlink failures with Docker cache mounts +ENV UV_LINK_MODE=copy COPY requirements/lint.txt requirements/lint.txt COPY requirements/test.txt requirements/test.txt @@ -341,6 +347,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Reference: https://github.com/astral-sh/uv/pull/1694 ENV UV_HTTP_TIMEOUT=500 ENV UV_INDEX_STRATEGY="unsafe-best-match" +# Use copy mode to avoid hardlink failures with Docker cache mounts +ENV UV_LINK_MODE=copy # Workaround for https://github.com/openai/triton/issues/2507 and # https://github.com/pytorch/pytorch/issues/107960 -- hopefully @@ -472,6 +480,8 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL # Reference: https://github.com/astral-sh/uv/pull/1694 ENV UV_HTTP_TIMEOUT=500 ENV UV_INDEX_STRATEGY="unsafe-best-match" +# Use copy mode to avoid hardlink failures with Docker cache mounts +ENV UV_LINK_MODE=copy # install development dependencies (for testing) RUN --mount=type=cache,target=/root/.cache/uv \ From a7cb6101ca7bd3d3ee94a5fe37caab8ebca32d80 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 5 Aug 2025 12:39:38 -0400 Subject: [PATCH 223/224] [CI/Build] Update flashinfer to 0.2.9 (#22233) Signed-off-by: mgoin --- docker/Dockerfile | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index c529d22e63191..d444087a3eff7 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -392,7 +392,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" # Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt # We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel. -ARG FLASHINFER_GIT_REF="v0.2.9rc2" +ARG FLASHINFER_GIT_REF="v0.2.9" RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' . /etc/environment git clone --depth 1 --recursive --shallow-submodules \ diff --git a/setup.py b/setup.py index 64cfbb8db962b..c6f4985c5930e 100644 --- a/setup.py +++ b/setup.py @@ -665,7 +665,7 @@ setup( "mistral_common[audio]"], # Required for audio processing "video": [], # Kept for backwards compatibility # FlashInfer should be updated together with the Dockerfile - "flashinfer": ["flashinfer-python==0.2.9rc2"], + "flashinfer": ["flashinfer-python==0.2.9"], }, cmdclass=cmdclass, package_data=package_data, From ae87ddd040b793fd9f4f05cb660a4728c81d7670 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 5 Aug 2025 12:40:23 -0400 Subject: [PATCH 224/224] [Refactor] Remove Unused Environment Variable `VLLM_NO_DEPRECATION_WARNING` (#22199) Signed-off-by: yewentao256 --- vllm/envs.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 9bce5c6d2e0bb..e28e9658e5b53 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -70,7 +70,6 @@ if TYPE_CHECKING: NVCC_THREADS: Optional[str] = None VLLM_USE_PRECOMPILED: bool = False VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False - VLLM_NO_DEPRECATION_WARNING: bool = False VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False CMAKE_BUILD_TYPE: Optional[str] = None VERBOSE: bool = False @@ -582,10 +581,6 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: bool( int(os.getenv("VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING", "1"))), - # If set, vllm will skip the deprecation warnings. - "VLLM_NO_DEPRECATION_WARNING": - lambda: bool(int(os.getenv("VLLM_NO_DEPRECATION_WARNING", "0"))), - # If set, the OpenAI API server will stay alive even after the underlying # AsyncLLMEngine errors and stops serving requests "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH":

Y7^^7 zi`>@G97fy3+7ev~O-@umc-o9E+mZU+l|wP_D?{(QJzJgUKI>fE(&r!1xp3}snJgU0 zK1k61n>ymcS`Xr$PASNFmN!OhNV0~gAhgf4=cN{4A`P+2GP6>aHnP3tVBI}XMb%V96DFf8 zf>MUYnUnrF^_iNSZHd}4;dGkEZBu0OT(fSBDB|os=8dvsckHL$Ds{$$Tz!Bg0l8HD zE?reA6}z1BT?vl6w;z8;mgjTbPu&hYuDlh&LJ~w3LTP8eP7#a_^9^^&04mz@PA)*a z@zVLEBQ0n3N7=WDsAzIY&2Gb{9ZAhp40 zJ^EZ^7>7m$!J@>~>qRw^`n~3-XCIE#1rkhjXI|YrOXB@TU0Z0a%>9W6_E@XO^_2E=^}bAgA%7?+Mcjg|1Q(|wt3t2hWK@pjv`LKvRGV;;$_jH zH7WJEmDygj1+VjI7b(ysVqL52$fqBIhBoulsTHX?>zdVWrn<&StkYcs^)?niHPggP zO^)M{FF2cbQKS-tvh~vF;_$TvK0DKyP1kv2>eLZAwtV^nVULCBGaHe8e*lSiTSxqE z3(3hKOC%%qh^HHFB?D4nL?$I;RbDS0Pp(gM-n?_-<8pjCy;54Y`|T??)}+%;nxCKR z+kP>53y{bURj%Cn)H^wS>ZRj2@j%l<)>}N`+PtAW>8f;M!K?Jd=f%Rhh?uRHnerw` zaf=r}6%NGn>}V2Rb;bY_6js(JpXRfYbAMIanxl{I-+6)02~#0O)=W#0ZO!aFw<$-A zoAwF%D|SNVf+CsS#xfUzuU;uUyg)jdtR zS1an$E}sZx5^%2I$V>{-8S1Qqds*BYgh&fi%gIx1R|s2qd>Y8wFN``Xf!sr`NVpR% z-32v$paQRELxiqZeVMK@dzf)7&gaZ>X!xqWW|>ZUxv{&0uU7Y932jdsD& zT{0jqWzvb*ZC#QLh@9UQV8%HQFFc#Peupg8c5+yxGozbJI35^-h}YVZ=F~%UT9r_& zk?Bmf^ftwD-GcK(yI?E6-n9u(C^tPpr)Zq>NjkaMlz0_^oYBjzc^1Zxt~m9Q-%snx zGEUZaBnKCOcFywAYx$sE{W7wescG`=IhywC&HE4l+Gn89u0TO0WW_Cu>8>MC*Pb`4 z22@5|Sqtd{{>Y1{=QKuZpM=)dd;`&>v_JWcDW%_|a0rb^a!^?3Q2qzB8bGP~@4UV8e@urF2Hi?vGXTEQ!d zfF`|5$~5*RIkC6N%SL2r<7w4nmGI65+DE-n2Xf32rDnBJHBOv}d_bMo22zHCibZ=S z6=5c_)3NA$s}=Jo#>qU){FcC`flTEQ5-+Yh=uxyB_~u`p2nkSW(HyVFY<2$RdMZ-4_nJ?%BM+evm&Y0}kY0tsk*9duRaGn&iuhE|W9#DjPfJ(&2 zPp!72#i!_?9be#LP#EuGXJ-1~;C3}j$40qPqy57dcAEFN)NQ8}3eK-vcube~vun9! z6Vbo3NaVO!^TnMni8g=noe;t`B>O(l8|g>oRu@E;#{5=ux8#zX3A(rah%`CLb~@zz zuqo6lbPyt24jwBP z|M;Goo42W7IfD*jV23HZ`uP2#viwXQ8jBp7Tp#lSF~G2~Rab_ZQ%6D;*5^X&uakBY zpxcx7mh7PlQ~2ZPWNs9jJ+tdp76?)Ev||Y%@w^lXpn%+y9OTH*M>b-o3JE@Kmi}0a zP)VW}xfQtnDGKuGa{$CeJ@x@bpXa+AhFFAhJ0*qB7qclbqyk<*&Q&OHc3VNkyd4$5 zzCM+Sk%mU0jk}KEx^ZUCUWVH87|AG)Dy&ZFxN)8^dy2kbR&n$)2u(4az4bJl$(Z+Y z+M~|*H(a^LgArV8iyH&J_;;pWJ?oImGQV{5X2HenvUVm?w-YzJ$!`j$HmEQ0n=1q0 z7)}!b5H-t%5zUe!RzMe>xW*5*+cR43xmV^-bG91Jo0xxb4cB?0pirM}bxVKm8q_Qt zCB^8Y)xmt1P^g7edr5qLD;JN8dWV~D1zA?=B%Nr?41XW}TEvwhtnr`G!;%~AUuPhV-sz(1YIn%S#`2D(M&I59lS*Q~q%GH!DM70#PafIs z%|3ea2cKOE^WM3Td`H>jg_Ez;#*?9|P(9WuV_9+z<5_10^5S@Nko1N2JNBO1d*%i| zo9JU#?{cpB5#frU>wt)Sl^VD=W_LhhMTzn)@0P8YL zMR7dw!0PdZHjG7GHj@B#uMpF0F&EeZKFyi-sT#o2@dd&KFKpgxh3&}olQL=*kFp~y zjMUfb2VecPveGvBih$Gd?b^c%>h-7tcFqm(5t0QK5sf8~X zPsbTEyL8d7&+*RHsADDb&9PU`AzjjT7&>Vd4Dbl%J(os@Dwok4z(H3V01mLs1!5n? znLa^cz|4?)03F8^V62?m-d-CFWj9mFHeLXleR8wPLtBR=mSJnTLyr0OjGCD^iR(Jq zr{-9Ksrk+2Nl(U#EtVQ~TpFdyH_@y@rMFjS)G1_QmfZ^&bjnd7H-`dKWF8JrXIHFl zuIjjLJ4I!Muy&^MU^7=mk?1l8H^Iiy=I;s#(HAue{27 zR|4_bU$rI(l@)Uo>j<+4ky&$f0k*(JopNUYGHYHP3nDZK=p~Eq6cuu=I2wIyND+@U zx<@e11-cPr#oEiy1+`>`5M@DUaDm6$c|r+JS`diJMCHX?Ypw;nUwG_3!9u>vP1^~b zL!{AJ2C87YnI6omca|L{lA1SHrbDmTf4*+b>$bzI?Yd$3=$cDKq~%m+s(hY_JcV== zM#c6jL06U*L1Lg`Q>6MF@*OS-qA%LlZqJ+_aQ!m-@$wF^Kn|114-i}$d0a(El@Jq! z#?tY_?h;10tN7mCI(wJJg%{mX-uR|<;WM31=asvS7w?{W=N`y1VCj9g=E7ZDZ+F=X zB!qI_TKs;hchA(jucnp}fOg&7xlUcxxsrP6x$Y~^IrOr#gndZME=?t_qvc{a zM+d8P5`#wD0pE=ld)Xj1G~_k_^$bPb10cn0%I@;WKdf_O=gXXYiu45%&4pQ;l!`~Z zj#$8Q;sIuxe3E!5f@yowi=9Em{t|n1Z*&qQ#n_3pVHpiob*d3m2ER7?dLNInvm};$_05EiHkjk20p*CxadA6q`+u^LQw~y$`)Vr#MhaY)G z1M$6HT8|)A)U7$jo=UkK+Hhxc^6AyvAYRk<6d8g~z4wgiW$IP4ZoF81o1R^cuSd7c zH@x$aC(O#$+t?&2sPJWbnD4+9x7}?{7K4VHD`Wn_Z3#l&R?}VdNpkVT%f^d1%ZkIi z66KT|KbK`<24G>Bc0FCT+-^T^&(g zRe1!upN5>eQiS&0m9SN+A;wstxUQQI-O8bQ`$b9vf?dz-=0@X%XdRY074wju!mDID z)p&9*D_lh{-mEW09wfMf%vFA3VkyObTX;DQ{R_gJMA7OCT=&de^4=KxHM23=Iji<` zXEA;WqCw`4D^Rn46NpC2E^9p$CsB70C&D^U!6H5X(R3~RMHt~#r&ZyS+p`|oS}*iY z#;}=s00sA=H0uH`(C^}7F|K~pEt7h{us)TZEm#^yDRpl7^9KSF=lQA==Vjur5m5*S z;Co$G%+$!wL3=$?_Zl>cReop6%9%HWbwY&IdEIJy*Rg@q+x82)bBn|j#`LXsY3^=) zo+B4!rNKF2m^+uikCKq;rJfkrC*4zY5tgP^PBXy7lDoEHFh8G)7QIA|#W5Ap#udR8G;SceQthQd^OrZ>Nd99F&s5cr>M#bc~!CZyRLyif6AEWR__jZ>#*`uCk`snUTqnd@9f|w%p;IsSM z@<|a=UKHM$Mpf|mvG+9vO&AyNcdEA&sZk@!+4;7L$0}FfjTU?vMcqL_-P^SimYvjG zR>FU#rN2m%E9zXGPg>`5Z-FVmIWn;*;o~?0@MoJYV3tdcYbsINV6>^=x<3;DSjw|1 zZjDteVg+xb*Su^GA5VQM8jm}(v+X2+R) zpj6xNofMYEsu?(@e2Lfm;Y42{qk|e6turOqBt~k{bEz@>a+t_{CUKgma4O=@ftaj= zxnVp?{4ZKsJubarHF|?`>Gqb)6j%3V$y~Y6+Zgr_Xp?WExzM@^;ClgB&qeEbj0!!K zjTH=0a}GxIBeQ*U)}lA>6t~SNO1H-1K;b}fyc|Dy{S06rI*gnZ_PCdHy1NeJPO~+{ zBth~`m^_QJB-`VHry1ORALB^hn=+*$Z+MogADZWVyzdnVt-7uMS|58C6|c+>Y!| zepE!;n1DCReP1zEsAWZp%Kdig=fOU8zX?jGww2u2&eyV(3V0vivsLOVB+WM%-Mq~z z5o@!pc!~!6N@$8LBHO>CeVlE>IJePc@=BN#ceqwKn_SbnT;o#MBTcbaQFEE~C1Qpm@XCVcv|8vE!}ZYt|SC2+dkHikK>z&9?6h z)Ri1)HDke7nk4%nxM{dy$g`?4S@t2?+&OypU^7>v?W_i5*W@!VU3Zxm6e3mSOQjvd zsh2`k@9M0t@zR`QyxTvABV4#sE`?KQv3b@4-`n|mV-R;x!bplE`~7)srIhQIsQB#1 zcf<5>;#C+tRkhttc$UYWPb|9qc1m~Ye*SrGM>LjlQk@&buR3@gHuAh^L%1Zwxksr; zO1<&H1c_>st#0FJE&W|PIS3cD-`opN9rf=Q>P*)lrjjG9pD|3Yt!G}QNh<^mz5*Ql z*Pn}9r{@EHExoN75%FlA%$DjeQud4CRaN5b=&ZODZ+1S8-l& z48&ks+~i&4&gy$vd!pOPf9>Uvr?o0Ef49Z5$g@Jg%S4~ZJkK*O5Fe4L*43+(Xa2rj zLFe9s&LOf`Z?I_dBW`+SG{q!9DLKK4Uc0-s6bugRcox-&wHYTJ$ID$azxw%uSu*pB zOr=!w3N7Rn+tG$lWVF#CGO!5)uSv)>ry32XTF#@>fktm{=pZRQ?THY*->KV(!JHq4 zpfnSLq5k})dqktY-s;SIYMt`Zl_$BIxY=tsnVN-YD)$_U?^DRm_U3yI4iwprpD>kP zb@EM>j|6r-17wCQPGozvJ8b$UmBX_hvNnO0&}_3w3h>MbOHvSPL_ic|=5boXO^fy} zqqi}Jp=2kJ(NHb@DDXQTiz8^iw5qz8D#YG{&bNi(tP@HuLWhf9=KaR?)5AalPP8nQ zPeGl!8W&$*eA??xL#CH!@ER+eTAutR2~D>+n|1J;cs{;$a9ZNbq%|?8S9t;oV2V^{ zIIlo>@l0|A+oSI#MH4RP192_-4tE_fFGAkexDdQqig(lzI4NchgXzSxB3tW^FoXiV z=*S!u#7^Ttu9Fpos_QH0pm=n%LE=+&;|uQ$(O+jvbHSR@l+u~WoZo|3O#PVc6C zvavL$WoqaYiE^3TIJ_jT!Hx=H1b?-E4o}mnPl!ws?pYK3;HcV~R!|oTe0^ zi*AhMm>_&z6nZ034&665%f={;e*7h8ZH12j2{A-@()RV>a8t*XhyKqlSV=s`Kp0Pd zlr=40zL4)}R?}h3d%g5xh8>oxr6t;IO>~Q{Y;E6!?K^ri0Vl=7K9{rfU~lrh zi4J0}Mj8Q}s!{oN>~Rtkebt1-GkqLc{tLF}M%u(a_Yv^!)Qq5GknIc{zu)O7jlzI+ zGa!{OgCCN_fbeP9%T0dB<-sr?okIN^{oxzBgG8Y-ML6hcoFSd5JEkW}cinf>3n9-p zlDfvvSr0Ry5v^mF-ft`0%bDe>?~)N zulttqJo5xC5O76QWzc!`;-cHom}!l>e2wpcAyUUS`Ym~QGzB~S3UCy8a;7_xvo%$GueyI$@)AHt)g z9MXT2PP_Dctv{X^AcX{8-8H~@jacIGI)sM;D~iX!WAfTu*;?GnNX71s*<}}0N5Nsg zvHT0=;cQf?BhvD=pOuGY#==jnzFod3C*a{Oe8O|m5^z4c9#ntBkhu)aQo)lD~8}>MXhl)<&=?%BX&z(Pf)<6lMp1qa2 z5G**{f1>>ICk5o%h}<@m%W5ypLZ@By&3V_g$ee4gP>q>YdaPFFXfr50>Cnbw#_{OH z#C=03|E1N;``Q)f>{6>brDN8zwUjPzLU?8i&R|j|`4`2zC|HM=T$*3Pc%WDBF!l1` zl~eR3_zkXsR{#yCoh{;1PYFr_Hs9sei5Fjz(8?Uk&+^PU(~Z8rLVjC|Q}okC^kYD% z2uboqrFW=nazyrWI$aJBMoU#J(8G0}Z>z5o>Vx^WL!2ErO_k3F4n z0zC<$J~p%F7^CTlk4xWf(mpZtY{OAMoO)V&h}CgMoXi<3(QbpZj^mkt(0w}1!V|$< zytHdG{Xvu1bx5>#p9w}?vB2V4B{soCz)o%f+;n=F5@39e+)lv&z|GnO#ukxqJL@&b z=tYx1t!}E~M#Ot}$aC}K%%o!E)IEKCZqtHiSXxPFuf5j%9-t9l>{Dk0UE1kcabil% z-RX{Jwp_<}v@?z~(6De0Ki?dkhVTt zlt{sT!&@eyWC)`;HHhSf*#zwX@*M%0t?fPW9ycq-R-M}IL>n}&b{kr@<1vi zR|+h1!X>XFp^0V7X0~N%>Y#3nkyXKH3HV}~$|oKfR- z+;CQ%&f8>s5qO}BKL)=N#zMjNNeQLoOb ze&GNFq^%qKcAg)*D;6gOxaZPZulguInrj*C}des`5xvkfXWm zKR*)DSQnhSZXq~KlV=4XN+FS3dp?YNiu!(4k3@5*iD}Qg>%gnUv8GHMa%_sy9K)th zOBy_{acGhP$Q$!yu+8f%HpKj<7_exRNX`=Ta^0ix>z1#A_Jbw|>1+~3u)fUO7U}9- zCuDhL#bUcjjHI9sCw$i0R2xI?Y`v#`tHu-Os<=uTDXnfW3C7ac8iQKI$a@ws{cG7b zWaA0)ix_Kbq+~H<;AeNpF6#&9>Y3I}_d|FVOLlA%At?&u>|{32{DwRyr}0E&mI&sb z<`rs~w~AZZoo$R-?^tDjB;d;F#v4Q{<4Z%_z*v%bZ=HEkL)k0uwc@mZf_Jww<^5(( z5$g^fU*72LrijINvM7%&A!&g6LB@GkZ0t$j`ob61VXkRH_9$dt7Hmi2W4h1RH!Dh8Od(SGTlMc+H^fj9LK0#dXwnWg{|mX z+G{vi+1q$4^j<2HixI@vWfuenfk5hz|EiYPmP2RE$Qo_eMa$ClbCWCT5$NPF7Uj^ zvLLTjrq7rD$*@+ldGQ-qCqpDE$WvKviM#nrznOQwwEVfhPLgO-4dt}lI$>`GdaMSA zXt%4-Ld)xs9M$5)InF0ucSER%1*$q<;Ezd(KV;OH3VTb?rLG@E+Qg4XE1eTS!+4r?5S`XjpM-Qt*Efg2hQG@B73N0z2VG7Y5j^^OWLuwg+;Jg20GF-f}qT;M(aTO0g5$zLQ_DDc^ zu6c45*b7;^JHRaw%UHhtiifxO3^9YL4OdG zC|@LoU)A!VIMtCbqpK=+&%5(I;C+Qq$ys6|@fw5$FxPZ+>$nAWYdgKAPt_;EV}Abh zn6>k59pargYza4tY`jRp&k*%$>EtlQYSf)N>n0^wF!rnT0%94 zWfi=g+=U}fVX3dus4sKf842Wc)|R45%{6+9@4uCceDy(ZAqbS2OQl1iO@gUVFPm{8 zAL4@QjG#xQ0eDN1Q!e+e;VTkDwmzgY@Qaj0f4L%9j8mC+eB}^FhrU)bh zibV zF%bYKZb~ucl8_nT36Rh11VXl(vAEGF>p+;7%m0(>Tb{?`4YA!kOnsinEjA>og=n92 zYoYsFs`%E_CGpeLpW#Qbfi6IvqR^D!shn6W2fA^q=ZR3ZJD2^ZJTV3D%X-oRrXNxt zP~*lGqQv0oG>Z5u4llOzAb8S9SN*7ejj@n~YwG6Il^V^a8~LPuphW zVUz>A+3ji4IaI3D4p5JpKZY(oLC348o!j#*W#>|ba{S6UjfIJnlBF@X=9?~&p^Cnw z1mhRp0wt?BN>-2$X!Wt>mfC&(@TFx@=~iDp`e|I+%cqm7dMLT@6+FA+FhyGWPM$vb zRHrt0H!t7V=ZrG-+bNZ5U^k5hIsd=x&X&4>?* zPpxNzMc#(0c4{+-?NXlkboXAZJu9BZc2V{Cy!-YY^JoayR&PFByluJjdPhL`!N~!zn}iMZX2#U`x!DH6I=B z3Pyy={n4a39N#Cm;=AfyL+Eg~jU)T$9GSrN&FN}ysFYYk1 z_DkJ3^OW!G*-VMk__gQ8aWECUuW{G-RuxFhysu8>*FR0DY;aq_yKYcvD+<_KH19&= zCa&~0`XUl3u}7564PdjnFor`W!=4%|%+2pB! z5idR5%_<#mH&sP~9KA-=)?$6E9)Vh+3?y^0f& z)Qm28jed3rUrc1<4lSvkRHR=#rJw*rcs#f)wWJ`xmT!1^WA5yk<8Bb1 zI;#8Lixd)Jn#e{eg9ec0)1ts8k$x2da8M|^Kg|Q@?oAik1j1p*X8LpRRa{=_K8tR3FYof6+gv6-4VKB&!pzOxpEObd96;f-o{&l=Vj0d z_=H@uD-4M2Tx2R(B#BBEm?Su9e4gElxuzTNj?**My4K(gMMRfxVq;e41@5Hy2ua*? zsU3x8SGMv-k;6e1) z??va}S^D}-YD!@WLHr~mMq83mBRa|21oON4HngG<>l|VbBRitre+<*^kA`cna0DO};Xn3nYLTof!UtIwr+cNrhzTRpyquvgs ztzA$j%R~(w+d}QV3Z4iQPDbmf*|f7Xb0J~M402Vc=rXzAy#1<7Pn-x!X}W3n7ulBV zD8YhXR04+Pyu3TDRbH@2o)8?9wg!|aBuQ5v$LGn7o|9KE)z+8t0pY21^Zc?Twzv0U zFEunL>w1yLbhA84JKabg?V&FoKM&zi!k4$Gz9|V=ctw^Vq*qUrDinV~=o(Y3(PVpw z5s1w-C8qk`g2Do4>c|ZO8a_0VYXL78(KA?Q43Gv{g81bJ4HZZ>QY7F!y*qNhh@R1vYi~$6j3dnwF$d%oAC*?{f^RWv0#( zn%2B+LEgpCzjA6ynm%AiN1mrlT=1i=O8Id3K&1COWnt-!hdw=*kZ}FxMN@?kyo67` z^|JTv9cPy>YMrYK;mOdbWxJ>X?&Z*Ij}w-uPLo?so+Ldfa_!+6gNQ`x@iFrpAy+&Z zdZa38IXyo`0!NgM&=9&gzxubCW_nq4-8nHzn~w#Z0U~VnnmV`H-5<%8UNC&3F^~RA zs-orH^(_=@q5S4Kh>Zx#q}Bzhw=pHY2`sNKj{qB2=pBh^5F1E;f&~WM&hv$NN6Rvm(f&fnTUl#nZiO5xL}hXU+y{Yj%?szXl*qSc)#2Y;@J^U{TrR}GY&V>yg10#biG2m^WIdG8rOlkd)S zdCpB2fw@Eh_D(`m^oo;;KK+2=q-iYSV{h~ci{yZ_6K!phwCNgRi%OK@1zS#^>9Vwo z0=fdP)k5<7G8b}AE9U3a5|e^mBe^rMgiML==lUAUzDB1-3gaP2BB2Qj;Pd5vq$rVV z`=tvdlxLZ1*!lczh4H?G6CNEc0W3aCI$PUxTTVNV#vWErv=^VkIwOIHpQ^9KPhrt+ zXqDV0ugq-D&j{?^^Iu1PizV`wn6}KnYMB#aoNe4Pq>@^##F3&BdyXz!@D+6VM#5uHQY7 z&TXeeqpJ(x9A(SCB8xLT;(qT-T3t4r6@SK~ z;^{U&V@z%$8>=5PgeRV3>y^UQ_Yy?SeIHz1m~F2Yy?u-x0w`d{v9|=%m@{XpC6zw_ z3$eN7xY(H>rV{5omuR)1@a(dwStid)xKf7RxdmZ_#t@!3tBa}CE*e5x`QkU^YmG~@ zX7qvAH!?tV=eJw|C*Sy03f(l7h`}uilXF`O^=pA6sHr1#lNMB_kC62@$FH~P@$s$! zflJ<(47|MWymN~Zhq&0@&H6>H+6x1ClPucTIY3(?Lt7mNT-K|#t9%RU7Oz4}m710}|2$TCI$ zo#hG4S3VC$adBznPL0e^fW^z@)eQnd)VG_oyJGc?{9>}rE&zHFuF4ZphRc_$WecuP z3L1uL@Gmi z$@ECCvkc4Jl^F!cZCnvrg?xn3UX$p(TF!J)NS!gO3TUKGv19%)t13?*Qvu@svKKgL_eE{#YoI5mp>P9k)eGX$yij(&?8+%zsWzc-Dr>K(3v_Y3a^f_~X55b=86(kweOS11fCE3A zl;-qI9FNje|4VBbUgHMxrnSuFY9-<)okdjW+L{gL@1WtMr6@CNO=b%rXL_bbd&|G; z8JArgkyxOA;mx`Y>{F{=rX6vsIl)RBC8PPWog8c=7kfARe$WflFyAX3PU2JZyQZZ# z6aqYF1ul+c*D^UYFS1wNqKm=8ejrD{UN6O1;iqzT+Gp@BO8TOD-=%nW(LT^Pqrt{R zz&XB%afK8#*^1#(X?%h&%*lPG>(V-;dIAyXO|SSE3`bI#5*e|4Ih);U9?Lslf0t|X zo?PB{Sx7;CHSe5|VG78a7}&ZwAjgguLAg+a{Y=DDLt!EPe4|Q;8>xp!g|`jpx=f9v zzGiJpiZ?YRily#L85x+1!+yJz#;7T7wYsLQx|AH8;>y`O>ze;i)s_bEDBO9t0vfJ$ zWB&9|fXnSO>V3d=o>V@yB{(&28zG3J_W#)W>!7&0cU~rtRk( zUuv?D>r=ZlSv9_TJ*>|F?y~R!TSKC>pCW?p+qj&y9K`+cGa+;(A}VN6Rm0$c7m zbClDui41X3E~AgOl{cZVueb8OS>^k_iz!aQ?-(%ZwUYn~i}kx}44HSPHIu?fiB4@{ z0ZGlrHogvE>!w7N$zBMggdf}ppCto#O)q|5Pth!=O_$1UmufuR6I&j!Y1WQNDAKyK1$$5pFA8-M>Mg z_X1M(&O88t-jjeyuL0@mygHOE=sUfuHq+0?$VUJOq(&m2uIAHU&((dnh+A5>IMa5t z#s{<&GpUyNVu<*hf0Pho*oDlxq`b$0?o9tC7Z>@~gEKF*c(Qs3>&F`T@ra>58T;+m zaKFJMVn_sLI$mvCBby-iJUi3Ryp&bUWnOv^h5tqm;_U(v;8m#C<%9gWx1@&&rv8cFV;dd^3 zpVv*YbslT(lM9?^&K!AkCsiGZYCHjqg zUg6^p2SzhP4pcYW!wMqLR8aJloBE&Vz0RXhI#vZUa=s)cX=atdOo`fw65z1ip_qh< z93l%*7VNDnM-hYzqP10Y4u1c8{EEI1r&?~VD{i|SNR7-)l})B{wfsCtAHGx>A<9pv zb(<*R<|71uK^=?1-ye?<+r1=Ux9~t9Rgs$ZFH|C85aP}%@b>hbt<>x$CnJz80KgHM zxLg{a*v$6QQ(f~>G1~2c;%ESBY2#Ez2%e*$cGgcAy?`RS0mo^)nZF$e>mw~8FvdS` zR+SaP1Tmb}fL@hE^I@nESN2Vyo@$W_B18LV#aWBTcH!W&&APnQ^B2Qy{fh|5J|_64 z-^L2bZLV_VSh~X^kzaky(dTn)*(%js#ekyG55;l)CCSRqZ=lF89jFlZ}PXDJIEc-digj`7JoaW10g zTuv zlf_h3CQUlklyaOl`>|n& zr49f!={+w69srS<-CB8fBxUeu(XXpLzLaMdr=8#EKZ~m;W+Dp|(%3a&D=1}8Uh^%rR(U_S%3>^5v(f2&jj{Ho zq>h?u^65~QsQxNAn49}S0>(eo;C@)>`@jby0<*t)l^H%<9cT5)apT8zsKrE?x}>*G%c=oTlvD%%3__k` zQg5xY(X>*s_BI%qJR=QRk;i4$uNv-J4#!ry)lp)kT#azqPe8_S{?5)+_mYax?;~@K zSDj~xIrH}ke}&>IjeK*MEBBt`bI*ZY--@JMoqgCG3e$V>=BLqAg z=7j))1l&&fUQUb58Qr)fj#9Kc+zpG}wg{C{9B35nCl(aO<&iGBwl z$Rg{w@5eF^LP7uXzWqhMfxUN?>Ja?M%tiYh?>oIjLcevvPQYS*{F_|RnuCK_y(+Ko zP_K;$@va;U-gQcVyTP|zSFB?V$Dkg%Jw@-FeG0OkNH)D9gtWbD2yL7 ze;+fj>jK+zoPSBf0U#4Yyo&k8RY*Vs(1q0Hh2xU{qaB_75cyW(y;~`1(+F+s@oiws zzAJBht#QR!FqW(_mQSfQ4EIGrwP?z0t#R{U_zi4Y|877NreXZu=tTbTyUvLi_eYcZ zvpM{YKU}}^(G2`?0Fd^iCL`-0AW>jkkrDldz+a(w6T?CJKN24fDcUN%ktEi|H$MMA zgr|+pp2Cw{3Ag3^KwX?kkOZ#Dd+Cw8jmp*CAmGc3^>W5k(lY|>G(qYGU6p+qkDEk} zEwEx&IzLkdFaXf^1q8B!<-ZULELgR}nCrlQsSnj?{;8VxkOT++Hdp`abtpdi)(>e= zr~d$D`{6eN6^6kNOVu!feT*tgx+>*a0fo?EX8QD3K`5eE;?y;fr8_*CdxWaGl*8xF zMtC~%;|yazg~jxTG={Omz-*C1v94O{ou%V=n|g#)8c1dj(+|_*AMW8l{9>RO;F=%C z)euuxiFj+^W#V9q1u$eD3mIG^=b4V7zNSMU<-=B_X=v)9JWT^boYe_lUox)^ZaGS# zp$$_?+MIG+qbx_h9T=Rc?hC0HeDc#Ff(u9nbl_RV1=@s#jtP|`ez zP_+c^(!bMawxRtP;r9ICz1ihtl8Hq0$do;a-Rh_0!te_%RLpzl?bp$S*;KAL4d{Z# zTbbE+|7yx^4^z$m_8AF6;%E3GcOET5F6+b3d|<)=*Q(lw0G}(KlM&eY&H&{f_4jcf zT0CHX>-R~rrhO~t7F%&-M2hDf>bv7r6zeLEqeq3ifbYJXFx*%%5|JXaUsx-n?qU%) zxV;uUY$-}I?&E`z1NL7L9dH*(5ism_Uw%(u*dN{)7cCV3uVI&XUjvxV0SWLKrA@j} zUiOe7R51r;(@o{iFcZ8^;u1YyYz?IRd3cp}0h$-Si7>b5SibK{#Nr@MZy2j8Jof@# zR_Vx8a2?%})a)mpAIRJOvUyF`otRBt%X2)vshBgup?#4vYDG_703K!N*E?kYs04p_ zc2ud8bP+BUqH0r(k1D6B)p$9i7~BSw?To&vDf>#L>-sOkh!W@$D^>7d|1rP zC-3;Smqpa?@9qvMKtMM+7}S7C@P?p&`9B2xrv4bfL)d=0zjOzH>ng>ue=Aqth$B^& z6D0br-;n{%2Lgah{~`U+hci=9YRr!4G8hrJY&_dz4<95)6~o2lEw=H-C7v$$>8b?4 zx+kl0pKea5oBR~0z^~7zemHC6OLg>1;^ z8e{pd2{xJ+gD^vJMLVU*2!usN(OI*(kp0@`imRd*kD$Idtn$Rl<`1|SEM@lB*&v+) zELfB|VFkA>yf!jd1n%T&+<&Be|2cZvd|Bv-tDMTvG8h=MU{puOkIC5+TWE`2FQCjb zLYZm9!woLxr-ear(q`$4hL0CexHq*~tfU|}*8^%G<=(q)GWDXU00Agp?@h$Q6a!e7 zATutv(=|KdbUY*9dn(WCco=Uz1pxdGAjA6)W>8z0a2Pa^dP9(V0qV@h=0GLyEfsNl45>#Q(D�UL`2i= z2AQR!RATNdUp56W?Z4C1Rzw1~c7_t%Pxw4WiJ}7f=GOf}4W;j{=Z&NR3JVjXjIIQw z*m7q_!zX~JB>W(bQ*OyNv;LvMs^Ql(7s6)(B+NJEFuv(QdVM8K|} zQj1(7Dhm|SYkr#NwS z#3|g+7jEEya@y#Y^y!pAj&M7s*XbJ9{x;;vn(jdLd%#fS8psjH#jnI$!F-Aa{@Mnsl=YhY}Y8*enC+Q7CC;M15fTcS4Pb`oqDyN~No6^@k zA09%Y@HyVQ9=2pELr2Zt9~9*%SLpNUcM!GcNX?aKoF6x?2f)~WL&6xPe9srt=5e|8 z)<6G4ss62RGJ~xjG_1u*TZn!hed_NZsbUsDCqw)l`u0Qyl2mM2TtT+TCNs+f*Ebl_ zeZ}x_#+eW!ZY4)-XiG1Xb`!NlrDP}TigB8KDaPokaO#-?$v`mjl7{SHiCa88S3U91 zl;TlZK}H?#pS`IBe6>M}Oe*A43}HLifnZYw{QQzo5Ndw)Q+oWpV9Gw)*e__Z|44a^ z;^hO>9W!EA9AF3C09Z`KAKiLx*ZpEpg}*)HESBkTAQ4aw-H1+g`uWw_P!+IzjpKK< zX*=sdTmaxi(4m#cgxsWB%uZU7TYnd8Y52eWHNJ*Wy6QICM_JDl>YOGq8xZt{{v^x~ zng77*GX^R<|Iq@p{TpvvC|un)qlrI>X#5Npp>&O>!jzu+?(e9>Av-)Xy`NfAtq)49K<$WlC|HKlD)J(DuGZZrP?Fij)Dyy*sw(+0 zLz>byeXdj&#{759JI!ULt~nP|IYxkJ+T-50#0P$7Y%z-)%KfyG)&r>8q7 zfVZ?(t|Tvk+JB|Q=O`lQCc_#vno{lY3C@N1@d4oKF;=3*!=`aqO06`#ydo!2BC2I9 z;LhZ!f@Dm*;Z$k#T3dizIr;LrKR2+QM6yLjZm zvji-@tc=n8Otx=-`&>nqfEa4Ta-)OUdWR3KS|Jh&G5?qLVXVFa`CY6n>X-ZdkFj{$ z9Tz+B+aoW#`#p$7JEawrp)SWi#TS_%RyEJ zBp0DQci-WYrS(<29ItwaEU{}@eXr^{C$f99h2bkGml6oxUC+hxcf$c;go;~l4^ z)+iH-jkJm<7B9$CkCJ5$2*wXnm|LRJD5XY(5|@xkvh$E4T={l14T3 z^LUQ%M}#ebKr)PYC$W%2;s8*-|zlN%>bdsCSW$Vdd zaF{+RLp<|=4djPR;dOJA>^ue&Rd4`ici#Dp_81>Qb%G91<+$4yJHIAa$8wgu>}5EJ z)eSb+TTS)YoDR|SyNmb8LcDKR-TtiAGZ@><7Mt7~^T~z)9CHjtbqc=r zz`J{D6n!JizJQ!h7zWfm7fEEY=1NA}LR7Lll-tRSIsq6|0nQvCr+B-kOg5R!^IH3l zh(`OCKP~#-h*YB4?2-{{Jkalv+>MQtHD-#Ga^Pg_jLxh5crIGY<&sF}^(x)+4Dkrw zv4Jm=Dj7a^5)cs7mNj!64Ge8s^R(0YyIv#$)Y3PB-Im1;Z1$hUe{A{!|k*?)UTEmR{wE{PtN1S_k%w=|2A9j+$F z?}ZNryuo?zb2-MtWTyTeTcOYt3-$hri9IN$wYgfUL~zYl4^l4|)X1XA#-J=R92Ks5lM%uD-9& zx4r2#*S=ODlzmxP^wQ2WQkD6R_8kfsSXr18C8h4?4xh*G%5=DYUUt{W(7(_qW*x_q z?plU!9$Wqf|H{LE82tE?EhKcqlUcWpHr0D`jm8>1IhQj7Xe2vfq-NjHRPU2e;*YqW z_v4mNQ22DogoQMq9$iv(%S{h?cQ$9RcV^tROVwlaYI(b(%IaSdcVTG}MBCoMy`5o) zg^T-+G&?Y;Fw_S4Tl@N;&;eg3O{N<_x-g03SN|<^liJh*Cv$e|7g9NV9j0)4bFA0o zc1w~SS(v0o!OqjJg&xNEMu^AIA)!6C7XHo!n_+AdO7?|bv&;%i>J8u8%zVnA3JP;Y zD})UAir4{tE61haR%kj@Gw%qce8b1wC^*SNGZpME=sN2JI174o_YYLvJK?jN-XqCO zgr;&p($#zUS;5s%p&VO?Sq)0BOrXpcBWnXi8xVKVtEsCh_jd)> z=qQ;=xJqb?)=vQR>+%uze^AN=Y7=I2Tf!-$WK{@w9e)M;)2I&K^UE^FBR!dS)0&RD z1N{z+%%v?#SKh|kHLnTCydw95b8$lgKM|ZCx2W?>2kO1hZ8lpBb(*YL7tDO(3ky&k ztdT*-q*Rk-53ct+K5%GDiJ)pYrKF6^Not)yOX7YKm|5-@`o8Vfcd@KcU)L$^ippS# zZyN2R2A1w8=>_jlpeP(WOA7@{LRiaV`VzylKDoWu+D5dEx$wadRR5E<;f5^y%IVh_ zvwlQav=s{BqAx1GsA2FaMk+OIZ_&tJ(ema^29o|zWkz(P1jW6vwkv{(4}C!T*D`KG z5!*`Lb;X(h!M-7$+J38p3H~8#i0mO$EmGR&)^7gF_TbC$e$jd5wri!<NFQk~p_xKJ@At4d4=15YydFWSu-bC&isDO8k!d#3GcywfV}C?x#uI>X$MdBN|7k znwiLo`DZ+1-RLE+&#fu3?jS|JTf0yID820XCa;{u((vGM?Yimmkc% z;Xf0!x|3?ny$8>=ZW#ghIiVey*vP<}75(_jZ?qnPt%lU9I)yIJz;i%mHmSx&@9m7=*xoai@ zlNYRz!=ImYI#N!9r-)|CZFbAtKUdhg*;Lu2uaPu30S-{&Q-^k=eb)6KnjCLax)bky z8WS<6+9b!5<(y@4Aq@Wa>F6&CL%)9rRa# z)Y&K#@&!efs2{)d(!BUSo!-~tjECO5hTn`Gn(gfHxl{kW4egUX!TnpsA_Y)!RNC#K zcYKR}DcNEzvycc?p+qsheFzIA1;xpfe-E<~K%;_7TwY`8n%x9L1xeD1-l! z{dgPKIhp7C8IZnXSbG)Gf3jnDf>jDMkJR$8>bwidB0 z96p3d$jcJ5xS{4;sWF)bNOevgTfWn zs=EvZhpBJ+lM&6fgv6G6OyElZZ+~|RDSKS>L5&Q=H-bYo zzWXJ_>-B(zfI}BOWF~F816o{Yv4*ZNYK;i=JL%8P`V@P4>}_=?sn-{^Pw#FVQ(>2p z>%9wc=hFM#3n*R@@>H13Jk(pgm|K?mgpj!!j@(e|a+O6f`vBS(aHzZGgnvY1{ClhS z_H)i|r#ciee?y+->y$@X+worZUN3K_Jh6%KjM zQA#RLffr5A_ZZaCB=&LMM~#ZT7WPAV`F_MblO2L*MUfnK4@kxHWn6|H&&Z7N=UIZ6 zsES#<(M+A?y+6}xf4S}(YU=45-pT+sDc!t>@xgnZ*+DETmh~Grgn`nO1y9i zMmhhj#IU-xKt&4X7yjiyo&RA0;`uE>S>rw@Ud4tT-_Un|b_8-8c;;6bE2xG(2HvIq zeJLnaj!gJV39fZot;D-J_Q}x{(q=Irg|J21NR~Cb2^wTPrlirsx z573(dAd3YYwalY|iSg?^_Q3aZH;LqKwl-Sh3-H29ho|AQxd--t%5(x92n9Xs%m-`( zY`b0S33vI25DgF2< zDUhhMQx+eE^z0cLO6S`=90l!_0;Rg%$74E&@r$#QKRt>N<@3IY`ezrjSHIdySg!a2#sh#YN5P*oy7*D!A5hp1cml-@S{gz`k} zcz(qFvA&*Wj$l&j(AfiaW0{F>oD?G~FzIvrMn;9ZinK`4Vd+qF`fK#}RH_tR-dD0TTK)1xX;oZL(6-aRqq zRgq5n<0*$bUU?4{0SE5Ba6(jod{wuH#Zt^q6FO1XApE*uht9*Z(`^VKMkc*})0RFn zmuGApzY9L=mQG?KAI4WywLE`XWG{Rvp7}+`E|3G?USL-UHTTq}!V2zP_iwGYix$eW zAP>nN$l@EZr8QusQ7!r$8p5<>G8bWP%)ihxm%(O~NWUBuMqyI-|6_8Qf;AN;5sRhn zG1#MV&z;6v7TQd*>i01B7zY9tXBgW`=k3D2;j^1sb!M|m%c5dF9kbaEyEi^RHM#o_ zdZeqF7zVrY^j^@f+E5h;poOB3y9p)BjS*ns@TWo>p>l-Roh7{X^Wpr)GWe3! zdekwd314-vxnt_MmAYgg9~7M;a~b}r-C1D)wZMDqWUOZvn`CySV4by~_e8wghMk!I zTNV8^>rn~9OiIE(jw$>-G+PSu)uEk0x<*rM%(hACde@T0c$PeyYZ(jXHJ=q$%v>!< zUVz6^Bl!f7(+!M>csf7G*eeTe4S~{KiUxlc$!UHMhANG9Vl)W`4*s`O3#oGnE3mS} z@!r&P5%avyS$7129>u2YqP6;-sEhOA*=*l?20ntKYa`MQ*1>yueu(YfsV(9fiGnf} ztjG31e%5Wfg$x#lC46>NQ0D5z>YnT|kabbYAp_j4Wf8t)a%rF6QRusA^ozp4QRZz; z#M+x?(24>;0+`tzXpSx4%GPMhUjInhcT_^rsv06klCGx~w?`*~ zF?BhjpQt%*+gQ;DZc-L{=M9-QZcZx#*n}8E}7PKL^qId5VpUMsoU5QO_o5E zMRx$$tFDATP!B)fNQv_JTTI|uyBUXWNO(j)qPDfg)i_(<&=%J+qa^a2g!q6$b;xyZ z<^dkZSJsYKYhpmjl}o$1`}X0x9Sy1n=7tMCv}{%~vDb%g=hs_yLziO+b|O>eAi2}p z=q~+tvG{#3uhUeSon>s-R`*N$R>5k|25>?<<#E}qmmzvTN0%@V+4aU_1dSuIw0}kC z$a+AkNfcJs^&oH~CM%MGToH2rAwMeyktds3GIjwRfUOvVPm?WbEFgfdD>>eNr<#mr zF;3~V#8Vn`&%mP{gcEqnW4jDL7){iGr2;LjSY=o=IC`K)BLTk92=jdg^Jbg^r!M=@c{tQ5!T(Y_u&D0uj-L1_q~F9SFlpO}6BUPF+KgQ9r^#oE*DiuGTCcBfYB4b@)l>&Z zG+ar9WWO!UG2FzDeT@(Om<-~OylW3r_AbU7>+UvxcPM54kyU!E4 zvdBRq8VnSygoJAB$;n9w6^^!Z%E^g#ORGG80hTN}HadtPI7hOF!z7DgA=wvFqBCKaT2~3d3jb8`9V}hP z`LstwMF?pYD%j_bB2B=o=oncoYh9UIpcByY=X)4Y0SBHtUL@AP&kgZdZ~--&=h{&O zPwJP#=Rxq;zWoy171R4za=KYmL(V?`iq?nqF>G>2^k*^_4`4VC1U`!4RM(oAO z&we+{X*2xh)`z)j-*Y(4WwN+SV%GfxnTRvyQ^1It#aMZTEy^|KAQaVdn`AmHG_cFz z9`ZrqpMVgV@IAw<0+F~a=krS80@r7ohXMjgm+gKfJkrmk9A{#k?$0P-qx)A|Hv&|+ z=_XGUqTGULb0|xFmxCS;Yn{rHvZb3q$gRO=K?(e<*KId^UeEYmRk&#wI?;PX>^UD% z_3OWrV#)&eEYC8BqJd!|&naGR4)UuAqCc6<))us6vOG$;<%egjuY3wF%x4~hn^6u{ zKaUWBLN(M59FuI{3&CQm+OJ0Rv%4I${5VC>#%9!1Epk}ZY-4-BbP?=wm7S)Rd3M>_ z=%7PP)8uLzS(8+1>u*u!(4XwXe_^wozFXsUXmKF;t!~Wq5p3DN@)d{=uB9@_*w1P$ zV%X0}-fI&&P8r^%H5BRqefsg4dJBehj zU!7_jI+M-3rgP|Ak5=rkID-{f30?|?^cucG`Gr*d&g66H$~kYmbobzZoNYLapxYTM zUW*2e{}K)S=A=v1`;}AX-Ru}EfvVQomN_TIyY4t#RXtTW6%2rE|KablX6_dkN(^Rh z1WAha_fsT$2@#PgMkDIvwP#fV7Ynic0CYR7t!pGd&ey^t$7xH^Y5-_iWifl(a)yyp zyc@XaU(|#_IO&&Flp^@qPICi8<2qSX&rf~V?8MmCe1r1kL$+|fHV=8jC_4S#_8f5% z|Bbics4ASbI<3uD&RGZ1L`<6}Ja){STtj4po%H;nGlMR-cVic@>zbhG`}=xE?Pyw5 z39nyd`~#Ua77`i%@se5Ap38{$H`+!(mHs;CoM=mjjR1m0COMXAA8ds`ifB7{3VT$b z@{8a$n9|iTs@VeLC6c^fW{yMguQ=3K5M6o!V(Rv7DqlM)Y**B}VDglD$Sj+5GuiP7 z?C_wIK4G%vHDUFi$Oq zu>j>k>H&oSfx}_7Nh&1zGDS}E(h5C+V!pMm>Qke=T;1O(N{B{c=LOftv5iX)JTyiVNK}xOL>H8k5S!b z=k>ZI)qSaYS>OW}OJE}tIM8xoQj;>YB#y7aZ+dqp3Fd$Hlh1Fggw)>_grcI#oH$q=t~5(O~&R&H8NSgKFRw zKQro|KgZ8{PJQ+2RIBhNE{M)*we+Y#3Hre|2fdnVyN$O>&JM@^0i%z|!@qM*3I+yZ z_lxYhTVD-^1r#gge!z~1dHJ)1f(2v&4t&6sDPSdt4wveKd9jp-Xd|Gl{QOHMk)$n^ zh{w`-*W7%9b^`QgqkK)I)03LhezIg3r8^-RKGnaV5qlrE6lP#{}1NrDQ#tRE}wgjATD_Hoxy1^lGM zeW=6UJ&lH)1ULroh;n99tdngwH+b8h5r-B-xd`OE_lgXh6%bIOUHGclDiMT49*-7f z>QUA#2&f9P6CU<+Ley1-M1XO>NMbmO_Zpi*2M5*> zf%-|)e|_sO+1ln&o`3it3;E^fa6JuMprhxYp;lB$!u8>A~%KS zZ(yhOP*g(~nVJ&_y*&oUd>EXX3uBZ^76^7P#Ul6}WVlv-jlO04MIK@d1ytV>-AhCq z52IJF^fTtcjQJW%_%@xq^sZ><)n3aM=fs?H@g-p~C-ib~)|z3%Y&I1;FAcvGmE;TZ z9MFcs*5|ZY>uU`FfW85U4+LB7JgD`(Zmo7;JKDR)~ zmw)cxEmOI64J`4ZNj3bjj+qsWGu1ofj*jLbVV242)R52^v~S-XTjqJ7WRzi>T|3vP zFE~qyH%a2hL+tt5RU95q>x5A+e0}8-9#%}39|~_nWw+v`?qg)JyklYOw|dBv^jw-d zTb+}nSkDT2M{bR#VrGAr;n=1v{!h@XqjT)N1Q95DgA?DDjz`KU4516rIGyuU>>aTO z8J9dUR-q&$ycxgdck--%kcdR|y#)ETZ(CdXzFphXmJ3JiMrtY$X@*l+j6~Xr^=@-T zYVN660UIwI(;-A;DIJhiO3tCM8!rwGvO)}F^U{@3h4az=qu4!x1Oztj)1-1XCM9~8 zhS_e8NLq}1sRS-`8#^ZVn^>VBoR!O;wA?bu8w5ddDpm_|r?L0V4cM~gB=fxQh=psh zi1!W|JB4p#h*x4>$jk#Cv-}U0LHMwh=PV`yZpG_nvo#3OGgk*Q*RqDm!I^feW|=4< z6U4iLUZm+!YS3dBOWCKGmDQhr&={Q`IKsXYxF5#;KGeQv1uQ^0t_&hp2-mcyBO{06 zv;S=KH*|D1|8-bKI?>;xCD6!4q-0xAEDIUx67b|6Xh}ap{)UR0QR^|G02?Bf4YPip z!mksi0u-UX5!MJ2ambFQbYMO&OArh}p5}M^MJqi>-R+dMf)oe^0ZNIGM{IuKe)|Z2 z6)w*f4K{&RR<%8Ojh+daPTWdn^@319_x1KYXgqV5Fffc(KNO4FNW<&BGYs-On0gJg z)py{)nIcl{GOm$ye%_=Jts%-$SY#R+lbCN)me}AKZEWh0Aol42DTQ>Za}7aC+e_ba zP^1PpKQDp={2|L8vcpQz{{fv!YBtIvn>I|q<4W0w_zxV^hoNA>sz_eGqEy{KU-%0k zC)$KS1`|T*dh_+waQ>pY-Y7O=FC_8P>=qlrm+eN_pCwprC?z*4U!S)Dv9D|Tq*#YJ zr_1Bkd)f7hUXEg0%%ph8Fy_K)W4$&*8FEy((1O?I1pX@85Ov-<;we-ki z8ivF&n;x4Z9tZ(H;DMMpY_6~g;Nev}eSJiU@kc_6N!xy;+baI*s&wL4kO_|8AI~Z^ z`ezCaHXxDNMMZs^YM7DljNK}kSDO%oqbiN&5rq3qLd+N?jFbJ3Ei(woHj2Eu*wU1< zt&bcT+0{03i86WcM}7qwIpSy>-@voGwKOQ^{;!KSPI@&fis~1%0^@Sg=}pNQec_=d z6t4K#iy+rP;c4m71Bq@y-DK6lKj&k62_2I;&A*u+M0B&8Nq(lW-OPJKFoeVfNhk<< zYc}ee8xrbgX1|uxfGEbMaP#KQj<$m+N4u?L*2NQl@ntM#%6CLW)kPb&6^+ocfo6SUrf~7d>s*-;uxViyzVBo6!qrO3hDcX_wkY1Mn%e{U z{=D**(_Z)UjpGpqkf-3W7#O(W#USnmE0=3_+2;e z_ocg2iA?%n^(UgB ztIN<9?ud5e=hY@pNsJO& z!}08d)bml)f5<~*rv!wtP*_I(@aF0i?QZ1iH1;BX*hRi$?$;=4_GikD!JRPrJF~KI zL?HqQs?hjggP?3G@L2l0=e)ZRc~ptdFoJNM+OEWs8d+qb?|8+Odc-6y4b-_6szlGuh8Ip;{7#f!AWR{+JRU@JvpU_XNrJ(f)Lx z4+RJgtcm?uP<;#gCaXrwBTK8{;)a-oEitE^U;nHtM;x`YtEvH*g(e7ewQNn_;kfVn|%xDRBb)k-IvBLEL4tc*2J zoADltMX%dt2e|@JzsdWtAoBx8hO`WZ)|H6TO6X=$7a=)twRv=RFJ1U9!vjRI2}{Ic zSDauL{os!$di9IYKQ$s023NX%uxw%{lYhG+hamcVC*`K>_B4(7NO6 zg2V#G)Mb}O{W$426`o(a`2Y8uuy4dzRa}Y}%^sH)zrJC>wYsE@#2HQjLKz~L!7J4Y zf})~kAjPVu5z=|_;U%M{MvE?j@)%N@VO2BRzJs?6W~Q-=f<^?q9k)C#neby783<UWCmmf0?z?V zBUYsE?VZsFlLbRR0=9td%ea_e_GVq%{iVR&^MJu-6}Jm#690SkKk&Q?z@E$(8Og)@ z?;ziOLjl7pL(W8j@jKGIOXNSP%g{&s@3pkR!bHUQGoS%asHl=Z;ggmlzFpC(h=37D z`llee=tN-t^S|%B(eSo%%l74ZOp!E3&2!&dW=&~b^=(>tb%0q#`v5!u{l9Yz#Y6r! zb5TFv0un=AF?(0SG60GB+Mxl;i*p9M@1^X<8m|1k}yR`|0WhE|nxF&YeD zHWY&wNId5M!k^|PNxjfYWwjBS!yYs)fJpi&)9PJ=ZAU;uTwz}7Tr3>)xx(UV}L~X&hxGChk4&c{C}4fSaQHx)BZa;@}xKO zRbL@uOnkEn+V1lPM+)PZS$qyqWnTog{L=j$#3@J6hu){vHMTV6HQ9*+N9H0%ZHX&K zmI7`hU-~uT^jDmJ1xpf3#??nY73aNsO?sRmrb|t`OV|2AFllALC?z`TZM&$lnY)?ANeR@7uy4082KZ^6o|^YbwxFepI`D$9yUBYss{qMhE7b@$Em}3&>t>2;-e>cU?3EBCJXYtuYJD9X3#xwcJ(%3CEw)>qU zX8BwoP2a3nwWSoZ3nWySoA2iN(u93#o-Q{!6w+(e?F~DrS&TX+145&KhzoghAy2O! z{I@9g<~PM}Z>4Oq{WC}QTK=#NQP_`c`D1@LLp=d!5;@h##Q_BrM%dfx#Ql>d{vT_N z75Vb!-EpF6WkLVTSDx z{K$Bl3j3Nk`wk?)h4%=L-i_31K}Z8&^R^+o@~Z9j6xPS9W1DlL14m0OIvu0g6qD`1 zhWmls6{nPzSYirqoAd#${ItqRwN)Se;Aq*Xz!rN>g>6m_Cw9+pE&hJwXu3kyPN(U6 z2BlJ*>RL(9a8rZ$?V9uX6Ca<)5e@#Ku+c4tk7dQ_vhR{Q4IWA& z4=G0b4e^a4WU{fW)krs&@a$31tB=KXWuN4>R$?n&864ZZHvv%m{ER39i?Gi_j*!V= zAgU?Em((3?6S}+BbVNuL-JC)dF6Y(VIOpnM$x!NZ8q=c7Ufh35$Nhn$xsK=4qBiis z0OdFV2*PE_dY0UsGpKhO2 zmT6I)5@2_{q8y4lG#6?=1HJe4f_JO1!@4 zM?Cmb(fmcjTCey`Pzx@cP`IP+c-BDi9h)@X8lNc#W5V6fB?nZ;h!n=-z+5d z$0d-V@cNsox6}jCJYTrO<>mgJ71SynczmG3w z&oY;Bm{gY84z9U7Z=D84*C2ky9x)Iy!E|XX#+>DXw(HTo)A`{MmC~zMo@xb#Y=k=4 z^_c)mXsywu6zSmk_1ZVR{yLdIO3#YU3?J-r0u&K)nm0L1>C8fg2IfZI`9_BQ8Yt;< zUOPoN^tA_bR@5Yw9OO?hEyqW-UJ-b?oljr8PbzOLI-FUBqf>;mFZUb*jwl9XFf@r9 z?*AtC{|P*JZ$XGL_QxSwl{AFcmWwvPdmDl+o5=}g*8FNeF$Y#{bXZju+fN{^fuU0l zE0m3A)ak0;iLTf@&-dFclw*73l`K)d_HQ=o^wcRTqLNKsncDOa1i~9FFmXW`L=HgQ zf6H&*K0xXV{r@obmSItKZ{M&Yh=?>uH%Nn0!bs;ZNDUy;Qqo8`sB}noOG`7Pbc2Bc zA|Nm{2$Bvd-SDir{(9Z_d%yQ{yzg=N!4J%iwb$P3Jb$&=<~*=-t5GhJX&LuG$zOvR zUQb-H{~GXKvM2e`z0a5_=VZaPN!Zjh>5%@$6jKcEdW=~9J{OrzP@IQ7ZeP#*e-yF( zV16#F)6X6KDQIP|_%*<5y3`Ph|C#&DDzwn2%WwDy@eD^d$42Jjq{QibBsv`{v>P(KTx~jItZrx`&JNF zflFse7VzZwPmhu%JVE(?xmj?6W4=W!iBQZj97R)LSPU}Yim!>Ls{SqiNJAfkODE|~ zX-klKiRno+($*tDH9i4OlF@dJF8l9Yyk7!$(Rg;-^(q6Vy9g9bS7pCS#00MRYWg?! zKSlc$JOy4*mxnI({!n3(z&s=lwF5;^Iyr!o^#8aTt?~EWp4$N@F!I4~x+t%=EnvH7h4^dH!o6v`wvQc1ok!} zWmUR`sbKS?Uaj=gf2c`fQ;CPt4wGHaMQy(}UZzob?}j;8zlmGHe;rKWeNsU`*b(Ip zaOmGq0QQ9TxOjF|SclI!oBR6sNe$5CK{Lc$0*D_b^QP)l80iNSG4g|0dDqb2z@N#% z=)D)$z5;w{Nxb^i;|O1{#{@g@NC?_|XF}>A;~l0Owd_XAxG*+3S6^U9v;syFc=L1H ziDpEp@ws8k4NnQgs*Ww-a*I#z&Vn^i0;yY?{4Gs{XRZt6{>IXX*!|ky`!y)Q#81Bir-d#9&p}>V^z*KY4fOlxosZe?gDb};0;d3 z-b9O^{jOe^os#>x!)eth? zQH~Gdq+}HOvLp?Oj#o!``ka@(CJnWFd!~xHMcEWe~KQkhz9wUs?sa0y$R|9uB^_kF6A#Vi~;~6Oi<5+;6O8h<+3-F zgcVCnUIG{xb$pI~;`%Eb&jk{#{|FSc`ugieCfJ!nleo?eJ;WY%o6q3gq5QneG_P^9 zj;VDrBoyV(d2ceA&oCe4@1DeH>96`RvxF}XWD9`pnBoDYgX2sk^fWS(P8?og(#8$u zTlcgE!YeI>}THt!&=Yy-RkTADCK4%Y(gRAdPJzXHHtS^KKPg=i=t(uVoWH!`9)$6=zn zr$o0DYcqt?zw9G~DSAB$)%0R3Yj1T#^sg0Sq4WqzcMD&xC>)+5%0cI@ozkox-i+79K1UDPx=5y2OjW=1PbE4BN^KdQes z22+OD0BSD0j(yn?0&xBA^s#3c2=3>EWBxu)4viYv4wf?DVe~J&pe~6YCFj1o2afrp z=ixT|>$6f|Ob%S;b^o1%Y)=yW9>C~%>k(_Cf)-zHWwyT#zCqm!%>+I+)!JtV5s(-Iuvx> z;XQ`v{qU0=xNwC->Kr(O!c*dZ$bT7^OuZ+6^0hw7-fFaZKYn?5@9Lp0f3n8Dkb$f3 zY37XgM6)w!599A@k=l;4c&$nW3FiQvY@lO?1Pd+Ootz~1G1^K7elpvH?!ZOWtfe$#G{9hU~m^o|bSc6r%^i{^Z1 z6lICj$AvpEkY>{@g=w4r`6rR)*8=L7uIe)Hdwy=60wPEJ-isq>uQF(Z-%Oe;Re z-Xmh&Uj?sU@fHpyDh{(HHA&2`)Qy1ZaCGMY|`is$t zzXRKSg3Ke_;7 zc*JuU*y&`?Dps(#7n3E9G^i^XF?3m_=KpRdegvh2Vr8V z_=qs)Ns46${gfMMQ?MqIb>FRjM+SPJ>1G$Ti}v3s7ltntRGHff+>dkPf$2phUbsmY zDfHHw18Gte=XJb)-~Y}{SRwE{iCAk7Q+Kw!u1I*J%LL;qRUrrO)c>$1^D!Z7r%m+k_nlxs*Cn z-UK^f`(|nncz^f^V1Vpsrk4c@AeAS<>T+{qM=PTdUMj0CiK`Iss-d_tSHZ5gU9H|*K!xS6Z-tP^N5b9|Wj^_qft)1_5`r2MUZ?fM++EXAwC2S1nh?I9lX zU+PM)&adsvQd$mme{kJAtl@@nW!9om#??Ma@*SGA@Rg6pZh~e_~L1V1$GCTe-nT@c}`@{_cnz;Pw(q=13s-r6JH(FL$;$HVd& zJ*hL!-(lX52S9>64vC(AwKw03XS4AEpJA|xuOlO^hT&W9<49FAj~bpV>+tQ*b3i%! zrVykGeq{!Nrdbz<3RUOEGrlm_6#>qT<~@{!--DNQ@QnABIzeCI^>9S$eeY^stz2f6 z>cyeDfhRqCinr@D0x`rNjus2C`D1~6bq{tr{!$S$T*IA9U0}13iQyRw`uW25Fz~pq)xKv-aClCPX!I%j(l@ldICW-~|g>t5YN90H?N3YU*3A(kuOZ2ly(^zSq+uo6oYnP|WP@w)=A-aV zJ1EDm{cR`wLdcwzk=Oixf2K4AZ%z-QoOWT4rxzq`J4o7fGxgo9W9g|S`HRZbZ;b-(ca4?5~6E&y6Ir_- zS5imIiQc_r!M3$o@j2Cvpj8mPwKgRmtoEkjbZd6TaEIp=9H~Ulm-{`dw0(@Y zoC;6T5XdaK?%sHPE#&NIe-95m==&zw=z{wkf9*)v`#Ms;FjhS*BCp+02sNpiTv-Np z`Jp&PO%SPmah8{?@G?jFCiP-WoR7$hr90xazA+h^S}bZ)&lB^M3A-gORjQ9eEFdR0 zyj;9jYtgl@OW>vLPel>m?jG;P%+_9MMl3W({+d;cvc0B;aN-D!K(wfZo zrd(lx`SXrnr1pT^LbAyYydcUv2A`aZ0lF$y&G@n~@_PN~OY=&{jg??s)eR5zz$w#R z6Y8PDvG-}#h9e|7M|@&y<|>?kNm zgx<3Gc~BJZcXVNNwx)s3sX(JjJ5E&nUJ7d{# zmNeN@et;%#l=4v?%s$=n_+QqZJXGp~7Zt+xqYq;PmR43Mo#41dM$Ae0T70yk4mb9` zQE}+cw0Y{0a;VcSzA7>8XyN2}cdqjrR`kWdz`0=Cem^P*g-67tJOs&onoZ)JBn!_RY) ziHIM^^09w?bnAJx_zW(*vr^koAHY+$m2Doo6=iuZea!YPE>6VXIjt-zfQAVg%)i~= zn3N^u44eG?!tX-Jj)c!t%rN3^tC0miv4qW&Fb^s5-QsLru7X26Ex{Na$8t*U=DQeV zZTY&E@gLw<@gBv99MYf%YiPRnK`ctY#v$&^eL3;@)&m_?uI}W>Z0k5QnKBy6*3B)f zup`qb%PmVTi#2b>5GQI9Wru^Z5FDy>UFxlT6`dw%s45pJt)XbB`^fx;?Gtg=m614@ zjY*WfEgrV5-??$)BgOc9C^B#VT`=K{XRjH1Hn9q7vL(G8SIgWcSp=~@6O&u;Ba=MT z{dAe2RN~omqn(>{rBQRS<>T%F^!uz#$IWN&u~E`gLw7!{r1nXVvXIIRA5EVz=ep4} zI8`2wRm-kE$>vQ5>=DF{fCL)Dc+O*Lc@Ep0Zs!X0_GZ(#kUQ~wy4J*_Q&yV{DGsT% zUwfKb?`-w1E+OC+gSs@Bw;ZwS!T;7wZg`E&DE~Jy8+5`@%KR(9q^DVg!@PqH%-1Z4 z$TShLf}1;^r!|Riiyr1M-L)HkEK1I+gbHtRiV}z8j>*q4wc!hPh5o|KS>&V8TvO#Wh0J(1;M3cq{S{1Uwvt~B!1=6HJzxsa0#aYW&Zu_<*e?^~?~P;krT zA^8&>xDb(0JXZKhvPaKm;TI+QN2LLzB$){8tGW0FW0iN(4=&5{9s!pfry5F?kH!+) zM2WI;mU*8i(N=3DJcIv85rtY z!4}KV_q!bV5|Wy>ioKNmgVORIWGBL-FZOo|0C7(oK=rW=nk0yw_^e2D^~XnUgW?MH zn6!H?we3V`o7ktzI)Vy%BBjP!$}ViM3bdtTXKnp;z3fLO#~uS&%WmSxknKLC+Ly<~^syq#(Ayab zINJeG`dTjS!e3;_yUO1v2;jKqleB1ag=MOR-0y-z*-FuLNFZVVM#iE3%VH=_F!=h_ z>(oIOlW{D`Y$~M2W4MaRqJ$CS8%~W2iyhQ#Xb9Zr-N{+0dCStDlEM zjr>7fQxT!=4J4VrXa&w}iVdp~3pCV_ep7CWzPtjJXrnFl;}7@_K!2>Wr_9ZVcq%Z< zlwF+2;wHaJsQy}TjKlv)_mScwWBe~!(9Hi*4&Rn@BvnXjjkwqzi@OuLX*1m%cw74o z<02u@;HAcX2|j@KVBL5dS;7!yC2J^l1Gz9hyUuwLP3`7riS6FdB zXRVNnI5=!n9v=fx4d|e_Hpj0Ue5OfGc;c_EB^{e@U4?gf zK~lG;so~aNTbHUm&|VZ9Zdy$4zsr=Lf|>Vlk`7?N|LN7=^WimCqwod@WCwLoxp);n zL!B2Ev%g11HA6UqBlqqqr?c(`cFN@wu@MY?{sv|hhn{TnoaNna{=DL97y6sDdG$m$ znL`S|e&xnvc+Kt`Y0-0LU_Aw*b~oL?>-%c4Uv-+zb+@-^Iq|L4>bMZ235oTD892?L0AKsE{Ap(!PveDT@nO>qo8<2QE%|Y zr_+PPR_<_|A*@_B{T=f8@}a`bnYMC8*=xFb*X^s>5qAeJW*)#K@{fvX?!K|#g(O+k z!tMt=sRB!TuEbMkMsF+z27+R+I}qcJc3~k}m7;ddp%Nd&fd`RR(%g>&Sh9uZR9cqd?9q;`sdymp=nlzWZn0 zfQIt$d2#PEYxL}=2T--l)RJ*WNYaBei?yOZVDDOB(kkgX(IY4v;9U zj<+y|KEF5lN|&*lX26H_&iM4eXScJBc!=t9?m^`KxyZ9*W&{%jAFU-N3)1;S7kE8K z!_H-eX;vS~E;9!WJoc@F5Y!ge?1MRl8|M}KHuiOF$#`$K10dy9(k=do6RzMjvIBnL z#Z=a%3E^6r_dyF}D`62J>|(D2gthiC?kp($I#_svyqEmJ-=RRH!tJ#R`%Q5 zEJ79%P!sb+LL|RY@IP>+?R1^l8|!t6CZ1x)eKW> z0ZAewT&}=l+BisPq8vFHc9_<+8ypVIkS@+@hkQcjN$c-#dLT((#RzU4TAr`k_faV4 zvs&FMGXLo&A<&b^v^kiQ#K354;P~M1d(mtCir!erW6E$YR{C!?eUl#e@X*atxel=^ z`~eo4tUe;CQ0n~Agtk+&W8SHGTYKs4_3L4%s+|le--GGpi?o#|UR!17q=CO)72h0R z@I=2s^&nZFMF!KTKubGc+%D=Q&>_*bDNavP&UC5=H_UgF**!1<<)7bq&S~DJL-Ek$ z-SSM2ZQKZ;vm%7M{_3bmoecpV9M@6@_>kc)pBUQv{I0kcvo^ripui}h%Ynljz`;w< z)k;JsrdgCsjg$Imh;hqX`8@;JbL5c*gY>9;y2e}A=lp;V(L=)yYq9SsfBxCj>a$VVUxRI!1 zJ2Xi@^I~lo;qBGZlpMYEIJCIYdo4JvmqWBekCW6}JJUP~ZXC$kKgfj8G-_^#;d>s< zwJ+N{#awt757&_1ft9f)UTBTYWe@(Q9;#;g^!_@+D3<%_GZc$DL!cito;NiaS4rM< zJiSvT;rhG!FQT=lW)Vesr?%on-#4E@2h<)*`yTHEO{PD9v=Mj)hBh~rynkg`q%Gb| z2%}dH+xGaEJtTbx$T+g57;KXalj0OfRIQ7Wd>>uDE4!{6mH+r(Er0-=Wa6Dt>se9HEO%D&p8K&79ebf4s66pfap>vn3*&B7pygk2ULZs6D`L^4xo!9o-cfA zT+ai$@Z6B4Q|?#7(gI9|y3Qm)f6%{c20Ml;T&#B)Vt@gzQ2u^a@13jn*#SXQNZV`m z@3%mK0elO(E&AgAqZ|DQi#-I|2L*QU__b{Bi&6f2YbijBxe!0~Ma7d7LyXS6GP?a2wvk!H zD61q!>B6u-g1g`-!+wQP*60A9i7d%I(CxqGs|xrjf0JWIZ3sB+_V=ijYL)kDm=QpK zan=8q)F4rQ4=5d~b?Cuk_^^E08!PfmT3q zL($otZSK!Q``>S}lhkImy$z6ts#(&${1{47;?<6Vd1ojy*shz>*Y`4%KBMjb`-T&WvI&r7HQn+;12plx z6x=$ouXdLeCy(et0r!j%N<<4SV3e#{DPSm2GqnAYf9DocIN7_%QL{vK^4YuUF(88! z5dYV~U;axw2aywtpM>RgknWwSoaL-Eua|)ZN`wFn?IbN{`-vh zW!{H^V@KmV+%AG*g=4tIYyS(k=tqy;zPg=7ueS~MK9lkvX8i&2CC%`k0u#oig>NYU z`aTac0DA{++y6Oq7W4|7!@7SltMPfoM9>Yo2kxqX{kHjJWK#mBPOW~1T_Wr6gFf0< zy9eb8F$^=y)0@-HTzI7H@rSdQOkewE>4Cp`A-d(O$X}J;U2yG3FM&)NMz1%{K9-RO zI=62Bwjm&Nv0a!^D;aTIbj5eyKTh`-$h@!3f9WmjvB!A3;`FY?yp4XHW&6XdmWP5? z{WRvYxQCnZq}zUpK{R~|*>J1nijK_)rzR!!^NXy4jSSEY5n<~DriY(fb6n62TP zVVyp_P<_GKH15@>LJ^+DagC7t+27l;!889&O6i;YUs6ir;==JZZ+xytB^ilG9X_2y z+WSqUv)_3W;T*!KmqdTwx5m5RRw#c+5T!vdVg+|Lfe{j<5O5SmGe~iA!3#+*6-!xP z6`C>jkE}eq^Um_et)?X%5lI)kBV?-b%wr{TBFU>iPz z-Z#4|J_!n8@z7j075UbI9jqG-whKz#XCsE#g96s-ncv}D7_Yz~Me6-=5SyY-pTb{G z+uPk!!)6$51YE>!g1#q~$15MaQ-!TIiB0X4I@cRCvnUIbqW5Y*H9W z&rh^DlQeI|{fqK)l)gn}KEDTh@J>s;Q=)KoVah}O_C%Y(TvedV3fv$&k7AV41Y83v zd#~;zfy*w8&{Sn#W%9{Z1oy78Wp6V@#M1PZb%D%JbBr3%>qv`kf9SI4A>BPFe}**2 zD}3_!JLOYroJqUBXNBVDt~+nMDyT-B#*Ny~PH69`X6gx;e*GV;1sA;9ftW#0;G9qB zPE=fl_jQ%lBfJ1FpS|B#sYSkPq}6Y50;6dnUC_3;xZ1+pSE((l0AVQTtq&;2jlGQaTcSrfDTH|79Fx=$zl zYDWy3-w#x&Rv0etX_ssm+5IergRcb<)z4N$ZNIlhVFU;@@@yJs*my|_rF=>Dqa$N{ zfTZyj5JlRXsAPeSAarpV+wRn$nkw5hhEa#BsO`LXsfCrHTyN{oEHx57dzHLz#B~og z$8%2|e^ewao>oBLw)q_IC|;RmB-%`AdEtcgZo2c|w6Y_I&31Vxmyyhc*1x*rMDXs* z;YBwna)b2~`gubu5U`!?rX4WRGNTgK=dCnj@z3yYCxOtD;O?7obktG6WU*FG1Aoo32me*u~1%Nf86@2lw05tm9&E& zP;h@(RXzRfBgv*UpKtI)W{&ym;+oN`I@CMS4uk#laDa{*RG1N*w|K6h-ipC^34Tqe zh;B-jusYu6!3!WSsJhrKufUDtA-+340EhsD2kB7da>aCh!RI}^yn?2bhWu0Bgj?Ef zyaiOU9?UaqW5XQJZ_L4`f&bXM)C>s^1q`N-J;XdwZC00|=^K0m@mTcxwwukC?kje?I13ODIPI~@ zf275U`Dy|QN(6C0;%>LSSP}*A(tQ!Vd*?q=DSu5&X~Zt}<2Dk^A{L4UWrC-bsBI@L z)0>n6<@V?@oguSw4?NRWpODtWtPL{ktsm{|eQP?&BU3)H7_Fyo#UyO+_$?(vg%fCAsXbC6T5xF1Fp3Sy06O<$lPKbK;x|JXRRGjR1l1Jw#1lNIL-nNI_x1Dp^}Vbj zu@o-vx~wH#^O0Kc`!t5&39naSR!Q(JWA$#O85v^X*-e?)KLb11{UhNmx|EFj`gsSv za$8IL<$=Maz)ifJn|%O%QQ&l(X?�EYVUW4L*BMlX&%1yopvzLd!m0byKFHWSO1k zcv#FnK&jsm|DfU7*C(cob(cZ4G9tjh?m8)gK!CZgOv88Orf|CVsrD z0n5mW2fAI@z9$I{^8bE=3(isUjnOSQQudy? za9NvL(dLPw*Z$D))iuH0=|+##WUAuN7*d|E`Ww8c0M>^z%y}8h=jXReyW>RXIa#l^ zW*G^K`$%O&g-dD<;x4LpPYFaN007`)%Y^O@X;tExRtuZGbIw8QRKw=)>R zyGmUg`N}jcsWv(X84)F;2do{~T;JBGkWnxZ?lM-jGMD;?gYx9W@HeqL!Q^*CDFOaS zX~z%jB0l|)Qi!;*Cs86`w9&_9MiLtR7p{V#DA`T|_0E&Iz&9t58w?7aGQa(7fl&T= zB8?qMDy{n0v^8n1-!oTzYiagraN{A2-sF0kj9;q}>pzq$eW6=U2Fp(bN=IzTV@Iuw zo^c_VXybr)=Z`@A5qpy4fR;{UFr0&X9c$%3or?@s9Hz`CHFGQ01obJN2;>J976#fd zQ39_+xxFV90g@GpFSuIxnD*&xe|gEKim!glZfpgf>*XdrQak5OGNi5LB|2p!d}QsH zVNoPCH|Rg99A3ZA+L&FlA#5ejqeeeuR>>?D^hG(#l3!<$@6VFr+O<}Xvs}&ADlIT< zS*~81Q{lq`U>TqE`UH~od0KJGrIUBaFOPem93X{~*Pzg-nhqWAG-Uwgvx&Pd4I43C z&sB_B{@$a7F;F1ovR8+bzBA2vylNN~NhFrFa`e%)8y()gcat5!6|oTLIkm`pjjiox zmkHQomBv*~;Mw7#&2b`2a=mQZ>WS@o9*I`17NJXwX^ zw-s6ok2Vn8+DY@XU#}#kJq2QK`c(^Ea=JcnXpMkSq(O`Sh4}K}@1KE8q}U|7q3VH+ z_h{g)G>`cBw3qo(Y!{S&oyaOl;~s4449MM_XmhNIuHB}WfW?=lNygr{9k#jwqCDsi zo*o~1E`7QTV^pAFwp;no=IPP@a+L7=opP+XsQrY8al==eWcbQv<`iUv2z**k1il^W zh@lME7`Kc5Z6p3w3Lk*?tty1H5|6+?7MLW>R!f1^`>v=-**(IcqGjZ;m*$PzEFTS8 z05~Cx)G~qWuNUPjP^StZT79@eP_i>`I&@v0U~s2z=+%*jBQsod;3Jze`?QYd>)U(U zY%#Dk{am=2W&|$CTOkM;QFUNv5Bn|XSX(UYOD=g}JEb!~sYQ;Th?3PZ%iq+|6LHX7 zuTLz|5b~bqd{r@2&5eYJK=tG^Dal2%1U=ro6(z=Sr}a#43kl)9zjL?B#pG}N4C;`r z5rt6nOf!ufa1RDsbXjy=+?`|1_Y^64q%@`q1Hy^-1E(*xiuDiHrKeZVnvuTSX`3|;3%)JGCU(lmI3hv zNcKzvQIt8eOT6V(Z|Z#qjJF<8o^|5a?TY7w8S@*N*l(aYb|3i)RGEK)7u`6x_?9gWD?&hfcqfV|IX%#1IPcO``8Zt;hJbH#1zP^lYv z@RB1Tn%v6K()mfH50;&faRVZ3#i?Kskpx@(N4Cg-y`sl3sbg-HAQ2h|ChPYDkMao@ z^Z5FoGQGLKOvLcI0X4$H~zA z05N%Tx}mbj(X#ReerIo3&%Mw)tO$G(6D9I!mvUyrdrD6S;CxW`F5(>{vvJWnBYM~o zK2O_;{@Y#uwO|UcV8AeJ-2*%<@b!%4WFEoo;xV)j8*xEF+qg6v8(X!|5pMi|-T%Eqe@>KrX7YxcNoEvy~0yjwMf*4r8oI((NKhEnuPe>?#qaV=zs1Szu zVAg&6%*~l#x7B4ooJ~DVxZ8+tOP*%Dk9KTvfgFhyEqdSd8VRnfRW$#nxP`;b=|Qy4 zW1gW~RhP+LzNa%~?e zzh}P!kgrbBrbV_-ApL~e&xYEbpiT{lMUgBT)ktV4UA5z|!SP;ABxDJPM54Gi@E2J| z>ub)Ku$tTG6g$In3IVVMRxR-t;rxD8b`?!!>>@t#p$ti1<>jyTpaYTvE(|h3RukDj zc$kwSKp25f*^v_Xlil=mUEr#=%{pjnaOM*Ke>-15tn z3t{wp!RW~(h|w{+aut9?Y;#JWkOscrCh|WB*SipaOiCucjQZn0nSSA=TtK)wKZnY; zWGo7;*3{f(ODj>_$wGd?X9;vFofIyFW|DwM-k ztp6`PJ;1y;5$k@Psu>t>8rpEKo(#Bp=5z|~hah zrKg+LS_MGI2GnzwpobdrT~zQs6IM@|tTv1KmL;eTxb8c^^op3QP5G9|-vKI;z!8#} z<>K7#H=60!M+!`aN8!aUk=<0$7O^t<7&KNGg#ifvAtrOLgXXumiYP2ix=4+T>rr!S^nvVic;K?*o&iu6liPqW%Cll#7P4(jJrqf%M71F$zLJ9txJ35^?Fa zhYQ=zm;l5A&XN~*Tgc|fEmMgvF}8;vG?JQVs3{D1^ZubMoj#N5x1IZCh#50n802<=GvZ2lCDE1tvhBvdOrxtfeJ=-tvMR)Lv|} zFBI3C`!hC!CKaeCFFQb6V}Kf;9|4eNTf3{S9klO64K;aMd?TA|2negr_E)y50o!%F zv-WJyT6MW!vzj|1*-pFbR)X~9ky={AT7Cf$OLDT)B)dlT8HbPYB)6=N{)YVqr>;f! zHpiP6u`*9E-yW9#`?m*J%;Cq?=+J18>%B(7e1a4AKxxY3S0bo)F>3J`AM`&tq{<84=as;beOs;A1=Z-GPc8l`IhxL(#vzC9 zM~AX1S11#Ct$N93+w*OKzK@O3Xg&VlOJD~suoiIWjK3uNiz)L1OnGePfkv8mK{UsP z`(nii15lmQ-?*mW`*x=NmApxt54*KG!x)KBFsiAn69Xsji6liV4d-H`)oeJTFE}M3 zwXqw@K6oq$gVFL_oFEcpjolOxselibH)F0_wRfg=fk}BnX^H?&z_ltt#gWgpk6Lnh zwFSAf=Gm?uzV+#Up{NSW+)+f3nCEND9?9gsEjOu_sQ@yyWig%AN1P1U9zW*`>JkZj*^!Yh6gI3hqugw8BVn!Y9iCCWRxk?fK(dby+zvPu^@?ptZ7yt`t~tq-N+VK z{%ij$0t<=@w-2Ec2Ny-~%*2z`@6~M3?KI|9F_XCOue!LNAZ-zO5kG z^mmZK##`NCnkj*mpBTWHXo~h4OjWj+EwJ+YKh{lwc)-My8t%^>Ny!^6-O*cPy%xPv!#NKI<3`yI+4)5fQVx|utG)mc!=;5pBpVx&YzXl|C3EhjS;gaPt3Th$uIdy&QdPbU zBrQv$=a48MsIfR(Q3>QO$Okz1$!kH>O{|EktQ{fC~`_s1( zn}EUj7{PWL1_du5iON{bX}$8Q(a|oE?eM$c68&L=v5&8|=Y!U#nsf`JJSmeb_~6zX zJBq;Z^OS;J2;{qQ#$oc^aAHC8U(9MmEV8h09}3I^%oZUB_-u$(VW%9$&L8xK&*|T! z0ys2_-Y`jLf;+HFErCe)b^nps`BLDUs^{c{LvQE5#0WU*I;IVG`6gdTwj$K%$zeYL zz48#K2@YcddhY|IT)jgsjdbxki(acCN)N5Z)oB>U2#p7!bOrt;1J`Ux_2C?1CSb=M z);j} z9dRygWLmpqK9qia^PCfOkP%jT) zQpt(l`C@C7OptBC|E4R1@iX3z_^7*a<~YTJgAjwq;(0h-Jhi+2cLT?_T4#)bSibRPRRVZr#PC{iZ_y^@0|Y~~%!GeL zcP10y!dqAOniN3$`x{f9|4^6!HS|AuBnlWZSIa9Lokc7hE9J!<2KA@JD zz*NdB>mquvKgB#Cz6lwz^)Rktko}bA23qSr?}ak(r)8kK_?htcrWi1V@h8z~#S~TK zLN4tJ)^H)p#b0fn9Q@_~j1kzxgceC249@7uA{RCvtheri^ye5!WdMkStQ>%k4kSwc zDRE>HF;`#E5R!1p2u4flCJAOi$?dK-1p{G?c@M55qE-G}t2#!Mpm0Pt=>!hqo;M#v zi20ThQwac5a_A5k!Q8zAXV<^qmJ7_c1!VEu`g_WDfD?S;@CS5vCgwXd?BvS#=Wn0j zV!p*p+oVO{o?T9EH*SF6f=H&OyQIG{UnYj84I|JE-l1!9H%E((X3 z@n<;wwOI73Gac5KWJ#V2hb29-d<2f9Bl3 z&nEcqDQ7_k@G}7*rgQ3;0kOT`bo+0|6$s`LeeBDyVax%&@u%Iv!JJy^jHvKlvVn8O zOuO@^{d2wV6EG_6p0w6?ouLQqMy!kHh(^Ybu6sq9(G`|cct_2vW4Hx_aIK_{-FZ*fq*nWw}y$t zcXK>P_r6kB^apC{#+rw3R^slu-&9Gzn||hV8Y|(s|M)E2F>x=<4Z53z{50Lbg8)Mg zYgsaILnNbKVM3H*E-pJd=yHNR^V$xxS}FO`SQ{q19Tg6r9V)zJ#(N$A9;BSHDMeD{ zv`*DM0|ep5`NI02hw|PGtKtqK-Y4;q2ZbR=Ed7A#ImgN-L!4fvDKwa^iQVa2CrE7K zN%cf26P~qAo88W7Gjl3WcU{V^8G7Mab*YkRe=>0XBJq}E%GJY>uRrv@`fRdm6E!5} z>50O&=F{1|bLoa0tiIko+gRPJ4^V-Ab%*7ZYZan84;e$&11!_lO$B64lVqq&yLYT z?E6Q2ax;yl8|NAR5*2$~+Mv0*$H2Ang4g+_bU@2b$$fSo+6u{Pp+D~*lfN_T91MhR z>sjz*#r`S@?(c3BS!|fX#d3MSlE+P_JH%T5P-^^PM2v^{_1+77Rm%a>-Mu>MgvCMW zDL0?2^^Qh`%QA!dteugr&r1o|sY?e7s;(D52C9FKRk^P8uydX0lW`@sXJs^3$+xTM zKZ1Am7QY^{nm7pQSZ{l$d6GDu%W6%`ymxp$ENG)Aj;>f7Dp&UHIkT8SSQ0GTPY{+s zb2EMWlOL+=!g?b4mLe>1o;$w9KUDADwl}mKI~Ip8@7BYaYZ|=vWU*xJChFaW+4VCt z*mYY6tU7Q5%>Z8N7^eRMZ}G?j~6pGia*wVgFcoUQn0NE{@P^P3n{TlVt1opY?V8+g7LTX1+> zr^6t90;0Se%YEBHV+>kvWc~$Pc_iJDdgqA!B+Txx@-YmbRI4r`fziR22MhGqYB6F>W8(&WAruTwrFQ@JadL~6a& z3>Xw^#4J%MC)cjmLaBF)ujzS9nF>!`yRsl+{jPZ9vj#Tk8%?{?;v&~1`);sd_Gb+6 zVp>pG_Gs;6_PuNnb%Q%eS=Q5{=*_2qr!K6jQdjQo_Ewg!a`~taR(>oiJ5b z**wE3Fiy!bH~|iBg^BQ?JbBe^dNDZZ9d#DfW%U%`>{10nk$ozROk}~(vz>2@db)l5 zgn=p|@M!gq;st+n^E`MJni z-+FWdUiC?e8GNbN2`(NHBjJ4vEpR>Ogl@r*oqgFaeB*OkcvNO8aiPeETPCDV(ieu8 z$`2}Dm0na1pEwMOjd32|(ba#oeUf_S>ei5~vfNESYre*Q;<=TtUUs-(gBMtT+z?j1 znvyebx)IPBxt#+WJJDs3Xe(G$_n{9H@-TH&IWyx%Ce}rqTq5^ZrUv?A=c&`sHu;4- zQn#UHBug>BI18jV=^n)|hfxLPMLp*Oal}LDO3$vvnoew321g-NL!L2CG3Y^U#S+g7qY)Z?Rd*sn3$ho`PctGF)lyv4b-NP_(spnlj~ zMvpZ>AK$5p7i=L-pFF(kreKXhN*RE;Tp26p=K$C$H5;YJf=fad>h6Ek?7mcegmA)$ zBG~$h&=}d0LF8Lsw^N^U?o=tWz00Z>x94l3Gjo{cX|+UstM zBi5BtYnGW9aa?1iMoZ+a<#;c>E;1x=WAg7;nsRjyrx$)RR?>Gnv)OMaMZSC-eySn8 zLh$a#M*1jM_dagm&G-4GDg2rbaL#{pcq&W%XzRBAsByD7UUV>+UTg0~nD!%Iotq}5%m`K)rExag zQ1om+23p*Y5@~*ZMw)+b+(ko|t^$jx@?iUqJrUj16x;|dC@8BzHZbH{r z-U3+mJ5x!4+QRl27L2+SNW{54d-(KdRK`?J-?N@0Z>)`FbiVLf@<>@LY9BN{>h3C zd-nR&{19rDrD%x-6DeFS&bYn!PBT!Q|D}M>Zoa}8oh>yplW>-*l-hFScfTijnQoiM zCrF+7Pbj@~U*gq+E91T9<97(RV7EqPKxPxy+a9zeo_-Ys50~eQpMR(+pkb>#pzBVC z0y+r9Y1drFaE(=(O2$)+GemKBCW9!%spXG4qelzX>5|(l@$wX76Gn?D)Y)b9gf6Ni zB}91+a&NU**XGT4N8+hTq>q4X`EjX!V^o6eWLLY@{P8bm)Z_ySzFm39n@&qp0RqAc zU7%Iq##F;Caeo?Q_!&~XF;%1=e5n8k>v3+Zwd%tX1~T4MCzKLZ+_2dsL$k79kS`lr zYi}zZnVpYh*UkvA*>{1gjjkY$6^>SU+26Y2Q@X@xj=bmo?B*NIo@{sBpus?zC^eRW z`vuV9hgZ5{>HH2W(Vwl@gUNDU4w@|9I}bT-O_HC$mYo>^`uVG8fXhe}MCA{K^8BdN z@ni%1k2?u^YI!VsSUy9+hQ)BlBnx|dF$UxHlDKy*G}0S0(+y-T;0r%3xi$(B2M7s^ z?py!*3AK6^hjHVJ@0A)|3(2nZRr+*L`YLJZ@fZK?$5`|_Oc+QIB+LdB4`_n|rKGg4 zxsrQ6SV&v3eaRQq*T=zxg_*z=*t!amQXz@CKa}*yU7mkV(a7U7RU1e;_s<+SrqkP8 z++Dls^AIh*1w%AllVK%-yG^x|Oz=^}_qR-Ot=Pg|-m%!$rSIju1Dp!H#uH|jaK#v< zuJEt$tuht&e-|^D^(^xk`Qkym?TX(-Y7w4oc5c+KR&i051H6m(t9YJkAL+n_t*Iu@ z)yk!|fbu2h=5N;8{t%;=d2dbf#bh%HMYLNyoS)y8UKUEV6fu=r+|%QC;(7ziJEz2$c7VF9PT=2ywq)N3fD>gFc+Qc`|7JLH(V*#LOSQx_ROxSR zWlX~y_@vU)uiDh{)%j}_J{{cH;^z6A*wOp!^S`R{5u#c&^Ly5UeGdJ^wf1Kt6idac?uke@w0mkS zFzoBiZ$YGbfYrSS8anU3vy<~BjLrriuGq8II+e&X*X?lUjfkggYqC{)_B9(i#J_ic zyX8Iog9pxS{ENbc5sM@Q5X{2Ws`-1gPS$9sph&Waq<-)qbiZsDksmQ%I9h2+=XgZ% zJt*v?bJ`o5gf!<&?3PL%D#N^3;=6F4#%h@+#+9K;j~UoaQm@D9{@YE}ce{z}`7f!! zq8+u}P6O!RNzhwE%q1x6Sd0n=nv~cP zF9b5>|CE%Xi&G2p53AIb1VNH>*1Ilcc${TeT`-NzFU6TAT1Yh!n@-8Erx?7KGfDy= z7)*A?WFN=3v;|@2oO$!S^GXHk0_3Qx(;a+*Sz9y6n^DSHax=p7tJ{WmYyk=Kf!iZ;qV3v`i@RlaD7WJ!4hRozx+KqF!%`YZ3^8-ihEPDmg3Uf+6bOxJ~U;< z41f#*9b<=rIoorXUuf1w^kqdP^k9euAIc$e(~BYEQ|kTCuKj-GF{{1u=W@oAq$cCn z1i=<#Mj9hKIFJ6659)%ZWr4`^C!EHDbI;E`A(K$$H^}zt@p-Wa0GQnzG^%bBQ&9Z; z*r-!tV&pKDka(BTUd@}2b&JV+_Dub!*YOE9)5gPv+#le4O|?)?I=R#SIqP|OL`>|3aK>IfMyu6MG#O^$vAB_T0M1?wj&`mqB{AC(6{jGqP7Rxn!M_# ztbvH_YI1ZVXPqN%QVnUjIHg*Qr@;_c1GvWnoz|=3QjzPTACY=+aea1dXs7GgTCGKn zX;!ovn-yMdtGcDPfe1&_M{+ONsB{ARSQ1 zwC#zKBLfMST2vJ`8f!sdAcUbtXQ7OzKv`Y=j5TqqSjFc9C(>UYZ1XjG1g1BZ=HO&e z3X$er$!;as>9G%g*x;mC`p0%GB5+&AhF56w->o zjrFxuEuN|&yc*>v|)g_G&cBp%p*>$0dH|yPjzDv=aO{ss>7WC5`-x+0K+nblfU%>G0E~NvXjwC+4Hb1;b)1A~V z4|#&xnM3p!ozMI>2tbEf3wTtyD2ox_$9j#XOiPSANrCjIE3o>7m{rCN`m9X<(zfp- z?gy_eOMid#p9~*M#=6B7lB&;K;qN;#a5BM?PN*wOU;l^S6@PTaiF)5;LiBjo(7)4& zw&fU{)xQVd$9(}_L;4>DZQh#ic2&;0%LPI{D|yCxPMH=p{C3?LKwO-MT7>guq^+WN z6S`j(J^M*YzC(+}Vld4PV~kkPQ{s+nR2ieuYlz~;eQInF@dun@i2^C2sbt;zJ#{IF z@Z(V#te|Q6ESn0+a^qZ0FwgPhg@P6F8wjD!>hhi99q-#8i%~bIUl3lSYlQzmt$jA{ zA)~p0?KNX)-&lf2V*4 zz8}z==C5i3^BS;(M~P93U=tXIO@1wi;*%bt;tEG)@3-N4>qIZcIXR0r|XZVs9|{vV8MyS6qjEDVopB&jWk9jex>5n)}l^TS4hRN@i(y?i*5Kh4Gzl< zg$ND~%f-x6K9vlp;h$UNtrv7IYme!`pAcHC-cOC36Xz@RaQ9GdPW+^$1xl!Af-#!) zyq{%7{ay*gtS4wx)^lqJDHkV5$95O8KrKaF;qwQ{eQ-8(3xvFiMd@Y>7U_^^C)6`T zhTnf5s-v=UX0K1x$u7v8_8bs=ArRL>RYd12p_W;R8t zd7<4D;ZQ+L+V&B#j!7Vsbky17v(r~iuGvDwroS3W+S*#_PW|yQV+G>Xmm~EDC@ia? zt%c}1KQ7dXHp)DDn99zmlBYs5{7IrEw<1|a2Q=L$s?I5!{>&8QeRx?fca~WBUb!!^ zvZ4KmPQ3uk@jMMhmKPc1!)@=D+2ZkqjFZt6wXZONhH~E9>Jq!>lvI$e)l{CjTxL-2 zqhLPheY>|mD=`aK@Rr4RZ_(lh!BP8KozJAO8&tGqLfG;p9Sx3s5QM%so@!^c^1j~_ zWPWnk>8ZN=GKW{2@-_-Zrd37?f28&7>GnT?;VwS3Mx)o=gYj4_!hglFlsoQz&%R_Y zcMvz^cUZJe@~S8;bRnSPgsbO_rBy!PRVtJ_yoU1z`ty|3Go|4%YK`LyVU+$XZP6E{ zPmkwtakR~e@JGaS$@lWQ)cC(bX|p~#I$8AaEA!p7Qy1MRDIAaD-L8wb!bDBVfakjmsW0TvHjwZW|rUilQSX$ZhN7F^cyX#S8o5_YXp0O|b$ z_zuQ!C%KS+Q2o7_uX6mvs8ev_`&oHPG9&tJbRSb$Vhv4GvM!9~-#$e;y*{Ok1RP0~ zan(4iBc_uH=s<8}*00k38ypvlN$$gs#vb-yE{Fk-SrA(_zqscQ8Rj{tvQefDW1@x9 zhhb~f{g>I3G15Y127*}`XrbMYvrSU0h~bY!93;I{+SFT;eyV^Oyj&Qo!_bWiK$hs< z1(ufA8AW*~ypov%6L9*ou)k;O6&(21l=#{_x>aIn=48iqHERa}5U|UBX5W+(blRD` zL5E~IOtbLN;H!vL{naHHdrP<-LLAdzNYJJc)UDOFzpVCQS6)2u$8c))pl5z=)e zDY%eP^wSZ{!Mc29zHuUftoDh?|sqdtU+Zx^hH@qv{U7g9K!ZuSFy~D?7hrHtZs`1g~aFgr|;V2%;@uaqoRQ^)&-We+ei7fz~IET2(q^T9g>&*q8D0Bm}apZTw7y)sy6mp zkoRPV@8jZc29<0cJpj1sxA{m7%+3Cp2l&s?&8OD@8^U9o*5Wqc3~{}{tnaZ!&VJh8 zAIoE%yIo<6XER9hAfhO{nT-$vhntAinCv-gmxrC+oNPT(hem{r59l!heTR8m-ud<< ztjJ8)J5kQH!UB6Vs_4G%wIHM*<5eD05{*UUfR_(zEO{mtXy$XDR!sU*O3;RHLqjbm z0^<*Ry3D!LN`j3R9;g7TI)9|Blv3?-yC&|LDUtWtXKjRXk9v`u_E}LhhHzobg?q=hu97gbhqbofQE;Z0 zsVm*O$kMNTd4q|+mY?oVRy}R1Sy#%)9*`z2(k~FNVDH@4WVh&Fo-ivY^Bp~HrF=&)?@C_3}c z<6>!y#$KDi1cQLmIK&8+Oao*2DwNR4%ewOkAfIj{9}a*5Y*ZAx^V82}P!dB^Zq23xb@Ez{^wE{Omg!yL@LSz(D z_=ABgcr{YTT)=Ma1H^p?nz&nF{)r3im}Yt+%^%wj*AMIFi?CIp6zEa7PEFqXyk2b* zrTz3YMmv84H(%zjEQX{5i=43-X^VnwbGdclyHi=!x@a+YJ*K?2)6IrT7c~Yl+_P5G z`53!%mGO0U+A?#(U=E=`V+Ngp47l6YZ&Y)Cw*1DRiv7StQId^EAw*D0n8xCenKG!aQ(*mJ4A)fhu)ie=)|VmL%>mj`7Q32Z z{9)Wk$vYExGqS2yYT`3~)X^gPv$zQl=DaN_YSJT-13p`Gtxk${0twuoQ0*q6y@ z1U%M-fNA{wB-%U2-29}3($d=FXg{QNve4*OTV)8f8_exCI@{iAE z>Owl><^96IT;Cq*50r~zRFbW< zv^3BXXDJYaZp_aKBI{*jJT6vox|E{)@>zAIzOtoe#WA)Oyw$3fLR}|yV}NQDyO#5- zaY+ZL+6;cR8mTq{9XGqq1R~L46v!kI2Nn!%f*z0yYno{62UZ;#F|glPU>in-oN@Fd zzF?M2+7WOX4oGT!oJB}m0&D?KTw*pkpl>EunyQ#lKZ*J7LKHfzL|v}z(H$1MOS!T` zfu+xurOi2ig^7B72$ESmE%Argcbyl-k>T~yuKaF z@g?8G$h;8)5Q(-R@{W?5It-kC?#IO-l=4!~LlJ?Ci*mM}3Qglbhj#PBcJ38e+6a5G zL5)5DC?Ni!N`mOJCQGuseSjgYULU}AsrDRqnzsS}VfA|qi=gCU;$BIgBTeWTA#0)8 zg1B1ax36(e_{rGP1*z|$KJVJpS~29GZ@00bQs<{R?VcY8b3Jt?CeEWn zQ$LlpVzrg&SN1jvNljmTZ50T}{C+$>DH@465mew!JKhj)i&*N}#R|nNxYuq-p}u-~ zTDpyF83gs*$`8vTM0a;mx>r202BrnZO=a|c6Bm|17Chji3oV}mVD%WYqSb-E+_0OU7xY)cg){Ok{z(}VC9jI@<>E%FAM^w~pp7-KUoShdyE3zszSVj^aNF1(=(E|53mR}3t?;Tp1r)g6yz8L#8=gBb9*3tbu z^EDJ|)+`5O(#xva3bdQ+$#a$!@QTVe)wfr_3}x3uP!tGtI5;2fSR@O2a|yolAfhq~ zlZIZ7WfG3$wJN1V>4( zC;(30LeReeWLysFg#z!G(G&tkrw4Hj&S%%Ral6P6@_8Lm%jC$K!ZPJ}0F&)^G9OP~ zW`uLNOoFs)uW2W=+)r0IdCRn($`%X&vGvatN}P)JF%_-HJHG}^*62v17odT;nKtDV z>Du!@RxZ?84Gt?<=yX6zkRjq-Ll&vkJFeW)^Mc&(sU87u#&{7#Q8sBCwQ7FoJuXSz zu9-WDHfhwXvQK~_Gu7mcJ6@@$ocNGD8@)Qm+-az;%!ePb{^dDaL+s8Qv2 zTw$Tj#-52mt9d4DX@_WURB|lQ(p(M?Na)|(M1EueOsW4nv7tLa8vG9?76~}8cTJx>(s)wi1%i}O>#$Q{9@uu z6fkPCD%vhwMQZ-1C{4z$Liua2A6$z=Bj$@tz`!pYg(tg`loKV3HEhHu)Hz{{>>7%A z#IMIKQB=YWk-&u6b_)n&yys6zgtJ_Hac(MlKS7ovFkDAnCTD!7#J|`uJpjTmUs4W! zg!FMLzC84jhk_Y(fW%eP-h@Z+A*ji5gZchFROSIo8AhK+IquN6T0iHiiup)=-~O#^ z@wBJ7Y76iQHA(BYjVA>_6Ucf@ts(73FtZ_zp&iW6ntd|J^Au#^nN@OPU5S1Ge5`4uBC=; zu%W^StOe+6@_nB9UB;sX&gMX7+wb4W@l_t2X`fI!DNygH>!Fw8S-1<2H8Dczn{V>J zy8nkM7)k(4!9X?3P{Ru(gbaz%*32E^)GV&QX31x-akoejN9$%Ggt!CGNH7kt%82oE zmoshgEWa;$%!1!lSZ<;3`C*#f`?1Q|*+3bg@fmFUfOtXb>l1uGa!^WuRG33!guD|t zJ}frMYopWD5k*^$1)YN)se ziz~*Y&H|w=zV84Xgpx43fZ+Zabf!21PeFv6INgLObYWGt(2Che{o9k`1mz&6G4c|l zRDQ>(3x-_|p9=x@0=-()P<)#7`1hkG$hHvFIMNwYpYls1{6Q5j+BTB&mUc7hs7H^Liv zsn>Y+KGf2=HTpL;oExCE)RpK32vDPGc-B_?nr%kLh{usWV%(s&siye+yU-9YAtOMy zk%ui!*&l%t(NJWiy|YI2iqxHStZn}>8n4k>fcte-Xfog~!+I)Lhmf|)By^1r9z#Gt z+fr>t=5a_TMrms;8EOJ-{ykfV?c(#n=~E#7{NbQtRf+IosY%nJDTV7sj@7 zb5%GiK07K5i2pUVfVX}|QOevLpd>2x;+7x7^5g6xOfxb1ZHCeRk$eZAVxI+ zLe{+ZLcNGt-NkaG$|mZlBRrR8bMi4U@{e{QorkYMlaEfDL1sj$40N3RZ*Ke%3dGr` z#BbRm?_&WOQs~a`au$#=NhR7?#@gw( zssTLGpg42ZE*D^C1So-*a6g_Hxb)+*89g8T&D=_$Vwn*n&wjmlPzD5udqm=KdUt_> zQhO?q;XdnKj=+(Z}J?J`H zm31FC>ge2u@J!1Dd%vG!P@$%i{xW1OzV~VU?yuLuVs-z%miyOtU+4P0?PtNqaO2O) z)#1E47_g!6&HXk}Cx>bGQTowGV+vrK&yrk(JM-d6fXy1|L6H_*rprILTN6lery&@` zWytbAt*2N*><{f^nOQcD`#dOXx1Pi`ev9q6VzwxF+GD+R zwy)OC zVLaaF2mSy+3Zg*q^A#&alnP=7@P()k>nu3XnU~$>)6#J{{pPo|nQyfkw0n#=m~}OO z^*|rGL4(x`#Feg~z=9hHsudklYsSbkdk-x__> z16Fd0tx|{<|L%ohKkFfWOmdG9)o#S?7)83UJ=rMi?RFbc21t$^mcQq5(?=?w(l{xX zqlv^ZI6MSA>PpFLg8qbiQ*XAoOmdlccMG+5FL=yQ&$W3xnet@glRZu;C9v>FSN=Si zy$tk#S;(K=Hpu7oYUcZh7;g>O@mVYb)P=h6O32%!UqGHsE#U!oI=WC#=J2AeHJ|u| zOTsFy!&j08AeZ4LS33ly%IK?s*#{Oo6*qCL-H|mwFQZ29ZS43`6B)KSPbb_*r?mht zZ zYE7JH?x*Cnlk|f~$pd}&WeDL``pTsj;bZ!On1~FZVm@3>hy930l5$^z8p3=g?|~Yx zV;n1)2-RRomGlmY$j6}r$8Kas*O(9RM66U7DBU46wPt6nyl~PNvWcNL7ssF06;ob& zy4vOeGecE6Ka>{gu34fD#3O786?BGU5Th2?MQdKZSR@($(qH+PeS`iow?=Bkx<+e{ zQL>({)>)A>qV{i#krf-BUzSS^LZe$7J^&YO4bU?m3+FZ)FjT2rve}kA9gwHU>n{ZQ zMEuGDBO6Qw_v&gbN=5U4>M`Psrbm(6%DLU?OPOTf1I_)V6=E^@fo+7t2^ZpcQ$ezM z7OsYns{!PV^NIj1rBAYsj`~(Gz;+>G$x-U84O#bwE zFiCMkF$2oF;;XUI|H)^-AZo+IexI%B=s*#1x@Q-E^We`!saRU5nn612-`Mrq57_w**E)1S5$lfrq-cw9Nof5!Rw}0RhgLonU{#$=e51yk#gXLc}wAtbP zC;5Zo01~iBHrk&OOBRs&iqMPJjw`EvI1DM59OJ_M@Ffed91hpu08N6?u}@NJI0_%9 zU;dCxU{94Sh@k$^?yz&WzSj5k(YiP{p`dV=%(V8e==2HPvanH55P~*KRMe%(~4l7;sVqa0GCxs6E zxICVJJbr81TVMB>BCCWKl_1GQHGmcMAq)_Q7!rsT*c9yH^0X8v$fz3XHc_uy*Y)#}`{0 z&6pe0O76dpk~)3Qk~reIJ`BhKOeUwAMS<8TU~?sL(|vuTb9SV~3E6mJDOx}hT2uy7 zTtn4boQwqJpPl%j3nvU{0RV&W9zWp^5M@#I&Ine?ze!Kyq}JidjgI|bPe5ihQV!y4*3~obALzuV;@Dv( zR^0>HHM;MHB_F-iYqd`R2;!dm;Lav9Qfv|m%&96{l7IP+L7FZ5-=G6SXN%E~qF#6; z%A&=pQXLPJTpqP}90rIK9akNbb7Msb2%QnQcHJpFIz7ocS=tK#mW0u8o$@)d&`>kb z9}ioX`kWt3R#lU6*<*xuzryy6GmM&^P%!6JnZ6>cptBJmF(RI2*KUl(hAmPgaTq@p zHBP&In9S)eR66eywIjD`EQZZT{Nty8dYqVmOU*m;{`%lSo|IKR zcxYPXV(j44=+xK=AgwRbDdg|hZA{2lqOu~hyZ$JBk?lgHDZVMFmExd*udWqO75e5ZO=3t>^L=_pn904@4$ABjAbR zZ%Sp^1(!X(?Q$Zw-vU!9`9Xl5*+#Ch0?0RU{DnB*%nyu~l5LR4R>w?Z^}1|o_rhaV zx;fAqC;%Yu2ET&tJzKNgUUEX)14u7ugMiHijau|xTVWBp96(QT{NxI&_mQCU2Ixt# z>T4gA@ARz$;cAf-dDR>fTU~~5>Hs`bcBr(wC1^;^-A2%oTEX) zTAJz#Vr7ViuhexIV97Nv;dkSI70teb;(BA-qU7AQZ5r&v(j4Q9t9r2|eQm>?ISfb_ zsSRg!e zrF@7bfdM<7E9Zfa+D*Q5`J5?2kop{s0OpE0Q1Ckjj47r$6(~Xt>J@9`5E9T8Ohv4| zCGL($y@zcJtzB%fgbLf7NVHjTo5W@;A7$HA_PJzzMpA6heHBv>%yQhy+%dr%3Wgaj zj{D_oE>Df}Xu^&IM#V4f$ByxepuZ>fE&(b;pati|ig0hHtI*1iWk$_A^NtA7CHJnjP8ZDIVs>zRy0G#s?R-xiL6nqI5 z`X+Xr+`p|cQsLmE*5vhIyrrGp!15Vdl-F6?K3Vf-h(6z0gbi5_TCNc&f;Iu9eDOdn z%S0HN#PL4LsQKiGe5=Xf1lL8@oshf><_P)(gwhWIkZJef%sxQbDxe1qgviELNr#;6 z&ttjI0ol@&0Ct5*!K{kCpoJne!Ef^o8tMh03y$&Kdi_=}a=2!~Z@H0%fYvSk;ntYr zPdE33j%ufF)siN=DMl=UQRjwG2az{%((8qm; z0b6_`jv}NYUuW&-Z5TEix~D#o$5Id-r@{NKj*%WSGcZ8{Q1eL^DR=u@!Y>3b99G-) zDY$bpidw2;pDJE6A46W_?vW4=rIk`Ke2&*K-H+(! zF-c|bQla|7m zcA)3qlDAS){TH@0!OpUj2an7+;QqD@p(O&am={WgexdAHAjb^b0sLlQ7NU)VsJ+ry zJ+p$1plbnBQThXjPS$5~m9c;Th+rIDmuBgjFX#Iq`~7*{*a8PUdTjGTm4YnWqIYbW zG8wki@FZaTpje|ECbbq__mJETqhr7X~?FE0pVZ9r2D7FeA# zuQ%-gp`(2?Qch8C-McL=M?8|5Up)P)sm|urY`p+-c-t;lvW`kLX|>VuFctl6gzhaa zVb-r4Z}XL8lGM75Hj^hnEMoAm4Zm4BIg95#!bMg}zd%U;7fB`Vkd#Enmv1@`;E#d= zDLZtku~DGrV^uf`H;EBKx6J;UvT)+o# z5STJ)aPCsj46^l8tGVWVOnv!rPbpTKXhi@J`F`5599yf&Z6i7Mw?eDPDP#i8wpU_} z*&tJ~Pqqe5yf~mHkISASPdNc8X?S~9rE9l_VrfUUawKL&Hx+--M62=@DH{bV);&TT z^9$L$Iz*k_Kp8=91)5l9dj&Qurr@pAhEm3=sYXR$(|fWe`!A<~kw=Fj7z)#d8GtVT zJJf7^1g0w^>x816!x6*r!&7|tg#}lrLp_#7N=~tOdnSjjoB*RMs3rJIJ{j%}+17Zj zy611T1Mr#UA(~L&-bI!uxb;ZmT`&aiikykOOGh*#x|Kmh2(Cob%JXIid$R0B9FOc~`zHhH;GGQFnvrHp-{_?wiI1WPBY+%g^c|2!eJ!~@c*V-96m zOwcd3X403~cgiG4AQb=l64&p;R_azQ`rDJesigf*kUd7Ir1#T?#Gz#=NVpL5t?mk(#$C(bn#IF`D{;uc z_qwYz`LHsSg8F=)?3{X}DoQ(#27dS(U~?WVwujKQqv`UjW9hL9Qx2lxi|-~m7a2b^ zjN!$Qa$3UWEYHhn;}bdyN6^68jLc)H>1}mKjg?K#3gw>4mx+btwQTZ^myel?`n>{} zi1!@EyJD1wz~XRGjh3TTHoBe>c7;<$X2lKY`7$MxFMY13)4J31Sa1s`2M zO(TRXvOK#1Ep5YFf$StkD`m2Iz*8n~-E{^`JzYXB&mgQ?!S7zvj zP7&+iL#!_(VvS4Mw|Qy?d7@pR8UB*Kyct4&Z4to7>iG zu~1&}wQE{;Jjy(=wf!OQF0Wj)tbQm-QJ19<>hd&z| zRp3=%S#y=4)t?!vtzE3vO|I zw$WBv^pzyJ0SAc_Zs{2LcjFj#0*|j;T9nGk14Jy+9XCI8!kM6E0J?Bq-V~Rhw7V8N zTx2F40B<&sBs`>(c=x?$KT63yl)99aZ$z8Y6d#jowDlDVwl!YA_u8bDiIv7l z14*qZ@1~DMy?S2+ools4P?g{&72LDWmk@e$&vvre<;9-}nm5#qr@1H?#m7a(ZFUm4 zaf^my9s{lDVa)koEx`X?`u!Jo?8b|4WNUm+q4Yf={NG-Yjb#liT{r!5oxPX5-NvZP z<6^6kL8aqrXii)PHLJB$5b>gQ#T?yV`*|abRI1JSG3jxx8+v-qnmG{Y>3ctw96(ET zmNpk|Q@F?^gfr{);ib)}K%mvmRg!AlO2O>Xx^$)7TV}S4acFmoG37;`8>40JbWO+v z@I~}{dYbYfSIm~GE^xHkPw)8zSP+QySJHI& zHtQ~wu$m~4`T`H5vQ26tS+_Nvn)c}Ih3C0mrm)70fj51M`OuHy9t&KrjV zFGm1L@i2g#0)rstB%!iS$bUD#71 zqI7z+-%9hG9MD7*P@yhtTbdcYf4t4#?q69O_9Yu*(tL-Ig^U0lQ5yPP0FFv6FIae? z8rDgnjc|L?;(3<9q*3~-ja#X$I=6h-TY8%^kB3dOkcmWt1KoI4OsPn%ax?q+(nXi z&zO~FvQoe#tXjb!n11Vd?BGUtu_mA&sX`(OzFL~5|2&vo(c|>pA2t1H9!57X6;~7g@4BZc9=A=*{T)k=Gp(TQpi+n_{kLepHwJX^Db%RdsS6+ z3!Fo`f~U>I|B{`)JA5{y;@-M;d4Q`<*>w;NwbDrFi+py{on^s{M0U2X@Us&pfTd!q>8vCCiJDe0cz@MPWuH8A{d3HO%ca$#`HS805Ss~Ffkmr zA8E2BDRut11SIiVW7S1Ex(h%jsbAeqd_Z0Qy)1gDYDXF51Yip`u!k&W3Y*ZC_brgL z>=eAFk6$RI#{ilp=o;!&t=NZ-VsSXUgS5SGW6*c_yRxLkmwl>WI_b-I+ii*K=L)$7 zVeuyI@#WA5pR=%$Q#Iom)aN@ri40 zY=X+vi{LMT>ML9}vuS`P^LUy2-H|i|``p?z1I~a#ij2x=aS)I&t$8nA-DTMkG!R3G z*>s~ByFoealE-+VRFh&Hr_*DRM?;j`JRapmV`VgkMD~75x@gI0Bj=2z4mTjaZ4D+x zZ4U}VIiO?>X!NOTjK&;&*wQgJ`Sn|v-^9Gki4gWE=IN4j*65;%U{A%#WvjDZNV+TI z=PQy)<@IQ0mp!@0AG{v?gNyC9k7l$&e_oCyNV_{GPf9l1?0crzqWO!5BR5I-sEN&WvotZ^5J2_sng|EX%=FXi3I=!d$T4j5eQGwp3oRItzi8NJ$K zdb>>9*J8X07qrceigM^%aY8#63rsASCDOsrKG2^?f!_{jK(3Ic&71wFxDJZqvDl(f zzlTshaYT0Iuv1SreV!wvZbTdjV|X_GzI|0*n4`3TW9jXuVNbcZLY%iWwOis6{JJZKDw1qLL56H#=L^WczZGBDCE#~VcZucMH}#NSjb?XHf=jb#D| zM&$vYpxFVJ#W56>Ppe|LhL3}bGM`5KLBg(=POW~z^!0f`#2gtHXqA{zIeAVQ9A8I= zeVhdtRf{99hI_D_gN2$yIEgWIf2MPj0%VqIM>-vk?X=K5bNj@K3=vtt7T}>+q+0uB z5IWu9tZMPbAwG6B0ix=T4s?mW##KxV2>Q&pc$WMWdn64YkmTRQyxn4J09 z2ZY)@0n*v71Cg8)EHJEZw7jNkt3( zhSqoL##i^BYdymOQ0{$wTud-5N+lq2Fn)&6UJg50W}{B6RgO1S)$B(M?@ofBpy5eX z5Rdnv<*4Hlmsr@KQ~Z2Z2*CL~ICz7232;6mH6Tik1op&!TXe50fOgWm*FH2^OZun}Yl9JU1$RaH1Z7*Wz#_2RL~Bp9V=TP*sLGT;En*v@v! zNMNQaNCUz;#O2fx>TS046Fv?jRX&l~?LK z>v@PA&S1XNU!iMii2u+=%OunlB2@29Ood#W8!jkPHytJ00g7bmy#}k0RTe#jA#3m3YksAkuOMuXMvckhX6i_pbKo|Mvh?4kgt!v~u z#iteQq2NMAdxfPNYXRS7Z~pPJ@4SJ~hPrq;d8e}{Jtx{A;R<%ISH%?Zxp+YE zGaSb$a>YqxMl=wdph!xE^q0I>*;`~uKc_--1Fv+3j_CrvLK(k=wY!H)rs2?GJ1~*9 z-~Yj^ehFT@&uewuNrf_J)L1_=?Md#p_m)j79<5;Pgrr(DExQ*2idqdx>J~)`v)`v>wQMXp3COOV zQE!Q9?P0~}j%;GOpCV}BF9rn%S$5-Z6^J;y{stq4FNU;Gd~1dW6kN$hib`X7U@7CJ z1JFI$cnCp(fG|MCq7jT_Mxh6K)51!Ei|CE#uK|;osd}_tjyKh06E_GsRqf3^0XWW| zTo0u}{J7v&ABjwna{f8!C#JwZnTS^?gY$^~CFxoi#Pc^l!RA=F6{?+gG%uX46GRM2 z7n%+klJK6}Gu|orRT9RPLrBzq|5%cM9F!C#k%sMJ+F<)|R)AhTE&YG~L6^#v@0>R3 zH0;FiJG$z_mw)K0z_Y!{w~{~pa2-(YbNjvTis(E7W5J-a(<9!1(e9_YcMi6vLt}Ev zgrT~h8)cS#r2rD)dgeMRFgjkEX8A&&hZYml+wmv;wv?7z8Bf2(CPayJH z6}I>%(KtW-k7;Z&pWs1^g@Hn)Ab!WSKN|EtvxfDaeIAV;#~tg1dM8m<|Gq|~uih|$ zhBRDCT&uxq#Pr-hP3oQI!V_xuT{1rfKhbwImbtVrJeoiD1Dn(&6i9K8*kO@HdvO^y z_V%9kzPGcOBAH_s`Rh`=qeqnQIE+R3T-C0kOc9-O^K2GzU#Si2-Hu9jwjKI zSZClxzk18h5$ErbR_yj9phj`i#7k2=suK zXr&%&0x+okG=RU$DxIWCaWM48plU)%a@*XHGkBB9&HVPnXJR%b zbsfA7d<;L*6X(Tw3?v%nn68ZYlS$}}iY4l!3O_>OFL7C+4n*-u%hl|OMnG&)?&jM& zr;Pyv#2p=MWW)hLtTk81q0KnTY<2NK)z zG&7t_q;Dx7@cvzkfvKILa?wojg=IuOL}%Z<*P>+8x$cZM*8 z(G;lD(tqfC_wm&@P#s?>)1R&ehEGWP0Zk__{3E@bH^0ePK|9T9j7aQu@K~2v!{aRb z&!I9NeK{d#>egJCvYy|BWtrs@87i49DZdI@d0!nw82o887*9El5k za$z#8T+U-^xC8~=_sc({1^8XZkoQW`8zOKsLy_i`H|p0fk6xWU)zgtJ2kdStU8^2{ znWy$2cRbe$+LxETz)uv=snKR?uu++mWB=-xoW&8siyVHG;zZW1AGRovj3t&$uaRQm zi58E#E99|g_U-4cAjbZv^ElS*1mfk5Z?;*^?IMg z((H=sh5lR3Q2V<(+3lVQn#P@X3^#f;2Fhqifb@T~7{wIxxhA@%=Yg`~Sg1Iz!aD-z zJPBrd;wKSsG}2*!64<<>G6240?|Zy})oG}9gb*LaKTqO`8d`n~Y>tVay{-S2pt4KR z{UBVg(R84TJE6YTMoCrDl&SzxYHX|%vj*T-F`he6z(d$-t@n4M-N*VSjkday^T8h< zcu}EOcTDRQXAH?5+C8~r&-9rQD*yk>|Hs%@Mn$=`VGoU@3P`8YB3(m+g3>8n(x8Bp zAT@x55`xk>fWXj5N~0*c`?{}qj1~@l z7!$;ZHxY%}omiijYl6SSH35s&cZUbMe{WrX{dfBd3IT$Jp;x1HdeiUYinPIn0#RI{ zJ*rN!|5hp}4;OGreg?e9cTkThM!HJ~;F>TnTK92e{`JuTrP|V|P*ZmJ%p*J>IV`}N zN9imGlK6(dONiwIb<+%`eU~0R6n>KmXy^7LKi(Hi3&Vf8<__nn(W5Fix3!uS0sClj z_nnPrqOL2Mkq*xXvT=w3FB?Bu*cUG|*mLp0v(Eg_oMfCjJJdURbSUMYhCcXe8lrXO z!vCE}(xhiB;EvB(_`2l)_eldh;HAM@t~B6uP%v&MjbguCQEIIN{^UqWICZ1{X_QZc zcTE4g{G+Q$t()IQe^dxw1ChD*K(3ge{cMQ1aa#W4B)p@Y--b}&gjMCSqu5Tgn7ZAo5%(fRvT6PclMe$k!C1pcF z0n@qE9OV=P09Y!4w@g)bdY@^CNO3gPx< z9}lZa-H~?bYy9#zws%>)W1js7@kZRY`CH)r*MF#)0q@hrW2*^ApHoKz7CPq~qX@v= zI$F5)F3>6A0TQ!36gOQl)Uy^v`EpGj1(Ks-!24WZ&%CaDdnseDoh^016*O6%=;#Z; zj9%kC?{pzMpX~uU1D$+>Z#8~#ngUj{e2R%|ytDwGwjXy==6%t2h4k&jMBRi0>Lw(X z9@YQNI|M}703PrxR4G2NYrRKbD1ud?jF*twRUQ&1gqnHcWZ%$Yzqj-od!Od*HeE*H zGfpRPE26-rcys|nV57||*VyU5>4z^|YLc>zb(J|f z)5IFUKJACsIA0@pc^{+ps>h=pT}}TP98}Rcp8{b1wUlRE#CNfp4A30lQ8}SLS7!k` zbJuB~Sk%l5iY>NZsJ}e_e(fDAWQ?t2CR$Qe)9L=#nt)jdo}wA^C;Wdd2UZvZF!x<# zAEQlcFjT6ydw8&M0#;1GIsiA))=6JY7XBCW2Zm7LVBPugq^_I>*%g>J|;06|$3Y9tj z{QOll8Jn|z*~sa+gxG*4zp`d&O1!Qg%I^US*6=Wvx! z!uKN9?`$E|>7)m7zyOvQ=1=9s;Jy9}HmCXbI=UB!H^DTbU+2ay=r|+N(nrFg{1oJ} z!gqexYfYA0sWea5xpPiVPGV`YE5$z?Y4Y`UadpM#9T4%{V~BgG|4B1P@rnP{zCzfp zFrI$4d?cGf6peuAo>_r$y-=}XO%x3cjV-uNfa7Zymn{7X>-||Fr&$`LmU`n`cSX2? zrr}$8Tb5sR@c!NAKXmga3$3AAU}r|SZ43!`pSXS<2^=vHv`8A0@1pcaRVD%ikpn*vUeuVdxoB{<-+%j|Os8Qr1Tcbt`?& zUF-+oXx*UtS5Q#sisK~zvjsk~(!3Qr1aCN3CEH_rD)^}jIXU^k;o*{Z^{Z+5I1U}W z!Rz=(W72mRG^Q%;@~wN58{`STy@3>0?k}|BC?@eS?^e02bTzaOMD4+#^>22qBdd5{ zFFr<2Re{C+c|`TgN!j;znHML_ug$SO%hvlFqHM;BpD6!Tjmi9_`wso*TY#QgK8#B) z?fUYx!a|;!oZIBf!AkcqxbFJ*ARJPVZ%s0y-|QzbiqL3v+y2a|hU=In8;mztVq9PJ ze1ChohE&SWheNl72F;|>e)K~ctEm{|HN^2Aw@HuIMkY}U4vp9pqyatRd)St`&1z55 zNcAg24!tsl?TN2JpB_B>u_^iXh>eWPAhs_}Oaa6SAC7mCzuEz(npd)1)MQv$!KkVH zB+=^gKYQ%$biBvR2#hoqwRGh)u{XI``1jPo(PdgLw8k`wuJza(BymR zLP+hP05a|=#~4)bqOtQ0oyTnj{T%0-RFe3t7T=0VyggzlvgwLt7nd+2hD4rACzCNm zHMJ4k5KLf%sy6#wum1`sPp$rvW8%I!7Fq2y*SACGFkA|CNB(q>*=7_;ZTm>y&#rFp zo31#DcxVLDw&w#Yyg1&(fho$^<;c)}$pbU} zS&UfqXGxEs8fhbu#aqh%tSi8>_=D1Cp zUb<{yvevF5Hbx7{#Z2g={Cf7VRcWN}6Oh_tKSb*oFcLK&q`Qy6a7M?)N{AQ>2BkS0 z-u1AWR8mV!{9E^nBRdxlk7_EUt;2X;xcP44G!1fn;ZseGuw3cpc&XM{k$%wUFo=84 z=5moy?I+||G4=ZFY@_#Z*q!?#_cn%d@I4%B!8y$Z8;k~nJ*wMl|4Yl`e}-0J=y%}K z+Tb%eQbk_B2*G+9Fa(ckY;3$WYu5kt8Z9EAuziZxW%()k0Gje3vW3BFV=!AGmE-vO zbWUr3@z;4j3_2x^ByD+rX9Nuw?aN?ZfSvkUPZEDDx2YsH2kidSMp-UKqi`Gj%@H)W z0-eHpVvh3-gYqE+l%8}j3K1KgQa{?{>|0n-h1&Udi?l(NT1@{fa2{7S^8OYr5z94f zI3EnE+rjcP4t2P6nQJ#t?=BO7^;miYwJVH#$lLlKy#%cO(g6r;cL7+SfiPS0%UC}$ z&3p|Ed52cQ@sC34B9yA1pM7SzGnAs9hXb)*YKzQuQ0D~hGq}iDcCCL?d4_N4Z}n)3!)A2Kc z#88L&G`&v6|I*gLD1G9%9S6X-O!dQ6@Rl0oVMV2t1wuoK>8-=~Jht>yzYc{wU&1b1 z`T3s2h?*-Wh8Y%CSo>**b*gM&_i`b%hc$Alt>nkF0^8Rsbd%Dcr7{9rg zBf<9q-FFcrcCw84fZZR!1LzH^6k%*y|Llv>N6%a0Gl`G`EemMmgbaA|7&J__X^}5l zD1fs_B~vR~bgIy+DM)CpTqu#it0{rNB}_Z*sHmu|A{j1P2*no(a|#(+QnI$Hi#kEj z2;RYJLDNI;8KjY541gj8-w8abbOA}AL)G&qU?Eb0Cqwn{@p!W>$TTNh*7{UJiD;R= zRGT}2t%pfhS2vtHs_8P$;aU`$la`pE_=-x! z1ABa!A{d1Cii5F@|N1`f*vxt-~YBV9;c096L<|8O!H{ z*lYjg0{E6iBXz#MctR!O%pm4+ak@(n&%Oto$;u+&eK|@A6k`ll`1z_7Pse3hxkTJ~ zDxYt&4MfyQQUH11ea0tU$pJq)5ce_13FQ}l1hYT+KuX{V&mW8AFlq2~J4Wg>we|Hy z1(4a_W_&_0o?SxR5}t#Gz|f8MQs69Rt7xLWr`i00lwG!|sYyI8C52inv&bC_#QPyU zBBj$1_cSIyvfXY(9ZnmUxa-OP+lD|5WXruwRf?BlEu_w08L)sPIFbW6V=%tmI$Z1D zsG0Lyq7q|!Ng(n9)18MGcE^6YCtX79G2mqQ|G5^fffC!pn+RV-HvXW+a1L5qu$h94 z$b`F!xJCbnXOKZx{LG>Pv(uQNsPe5OExFBoGDkfxB>UA)1g7qUvjk|>GY7QLVC~(3 zlGd>u5`=}rozKUps#qOWe;zXa7k33?(xMB9x1rzq!eb6G55Sf=6U#(ORV!GduCA_7 zIEfLe1kQAFM-*RDs+{H|J5YxZYx}Q`ZkcpQy*lg-UFXvx=EoxwgmP|E#a-(FseKree?dtCX$1p{>++9iE=-R$q+ z9=tMR%0CXO*NxqNf325TQIXhYEgudLTgZNY!C9*Q%fM9Yemtf4_`4sGBB@GVXpPkM7Y72|-RkyBhUtl7FTX2Zk* z-j|p;TWR&1Bl)W(;dL~7xZU{)*$0ie%%bRBZ+${gi~aC!3N&PTXrm=-qmg zHf`P>oZUIzqoJ_wA7MexHemtzNi-l9P_DX$di((NgipI?F(a7{YQTCVz>ex#^a5^g z52y)=K#-(7m#0ELr4T&nyYTwakC(4AQ`cK|Xk$gai4;>FVJxI-SCp#h*iRTrOfQNK zqm()M}&xoE_1XG2{K4YP-);WEK3c>%@*>tMToN(n&4qIfR$nV$}HiRRh4gSLC zQPeW7*TTVvs_3fzH$BgQp)_-3szA^I=~*!Fpiu;j-z=#NscZm%hT3zTd2*ZPQ0WvJ zOKG1P*4Dg?ZR0hCl^YozwdMkM?>w;@F_Jj%OW=qNl;KQg)I7DK`f%@cV9 z$=DiG50M@ltIOYf9}4elB(q}tLo8+`^Xs`K(BsY)!^=IP6Jd(w#Jzq6$XlgpGwlDG z>JUzFjv@+V0~>=LzWZ#mzd}0^EqurKLaVmR^n=r8Cee+KA4=AAeDGvtwNtsTgf3S6 z2?z3D);AaD5ji$$8PR}>?zXzj+x1C;wQ74Nt$gK-J=!dOwL7f_@=CADNG2-uXnvO7 zZLve9>&D{I!MaogNDiVhBa&uoL4Zc2I`CIMdjp8iPrR$0GJ#43^Y%C3&Es(}Kbw3^ z71Jn3(&YHJILN)%RQrpizWv;<$loS!5f+snQ4FJo4K3T#Y8CNPEmmEQ=LIzVVr4a) zqfRq{K)y4uh^7yWg8+2a+Xok#c)U#Jr z8+ORTSBL6roPchs+iib_lEWxVz#Tc50<3+^j^COTLJhUrE(QW5#MlALas&}fqEqA4 zQrCZwCT%yAljos$sJYTaiclyzUc=oF(aQ6hj;egqeh;feN*(ZVSXghpXm;5bUqScj ze)QmK$N$fzDWo@Yk@U`*Y%|aO-!(<1#ctoaVZ`3Aw|{5II5=PaoRJ@oq#ZMD^p+ZG zN}CqBvErPJQ97OV@!NO&^0%_hd7D(@b%<+`;dJg*Ns*pND0W1uG|=bNoJl&U<3QY^ z0`q~f?ib*_;X)_`6U&P#3EBNVezVQ4zUDv{d-=}`#V`dX73&l|+MSjV6ejiHX>8(< zO$S2qjAgjUYLkrZnQb8S+@D<*kfKluCaurUx7W`Wf+~T2I=R}Jd3&lZ2ErK z#3(QPyhIMkOr^;CNSn)iXw9+b61_HzZC{|5>{RkcURH55Fv)Ku%b2w*Q50W=MljZ1 z*!BlpUJ96K2vvC?o&lCD!t9C4${73O+Jj~O(NQ#-WTDptK%11G%ExW@6DFD3hjDvA zNz}#Yl0e;f=I)?bKOKRLVtgUENgan#{+V`Qo8qRpU0CI+3;mA|%u|#f_1cNEWB%9% z5Cr&{sN&liqQkP_ScUGGF)s*+UvLd!Mj1 z9Y^LZ^Oa9n1XkD|lX}$G05gsN7g7dap_mgb(n$#sWG_v|DZ_#x6VYXd?-gr~6J#E9 zCTOgsr{fV=<=$$=(M$_b$#=}QeqIXvT}<4Lhp@1PcXcjuKEB~T0O};Id~8p*Y~X&1 zNEEVW3q!LZ>ig^w6)lN_2%4zYaGvjZpIr%zlE%)D_Oq8qe+NVBju*6LU=my3uY4Zr zs{hbTtu8S=?ClSh)RV8{P8Un-AL9O{SRQ_hNZH}!{WnVEykGRud%#z$Q!R}vHDkF` z>ys`qd4bXIbo)`KpS&_%<~jCN2!qZm0vDR(>cwxT8)LJJmkne7ysba_BrY`pHzu~# z2^+-#u~?byd=Kn-dk)|I-z*jYaTTK$XTjFU>iKmeGGWIVr*fIAaM7EfWa&G+(X$~H z5y|2jUhgoGm76m`b(MBMU=UTB-h8=&g`|W+B^a^bRbN$2&4qKgjbd})(ig2lc>({? zdx~i+T-S&FQX`1BqR7R;gf#If2mx<0v17_Gvemh0E`gL;M2s%KP9lCt+;Of!HAROD zF45JyzdR^+6NVXqFlrRq%|^H^*wSA@$SC=3a(;G+qIVpk9JtR0F{l=FzHx#SeVO|Q#U!O?uZR7O=`wCzdsmsd8g5&9YW zJ2dG5Vx)LH$UDF zE7SnFJA5!l=|>`;|7Jg+;FloF%#8sVg2ZvC>*v4F3S-hRTtv|GDHzd(MwyvRfl)2r z@$TFlCev(#m&I2$rvmN#j@jAt5Mufgdz=UO8VD3=Zhy)DoLr%Dv@ecRH*ANvKYYB@ zRISQ>Gz-+NuJ!u^QLbx!8sGxA1YO|#(C6lQ$fn>qhkmH@JS2{7f2Eb{i^T{+0|kxA zh8BfU+}zwOZ*QyE^rgNZGWL)~cTcUa9R}RFIHeqR6+jM~Ni}rMTvj07aor~Oow_ZF ziIQ8{4cqX@q=ERDA(X52!WMGyBmRj!Ms$m#-Ch81P%ZrQg=f zpTEapP_7I||MrxdGiOHJS+W?I^a8u}1BnS7&2<3;iI@lIr}m+@f!Qj`Z1a~^>qd$v zPh!qTB24aY7+;mK6?VD#ANI-5+kcT|5QoPYoc!d2#AZ7v zg*$tXB6S%AI=9C(;{$+?` z5rDB4PTl$uG#G|~)D0s~t%j)5ZOKt*vnHUoey?UnYiYCr5Gl#wkxbe|Vsei+bYwf~X~FJwD_XkX7A- zAm^dkzT&#`BzBJ{PQUxs+kUu66x^k<_;K2w$M%RkPij2>e|< z@%y*e_Rh|j;hZ1&T#{G>KTiw6kFv2pwf%oc`X+65-|Witf2^8aX?7;F>iQI!cPp}D zwWHVSNoMEBV4dj(;O}IIxUf*RkS(SK!b)R5k~D-JEuBPe%Aek{x%5s84<+778u@)2 zY0K=+Sj_)zlb}8$3Ew^XPVn7t^OF6*b7w$?mP@QK^k9iC5G-C`#A|t}u81 z1mD;CpKsUW07PcXX&|(NNT$#U_!auL1OuI}tI{r=#M^av6sUkBsg6x`iDJ*S5$XXa zyL0`l<+ zGmo_cdx%~m;?(@}3uy}aQnX6T)<=&V?DZNSdgqU5`O&bBx?_+MBxg1l4B1=|L(!=c zshmKxYXzLhz@ioRo|VjA;<=Jn+rt#8lnUACDm1x8+Aac0O34hJ=fvd-t3?wnD^Fm2 zs*Ms84?zrdHOF)=R@KgW5oa@pesxx-J3}{Xb|4wg8aOH7lqai~S6TmhanWe|Oo(0R zUSe{sl^CmVZX)!QOh?37F zD)*|Za>zY-$!asKOu4C&GKNYVi5jkaZxQ?cNPeZ1q=oXO37xOVXPnhvt35cIbX%*k z`*H8xSjlzAI=;S_h}1E81mVtw_krfQ+ZrzR;00Ijab};S)kVG2+PwqKwY&4q4X0)w zGWo*G{H|MQBDRNXKeyk%G6EDWmwn>Tb)^PSsi(O_J)yn&HqPUK(W(AHx581bApa_b zpc^PTgEBC3CHrwsxAFK)VPcji+C|nj$5DEqv^r0K8T~Jp1O@G6qQ_t%%}jWRRWc+c z`-w6}Q86(R7~Tu8^i{r` z>${DE{TgNeT zDA>FQ(IvT63~K(!8)z+|b**)+}+u)LRT#3j<%6ZBR;;)Ow>d(bw9#7Z@s`=gQVv7OrbSLjuGr-`}|g!qWGK zW#Y-4+}b=$Z(;K~DL;nt*dbkE5q#I@ zJ9CX`A^H7rCx<_hrIfIcdRoSmhW}tdp!*YZ3n{&)3Q|{&HimVmZp6muS9C*?plTVC zD+ga@YF&pxL#=m~)$;Oadpe;)Z%_6Uzm4jn(fz|ies2P9Zf+_bGss*}1Y-++tPO4C{z9&b&4{;3m9&cklQ?M-@H~CPq5G zMcw9`(BZKcJJ8<7KK1*llx_w5;-?|wNrzrz{gr&DSHW&{%D$#>RlT^+m;dy<057E( zJ%$izJAettq!fW*@W_Xl;-RlLLvV2|8zJ(EBxj*~(&|6n-P&00{Op4d=|Q!PI!Q+D z+LikuJs2x2Fah4taGsqEp;uHk0}viG4^)RIlkmmPOzraGqAU-$Ja+F`v7udGs%RZK zIV?YdlK-yqr{hdjmKAbUpAx~oT;pZMr1Z<(%mmlUimHjY#6ZH{QUNOJQNCfmh2ycUAfwuqEKv5 z#b?v~!;Nt42sF@~x_W*H=sjrY`~)`WM{2p2=2$FIbZzQ5#Xp%Jqq&mmRarJ6qpMSq z0x#v=NBzf$0-$5h_1gXcpwUuv8OmkS{H`x}V<7_48-SG!wa_Us(yt*0D`$4ORX36V z);sy2CE_l#qBCW5{7T>u%h>@YCZ;n>geb4ine(IR9}N)4g6G9u(wHE_;5!Ce@X1uJ zfdfW4WQf56>o;QFMytZQ*9lHk?t@3l*3veZJjlC>!-k2#dTK=hQSmxhzJ(rl1p2ND z`ghYQK_fWC2(0dFe7vriec*lZEN1gh5qX{Eh({~tVQKq?*qu(&r=tufUo_X@MgZJ{J9^i(kWw>@J8N0t;)Wsh@zKlxh>+qAYI{hOM~G77Q}rhw_CcIoZHGH zUhk;|_Ng!6FKDRk{Lf4THFMZ+C&7tKe!4Dq$gpdEF)O7mzKkpAiA265}Bj49g4~h6BMk{K>DJB%J*Kcf|^(4O$_K2i&n7nd03% zS~yFf28&`O<@<1i;f+mC-^z#Oh83hhEdFZXfHsrZUHu}sh&53iV;O#gXXV(zz(9bZ?JGzdfDR_h@L#n^;Y&Xja~k+&g+pnpA5OlB$++FCx`vAlXw^%* zBM1u6QC}?SO#K%+smZ?A2xk4xF~+N3G1q8X=CRm}^PC_T&`9Wdw2n3Si8x^t1^;7t z>s#reYZ&l%qzWJDtg=m8D7hexX7Ld)MjmnR${l02ZFDo(cfK}|DlS@srZDdfk%k#U znD^FXgn9vY;+FMK;BK|LIe23f4i-OtSfEVz=aE$WE=$Los zGIQW)w^`?sG(ALvT+@23)lvx-rC@lJBQCAPMFvcmK(qIma6cOwy{&PS}J=f1{b=8NuKx zFB~IyxDnBjC}n{s;eVn%ZY{}_L95Y^w{iIXSh&!*bJ=ZdSCsCHmCxheY0dsS2n%** zE1G}J?`k|BVuZ!Bulp%Ro;8@{>zY;tDkV3*Y#DfuAezy3FmfSj5~B=8IqZ|4gTz;3 zq_nxwRTykJ^ohIYhikQ!c>034Ez~V!iR9GS>F#TNv?p6=R|65DD!p2Vql4wXvW0>K z1^X!$S^;y|_rviH(WB8wgGBk8a)YtmA<^m_A&%Mxr+nK~;;dkFx1POwh^>U#l3ZBG zDdD*Xlljag?-mSIAlhPVew}SQThRM_a;%RFsZ~*(MnR>Ij_gSjN)js8m+75y&*CHF zaHZb6uCE^MknzoTB^GwE9U5BPOa$oKpNCtk$WE^HLF9GK(Hy##D_v1(`*C4{cD1>t zVyZxrXpO-URRA==nAo(@r>}0d!6Q!G+k^2>a3S0GKYwn*sD+!PX@j&*gD&uq)6dcG zNhI?n{+px}qY!`~a#t^Zr<^p8zM=}aEqEtXFq~K|Eb@+f_7elpW~-Xz_hl6o*+u#l zF4oGn7%QghSymkCB`ZryBwJ+{>h0RkS_U%ZY9k0&yW(+S0npAF`chV=pu2h{`3ECBXY`~VS zyl(d)8M}T+>f7U-iPwumrg(k5DUwMi+f(wdwY|6Gxu!JFIRv#s>$peiU8xK7YZx;v zh;C-SxO*39F%8_F3%QANPxZ^+9wWy zmTBRRlNXIzzKt96v&JB7fXRNQD`snp8dFT_8VzCwEZ*dwea?eRa#UD-kH|0T{S8Fh zQ5U5zs**4ZwotlU$tUvgAYxhfXP(Wyt`q*@i4Imgb2(rjlOa)XigJ zRy-8U4)s*Gs)uSLp1!X0Te-`XBo=fd;`*t4txGP_5UQDu9;{us0cOI=Ex8n6L8LK8 zQ`_-s02k`NN*YOr>|7FXnAWhOXVK1i-=D9a5(WI(T(L1OcwetF#TB2%TVJZc$ElgHNXAm9@$1#a7v zTcd`_rANOjd@BO)zV8^!$3ZD4W%y3at_!TLjtz(+QRe6F+oW~5>16!Y4<#$~xZSWG z;BU~F&w)OGN=LN)!bDQFqn;e#h323)lV{Ifhw>^5*+@xodhQ}eqOd=vw=lZ_THpg= zR)vFuw65s6iMM6zz#dc+c40LCHZ^E<8+v&B-B(iXq85uQbxI(Y%<$x^Y8ZEcK$XWi z(@43mE{Mh8%Mr6F{1}~1rZJbGoED~Okp3_3kWittb|_S+o_9fyg3oq5&|n*2S;U zZUQ+8c1oq(SBZ$htSO%tMoZEIanV|LJGgi!H_aG6wYEPo^q{Lua~c4GtFco z$b(1^B>P+}jv76$a*mg(Le4+l;J03^MH`#UKVY-uV-&-I*tf496Zk=Vh<4|S8K#>t z_cr)_7*If5n@Wb7mT$vGa-af)Q%yU64DtQiwlgH{Xrr~==?G!cF7usB%-(n|3gwcg zGVKN@Q6L&bTH))QjIbEgxV#_qLd#um|M0|hqZ0b&m0112{(T(a2l`;@+n)!orptg$ z+X3T8^K7ys&u-Tw`m!ehF@oZc(i4~si9pCjvVH!6pruZ*Thx$EPY>MUx*+FD6c+!4 zERhOeLFpzIv3;XGQ~kudb6TLy#cA0SEeo7a~}tZ#K%RC@fnJ0|$eEAjAKTejC;uXnNJ!UPq}K+}lkzhn{+t zQ{p;)z%(?1>H2`v@|&%?g101}GsL`13Z??_2NKdOOLk2cdCl?Xk)F*loi7kLaMc%B z<`Uj+nN{^s&L{Zjm%p=jrR(0T#dmsbl+xcu4xLvl3pr!v^v5PG=#OpHBGksKIYSUB`|zF(5`~dT@vhr=XwipT8x}6~%e0H4RDyOa=JaUaoUN<8h_{9#U6%W@`Mzii zCBN?vy79xtaR&>&xtm1mWxb}XRrFl91#UkzrCj&D9?hvz;q!9>?h#5+J8T;-yTf0S z{fWyCZgdq1PPi_Qn%C$tkv?k2E0h29r_4xDEV}ovVxK%F!2$!^0ks?dCdBpoTMA$_ zbQLgIKyvtRkuWoI_s`C;tt<+`98H?DV0y~5?*_|N?iK%5?cM%^ac`=3MLQltF%IIs zS~BxAe>iu^Nrx*v;flQIOF9R2j@qBhf-TO02?|OoHXJz?^9Xnfs8a@F3w%qHDtMK9iC1(k%yD^-A_IP>PHWl4S&^ptRnbswgaqLb zLOS_Ply!v zy$Y*RcxcFz^2k)Aol^a~OjljUf?%d0B$d`?olV( zGi{WrQw{r+PcwM&rL2(AZkJplK6peiPSSI*C*N{ugZ#+t(YHT1 zq%=Wf`x0r`>lz2>`4D}Tv^a#!e|QLtamu&e_59DsA4G2yp;*ydW1T;~Cm| zY*?w|%dvLnC@iN_I8oUcZzSQGu2i6O#j!SEki*ay+j}eH6&0^v<^}jr!19z|5vk3l zDA%saDgC-Ji!1dEp=fm-8FK1sFZntMHJZhB{ zeMt}CmwQYpf(niW(`8AW=fRlndzz)^3%2U4BG*Ta7W0`eOfxB3@nUlrqrHL>-o~>3 zQm1r-@pM3B-%~fJMGNW_KbAyF&4iXOcRYKgo%DILWkI{dMbM8O#|;o@{e;;E57Y1>ACNuQNt0(p&MiH#%=YJxTQFuMiXx?pZfr1QFE8HLF3Zx1eF(k z0s5LZ1*Ds1(T{X44u}3E&NjVHQv&~bsf3cw%ifyk$#Jl2;5e1@i108W+M1LBPAp6m zmYb!3i<&8_kPBBWX*gH$qdPj)3W8Dz48m#Qef7dlktOX4=gcddXZ2m;$RFLI{)3z= zHo~c&GC@%N*iFxf~OggBc)@PTF4jQo7T+G8c`N;s%+&s|HP{^#CR^2^v8W> zNySDZqgu?Kzd8)*QhR*7f>Lq_j284(n@VcoEx#O@wD}{Y40fgH;V=Rc&+m{PNGkjt z98R?Bm8z=} zO4#z>M2&J3_qE)RmE>axULuCOS=;7h1!uDWD=_0*ATa5uN|28Yw;e1|db6XKlT>4$ zL&bBSZk0dDlt&|dxLH^CQNK*f-o&uO!|P%`?+96vIKj{AGWSj$X+=nI0}i+oNwQsa z^xEVutu0epq9+E3F>}P}x;C*kpFl)fmMM5TN3JT1=u~h#hg(Xq)v=oR+xXlAhx_-~ z?0=`K%jRJ5xzC?#7rBY1)3=aAD4A^1eU5gvsqKe5gS(Tk3pCk14;cn-e8$HjK{DCv zN3Q(upt{`YHpYV7^p?24!ZTpq@%C(7W<4({cwWzbwWIYuFf*w=Bs2SyerTgsaT<|H zXCO+&7s>LG_WP-{lXNDkE%KO8ko(%E^(Q$@?QmI!$@U1(kuH;> z%{~G6NhVkFmHqRS9-|axj}y}3-KPT4S7p-%N_2vtw(lSQ#U7S^jFLNK#m2AwgG9o5 z0hPGjy5{*k>mb0eMA88Fi#UZ}^JB|*|2j`Rjs@EilcJqh)IT3Gf!eBGf*>Aq5EfYl zjQtdum4M7JfDMjPRFysBd-AXMX$JK^&0OBEAi+8!!zCO!*R3iJ(RajK`FI#)FY{^> z&7bUSVck37@v&gDZgH95+CNmrj`Hi|Mm$UyK& zwV(@wZw?FNd-u;xJ$6EU^WE!&Q3Wono&c!lomAe*1dn`=0iLGd{g=;1pVw=>(G`Lc zy)tW{%E24hvf5;SbwQBXDDk7j$+#ICh*A^ru1p^G^uN^3PUU+Mr2@=>7aWblfQ5XF z#S#>_o|*$6Xv8zeM0(3*`NXv>!||Co>d7rHX(yWDiT+h&9#ZOOk_lKET5&t5E526%^ zgKsab_BE8zT;W7hru_i6kdHXk)raP9%p`plFkimSp<4^h4EC~>3xWd6qU^aL8-nfa zJ06WWa3X1nDLY{Y>IugykYPR2^uBQv^t!DFG{p`3W>EA!J!H9%?&7&dwsVVu;P%~-fb`wD#s#4H)9*|h^&!KQg=0)5uY_YF z_1+>YiYZ&du2;M`5jfVnLqVjF5x?{rY z>WQydDsU{g()lCDN)6<1-DS!88@xzg(u@-<*RmqNzCJYtF8`u^R5FucyJ0f1plA#s zdgJ(8-MN6uePi-8<5mQ4LjVgQ_t+%7i*s1x=py~e|%_)E2zd4aFm4S4BDgHQvafwYKmtKlQ>R&5Be^00z!H zm7-opx2<&6TK)?oJ$H}=C{oN_wy>jv6<7PQ-23iMb4FHsnDMvlP(ae#yj8m`arCDP z1k5Vwffop6ddqE6V3mhrlzCgdyv|(>=10&^ym@7^n!4uBnXtF^3pN6>jYZ9TPBUsc9n?^iS@!#AeBn$?VExA@hf#9^0ybczSKKSVo2g#_PRubEzKmNs|ToJqSsuNeHcacEcYv1ah_Y@=1-zhIa1 zcp`SadmhrA2Ju2r?DHm$53dk+s^K*Rb8ywF&nsQfil$M6r!t}aHXXMA&Ds8aoBK(1 z^y85A$~VaTz6l#_&nd4qN^JFFlj(p{qBY3|b)3yMF9^l4Rz{jZK?Bikj&EL%mp?uO zvhDn8@qejjUdp4{*w`eCy0YZ`Xn#)p>TRo;H2V1?&Y;4Rf_QQ$yie?o-V)<&%&o+;h{1iif zowwS|9Tb2MbW6Je8fQ1qv451(%JUUYlUh{d-h0pU0 zUU>`t%C+sN5oY!^?KgF~k+csef4rLU}gIe@5FK9_`B^#VkoT5==K*NnX8x z=|L!Ap)eHyKnuR5L*qQ|E`aC@RLa}iR{1he#*NZca5KG)1@(k`%nOT-KG&AZLfEzH4 z#@oCHZP&k#KD}&kt4)eWPs`n2T8CRP3KgIZ$2kf-mrtINb{X{Lm*ZR3FkBGQKA}w> zT_T`qbyQ&MmwtcOWD5-!at~~34C9V?V>7q19zoEh(M{+Yaq7N`@+-O+h1->VB~vj51mz<|#CpOU#4|Quk2y>s3L9U2EGO={9TS6tb#x&< zMlv0(0d@csnAGW94#u520;rXsqHG+1Lpgx>yb13EKHr2^8evw7&E4dEyfpYkAhWP6 z0sH9n*mwCE2c&p1C}>&vs{9~wT8BeY5YSReJ$#Ale#IJ*Db?(^qKq1=&h((DZ8LH}+uW4Ix{~3U;I0f;s#^T0l@?a!AV$+e8?~fiN3A9AQXYf`YLxgb8 z(i=2$R4Msv7PiuR3VJrjdN<#>kXzzs{>0IT85|*Jk}E7gR>pTzXqgQTvs%mtbv_Re z!Evxh)wn?#;k-RY`(!C2${a&b|j6aJjF4GQJ1aGC8yl zXg#4vLhib~H2K&r@_Ly;jut{JeV34}m{?CfR3{ftIUh|uqL*Yv;6Ece;i^tFW@i;Gw=?#<)$+)6P;*odb&J53A;AhUCT@^RVI4MvPnC_O`RD`yb2*c5JF99s~ zd4Eg$>j^0^!haaSs+LX{)C&bVZIBH+AXD_+@e#=T!CSFfwml=ZES%#z04yN2} zkoOyo-S}rl2SZ=1pY&zP2!Gk;u)tQgT>e72Wt<&n3X%?D01zo_!qJTZvjjX*EVLH| zctPeCm6)mVh@Tb7iWXwIin98dJ^llVc!pku>KEwU*T0%*S&ivkw!X1x>Hg* zB&0z~8U>`JLApb_L%JJjkPbn*K@lXSL%Q>>V|~B4n|KElyNIwlg4ZP`nd^hx%Tw1IsXpFF2M;#rtz4r~u3tkukYP z%4T?mYPvrC0so<#dK!pmH8r3_~SYk`=x_hZ`YRNOhj9e~pl{{y=gFqQLdVb<9(ju?_Z^ zqxKd=ot~mO0!s`~Ba%~v(i{!$m`A>%Jed(iS97T(ghp>Q=H*$_g^d@d#FE0{(!Z3e z2M*wuOqCS~cwax6KuY4XjXT3e&Dt)k^txTe&k>%NBcvt<+=thqPKHM!W`^Q{P~k1@ zbSOR3A2ZEe0BLs2SImF{Z?IWOQoieIMlmxOpilZpvxUVhN&D`gIB($qx^Xmb7zHKt z<@pbkUL?lqx~`4(r>On$i$1Go$s8r60rPv#9Oe|)X?Ka*36 zV#Qr5GbgZMyBo^jq}WEi2{{GKYuZEYW551vj?%vn6-~G<8axzKi3L3uBPJM&Q`->M3*%V zd*#ET_gU49co_MVpdH5N4XsEAcN#25{GW&wqz2LmwC5Ch57~ajs|sM|Tipc>h}iu> zHDV^a@*{eEDn#zr+^9y*Kfl!sZ@7pTkq*-jGm>GuJJ;B8s%#_vj!`Z1ebDJkJqp_a z+k&=T5rvTKxjVTb@~TGuZqM}QLJ8x7pGZP#!PJXkQ|;fx_C!s!m z&Ic?_2pyVeVllyEPqZrM=E*ka(>VEP7;Dg>{Ee8@&-d?9^LVqJFI@VRFB{{{lbglC zR4?5?u-JsT2hqn!;aF=RR=8)3&kra4$bKfT{@2uSpb!wR4|p33xE004$3DpY2N zE5+Mt*Q2O}D_YKJ0-nzLE&g|h;hr66=0)r8IBlI7MD?BLHcX@mn=q(f(`5i%u#Cx* zw#W58vFh8XHE$Dthjisn=VCsSK)wj!g2~{G%GvVK+5W9As9oXiB)e&ExXW~q z$?Ly87S<*o;$m9BuzK{8kd^5S0O3M$;pXK3Vgbzmqkn)YkK4O`yAUeN884DRB7-m- ze#sN()VrGP?Ffw&Rqo0eIs7{Je<}xorF{!J%6|)LOcF*U?Nfl?tt8V7lO7(CIf6pE zJE(yYV@{bhFb0vJ7n%lydLXDvp$ftx|7|A@nt?{7X0GtZszKGxSjfQ9pfX@S;Qg&H ziU~!tVXM8BS^HOoEDj5eEcwa@iK&496~N6%B(DZ&>So?(7Smy(L4S_Mt(idgceTHe z0Z&YL&h8pUsYL6#W3Q@hd?Yll+N`A4AISB_QH5 zd?cI;ge6p_Wz?kGxOlX5-f;uCloX)q;rZyG$7atU=y@Ea1EN%K-`{WZ{0~Hnv$uN8 zSI45kayJ($LrGZuI%l$&GX5osy?swm=@iWKFLY|&uUWMfp><{!$-92B*<4Q^BS}L4)yqq(s`eb z0|JK?0QO1#zM{ne&1dH;*%AJ#2p2O_&0N1PER*`=F)r)-?<>aCUF`g{bAk@O(8MP4__%^kgo-7}uZ zs$ZB=Li>?SO8X~>jMbc{G7DzX7uti0^;#2Y`bu*bODhK}<&5OopIuRd>RH2}R(NoJ zs@D$v_>7vWu-g1PjMc>>wWoLXrXRSP-ER_T1~nz$B`8#EP=fiWszCV18Ek)u0I}~5 zBYX;&vNCg}W5=saMg|+?%aHOwSXGSv1!tQMm|nt=Xom|pn!!9Tuk+x3Tt-v7KyOH! zaqqJhfk60%waEj@$&cEIP#T0ZL6@-kHn)LANF{&-TwY&bb6HITntm7g>KmF-AQ4A5 zWW8Br&io6Y9C<7(}`BpD=wyCxNc+4tE$ZCaqqtRODupAaq6nA|XK1)1}qpdv`5P=2xF>MzwN@q3@1s+mAEog(x?E zkebkG%)=gEZ(=j3RQB^E@x^wd7GzToD0NVL4%;$$GSVoTTxR{+aZtCJ9BDcCA;)v^Knp;&};L9Dp_Eym{7W2Ch#E30GgwBByW1cc!5GzvTCLqm5Aj#_*kKMJ*u3?1+Bjbg{k*Mdx}Of)-J13J+# zp(KFNsN^$EZ-R$pziO;% zwO3+p4$(9Apc(+uE{_l3=IUKFe~xG}WFI?ieMx-P6D;d>b3ywLlJ=uJN7DS~YRTlH z3y7zsn)7nz$A_*R@^)NIeX`L9x3fGJFi+z1=Cl2BK0$ji%IFw)g-S4-!EAU{3q;Ob zZ+DQM4niSc>a{E%MJdmjwh+C64$7^}HUt!`GXgA7?5%Q=VuXUA*n_Pfv+Ace|^px2se)CjpJ%L~d!f*uMeu-p@; zyg5vK%_1bB{q%l+&adzV-a>=H*tFF?!hoJ6uaPC$Ssg7LP9)0OV?bZ z%#kowJ+)V)HT_Cb!v}w2i~1!)_za#YxGjJQK!O*FHwehBCD9$L>_Iy{G%GDb@fjJ+L7^$7?5yt~cxz@NWWKm0MkI7=|=k(=&eV&ZfH_6Uzarr-GM{TFzGm zKngDPz;EZFcPL59)KMslo(&9AnO>s^3C=Z4=E0*4g?a3J<@D=-uwe@*UnXU@SKjNH zfzxYm<~VjoqCj?ZHJQVix0Z9_1-QrN-=_njHaVxfuqj9e&Si9|N%#hYaN-eLk-QhAZCOe$OA9b&u}q*c|7^ z{M=l;U|y^#W@NX2QaL=x7})Rqb{of_AI$NNjWjilc#4GsvrC7F+d9W!(%~K!ixpg< z;|)>q^&YMZDzQB2e4EPER3h}qN?2_)0Z5@g(0iPW>;lqJhjqvzwSLC}l0DLk#;Bzs zwhVgFVZPJX0QEZ`S+gaeK)~z2#ce)pfMh}dLB+}b}X>gn-)1xu5LA_7>fIBxcaHALEY z`NZHe0%P|lCYLayd@>!$EP}J&n)-SyAlu#}4j@@t=sBT|`sl&AJs^7E{a&J*qZJ2) zLWG(iiF6~Wt z@(|fRw1?ueZh)e*9T#f*lM zEs_yt&(1H;+~1nkj%Jh$=(zQ#@{(JiA+{V#Fz+$GL|Iq)idhI% z0!NBg_PMaA8|&amED{Y5kt}$(1CFnGrsq$D=d5>5SWC48SQ{`-^S|?yfwDdb?jNMn z;eZD~I++6e7IOWH3Ekt@{Y?G(^>l4=-!+(qMIcE8S{LzaMR0{V#(bvKFTL3ULgRD6 zA=NG)Q_nl2=nV#IPP5n$sS!x=Bv)_@ji19UKhx|Ko z6uiEaI=(kHY$ml?)e1vraSQs^Ab?e=E=&*b!1FF#Dj_bffhgyU<7Qd^)vK|arjM$p z((`0f%bE~L@38wo+eK=yI@ve<9MyCmj*z*xa5Gl~2~^eO6o0K*KMFpI48o>;WZat? zP~&mY;w&AzUg>9V9mofePDOGx;VT~v5KLGQK4N^t*%8R8T6e!$KRAk_KU2L>Jt4v3 z^h4MVIk1r1nw^U;KOz*0S|`Ms?6{0(KJ{Zi6v{nCB*S2PFm0EgPW)h1gLXteOWSF8 zye3PK`Epr3GeTlD4(cG1HPTm@j6AXf#pEeG)>{dW6mRNG_ZK@ROEg&Lrpoj+?wp-( zQ$z4shxP3NvnI%zjfNBZc)4gX-a{o2(bLNx!lPA8AGCk$sGB7gU^<#9s#>6sW+VT= zs|eNrxDjeR!>n!BpYG_cFV-_ISaomiY@(N@_Sg4LPWCQ#M$0t4%G_8!**Jn6_vIur`)%?Zt$A0GQvcv zjGanrUD1A`nGtvXkB9oA@lX&%M!oN7Xhbl&W5IWOJ>-9Q= z^3G%!5Z%k|C|Nv}S~w@9)|R(W%U(@6E54x0KYZ38`y~F*?18?joXNeJ>|%r4wu5$-kYrK z?tj62dNYEn3~DOIemTy19EoMCrKLHDXJg*;K4c%1#;`c;emKmvlBIM{y1=u+PL92* z;$z%TM7AQ;Yjw^i8>YM`6J(6?+Mb)lAl#F)L`4&ZglD)_lu2Vs_wvpWRBnR!tMPuI z#ZLuDnz>Bp3k zVPqf=0xSMFtkH4%IiQB+4ZSzz;_R%n^R4pYqdr8TMEygy6NBzmr0{K~0JzYlP2PCR zXdodRWQeiWY*)J%aib0ydC9ZDxlh4#GRA^Z#^)+BL%DOe)6}#)f`&9W(bGB2bQ;}B z6|Z{<&g~)*t|V^iK080&@S%zjDXRl(wBMW#opcpWacMDWZJ~C&Onk|+5Bz-gW*F>7 zfplIEm{p`0^*_Nwp_ivV7#Jd`GDeGizWb!Y$sC3PI8i?`V2w#Uk1{AlR?1v;}A5 z?Oaerp+`?j9G6Khi?k}f&n3DTK+HDu;*9a;uq0|NxYCycs$f!;dFi+(4z!1r@6?gt z4SShHzecp;qX=mEHle$DX~idzAIS_{B3XGIho^A;QXTfpC!Z938=dgLZ?L%S?ou03 zF={Emj@$4u+h0}~l$a0Fl|HUM+-k@hK52}X{dQ$`aG7>3`AHB$Jd&_NcvYa5Yp1(J5({Vq}e)y63TX5XYssapAir)l)|iS&C# zE)0x6@nCi`n@1*vN_8vW%P!Ph0Br1x!&Y~a_%-iyw$r{G#6~$10LF47n*I3~aJcn7 z8UPN9sj7*da;CEWoq7@$6{Megv&@sou1MyEK_H^s*RBnuO~hpplwwl_T|?-!M?l4cpvwwhIiAi;5jIe7eUcQP5& zibpG6b{2Zw-7d{rw3CE!0Sd?4p8IQ1%He}aq>okIh~TtJ{nFCm1yD9>tmLjJo#i4i zQ`I-QhpxZ>H&r&~Q^OMG^vr9C(BzVkE44ufKJL zCyK;tpS%i!wSM%z{&aU-eW(_}kIFf{Vyb_?GBrHMWeXzIrLmY98_EcGzlY|OQ$7BI ztmG5ORO9PHit{b`*zPA63n(x8!4-0t)!@6efMl5=Sl!7`fF9!8Jyk+H?Os{qL`Z7b z*`vW#*8O0@Zw17kWuv@rGRt+# z_>m@CFH=%h^K;-7WbXsrH<3RaK*@eU!01R#0huxBxN>ZR3we6c(&i| zlLG)?=li-@ewjak;?g!3k%j|-V<}vG`HQWgRJa{^_!9H%QH_qz2Ytit$vN0Uzsdqu z^hGcI4hRt;!~ScrZiPdfqQa&_I@mZ^Q?sm7T`#@-#t)kyL~#_v#4SSNET4t*paNrB zg>8(xR)4>M9dWB2^#b_ArJI1kq)>JsFpl3VQxT7hmc7-@4P9mZEkCgq+!i1x3u&yi z=w5$lL5(~6pf~(A_T@RAlGkrm60=|4RBrLxKMvoRjzC_! z+-nYWWV*Qm;Q=uTLMRwGI}J7zk2xEUS!=Z3cDYb0GIMhtbGbK~QZ-u~X=}EDBTqgh zfkq+q87P*|V>j*}9nX_BJKtZ@@%#dK7?U|n(B~TL-+_rN<>~3U!|T_6(wjQn;`Z@U zN=gn|`JO12)r>L*IJo;$cvxG!?$WPsZp`j(Z{h$|Dw$hWM{YHcJ^nGq*7wVwfF#J3-1Gu_EMe6C6@=D2iEXH;qMfleX1G30v&OaPv9 z(P!B?kdfjI&64U#(9sIOVuU1wB~{yA;JR}!Zu+q2fQmeuKn$(8<v(q(5g3^`RJhgWw(-1&qnNb|_ibQT6-|+D#N+eoi>q+r zkX#DYf6gRF-M}6h7^%EZPE7ccGX#@g0f7(d=Q6jyz}I{G(s#R<@1w1M<=SY@OoQgum`jCfTB_*#C1K< z{vn%;*RMXr50j^uzaKH$bq%~Y-un_iA_Obh393f?1{ncfO;T9<@+yW(lRansAolVz zd8R`9sh|7sOTF-6l_Rx8+EhxztAG>2Pw_w&W&&cpKz7ugT965s3d?jqbx+mI%MkeW zMoZgl553M#(P}qWKCvN1p|2X<_nrqOI904zkhr3C^Hr1chY3vD!Thry=53z@xzs2} ziyeFx>ZBc?m+04^pZ5x!%&MPT$@>ooZA<~?d=;`W+vF6P6y0;lw=opsL;046n)q;tAme6zCG5yg$Feex1Eg^mGJxsNYf$4Lb^;wA}rJ$5eNC zG+wvYP;iZ>1E^OgXfzdhmICVZZPH%mqu)kH^V+{|oqUmx?s@gte3I^V9+a>anSW__ zeJy+M>(A}&fyQSqprRiMQ5PUm1>ki(_HXcuy&!uzq8l#td-;iGKEy8*!dfpsrB1(!q zCnPD;6L9NZ6>}D1mx0(SWuE{IXRZYTox^NW)12FlTS1vz_cxr@X;JBh=N``6WXJr( zOC!!S$G%0DaxHK4N=aKOy`{`lr{i1Id%L_dlo%W%JuehZ*Nt{ucVPM#YGSxl470Hb zs*+-gy`I8Q_@D9pyj?u)Nj-tt0%v|B%zJ2P-|a07;S0)()tZ+*~>#lRg)= z_G;W`l4-gkPj1>ZJ)G}Yp3;g8jr!lV`mYKkH|ci_A+e#&ZG0^XYYO#B9dp?;7=ciP zd#EeF4jjQB=3cuw8JHr|9S;*6nOtY%bWLi+ODk8mo_m^N5?0Y=Biar2HCt(85fG&f zLPu^7eGO7t*BBy(UJYV?fGd3*Far@>6C6##Og1=dVy*T@`%ypDwsxLsH++4(F}AY1 ztAL2{C^v>0u1W9Bw+bIX+42F7iD(iDoyn}CdotZLEv9DMUfSR&foc{c7g0ls!T?h1SC9x=ZFQ9bE<-AXxjBb4?DKZ(k(|SbVW&%RF z0Psb{Ep%^E8t~|VX+4~~LR8A@eqJ#KkY8Bj$7t~{r-*BWP%XNmvj z>z*m!6VwJ^XE(%^i+mjCn_aQ z`$>ZGu=2qzlf#MFS9GljsoM8-bl!YiWzuRgxOL}~fO19WIiV7CH}Qj|lim9Mk(K0$ zjkioZ!^u!gxo%T8A&V9wlYOIpLqkI)4|=9T05b0VpW`Q45vQ2^{QTUn?oEpJFgqW< z57~F!5`WiHv&sd~wySfs*E3+F(o-QEvqPTT74Y6+h|6Xo3`-Qy-j!yL`2|Nw(8#=s zvUM=J5Dy`pI&7l@D$y4=F%Ljogp7#J=H}9I_*K!1N(#&Q<|HrvY=o@@?OJmQ5G~0E zA@Hhl1=B2a15C(B=Q|Agc)J6scCUMyeETCE??Gy#Llk8l3t`J&rQP&xlSrCr-IDT~ zhE{v^v2eG@BCA?EMp;0NVVc2F{C_l8+n zU6lZI(&T^^o15gGzo;gjOu=F>2cUFMfbH%*U!e073q1m4_4a2a#5`6Tx#J^~yl;01 zpfU`AGvGpxfChtvFV4ftMv?uC4;iPD$>B<$XnX9blEa8^WJFQI+Sp4{OP6o}kalrm zZsUfUAXUklR9K!p!R&6q7E^x<`~fru%3l#sOc>-r-3A{3B2Wnb*{{872J2W#QG&_< zQb}VHKA=*(x(y9ApjKqS)*;(`B0qfBf=z%$mv8h5hNckUi!P8);YI+E0ufWQvWp#q zHx8ZS6dGkZI}JJUZb@*Cocl)R!kqiLx@TKo7-b4)Y9x)Igemgb)J!0sMlxVwK0x_< zV8d&=P2+0EO~2$x6dp^Y|0+`;b!hLg-?l zMHr>u0>ohsfF5P|t8^ku+SqF6#(AuHXD}17{W2KDS#lJTB0xuOsX|f+v5yLX|Jhnx zSA>-SB$gspxoTG*09WF0ccyy=@56U#ZS?e9b@aD&m1LR$;P&5|VXq?6vZl%xMsr~l z)U`{%I7&0Z{n4KzP=THh-i~s7+*Oe~ahHfrZYp8xKCZPHL?Q6fCI82Jl;tFZv=J_;AjrllPNtt|RgxYF+ z$DbQ#iU2C}O<+0vu{fDWAETzVD5=xDBtbaUpBG2Mg8ynwJ9N!~Yf3Q1hY_P}DF${g z?#giPwx9x5J(Z$-7S8dZ;2%BJ?iL0q_)2)tbmN2kcK>PFzM=rZFLWwh#Uo&!Q*$x4 z`JF$5p)+8TrBz6MvLgy;BAF>PrC|zqxqt-AU0C@9UQL8(MtnYNr(^}lIZ?c@anX5? zOThM$rR8Qn$y^QjjR@KAiUA{)#o5*EnyXQy8MdK;EfqjZ?ijoZ^9RA&j`5>?v9uCYjK37h zvS;?AzzTD=x3n=^UPl-$3Y{v-bNE!%F&9rA46Cf|(;}rXb|V#Ow)^bkea#?|wENMT zkA7)Xsrfvhl&s*bc$o_v>7AXO-kzR!Ke}tfM*5(l9>xO+v)qEO2j44;*|n#E!=b%P zLXrBu@+_sAKJ*}#F4v!h47)uJNp!r{sT&2a6?@pY~ z*K*uapkhhae^5G8!M~RY11b}+Q4E=J%$e;S0#wL7kt6KxH<$6~51)uMz5SU@#AOcE zm|>zA@kKx>R4Y-3CNMy5Jwc*7PBgm?j8gXLt;F#TP*kh=aoD`iP;sglP>4`RKR^mE zFPY0?9IzNoYtsOd`v0k!zn=K6cM*JB4z41pMCRic;gZaic+y6D2|Y+T>(?cyf{l$mJgN zSMJW@Z}8jw#R4=u1(f)&MvqQ^YC|bh6IIkoUqojJ0vREI<0pcO1A(JX?^GgCqkuBa zfeAqSw9p>NlGxeV{iwXTxw+s4A~89@HrrSnn13lA12iQUdPv`a#8iakZb_{=wNj5( ze}DZ~^-SsVr8pKxhT`Y>g|(#A%fIZ1fvV4uhSm7Bw|tG=50U;<7DxYLDkWO6)r5E8 z4MZGN+5s)kS%FT2ow6@H5;jy021@}?{ z>ON4myfI7d>>*b$#wj1_dd0v-E-p#9!cVt<{F%{igWKO+ok^l?edvPZHCZGMbV8N3 z=a{heYB)9#t&asT;4oq?JqOqZo4!>aXnZXVXEaj*Xa_m0eTFc4^923jm)!(fHMwA} z#pL8rUhR-Uw1G*I>WLu9p^ymK$Q%aXL=X~b_p0GG9reCFU&d&(TMYrYNdf1mVvTYI z;IY4MOBWBu8Us|O1p5F99|$CXI?p~$n4Im-aRQ%iqlS9S`~#SHuMQdGJZC{91FEZ9yTngVNeK8$>QGPS_zdHUS^jYh=GmpdWjOjyU( zBl4&o=s;OAW{Q#k0g^Oj_C^(;Es_W&u%v!L=OYH$fqYtY>&=C6a5>v0ArW+q3&CUV zS3=ayhM3lD0LRBJlFT?+Jl6Ge-AK_z9^p7f+}1av6jl@#a`%vT($6S3!S9*KD?mKg zZcvHK5n4?c`&x2ab3ABJl94Q4`CR!@+{NW*#+@takE7RI?jj%c(#gxyxSlq1m*RuC zRf(OQzLB!Q>en02m!6Olhc_|@Kl~R;SgJ~8*PwPfTEpj}gj>10hTgPLc6*N-V8<8a zU0tAcVZf)Ai`R$wYSb4KKn=--8SRUvL`>BBQl^_y?PQR@v$XU8e(TM2Rls6=js%bn zst=PP-6CsR2e!{)=@UYu`vrTR0?9eCtizlZkc@IWUAz+p(qK!&0LMz<&Il=pGNyW| zi%<&V`0DmyBF$8bxbV$6isHQ6M?qbWx5sT9nlG7+W!1wP;f=_fQioOfO)w7mXQLzL zw55m&$MjFy?rxuAqt|3tN50)?eVA@Sz`$4E|2B}8RY)jP(W^D@MO#-6RprE?vDe)Z z?U2CNw1`=>O!p#&=OgT)W>>X@HRs2WZL2V%7Uf$XRZ2IFWY>B)lD3aRUF$<;$M3D$ z1Z1DSRYf_Nsn7#~=hFT@k7#M1%4+q|=<38m75jI+oXkm~PvU)N5R0m zf=JWLrYw1Z3mF5!lMTQJ$7#PNce?G`sx zq?SZ$+vO?eKH9QoUk@HN3ZHSnIPSK#ip+DNj|<=3O4uvbk6)FS7b=wL#gQuuHJ)D6 zg?}?HC#EbNfjnVHMcYE1Rgd0nY3*H*Oi+qsN1-?T0egHoVYt;GRB=n9?{&4>;K;Lj zmxD4HjG44bKBnqp)k!mv;dS@)!~&k1DC*Z%+dhumj&m_Kv0mePxVV>2NT<{@-o&{4 z%aCUlhya~yYE3BTt46tAC9496xtoDp=47Lz1-Iz#XcnBw1f+kmSS>U)Ev*mmoc!Lp z@X&yx49K&(Z0@(;Pv<*5J)HzGs(bw2j37WyQKKD6%u}XzhQ@|CFSw5-mVo8p^Rn9C z5x3&rB%z*%a9%`4{9C<@r}0%ad68bNa} zBpo_B!oav0e0duf3TnO5aN}r=5fE}+$8UH*5!9zP~!J1cnwdSh!S}JGXOMcV!}>KmtwT9LH9Rs?py zm4{Pdi~unc5EN{9h^!n#s@o1mR+bP3s5jjj#$e#Z@je1G<_nrE%e zlGv_4!btl2$A;s8vXveLT-;gN9c{`(Ls6dMG8-hL9vN967@&W>`|%m&76WGa^?M1~ z7y2-=Pf0+1a?ZVJrn#xLtOLhkW#9LGW|zt)2v5}yedR>cgzqk7j5)Xs2^{n~ki|Dq z!}}^X-~(}6zT&Tx2doojf$Gcz!{`H5K0|CRx3PD!DUFHgBS+fK#iSj?bQ&R5I$Ecl z7{Y}vxY?v$=Si2AEE2r2iTUaQQU~t(?#+sTwPMxf?r@^KW>Hmt#?gbNAznYPjPbKY z)6LdHV+Ea{A3buC#H{j0-m0`ttSoR%1qLH^B@?*sFGLimO~&<$sdi?L4CC1KXngk~ z)a?)o{X`mO{61<-o^ddK!=b!Sg|Zh6)SdJYR&RUry%QHGW;~9WK$<6QJYVL~bUy77 z)S08phsZ_am(LZ;jCy4T-=xub&h{?NQ7h4gkWeL+K}|{>?Ea+7?KQ9QNP3zl5I8)Y z5v7p_RJ!M%*_&Y&?XS;mLmp7x6NdBttmT9E6Ra%iI6pI=^?aH55ZO+u((?9To|N8U z>VVIWaz!zXa+w5HMVV2;b3-&|=$=)-b3f%;S_k#`f$e!RUr$$}z+N}_rdc3ZJ)L;? zO#6_KVC2O`Jk2x9c+2Iep0v|B9q|HED>PU~&LDPYH{8afEewYEtOsg?vyw2GmB`)K z-Ae1z@0vJmEF^~G=o89V99ec1BjAmh&Sfdp0#~Z8G-|jTzA~|7%nCstk`_s*!UvwmBd^{6eN= zBC2-89 zy;2Vadp}vUl8;}(u*%2-PjHw}X3(cFo2-)&73C zNCl{V`q0DVq(GMeEoBvmO#w(l=|}?QOT-X>rTPhuVIjEHgQ0~VHjVpEUQ}Jn3~<7< zi{1;t+fNbI#A3IMvc(20z8!Yq4;9?E`>u#_2MN0^hiWUjI8aYOg-m-rp|Kq0+b=J@ zhyh^^1lNchMhs^6`qJJYW_i(UDBHfMpL*~vIO|Bs5wo)M3U8RPwE95FO z_f9bnM;bfSr|N?h@M&-=d||I5)3Q$ZF`}&7hQT4-z%ZZ}09+})NA$Y?I?7ZiBXHK+ zZ+~U(kEO*(6L1-q)F5(6FqSS{R<>m}me&ePNG0sbZ`D#A`Wl>!M<9kVuakePJ&@wh z*ZaeV{xGpk$PVR5RuRLUQq?)*0g+-FUVtQB&iY%v*frRq5+E}0mGO4R8g4Hm-Yaa; zgRw@v{@YWx87+4V2zvj-3@%}hCZ`AY_3)X-@K&$nAmA^dnY{25 zbUN1V%DHA3W<@GmETb7*xTTjQvl&mxw72WL9*F_1eoBLHdOvjI1b$gD;WV z(fyl~GYvb<6#i!)sf}Xg%G`owLl=LJmwMy>U%apH?=(i{<7I1yg=QZZjX{KftwcWK z@C8f(DYv%+j9eoGvn`BfsSB#=LmG#3Q(-W%LX$qqhxIcis;3GR2)5i~@?UI`(|LQp z!AxEr?Tzr|7>N)$&*41Nl`uZ)T6$iFqIg=hfHqUNSI-xV)h#1x%+75bzT4Iyu7X}$ z@MDvtS4hM9%Y$vU50{-Ohb31QYa87&=}MY%ML&#R9vM6QNcO5K7a`g9kTY5_Eo`rn z??-%dihL=kuTZSqn(NRL5@;flz=RB6(4J0NU^-OSGFfb;22Q+B#zTPt9=DE`gGN7p zpLtVY6MXr!xQBzUhzXt%NS0TH(p&e3%zE;>RsEQ?`)(_t6^Q3}R6pM3XIZF|j~6Gb zW}Jf}8a$2{PvRGru`jTE?8m$i7HOsHO?i0$+$v*k_f&o30oi0iVrWP)IibW(OwCxl(&3!@I>LJtm)A$wvk2_d^|lC?*Kz~P%68; zayt#q6Q>Klp90RQF|-OI9{k){50kRHSxG?vuInIH;vcMjfx&>j5Ie1QD6yEr*HC68ZYt&cjX-JZx_)159=sOg05c6o;Q9y}sjTxSs^;?~#;I z@By$qG(^#x*&ckP(P!P~+UR_PuBJrOzZ!!3e2%tr!IGW5RDO*N99-7uZ4R%E`7)dB zF(c55RazMGhMFG7=zu|tdE+nlj~%I^2E;#@KA0{#16Y9~`099v4@p+EVsDt%|TCy<$2SjXbti;Bu? zqLl3$Lmh;Lqb!bzsoVwc`|92x%;z9NiO&TDqL|7!xNNX7IMg|Gh4*9ZhnIw}KBcwg zZ>ASTOw84eXqBD4FH33t?$Km1g%m8hcr|^rTD|wW->G_Cdets(H7;Z|Pb(hzx$fOS zJcr3ECf{l*m`Lmw2Mb!7bw7kJ+*ZvmjxyBjUJt3{PPh!^J>AZaVYN=r&3vj%OJ5fq zN(O@k{qu=MQkq<9q9B09arDjlY|$dV2;wm8QW;w#dKId#hwgOOZtCP15}2u}Lvf>j zNT3i$rejHjzGu*KIc{riuur!CONvtNHzfe?wb z(xvpmoad9$lWEJ%hVqN;9ZwHOyta)#Z#k}cC6?hz_Chq;^91@dh%2L6 zot5cCJ(Msbm>}oglv`MU)@*rb=4{m3?2_kjK=Hixxn_}8WvME?zGSf~9diImE{z~| z2%mJ@XB{%6CCMdMm4L-68x*>WSih}=}YD{Sm|Hy(JbSg9iibQRO6^Ji4;z(9St4<{zTNl)v@u% zyxhP32|*H%2JcJvN&ZtA{T9_yzMV9c2rT%MR`kwR7`J|m{|x#sszo)>h~~zltPd&K z3Z}E)LjLwv479g$M=F0m^ch(QJl&S~11qez^2)SGJn^y=zZ(6_6Bh)=&AZ59jxE)Fpdr0 z;aGbRA?DXhImbgJNXmEJO}Fz4v59!l0TaTf?4RPVt1>D5 zNq#lv_lU=&gLlYXc*PR+>!sRDKqH(T*Pm+hNTiL{SK|KZC?jfcwWf52VeFrt!~q>1 zOfgHflbo%9WyO#v{;SbZ$KZ0R8$5fT-!G*{Eu!30!cRKWNWbOZ9sc3>GoO($f~)&v zE+3}+>B;P8pu=&NXkJc%p0ZJCp!&T+a%sUkY<_%?LiMX7_ZdJV^yV%`zk>edw0a}) zr=v8b!PV;H>)-zLB&G!D@a+3MNKN^LvHmIj)W3(HN)TNB|EJvlpK`xUMD{aCHhMU` zFL{M&Z>0gtdhMUdBLb@_2c{FAHRK)bZ<_;y_3R-mh~2~*_RzUg4M*7K|6Q@O(Aj== zYv=gy+6T~8=gT(zcilvZd;`Dn504^&+MnMGHUPR{cQ~h{|4iRtQSiK1JJ=?Fzo5cI z1Q&I6(AXgVyRjMgW`F3Hb^84s3s^$7NFj-TOdABL%O9-d_XcZZ|E}p10XJsfp_ux6 zS>IFe#o-RMhS;A+Q(+oImkR3pxj*X~f&?!Cexp?H{P<6Eqr4NrlMW7n9@?K9V|qbb zw~%fl`R_@#z}_mPTA>#FU6PNW3qOZ=#P~n=h1%5p!Ee5Qmi#j^S8yBgNM^S`rX6%M zJJe9VdD`&skIi~!1HQm=hvW75^97mDw+F!f^M8H#+wOyrB(n#^|7`ul)Yr z#2a1*x*uTSuu}e=Zb3-k(T_U@MgP50j1L%Fqi+&rK7THYc?Z5&nU*c1@<9Sf3lWfu zNNi_ZoP<8NogY4egnupVvt8bQ1_;^=9cv8QbzE3X$Finfz9eCOdomTK zHFUaskFXI+2aBU8_WQ8Jze(}uu_7T@9c-3Mgn5(Z)Xo=^6%A3uEmpBeDGO;`zF zO#SB?zbQ0Zey=|)^rywsyf$0R&o<^6tHKx&%MJlv& z1kN{;DvDfQDZKXioEBf1CU_0Xc;=mRHk&}w#%xM4iCseLqZKo?Yyzi;9 zO%QRKl0BnAwcjpUX57m37)W52C&kQPq=x_g(K;iJK}{yz<5V+tiiQ!j!GTP~2QK`7 zjtv;0S+M+<17qHm16<4d2R2b>Yefa=;PAjJ^~mem)@9KLalG)Mko${t_d{tyF7vpR zp%e{}(4YapXEr_7`xb{Lm~4w&)gXF&RE6tU)_f>)vEPX?wD4i$)zk7Fi>x3%kn5F4 znNH?&ES@aa7oMweJ=rpj!TEmCQm-;yYr&$$ZuV~#r0(6 z!^e3U+AWb$A7>6wu)-<20ZLPNmXyI*M*~O?;?E)u+aKZ$r!yAD%iZDZ0JBXgcR{ok z>39v`-ey^^l@YclcqP(ajIX`;p92>45Ugja9wA9f+?&gj*eQK4U7#Sv^f?5V;|H>M zLWbAv^h&#C)7uYmj2a4MZMUw3k6-yGy&LMlem2SLyq^gqjik~&FU=a()7{2{`9SF} z+YhwExo?yFNt|ZMSzFJ4NH+r+6e@gOhiJm9!vWSvq7f02 zPL<8I6l*;Hys>{4^*lRrgl#;@Dw%)e83GsDjIoVb_OT!9Q?9&1uH11OCl4i;5AY(7^xKvt{WZN1u!6;DGa;Ujnn3?=>Z zdYhL;dM%A%j|69&zxz{cJ&{fJI{1v=EsJP8M9#80m^2XvvZoHQ7AB`V)8nA_dqWg$ zl;cy`BDsGBH__jH{;z`tY;rw0*lEzHG4u=tHOM&0sQS#fP0+E3o(_UKg@`u`t-3-t z$10SEsE-7`sa7DzsjP2|n>12h@Ho?L*lpaF`@-Vy-f?tatu;!Z0f5Yuy%2$Q z<$Hzx*LeL~9nw<3x{h0#8Q~eV*uD`nTF@I2x_T2^LGV%;$oa(qcvwn^23s7;J10ee z)PPfO+ZxMxcYV-9%I>tQGjZ=r2G{_o_H!G~e}t|AB@dd%9G45#CXq;YRTpk=q>o_W z?%iJtj#*bLR^>HYD*P6((%)L%8GGOG=Jsl@o463*b-E2VVikp?WrxFNh3=zWE&5@s z6ql5RpiZK?X8pI2z`unO16-uE-0$P1nxoN_Qbu4;qme7ITkV$|6^%$1p&qi-BdHOk zSW!Y%t68K5#)d*go>A4$+rL?~cg}uF*D24@&!$SqapVu-D^orY`|lPF z7&sqnU^4bUVB*3i1@jT$PaQv4o@1B|;=3POto^YJsgpsw?#-x$ND#i= zxzR|v;6_~?%;6Qt$;zfX|3KOV)u>v+{7$=^(s5~OZDNnQB%X%wv5*~L{qI&gKwFLW z0Jv8=WFa`rTAZD?(F^yoRCw)Hg8Bt!9YUG~>BiFICI8!@{GZ42Ob$39SoRA~wy7w+ z*;rL)#H1bn9}`>eDXcOLF#Y}B(}MqV`cshudj{eKdOG7Emrm`pJDU{f_SqNh@Hn$r z11xW9@Wy307J7J)Knw346AOe8t!c|T71s;(+z!5#< zDT(keRbh*1ZGY0BzlB1RLkAYrN6sgzs<2fy_KkmrIcNIlJ>a%$_YtSQ^69*`+!e@E}m%`%6E18Gxm9&Okeeg5F>CHhvGmOMee}??Eij2 zQZV2#*UUcqIY72C#U=+porrDM!G?-q{$>BI%U;Xb%5tP|0w`8_&sK&=>aGNyV|>}M z0AABp6L@|)&PxH~05r|HWV+fPf7s7h-U|#PRNu=E7+z^GmxlxlTC<0N?KK8PsJ>VW zUy9W@4CmaBhG6U`fX1I>S^W=WaIBTp5+VuWd*2C0%@|-q^92V3_0oV_bGY=5RPqbZ zd;@bv7ApFu3CwhUGyRrPND%%%zxUqm#8zLeN$K$fAKSEg(fE|#YwJ7rZV2nnAt6d^CZ6=5jYAnyZVH}dY;B= z_rpLQ%nkjSZA?dClvzXzzhpgqX9VEpYqh-gYrUE#Y1A3+hrybPoSo%t^RlchMr*fw zZFe$xGV$yyXpszqgzjelEcJgrECxaZ9cPtfMlFOcHz+AL+%HxW$3aODWu1;)A2p3C zlC;YlfjChR$8g05o&|T6Dw7c_0cEoD-2D80pw3AR6vrjSi*%dnbZj~h4zG_!h*^nr z0I2`{PYhVu9LaFt-T`F}6fR(=eM+{murjrsR&a}D?V-m2|*#oR?so4=JJP*tN*I5e7 z1{?lr;JhnB6v*(z&dod1 z)jjw52wlcSR+4~{p@fE?k4R^v6t57;Y&c%+-KlSBAq=9&R8$~3%~)IxST@2X*EH6 zB~C>_`fS|~5hIzqH9O{H(nktQz*yM_KKZNzyjP^80~-b>NmY)25|Vg)I3B$d>^d`};>1;oq_T z?-xNJkz%Jd#`oZ9uv%ck@g!h95j8J1{#dj<*Z3Vu2h$M{NHIO>WzP^=f~t8bWXc(l z?JrNKilUq3+E%>hUrwERQIyDV@DflKcajOtx}eAaFDf0Q(iK5|tQY+G>!Ul`c4=roPua zU8cfY3u%yf&pvQG5B6_U;U}LV{BQ|h?sM;njLFzUDqy4%Gl4rp=MWO$WDCBN{-ixMHyYD*)LW;)6BH7T6 zPH1^hmi3LxV7H(KQBF?IjpWuy+^bf?o~c+pyewm3hjISi*t=^W&(RoSgz`6qIwta% z+-6gI!*4D@lXTdDi9=D|S_PrD{jgW?KZYsok5R+j^$3BBBPzKLRX-VxV8q2R)-UQBC%>)w;izdHTAwt9s;9GJ5 z^0AWv=aSb1yAr+Odj#2kZ^>YAn4dgXCBQCt5~5uO&gbq<-{(8uR%?w7Gkcj5Nx1wj zPL^Zb?*?f8O<4X7&ES_{KZjpQFA+%thBX>bBz%$!(t9#Io}PG`di|EF25F9u>KG>8TEZ)Y}Z`VK-ka$WL&&R zI|v>9;OwQz7hJ3Pd#wL{u>|P;G{g-4Pb9>33P>|MLy9*u5Cq-eZT|Fd-8XYqAc4 zdOSdBqYv2Sj$_CEX4wC|MUh6B_2jrj7Xvwm1=bExXh|_0tPis@{@0jek-LEpj^%cIf9*nf_Fm1kZ9BFBX)cr=VRXFZQ zY*^I7!BG5w4@LgZ+#Aje-l7QkAS#(vusrxu3uOQ#N|f0W-9M(F4YR~egBV=2!10p8 zxc&eTC+oBw8~=C&MT`&L zFIY_ae+~jN3`6^qMa^xavZF}^Frd2Zu%kN(G$E&>2V!7E2vn_sZJI2dz~t2Z8g zK)m$(?{)RB7tHct+7nIa~*ko=wk=ekFQuC{a^~ zeMmHH>>vDO7!>R_veuu2nqcF2&Qn-T0E}yuD5mm{5q1N|DSl$+tM7W_d+4?9(^0^7 zo?dtN{l}1yVGdGe>c$y-C3@FiFPKY!^vt|XzWy(f{y*m+D~uPj$H{-Yy@-%tm^&A< zkWfk!n}=)0{9{P||6g=pf8>I4zJ3h#4FYZ0FWKlhPz1?}krGqIJwv5$uLb=6sneS# z={^1O@px;x^b62!g$F?A&ajx^gG6QQJ>`Yz5zRdjwxL)5e)xaO^-Di_VzWUT)v9$g zv?CbYkU6H+|MK_;`;RZ)i68*Js?8L0yy&8J?fJ6X#0`o>-(j%UpW|IVT!I_105qnv ze#U@ekHZPW6iQiDBtJz21~p)qW1H@1;#{B4A^^fv1GF&%)yw(@@^)Up+QeU7p7EJc zJ)bwJdBU{Ya1gCuYHR>FO~yh1c?%Ryox2Z2K?HgPftDmm8iFl-jrGfaP8>L9j>)O{ zt^%+d3KzaGnd94jEbs)aXKGS>_%RH^4PH|*b9Y?fa&SSqDk$}dD!}BrK8qn#wty(e zi(&LXkpr~p+p^|Jz9yrlKbzbKwYC&qpumZXR>VowWS6_Vfw}g_^At>YX}bL5$Z1rO z`5Qo25Y;Kk?)KroTX)rV1@t5i*b3Iyk`(?O>H;x9{Nxif>6$B-ymEJ_&h6NVY!WH4 zWKmaMFj6A&cv2!527senLgWXKb`JIj1c!S-1|4M>tv9|gRi4rCXS4Vz1#(>zOx);Y z+CcVD7#T|rfCZT3_7tRTV~P#`0iGBc2gDr}rv${MG7+0(1Prr9uzOwpy|3Y_;Aw&& zUHw-X;CW75!VIw=@odgM8`BT~x71@8l{b7Ab77KoUrD>*VHjMad_(?M@>bawO#pCN z0OcTj!NfP7{OtDp@^1H>SiPcNZQX0C0I90Q+3f5ChvC zdx_}Jfqj0uZ_%@j(N3r&Kli4o&+6O#PAZEW!q%!RL&Od+mOtAm4F1Rz^%UITVMl~G z!vH|2qR6=esAJkc`tI7FXyMP9`ylZhXxIMUhh-N32=EtuFLoR4Pr!9EU8q*+lZD`T zZbRP6MML0K5(@eb`faSvdJew@s(^GF^evep%Hok$8b}3u4GhiL>{H!C)q!*cN|TZ& zZVKI`yXYWodO)FQAaW0dok;~RVv^#&Cjv` z&mg|M*XCDNzm{C%D0qv|M(D|iEa)y`3ku%C0)Jw?*!egGX?s)76h$;8SAq<~s?Hu_ z&+xPEV1O#Zj4WX`u^cA=;U{)8ycqacfb$jCGZ29rlb>kr+gq|z)z2%KJp^v5&($x~ z9WBsdVNnEy<}E9HMGs4p2PPUAdaP_Z^UKV>5J2E)xyua6Uv=$a5^qP7PtoR3je$rq>Lq0D>aAQYE zz*L3+z0DF2#L0mg0FpfVfo|FT8$X7M)))8gW1tM}gJk;)xfk5FwHkNP!H9bsG@*(G z+dVg20{YnZ_g9tI>kj}wGawAhgMVugTm$@C0$^^0%%;DmxHfUlRNIS+yOb1QB!9HJ-4QNh@6ndpHf8ttOh1J3^;G-7zga z2XvW8IuA4-w1_ExTJ3l0LBWihg#!?zzg&S9IM)ZaV1lsFXW&5lygy!P)AAq}Xmxop zpc*Raim^uz&!Bu^CTNTQ=0Jo5x(C_vyZP#37_lx3m!IH<(xC0!fs8}61-OeSTG1iI zrBQI$1i%YIOpgKOc)P{{^a1TsSOc9CHI}cD;iqe&eK;`6W5Fsyry|X3EYr(k& zR5I3Fl0}5nOJ2SEYWGM7@9dg6HQpKSa6Z_I`Hlp9+^AuzGH@(-9y0W7Gp9Evku7ai zVu#Yg8c+dgxmevJKyZPSR40(B@>Y$g{N*ivEHip?ToYgdqo zQd4_HLa)Y6<|1tY--M6jrL(EFMZ#De^E3QEpVksj^gW@YZSq>_AzN<6uC{@YdtE1G zxD$MxI3B1c_d&Bz4O*b!F5gdlVFvY=J++(99#yMWwRxL z;EZ5#2P~hvZDf-%!Ytk~!dqA&KJCjKf94) zPg+K*JYM;>J>7ScQ*N4^h?yKEkfPTeYAoIu1$!9M(F^l}SDb@90Na(7+T&MC_4WJt zaa7Rk8K9uPeiidkTakV~ehqzubVK>rTux%tjfC(zJmnBgBxDaVZXNmd-hdHhJO1hOBl1&oNr!Zbw@37E{VPB|sZ`NiH0h`5R zazYkAC^LCm$H>h&R-|GYa4%5H;WONYmIjF{(X;A&!X`0qKaU*|qt@xZ9S)X%HC%D_*HwL}}{EzZt)D8Zyn;{P7Mtu7_b$jDTc8a6^wZpw1 zeD;UjXWuR_&IaqCt@2QYwkFFSQb-xPBuI{~J#`)gg&O`(miOM9O9TWzZr`f@mq&I; z=RfEqn`9n`i2^;U(HY<0-&^$K)#{IX&mTlPF%v&ni|a35OFX`u^ZFyVPqY(7cKyboWyUd6ieTg69DNw|$_d}2IW!!W9knPLLLBMg}1u?;vV?Cs{(#O*-0XKzI?Z;C6Dq1J* zR)}YBc?wC-yd=qkZmv`H=G}rPzrV$&?VmhS%F!;{_SlMAUY5?f()@Z<_vtDLm+i}Q zG5=2UjeDNOQkgb2*6ve!){ zJpjER$6_a25XcTwqzI!%pUY~}ta!G@{G=22Pz4lV)HHTE1B&jk?GrpCt_|oMI-vhp z?f(6^@$vD%Y?40;=Jbf`VCubf`ADCqkw>~52JGzB(V3yhk zx&J$IHnol$Btzr~hHtSGvcW8-l&2%GIfqV<)C*XmC&>TGXviB)T<_Of*EhRoV5~H6 z=AC&fqgqZ60*P+cXvmWCJ<#`lmOQu-x~qjiGW+<8wIM1z|K;z24MTnwQVf5gSob^_ z)xcFu3o#+-d%5}c!?%c7A1_2Xql!QjK{0fzR;QTVo}U2f5hdx%+rmy*(mq zIjZT~qy9_R5B?M!sHw`(pVZ?@uGZTru_hKtfMbm}|7IVXce;+YUFkhb#k9QKgP!B! z<6hx=PSFS6TF>&Sh5^s(40}$|13VLUT=%l{MLnu#y{62RGwT;aY1OgKXX#{5knH&W z<88&)&F?_7C}WU_*uRfVQd=)F$Sxr`rqWUlx@YvqDU1HCb zdlQ~d?Gf4P)+L9o2o)r^CT<`h)7|9mGS=!XfnmXJIY<-X><~bnrmn-h+{z@TNL>d2 zb(ib7X&V=nO>&d4oNe9v4d_~xLd48)MB97tmlC$%1yD)DB?&MPn13z|EYLGjx|z>2 z@2+}%;G09odnS#Do`i#%Z$tN9W6u;L>_N#h0xg*WITf@t{ zURa>#x^0{gX@59DY5V8%g-2&?N{;=ZzK`0O_58E|3T^X4qZj!``; zkB+=IEX2db)3+wv7CR{ptVs|rfTAMJ?NQ(A&@h5=j|C2VigG?=yPST~VP(DrT;iq;LXFs)D9ednyBMo)%@F(M*Ffu5i``jZW#Kzn|nN%AP>@+7BjLSiJi>QnbD810v zaKs00@|im|F#T(AF&5m0hBK?OAAg;Amoq-oP~nQhv*;dqVy~bd7R|B0i_Ori$t;OG zB5mrz#MKTLr@jXgiu6$FQqA)hkB15N`>YR}9#ts78;Bfr229V~+S@i1)7+`v2-W-f zDbd$-&%Z2P`#G^8UvXKN-H@{;;lFsX>HNEC4OaQ-mJ4TV>~lF6)W1;sEP zKzHR-Dg4Ko^*K#V0O;&}op3MUlITm&`xQT3hF^8P(+u@o6|dr^sVrnbSWMV!jEiz4 zP#TxxtpJ+&27NC&cEoutuhA(5*q*3+|_&s6>5x z=o3gyScO$Qo+GH_0^5v$lvTj7797hIHj9DZV;FN7Fd_U{kyQwhLit&d0Ii>2{A=Gk zo)@D>EaN2)5_y%Uksy~Pf_bbFVZiD;E&gm0$SKf3Xm7sflh<7ocW5tfxn%%h%6kL+ zT`Nz53sy>E!xT?{;+;-ZXzCyJ11MmO?OP!iQ`W6`fngzvq=Of}%tT}rt;$New*h=dqVjkR_9fRI zQ%u{aBk1KINNQBQr8D8pJN#QgPmto0z@Fh8)Q5yQmVMsjI*6zAf;-4=w|<6Kv;cLQ zwH2)@FGE+GO@g9^!eM#ZaipHga{>co;2+pmSQN@wDzFV#KFUqVN|{zTcKU%XHV#J^ zFT(AsVPNaCQlv-Wuy9K6e{S7+0{SPEaSK=j@Y{+Iu@dqiw~B7cCMi=%&&70&QGc?# zdof%s*r3Pse%J@WQYj3Ie)s{BgD6B!<*_W}@^wWirU?q~?IlvZ6$ly%%l)EGk{12; zpqFQ?wkL#APoCWC&lBEs-ux9o@n>o|wuy5e33~Xx-IMZ)wOAjQ7hhE)o{@k5P;l@A zj#kD_@&|j%AM#atSGBA7TJZ9w$AC%pT5qg&0dp`nGtpuGlwb6PNd6ECpCYO*Q0=)0 zgfPnxU5Dhq7DH^IgLnEAG~ZzZ{UJoMV~87r+=4{nZg|1(v?z?yIQRlNgpi!5odwT4 z9$kgcx+(jt=T1J!r1#N`eSYZ}=Vs<8V?#XrPOE2UsJWcFT$LqM-9?Fhfa3l)=lB;U zgD}TH05xty*>S3tFnhcBUBCD0O}6L-;s(} zE&zKX_*xY3&$W4A25cAM5ziFm}Z)3muEIYbr&3iPE%irlA8bvHw05nz+ZzX zwO2|z{n6Jq;Dr~Zy*v!hn>EhQ#1D~gTN(Z+Kw0+kFx9%tbFGapW*sEykKI{DpCvzQ zpgX-M-Pf;uI8O8VNxo@`^$6NCGAUJ}b3Da9Cqnde;>&f)PYaI)E!rDD`#0*8*u%K& ze~8!=M~&w689XDJ4lsUB9}s+P_astT^1Z*L)VTm52h7-aF{K`1*>Ua zoy}iGd6RUo=r;z1A^Ev#v!f5Me+HtohS+;y#7d?Xt$&E^5&CDVxvY_#{7Y&@`Y_c6 zl9+VROX>mGKZBR>a$+KoTD(quTeILEQvap2Z1slMNPcN3ZYxPPc|*NZwrxXn$0#t< zr6sun#R4RlVE6<<5f4{s zgF{aV6Wff2s#ZTnti;b2aeylPIBLnbv#^?iYQR|5&&OP0I&owjh130I?XD;=$h1wc0lh`-wQZ+hG@<_Ww$~+s`Q*d%ZoO46bMPUa z)$2FBV9KaidK6l=2MG&IH3I}P0mo*GC(c_eCLy>K)EO-vo&#JIOkpBuQ)G99@XEd% zbQ&0|H2mo-bk;%37!mv(GXK>sz3z9Lp@EK_Ciw*zT4jt8a#LR%B^#s`29@*V-fWS~ z2bl7<=m=2#GCsX~@vFWYaV<*=Wt|2*6?TJSyL-~F&PBj%>)?gPiULq5iDgeQu)k+G zN@uPC%aEt^S|~(abT&BECFzYL;a;7ThVSd!U-ZA;+kU3Ni57148)~Nw#~kthHbXRs z&|~@E-6tk1hcNSOPBTUu@ww0~T>r&Z?SihWufP{ncoYD7lc}de^mFiYbKE&>ZhtQD zJV9Hv6Ny$Jl3ib~yK?Ma{}g^3Y3Xv53PKfXuJJ`b;7CW~+gNG{dZTfCHZ!y7Jlf&S zgdMotpHy_mk@n_5F`A4Zg2=qwqTxxEuJztwY(09AubrXBasySpsf)OOkp~rw z(1hyM^*B$wbRmv<5-m6?NdtI|%-E{kl<3hl;aQ8qxRt}4o3c6b*>>ym0xx0?wg;lL zyUOtn#7SLQKLLpmiUG!!Pxg!>0DfbIAXOtJU@UH9fkj;CZ*m4PZhboJ5XEj*7ADZX zu=;2Xc-rfDp3)K2DpGk>*qQvIW{|JD8@$)W`oJ}5R+n;R8;xIHtrLeVDEns7Z!$3@DE9}o4B zi7)2f0daGiuqDCVr#p>$Epjf5h>eA_&Qh?F9`s##{lBgx&dmkR+v)L38?0*6ONDUL zd7X3->h{+WJkEr1P*LWSxERJs{__fzc~yaM##xNNz5Xd;^{wFmv4Y(vaAAd>4}?h` z=) zmb-AZ+P;#y$SiPPi-wW{O23XtZisYcuK>7e@3RuveHAeA>Y8JRzy=!_ zWN7nsb6Ieb6K0<+_^>V90ZV-h8`0UrPiGU+5s*(ppMJE54l$-m!qj2Pkl>Nv17Zdi z&35q@?Zl$gR@uP+Se6(jxC9!HX%Py`E2QEB)0`Uj7{=3_LZHQ;N+GtjFc>~7iI%Sl zkKOy+411VTL0xZEag%!JM(UJm6pyu-#xD-z}i^~V7VV-E$ z65K7t0t&@j(8v6KDvOfW<|JS~Zj;L;0KMDZx>rJV(0WV+jBr9TsHwUhnz@hw7IxA} z)UH9m8UO@dlY)d-^I#HZok1}SK9i@Tl;n^2{Qxumj03c?wIlE~TVnKy(3|BNDuEVJ zehMAjNYx05f)XSFErOhAm#$-nh3NKs#yAl?1wiR)1l?!#HU*^`^;j*WPg8cUo!Y@Y z77B$sDBpjNWnX#Nl3+4G$b{vVq+${#{$A8yefgEpzUV?8{+9gowaLe>>9D@QEBKv7 zJFK#Ux*XQwndpIdb!gL?@CN{Avqo*n0%Kt#3O^>W<)e#o9 zI1?n;p0Q}EO#QA@opPHYb2o~Jl$D@A3$F0mRBAQjVzY&jLLJ zpj^fd!7%Ao)<%_i4!oG%Iyd5ZGoxTZr$LPjFy`6>ej3>?huTeeOcG?5Krl=YdqL?f z_&SkK**=6*zwp-8usO~7)={R<1ZT3zt+#zFu8xoPjlO%+tM!|f+IbCYeYcjbDcZVD1q$>9In&}gStMQBy@7|@)yT)@m9(}nbm>S8l3`h92;qFsGe&s)5@BoQ?6y(6 zhy|O9-NIo`r1VNX6)dR@6eLx$J0M#13ezc2x5)_V?mwmNGoHRJj?emSkU=WIY5H9= zJ#vabp3%nY#A%u66$hg#HK`D3&ES9@j6oJXG~f`7$F+}zbgl~p+-j(x6!r294N+-A zxke30zl$F`VoGO$=x<-X;|^D;)J8MQL8X`HAjV;=G2wYX2V#8fSpso6oh-Kqb_ z7gjxkVwYv%ZwhckpCO*Yg}N_Cl&8sx(lg)iHs}ioB07Qn_LjF_iY2*otR$8^K?hS5 zlD!SORdrnl%jF7W%qgT;yoWFsr}{_sGS}_BDCd0?Y>}$L`E}qz0f9Mg^qqZ5uL=ef zR?9NfWp}dT=88y#fo|%>clIJp9nLrLT{M-?O{%EcNaP{f(bSG1Pl97f^-EfPztudo zxeR3=F}f$#_5v^a4H)sa4@(ws?Rp}~cod#oM~U@|nh(uCDNVe%&44wx9a3vAW}#h0 zn^pJFl(fA2#F4n15pjZY1bSsWNN;fIHD9^=JP%dL0LRG*fHjJdAlN>MJCtpXJF%ww zVBLBYtdd(4A)q()D>3orPV9OL<=fAl2R&IIx>q6rH5z*=2{C8n6XN6I_NIU5+v(X) zR$6vCm+@tFf=#j$#C5+DIcd4M^OuQ$vtd4-;Jed#V3(Ntous!vvWDpA=Zn!S519=E zZi5BsAdkfl?{?l9e2M9~GQOE0Ums&Zh=kgZxR@Go<*mKZFwQ+{6l*& z#o&{pV^x@Ele02#TbN)y{^~j7J4t&N;foLYr?Iz{a^;b9`}1WiqnU~>r_Va)KxI%l zFW2L~4~7qAl=f{6D;+XsWtQvE6i@t*ML9~!SIJLIRwgQ`oMZ~5Jo^_vSg-9B`$8_j zHMGx4;)j193^9IL6<$60Kh!SI+OF2m=DXge->(LO;>{%K%00Ks9TBqKPb@e)qwZ>* zHQOIeK4Y6qNgPUvUU&Y+u1G(StEfiu)oR6aXX`5kgF(_E)pj?hZZ^qm$KJ{f(G$&J z;76y2N+!|78@D~^uD!#%&jwG#Qlyv4QmVQS<(Q1UhpQ#{BSPZID>}ZOW*i8E z{u4QB%1NXH8#-@K81?)2&UZuLCUsRLM!jb$fi6A*IQEa^@vP_T5!`aN{6%HRslA`8 z>OvO83ZkVCjLlDyQM#jIER!y3NAe>Wl@o{>WrSh;u>0CeQ$RH23epaa6*%WhhnTqc zTio|F`6c}=n`Hml^eoi^Ec`&U`k2>(9Yi6t9UryvWqC+TlxNDjyI82f#WSM%#314H zv{v#50n+!@7x!!&$-^)uzF(QQkp3)tonpIHRl2&a`%~NE%rP6pm%_rjxmLZi`-cH% zIS=DzJ&Gb2wT8YkH9Jdld>G=kLNSEFG#;$q%~t;4?)VyEE$#Kr)O-j^>goWU5IXdi zYyMfBKp+mY`}VS4?w!RW^T}V*){qYeed}-ga6Rtl_tLIZDl4fsYi%HQFX|uoZ<;>T zmlQC+=luIKc9G(<_cd}ii})9XKrJp^t+M%39rYo;4CY9$U$7Y7vYc?ni3hn;Zw3V_ zUvi3TG#WNdHSIN1>Wkg_ARk+vKeSoQ{XR=(bP6seSMb>c_YJ3(mS zg39BL(H|b438db4lc#l)P>#UsTAEDjr})eC;raRxcXQ&L)uE;^R$(;#Xt~Ym4Cd0yL6tUh==ax@Kldi9`_;LRE zeTOoqs+<=-Gy<+)_T={{y*?EL3Rbu8a)X(QuU!U>#F5`DisX>Beql7rF4GeZ4oR# zT4&23{&L`!r+7mc+|oZfruVBY0?4?iB-lz#R0>t*0UammHU z|Ipoqh(pb-B`)(^)y7K4*HfgZ?2XOJXBe(KzwCh1Y+bxBpd!K;h}`5kJP)EDRjm(P*SO~rhc~g+tr5`bm&44oX;e;R z-xHgL10iD-3VlTwKV@h4>CK(Ca_vTld>eYPeTI_hjvwX>YJT`(Obg+`s6PG`mcTG%5f_YT;U!A<6HEX;Y3UEGl$Y-ZuKGEs)^lEJ z!Ypf8+5!?Mh7IwB=$23qiuB}`;++*Xd87A(1{7-dZE90-K<+p_E54p;qqd)IOw5J0 zyjpmTvq&UpZEtVdKCGg$az-%9ieY=t2e2?bohmca zFen3V7Z&H<_k?YG+CkKqWh9vHmj{mnD1f>VrIHD6>Cn1^TFv>TsBr?GCr zuO1vboaVNjv9EXgXemWca(U2c;k}skQ%XDAqKfzbNSH2^70>NKF@!8a``GN93!Vqx$G5LdGK4I%V z`{wWT&yQ}Gf3t5J_TSd;Ld7$c4E?$MA?R45u0zfg#P;fm7F%VEo=x=Uo02KB%=d=!ZQx%i?}%*OgEXXaESJnhL{b89)#q z|4IMf@*hwUc;9uO@x&)3wU9xyhbdT_1<}z0Q?BuuC!^rS)yW?gA~qW(Q13ig1JZ!U z)D8V|dJ8RjM1k!4ry~AoCAwM}4_EiqVjK#SUF|dW?#O+a@`}p46#|QwiJbjS zHK4*po*n&tZ}v@m2z8K}VF6u8Do+p2NVUBdg^MsK^+&2+%YvSoYyY`v-45J|QKEP# z0+eArXComWZP@MidZ2_*Lz6C_NfDo!yy1Cl!^~53o?MV-mHI{)#G@wr|VmADs70#A-`(QYBq%W|g&58{tz4U!FUjJUzM zbhKlMDp!5G{V`bny+XKGgO*d%Yt_`|GOA9(`z^R2_pJ}?cuy8MGN) z^{exXi85}ITsTja;@7Ip)@0oI5*eD!^hz`Q&GzV*hgcM!fy}n8#q&n5Y&@0+^;CFF zO2SA`jA_6*AB0zmblG=W$ayH-`LcVjqHu)vZFKkO-V|5`<@q zBo32|h))%UXzFRiS;Wn?aMnyga5d5nq!@3i{KpF*b7CGT%*6B?REG!ux~)U4SY^ux zmkSH7KxyBhjepgwvUE=DOEDb?5rDU(NH!7RP&%~Y3X(Z z*!&oznq^?@vzaCST|<7pFj@EpCw@WU;rSOtE(3iN7YpljFQlC-LFr0*xTSrij`LAf z&UdS&ONloy74c^%t5!!&8XjG?7d$Dze5neYZ%!VJi;dPio*XJFpAwr&J&~Up z>aq5T)HFd<<#~c4b!dQsP~{aPg8+9;W+;`AZT0eye3;0K?&o#8>_C5`b)PfQ7| zE$HjfWrBA<;w5qVy*eKiF$#HP{8w!QBFY+%m7ch${>YPL*z@aONe5DI4jzauMOwZ} z$1FcBC(ky%czb(Q z0*XQ5YFAgXu+`Hoo+?x40!*CX68{v*hLxVS_;u~S#_CQpwT{xJ#Ngikel~rX)U$Zp zs~?91z23J3-o=S(Rh01doj`HxNPB{ zAIsNFEnU%z|5-6Y0{^pn>L*8RsQ|MNOCfieLBc|fT;sPQTbQA_DYN*$ys>@WcBNdr zWaEYQbLluDw$(U4nu(sJufT`8T;`@3#~N#}&gWrkpgQ=JJm>qYO`NHr&{@QkXTNB< zNgYwe(1l8LmtO$*)xU-+U*3V*PR!6me(ba+@*lF1OR|ZTMPqc&;V2$WnA<^T7^JPh z_t_w6ipFZ8s&QI&eXdTS)c&V;bl#2!A0;MFC?+bgKjPBK4m8RAe#uwfJ=f z+3xzJA=TUX`C7*K$;SuDYEe5i@hn6WaFrUaGD_y-USd7bl;$7Le6f5o)4-3M;KVGJb*Q#!DnQVUOMQ=-v-F}IBy5=muGhBV4Z1J?``5cd!JjE_{=kV7$) zNd?;HOJxe5@+@=~a84{nihuI1h6D+co4Eba8C$PqQ*gQD)gM*gvrN(c<0XyNi1xpb zo6`z-&o}B5rH6c)aY@}=;`L*E7Pq<6O|q^EKcL}UnYfKuX}DK-zwaH@}UjmgWFdh z%N~^vN$X>88cTQ3RrKBTZa>0N$vskW959~Kjms}3aLj-Gh=*8hCGOz&m4%_8LEfhK zH-mXbD)$8}iCi0+gO~fg;eFw)0z|96fP|8N?QE?9pR~6xnWEa+I}wi&u>zemvVgS# z(^^s&+og#*Zs8I3f!lK|&s zZr0a;@;lGG=ihwkn$eJd(B=i6|D7q9zWR86Rsy*C;^UonW%CDzDuP{Hr57#}j8o;j zbDSTpBpx9)GDXQ79dZd8612(v#;bm?o8iUT_O7nl&aDNH`s{br9FxticAII-17{cf z_^Lsp>!(yLqZF6H+irbIEg93M@8=76wF)XRy;PgRf~py@cuSt&f0ufEoV%m1I>>bD zVV77G;*n<)OT)OjM|H|95fNyr8$KKDMOA0sd=}T!1J5yi&FuD5i@am!@-4i)f zDr-;p=Vv+gy`lII#+UT=>O7RGOq;Xrc5;U64-@tFA2ff%&8hIuUpV)d9bN<>rd}py zHr=)9g+QYo&8c4ZhV*04WGc;=T@jkEeqFhbb4)Mkx;`Al-=`h$UYIR=Ja`OiIzYd? zemmq)5;zttiGaV2?Nv>ZTdMn=DA5;{nFDv**kJwDg0uBqHZS~#mfa1-*vUuE_H7&> za@bja9b1f(^UpbW8YjHY_lr((v3r+F@djh;>Uus@>G56MjU4?^5u<+xsf}Z`_-n>w zwU#0%hIl?mln|0BsZaFT)D@EaZiq#zu>W+M_zYJoo~f^7-dmA(Y-;3q@RrDm5R_}# za9fxee+`$maEVYYk3|dOuRfoh&i-j$>yUF@{lcnox92WRv1j|d))eDcn63W|0&X_n zUw((r@tEOmP`}UW;ZsMX>8RUm@-rXe*Up!8Rna2y7!Yb0F7St%q6&U@liwJWjrC4U3~;Q z`uZRF@ADEik$$tEa)j!A@tu87%DW6SENU%`6Cdd4aHNoV zm6fDh^sNtb+HdIjP@|Zj_a-MdzU|d!NF-9yaiK#UKG3;o;59NKwNV!q*=NlPxZD+i zty_4W1%Ti3ktZe_8T6uQvuFYKQEKP!7_*J`0GUoA)GKpNQeo zZn;@hA&B_(oaUuDezX1hqgx^`%8wphUM!oxt96+7o{;|KbvhH`D{?Ac`|Hfdv86gU zCt^uV^wL1*l#fz%p%ELDt%XMXPD(Qon{vN>SvvpVK`s6GZiZi#LP&yOcbWT3h7ezu z((Olb=@Zp{?+NTgOlg*8&NhGii6jt%eZ5}5fB4Pyi*1xY-r(pi3v_{WP@FfpHSbB- zLGllkQ7JF9=oT5lb%Z53;qsvQ9S+YZHxSjgiM-49rU8aT%u)`8V=6qPVb`%lEuxyT zHJ9VLBBI29DgW||hw9*od&4zbn)F&9AEvkOFJ-4;DG?&3r_tPJtTgp_p@p?7Pa9iX z#x$VUHG!oswtWrvW#6pox^R=({rdY#nMGV~)Bdi8OxU8dX8IkckCf;T<99|Hq^{0h zCK5fY?0gy{%$q$Y^WrspwAc=7!%t1-DaO-RGj%InwRxnWfGhprZ&ZI~I|ER2M zDtvQW$34H(ek-S`j3S8A2zV+M@p)Y&eoZkR(S%FD)Hc(gatLFy)9=Nxq4|q-k7OB$ zq%?-#vZVNxiR$*m>_z*^GC8%%aE5N(lfMSv{5?e*iL=Ru5^lZyY053ix_)den*DoE zrJ!Q@D_J3P+{%5fH^{Bx&+I+z!6`fgc%n4Zo=}yZGs|vqm8moB_O`OJV1DWzQ0cqZ zYBz?0|Fur!%|```i_^_SR0iP}QFEE;!jo=8G9Q+3fPpK9d7J%1HAVVu@U)jhrTHJP zz~R&22w%w%aQK)+KE#i*lpAT8c{fuZa?qUMd)h*4mqxJh;}EOkn)ESKv_<}0;#Y~< zZG{NheCPXTW%6^itKUf-8PXbqcHh|Dn*39>c2{AVn*kz~&+u+)<2Xk!ChnfjI)Pbx zwXd@lac>{TteEaD+3)t0M%7VUi?7U>W5?LBr_cXXeE6?M#Ml&>`6To7OcWql`pLby zuu-Og9nLuSabj!Lv@x!m_)?pa2i-w_^ss;F_MJ`R!zK5Kn@8bX{!Ws>U`dm!6|(4# z^F3!yBBto)ev)sV*dU-;6JZSoTt&bMoC)LOa_9lJI2eIj(xuoZagvcKfy16t<4zVt zgBbCmX(|1E#l7V60?+7BHopRwy`%fwQYIzc1nDJtMTSw7UQH7|tsn42X;rlyLT-MU z)DzArDxV{QEa7O;^8_-RHOVlnO<-WA8oze?sGH4#W51BjdimyDz+>UV9`QZM4;k_6 ziRr@oDQ>0TlKZjd6k#c91wA3s3-U}k043As-CZQ6|&stHSnZ#GP*d2ZQyxG*^p z&dD`hk7Hc&s~O^f>hKj=+9?$CUjkFu{r)P>E{YqA9|sM_u^3TQ0j2Um4rA<5z)9Zl&8XO4nK480Z z+23JjDBoV>-fSrTwMJT0i$|2sachsYw$KgxYxZ!>pn&^olTZu^_^&>=4pf6|f}(W% zJ`677jE7I=PZSGc&GO0%Y}(&phfBQt$cTtuc*?ooDO(emNC+W<3QoqxU>Rh^I<9*i zsy!{rQbH7M1CP=ep1D~6VP03#*0rS}Ct?$cf`m`N=@q?jVNElak3EdXF8|5SoA-l3 zK1pwJGySpcem7pWrvG4m@GK!e0=js6$$B+L1nL!xM+}wx^m4M!@4rTwQ%1eB^XFix#7OLX;JC8MA#_U&?l}Um@raTGp`lZ77UsMz0 zDd>7w2JCZPuM#yV(gRZtM0F!k6?^+Lo$3XfjqDz)nY-2F?W+`r9l$U*xyAaSTCh3P z7D&*3RL@2Pdm{eY#uI0JBQvABzk@uIBl^W8tqQJ(q*{Z%OJ5H-syzv<@=ArgsfV`O zZ^cDxne8JQad5ZwHI+($?0ZujK&F6k}gqtNR-0 zGRtr)4?>1#&K_CI5Z}5<77WAA_kOM9tU$BC^t&>YYkZu24i-}MQ}2=e$y-X`9Wqz^ z12;Aw*RHfS_FA)|O#Y?2xbl7C3}?Sm+^c1!B23o<5?ZW&-EjnT2j~KFXvmJsR{_#ZQ>|TmO}=AU7R$Yp0g0e{gWZt zA>P#a_f(2$eq8VEpSwJY!_Ph!8pI9V%@`A|safoH{J^!@LV?HR2pm=AO7CB~1zzpm zc;UZgKD3e&42#}X$_F#)Hpk-gEEis+5PIgnIA~r6nUT2(v3~iy%UYw7E;!@hufCz6 zZ1vJx8O(%!L(?NV;97EkJ$tG}v$sMuCtPY^TM-_=13NIFt6Td1`t|@Gn<*0= z{Pb6IvI_1}_v3jdAxceUhaO7*5yKI;tX`@_0c;47xJi2Im*t%O1z4EOgus%`@9rzo zWYf_I-lJDS4cc%&v9r#>ps}5K(fhe+2l_kZMGu?Wq}GeBd3=u0E4Un2e=s3IY%b0x z#C^L5$ClV!7AC*282tZm_SRupKTY4T(%s!ihkzj69U|S0(y1WQAe|y9-6wI90s_(! zA`Q|Yf;60TH@tiPuKT*L=Q!^BkLP*+67ajbv$M0aGqbbvk?V19PUaUT)>y_sdH?OG zzw*k}qVK@{OA_SkLN3z6UMv?^6?*IM@ zS&-yeF#KiNl-jai?#VT>7*eO75$e)WBGTs zS`UV>s3=t8zn0ka{j78rCoRGkB+3snTZaC7QCaW2_V~{NWJc=hrUswu^=7siBy0$g z<3~=-MHP^ux%VIn1QKidZ+e*y>zw@Mpc-w`g)LW%j>V!1uy+`BZ{88y`J=eHczK}m zu5tUx1it9v;MG~(6w`5YQbhO#FACD^m(+Zrw5TIBDNW3wlNp?!m^gvgfOMk&15tA8 zh=D4c5*Lkj+0TCdZT8(Ozl9HZywTKyOB*shMlBa#no#ymHu`A39HW^<=o1Wtw?Nfi zRhw6?#8OIp6Q1`tOvB*Kd?^%kg8Cs_l)z(uqUqYFO=_(xLlZibyNu2!!-&vEvpfIf zPTEo}TZzy*u!dQq)=GKqt9{Y}$g6jQXX`TV%N^fh+c*nR#y|Sgcx|31!7Cz)f8MMc z=DL}lG1rvxYcBXC@Dqj{UcanxBTudt3_W8`m=>#gs7hYRkbBruPoKlt`t(ntq2$O+ zr`^5vy45!y-MO{v&ab#6VOd87i;4@RbfLr4B?iErOiqem7!E zEbXo+;k)k`-ibsyN7PvevK64|T3%{=xx^0MT^5&if5Jg$WCJ-2%FQ)326~LH?EBLi z`3GNn1lm8?CD4CZ%VS-SdBayt4vO*bdAU_JDtx4;*zPeqM>1P3Wah1)AK+ z?mLnsDS@A>>l|?`-0khaKQepT!M3>U2<}|4^wlXNCEL#%H;l#fV-yYQazy(2CzQ%N zrYEyv)$L_2ET2Y`<8M1-*MN)D=TEa0C0vXXb3|%=~%u*`xJOF*n_3xI^SN)6^Q4&5l&JB)pn3{&l6IR;z?IPmBo1Qf_V@;C@OMY zYCj&6#Vuk|jcYs+;CFLWKY>cvHZmCO!p0 z10{$Sb(~%xpu?db5S+AH)qX9#L>}lUAmVuC&*043UHJntl$uv0*`!fLtAfL1#BW%-5wgvW2TalGlP&__jO4AEEKD7cY-r zi^p4Ng`miY0}iO0WS=UOs)06(!PWjMZjXZu+#z@+%;f(`{W)r#Riu<9Y(FK@WC^o5 zY0OmPro(+c=B_jNDzN%$_}0+1a{M$tr#^+3H6rs@Q+Ou9f`#Ah#bOfs{mb@m$^C>! zY2nXX^XajttES!Wl%ICqY@W-G+jL!HY1-)UdbMwqkyT&)KyiFnCX;#Ed*8-lx}N^# z8nbxsYF+zlkIyrEA(w1#_D^nysoAB?h?X~mPj}3uN4o+*HU;-Rc2=%!(P#O^YG>mz z(htPld%g&ZC?Fnb=fYDxMn%<0_D??hyd8ez+T6D2PnVJnQZLwPZqrm2;xw?<8Jg;+ zFMnMBz91EMwXIrml__wMER3g`9Ho%ZM@!164D9f}Cg})<{>~CPoFp%%HwZjqttQs1 zONR{N4$pmNhGG5667L&6*(pXmz6Y6j(&95O97eb{CZncBkd7& ze_bW}X|Z~~??}JvHeK5~d6CA=^DX^)zIAY50K=bs+!IEkzuwuM9pfBW(yP7%)x~4C zd6PtF$CwGcRK@R~04U z82GjK-PPx4Bxl{{Le4|54>S~KQ|k|1(O0}g&v`{=;_*Ov z-H+X6g+W>u1MKe&-lFL16zz0B;q}hk#&i=(a`Lj5T^d<`jfjsd8iH{^QG;;|acz;$ z&sO~kmL9u&EQmAKYURp6dgA^5dW3Lut;zaWr5;zm#CJPEAq=@dP(l(ffKH*?({F0O zoBf-Upp5W{&9$pG*NYX9sI4E)6 z5OXc*_>D(%ULF2cSF_p??Yl1KMk0$2BksCjxhaq2<}bgCT2}(JY;XlJHs{@5)RBoH zw*53c(bOe3UlQf)=Z(!v8vU1TY=_Af_ZuN@6Q44RRDw)4lu=c9rza|IuUo)1ObPmB zfv-u26UO=gS0n7D2$s1--_}E8b8~?oL;@}02=+0AHgnp^?}X-zS$exqx%_;9?`2E| zxP&yiLs#cSwdWP~vy5xY3*{X5em=v!PFnBKiJjqulP_OFI$R_fFE3$Uqd4*_lXl4a zDo%FXci?RT+LWA+eXXciSnpZksp3i5AH}u6>q(*~Bm?Rz%CYtB^9Soj{$=$NhP~ft zcfNrv8y?ATEq2hk%pD>g8hMbrr9!ghr@~5W+7Q>FAMb~Y@_CP=D2*27A>( zd#~svO;@BnKk%sU-b=;j5qLE}jadRR2dpvz_BES|vI2VDw}UVqBgI+7?Q|OjmN!)ykt2nBklyYx41C; zsH4#-zQxZs@W*uOeUkKA3|tHgNh|xW|D6jUX!B|Kj|&n#cmPcr zj4{HXQ&*$Dc;~N9myCcgpJFubRagbh$1tWm^~I-A-okZ!=N_=?`+93mzUXZ8y2l)p zvW6|-EU!PziekPgkKX^t-P}TnDgWg$a*T!?zI4}$tN-k5SZVGRIaylTY!F?vfki>^ zEP6wvw901DRrQj|Mze5^`#}@SK9qlSmEmHs&33faePlP?bMcHzcPn1v%Fq9RS52N+ z9rh+0_ZM2C$kNtLxGd4)Z+U28@M!I@nP%VbrHRKuD?`~e4XFqGOSxlX9@8P6-pte2 zg-xGqH3{y-ZgLbu+D%`hC5s^_z>>u_3{QIAa4V?c#0N2)*d7^cGc73H8oqH;!L$rI zMht$0fH22;)Wa2Pzf0Am+fODF;CD$fqdQG}=WNcM`dehmEi?XMd*zymNz%`<#@F$u z5wHJ|Pn%2n@7S+8U8~k@;U9bRn}s?|14mz7gh+f`aON~R&_Rei*HU5{wC5SkfkLp6 zh{_n$WL?HK-y(=9lCI&&q4G}=x-u1Ct7dV}G*++Ak3uma$=spbmS&2qvbxTQl&Au{ zzV@`we*D%(Pa|A1{VO0z)P?XKs;$;AGmIQ_ZssG&GN_$`7A&+(p0%JsTgFI&2^q3_ zEj4S{a=mVPJv8l*)tK$lpp@=yo+mDb=!RSyEp$lYQ#;6}_Kol){b6P@fycI7F0HHU zQCC!Lfc8B?r)!?x7B|h6Wzp{iszSzT#reg9&lO2YlbrFx)RaZEpPbo8?`v!STHUbx zIDUq$zuF@_zDc+Bse5x<_D|@>HnT%)12yOT)@Ilje1R8EW8r~yCJ#3eeY{#Mj>Y6V zN=Qi2bN${CNnR>(r*iMoxr=*7i+ke$^;>AH5nT>@w^e*%%Aftn zhwZBF)5vnDIlnHUMy#NxuGYP7HX(i2RgTO^lA7Rn>-#bS83!J5EgM6f>}@G1>}f(Gaf-2Q;xK@_T`m%Z_;{lLz0B=OH>y-I&}13s(f2hs zriBpX!00)c{$rG(O>33)8@i;V zZf5`WhFlxl*I~2gTfh0C3VA~Awq?IF;zk<}yhY-o>-+IECeDYAu+U`bNRSHXJq%)L z>zKH5k-3+}`ef&ZmP)PADKP>mL{*uJatzR!JmOPHNCLt}J7bJ4I8;XRkeX!CDI=*c zO2!9Iktoh^AvojS$r2K(AFzjilx1|vhQAC=tqmwE8CWVcp*{Z4(N={)*J_QzX8Cv- z@hUix+2k_iU_!P`4chQSF}T4Z=OU3kO`|{0#PVeIH%yjrL`M?oRUW;T4=hS3>|+FO z?uSXkqUh{9kC0Vui}oVQ*g;)4W3}#yUJmaN42(0?FMpGG^rQ0Z=SvWE5r&vMT@`LU zB#J4S#_=jY5r|El`G1qq8oXQY2B z2XOl^+7B9`^9NQ;A9pyUc~sHC3!}FpB?7wjIP}`_(=k)u2(?gOJ$6InrF@{&5gn?O z)?!&`6;SV}4hu7XCixcc!xKV&Jd9*^kw5~B=4^uTc#*K-*PVyuf5rO0jhiV^_u+03 zZV(R~i0JQ^7$9Mu2tsf$Ae5(VUsY3teW6_m!kIqp`qss{JGZ_c<0F&~qV9j|zK4wD z`^*N3pv#ZmshrA|NtQ>a*=@KkrpF6GUKUHicFWkGmyitkMjW3k zHguKr*Sp9GOUOg1sX-pi-3Ed!vtsAu-o({T7CTGnBzbv^&pI2`F9K{RB6^5SZv?r9 z;OKmn!knLOw2K%DhkdZa&m<96EscAuUt%-4E=Q7QCTKR$b$oxe4=AANnVV(0G$3`{k-Q+5;H+KgVb$5kj&aF@f!9Brc z+&g|Xs;P;`%{2(+)X!V7vHTQJlBJ!H*m5GJ{lf%hHJ~8`ThdI`Qznat&}H5*ssI#1|9;RkP#oI*7&AcO-k*H<=Zt{wuRN<62u0A!NH)~P zwOM?reREsj$ux~D9qnTVkKH6ZK7?QKBGR(`8d^Vp9{a8%FCumt z zbf^7EBEKCXGigg)Jqq~@KdR3?(m0%%tJXxGJGl~8gNDe&yu=I73H6R|>P!+@mO$;q zvWZ2fRL*@eS)L^Hmb+Z>8mNt1;s#1SGPv6CBVlV6gQe=aMftq5^oy-ckSQjb^bbiH z3(^UKT_|LFUt2g4;oRcA3p^-sf=Z2=?N2X48np|?Fe83$;=61k~{B$?@ovh=r*gF_> znRgCJ$S`~oo!v!dc+>Bam=PHgROLT~=G`=@$(>vNPWF+q)Rr121ZN{7Ug+{nReLFC z=lS5{oaatX@Ab8ikC{ucjz6FXs$LS9#51h{Hs<15vO^-Al1&)4O8riX7!iq^l2iFXnGo^tJP`}V%ZJdW@v&Pzu+Fc1EPBuc@9k%lr*oFq&o@L`(m4=R?6*^92Wn1Av+RF@I zbM8E1Lm+D3My1F2qF(+i@u85AS1wF|Ep^+3917}4MeKBZoQ%5Sa1ylPLHO|}&5Jjg z%lAo2V4$Zeq3wA&t38(HZ5aXSkoKq2TD-)x1F93gWyyGvA=~)%P;K!%dgTb1 zXYLgvSpad|WO~!9Him?>3)epcG&>QG)Tz6w?+>z(_bE2iyF4|q65A=YA~f=VkXQ^@ zY+S#AjA)m|0ffI#A{}i|Ib!$n3pLDoEz{#E?Vx=?8o=jsTw z96Q{A$q)!m$Auit!#N|4!yqPq-D}J39oY_vhPlDN=u8nAW(!Sx5qVK%<>%s`Fqm8V z@L1m>#ZI!l?W@}iS~iQtbrdM*ZAQ1XR8pEsBQMYP#IY`OL6Fc$7s1@K=)YBN=s_!Tv+9Bun4DoNy##NfGxJLYA%NtZ$*s@ROlPgmm`;7Zit` zZBTHM4!fkQV*JHlar*B5r)BYa#w5e8LSi(AF`w-Spc(2AbeTgx1L_!n93%9wIWtFd9bd!us*-F33 zH|rt|i8UjtZIvp}nfOSK+!4|4utGIt^5a0*gXyVoY5ZJyb9h*ObTAb<1)=+1QD^}h z35{%T@MJ?sveoJ}hsW!m=wdYZM6p%5TY59(Nm_^BzoSufg?~X*D?I9_^Lr$NmE>?u zul<|$(Jh6nMzTm&o_V3oR6Zr{$t1e`Sipz?(yO^N9#lTb8y-3ya;UpN ze*R)z^pf;DsaxJK-pzy*TrJc@YORO^%4G>#4&h&V9XIi!X+?x)*Sn+TS!F$=<(=Li z$BcT}BYB?QIb37PvbD=G0M%eShx)wms=JVcb0_1*W0 z?P6l+<4|ar^SUmT9AK8HVH{Di@A_J0wmD^5*Vaka3}w$n$vL&v9(X%_n==iUs)buG zb7;12?`qn~+HkRY)N@pRCCo(-;NhX}?scO_EW1%x~zM>{jB3sgf zRT~t)lh%v3`>nfRTh|wEz^0<6-j?+xgk>2i0V9)bDEUFhEvzei@IiDzJf6Q?zzIW? zB%#W-5Pu%!Fh~bef+J&8`j9+SPRi{rFF;b>9|cJJKFe^`=yRl3*@1<7CAqRkztB?& zNS~y0deaV@ueSeA(w*ZyUHt*eJyV~xh1mYcoNoN$&AHed;=CaY@z}JWn*>- zJ9SZ97H&TL9?6R*rKP&D^eXe{vNe#<+pM1nZLa_${jp}a&+GMqD%|^QTYvt%C~eED zP{dygu9rKIWYDDST=JbMNwgl32WAXet(d}bOB_XcS(3{AvSTHpsGQRG#j@gQ>b_bmIp=xj z^x`7+<1tSfxV7t<*!6XwKc0@18g&nASd)~@+EfeK&DBF?=%#>^a-CheIe#IhsbnM` zF7*n&PBmT|I?iC7gzv-*t#3U8TPpg(?A0D;yv}~PvH4voo9c#V-NN&Ab|4`#@zN55 z$7+XTw%c(_nKutEPDz#OI$*W4fQwgmqLJY-7 zr?=70=g}dbO*7e%w8TA&%a)>is{BU@!yr?y(HKd9F+&hzt2SlB-J38IK zw#5#j7LJp}FY18gbpMfpz2!?N-j`uB^vBMhqA=lYgVlF^X=nr9H`2#*0$r#~QY)f{J!2RK{M6e`$;(6lPj+3pK!p~h23o9 zxL>`u76)lG5)BF^St<7g;((U@;_dTDj*fuC$Zoihj%CG;LIFs5Vnu z%&b$;UaCqq7McD`?f5zAY~Wq%@r^^%bzKqf$_Ejx{C8}^>uq6fN_8r3={76hT(2=w z2hOf>>LG7ac-2hZu5$Q`UUy^En(X)~Ol&QRI$w2YNE-8nQ8m4^?wmZiVcJ>mg2og% zu_Y)rPo~72E}3%~ut3x|H{DsdKkr;rvHjNJV0kg^k{q5g2ov8~d)0;`|MJsD%9;+p zmy2;p>(I2$_KYJX^RU_Ken*?OvwhKBTAf4T&dh7cs^qcz2B@jp;?z!{S@U9oh!ykl zVk>s$AMx&ve%A+L->KxBr|eMz6Hte6zpq{#vi7Qiv@jDzhh*-`?Y5c0&WyJke~VK> zW0|n)b7N*)67i#du<}n}eMZHtC_K~qY#luVVn|6b$waTjl%z-ebIaK_{`SW#)M+4? z&g2)r*le*FR%kLmqOD#fH0oM@G4B>DfLdS04!J;Xu?;ZWo_BwFlTq7JY&OHW?XnKUPLMgmF?mI*|U6+z(ElA}QgL8#4Ak zrJm&C$L0EmUabx-^7P7{pJ!z_O^eC}Xd_uQ3|&3QX|()luVDu_jAyEk-9u^I%^@HCtM+Ygpa-jzUnMj>vD> zlbd#4iupdB<>XkvAzOEC&(QmL?1E#hzMG;i>VRufqf~6}r7}u@#C|2RTRvEYHjLv+ zkQjIWHa?N5daDjK@QYG0wAT8(3_GrGyO0ry%}=zIM{)6BqAx)~o$c)}Wo72b*gw5P zpK(twI`0=N{K-rFi?HKG9FtYk4;6EED=l^Kbiu7+L07tnYbApRU8Nm^^I4HMKT|?7 z)~c(Zf0L@&t#gW>mlZ?*#C2bE*THXAwh7ObC1Y!DVVfIvxAH{wctJs!|0lKt-8vb2 z{XqF2v|`m$Z^?_0CVu);#$ZI7n78m(vQ7=8d^RK8X!%lEaEQ!7B<)Tl8QC0idjDA^ zIU)Qa=Wa&`!QpzvBhF+cov63bqw;uM=4M)JN|AKWtWb$6cCtT6xwn}NA2@6sC`I(~ zPDoC5sZODl_i}U=?zVocK=1x9JCNQ(4~k(Gc$Rn+I!L|5jTk?a?>$$*pme|CD>)V`>3so-k;KeTtd&X zVrQc5t9ic*zhu=LK|L)-#vyjH0FRtRsZ>_?V`MUYb`lYFH zS|)6_&J_f56QtFiRuny>LU&EjGLK&9%oy4aJc9mIrAwKyyi%XL(WzP^U8Rt(>i&(D zyIsTK`?}ru*1oant6Ik+BKhg6IV}otx-xrSOb0IGn=Aa!VSleRm+`h8ZB(+U(muGg z#{9?*bf@Cl2|%X>)ZSBjRVxS!l; zGAwg}vU=_t8#_S9twO?GG!-?5SC&gg5L1ZqIW_8WRFc!nHqaAASdDWI%sQl?bqZ0D zuam9vvgV32Fe8rHGu)qN#T-8Ei!RYk4=IQU@S0M8GaGWZevuCuNqEdO?D_Ng+qK^O zVa-Q(H{Ep;rG~~~7Rbuk#wnNNdF)wqM<;az&s|y5Jw}q{jde*>V67093=T=dl`3gu ze&x3&6;^|W&gBo~_@@`V)GZe15eF~I6nqLCO}ER|UVh=onY4#GWO1n+YTOI8r!N%d zzX}bx8bz9E^N{N>=tvpE79uqTkD8Wejzei+U^lD1nq zxjd|E!49&bw}7bC#t#Si#N50JU6bN>4@*R=bWBNV^x;p=S0TV_kh>~%Wk?5Mh(WQl zF8VW50|I&})H1i#w#AQf@&a*k!m7HQr6TRq8hOnrDlY=D4PliuT%FCOxaMgMu-7Nc zxAh~Ln^?LRcA<$`WoNRm!>k$p+kOr?ze`rYx6Ji3oP^a}Fz2UU*a!WruI z!*C2C?{!_Nu~KNY5ke%y^YblAtu59*yc?H)?g5G|9@Qz;rnGWGcBdpFHOM z9zis$5&T#S6#?xDnl37&Q-|rpXS>hr_aAXFEk*CmaPMl2#jMa6uHP(DNdMB(9uA`q zLsRd9TqrZWs?uS?Srn88-Y&FH%Oit;nbpo^Or+RuioMy_qSl={Pkv^3AQmN36U5r8wuHy)l9;nfOTbRr3U?! z)8H|BZIX?FN$-Y5*@u{}3$9+JpCJ;e6B<=JlT_)?3MyIH$VsrF-L~`;|LEf=ppQxZ z(Z_}VxB6IeFG7n&vYQxrb&>L-ifO&EhjJO!eMJ=0BrQSzSn5z}`c}PKR5^->wl-92 zgMvxw8?g*>Fp7Hm81K;`O4&5qLBPj|!H?#g?sC0K{OJ4nFzRHf4vx29KbBb@?lu|l z$>igWZ?Tiw$ZwPoE}E6D^@udRd>Q_XcO%`Zf}-*auX$3vKO5L!i5SUZ8S}ZUq|WBMMuL7 zq?z#@6wEvAS`&riMSiW^m>q=Iwoxd!bGOSnPnMu@c4#M-^q**x4V5C)F8NIdU{-+rsOVfygF`CR3Zm9N|a+Ux>3ZRr1E z2IF%p$b8~q&M8`xFD-+xqM>`ttMz|6c)lkjrvX8bMtVy$w9r$(pdPe~{`6~kIt5Y^ zub2Uv^OIpgBpo~YovG2mhW7SSi8r`xeX_stpu@}a+c#=~bRhw(2)I?4Gn;Qb_yj^< z6TkO|G>kWb2;aAw$D`5KDvDpE;j5Teoo`KS>c1AXYh*>OsN z>-f?O$!S0KAj$H7YP-E3X^0_N`KHgcSY%O0GN}0x1RfNyQ%ntH;F>b@L&MLWFrX>l zcC!4>InS8I3iI@l&&xA-@V@(7F{jC3apF3uF&D-Z3g;J^ov*41oL{7VAl8YR`^OoL z5gRy5i73wJ`cB-vyR@sSd-(|>=^}2^`*!%iSu%p`HYdV;qC}bb970_9fftfP1|_8R zgF8IW;SLYQ*E#c=HnO_FSibrQ1(+4A&#|P#l(nQ})AF{&7Rmt0}C6EQ<5`@gt5TU}yeVnuC@V|2F zTAI23CKYq$j-yT=X8d`c1b8Dxw|m?Dihzi<*G}rJG{AA;pfP*z;ehw0iBDo-?|*n~ z)oo2@s&-P4`J)jyfhe%U`I*?zW^Xfz3@XQk;3hQH-?>Vr@PAG# z=l_kO%Tc-mt(I03V8=hNHqs-hm}9+AH>Wp$YQ&=cu99Pdq%zgLgfsE{jHB;MT>aXa z!K`7|%uNp7`-s|u} z&EWLs-!|OFiUKXy*1&*Vwf{fKUpCXI`gbnCzYbm?ijsZtJhT_ioSA1!NjB99E-yJ$ z>fMTpWP}nnqbaxXk?$Eax zSnUz`p!(p-^smc__&xX{>@j3{-_;G9ZG1H3YDjCEQ1L@^YA;v!BcA;e{((OJ4O}9H zAWho&&g{oOeewu+yU(HPktuIGDie##rpadt+hfz(P4?3pOeDl=YbK!v^QNMwt4zm} zqa5o+vO6zg9IUNBJ#$|;aGk#KZ7AjZW1h~Ed*;gNIy{gxcsB25t#(k^tiS8{ls356L#oYi93jf@Q8XMayhx9K>#QNd-Q2Z`hN*|MD=(w|%)#WeCe^9uwg+0K8&!*}dp|dlgyKag z1*1-VS6weo*ec8W`S%*rIY58@O&e{nN~5a$LAalU(B*o;V7) zyx;hl#+KXSi0y5roO9D*e9x`5X*(QnmT-NHHEo;y?KS^g(|h5Pwf-{@^r#9%sO@+V zc(8dOvo*r~2tbta>&}ZAKzs$VT56Dq6?CcyNIa!2{OSzE?4CR$#X0kG7w&k%b>a<6W(C@OVXOK8I!Tp?$=SjppL#de4{q!nGf1m@| z4(liWUVX7so-2OP%DL)dnmHvDd($*l?Hrh%dh>JWmX7#uJ#Ln43cYxN(P6dUqpHgM zz_NquPoS2gx2Wt%)889WdM0lzCs>KBX$$qg>uZmvUw z)31N)Fk-2cvv$Hqu#l>3{t6emuat>duJH*X!Vma7j|e`1cIc0Om4SA6@OE#;2&b$; zI}GJ=mS6C8c5;>N@8Ip;QQ|v(N)S0G=RsP5ul?n-uM}88CUW3M6RI<$vpE4`bcFFT zWg&PBj!LVXXBZx797gHMU{HX~MtsMMgzK=CpJNVSEI2pZ-cAxu+Jdprl**Zx;bXl~ zs_c3WAL}_Ko@3m<#`=Gykj-P4&jPLI6PVZ0Ydd~2gr-RFW0MdJ9aiVe{?A7HF9oS* znE~fx26iO>&*XpR^?y6ohpU-r`goUrv~Dwh|NLt-D;$1J|umJxbI! zMDSra0aFPv9o~NeqDp;32jPHk-<>{#7Vy=7eN0CH{x)md4Vu1!(`}$p=Nmg&Gb&}o z07x{#{lq`=uU_HbcLB2F{5j$Jn*_eWkojAFf{w8u{P2#?4h>fS(=mKREI1Lt@PNFu z)W$1*l7N($!W?pc0WB83-hLx{I4k}&!v{JG73!Pk|1UHX-OXJBG&u;sK-2=Dng!fy z--!X57|}fk00LU5cb@;MJY8wb0)UodTdLlbT1=Jc*4q7gVh`Z2M4q#h{06gA!wvMy z7Sv(^@uU4UxTk&mr9R_7e54abV+@{57x5_D1^9JvgvxYr--cuWMwC|rK#ZHdx-Ko= z-2xg0xLS5=brtDU8-BZpUbFyM4-F9ySaMce_#~*n+%eD#jK$Y9euvV+tFe4*e?xFT z1C&wsauI+)&j?_jQ*b3NrOp9dYUEi5z|m6?UKh5>vCFCr=@M$6V$c3|E&L%VM`n4bv3fP4NA+Ob@fx=hA@l|*?4{|`rm>nVi z5>xovAmJ$?LESgfZ3hr>#r`_@1CrPBLU7od(nW;tIy^%Br5RY!4JehOY11P4q zUJ%3O*1@5ywF$?o1PH5xw^t{acVYljQuh7ATg4^-zzKm|C%QmzUJsIK=lJdm3j;iP zOr8+0ZLnYokP)_$tS20L6>rkC!BlX%0IUT)shL;4PPyJofC{}FNp*h*>es&jytE$6 zldh_I_PTf0Q#cH+Kx7X2?tXy{agAo`6{eXx(1J?}1oyzJ_48>bCPl?v7jDr^?v?3{mHN(bx5AzvaMc6JXRu zPb%A(kEc&ce>|H|msB3k;Qx~*FAH;;{UYr!Q}q;A{4}1*edrm%wfiAr=Mq2*+|-f8 z3%#W*8Z)%bW|9rZN&q-|hPU7H3LF5EK|=ulM7ufIJ$HCTu81Eem6vMz&pwkwlXh|+ z?R5Nm9pb&wq+llQ!~AOgH4|#0#4U)df^p*K291w9wu>_tm;*skvbe2$FOR{uKdD(p zC2lJGPKE`!vZ7XGt8bU03n$4X16mi}bYoWqppi^c0*Kq1ll*j1&#E2VyC=^hd2L6i z6j{d<`bywHjraFAu8K5c5de1cq5Pm!8|`V~aQ`kQy;iBE24u{v)mI^o$`CIAnG4in zD#bK(nEjHk-7Dqh)NyxSWy)te@~#K1l2znvzQSl%ldfEtj5Kv=}$oSEqUQV z8ZpIQMI%oe7oGr^PiUT-Oq1LZWaQ?cgBz9VjnpjcdkcW7sr5Q~1=TJ8+%0+Oe#lI_ ze?I<#TlrwI?N9C@(i1C8Cs4|sdJe#3VE%Ygu?xybeJaL$|B?W%Y|!&IR8qi^JfLK2 zCJCgV^;sPW<-et%Dee)yW;_Ke-7RyoP#Rv&>6E^?Fn9J(+CURE@iQHAeurcRjbJ&n zP@J`4@Lzja58E?Cv0H<|t{?TI{fIJcCK{@{Vllb`?q!+c;M#boICWGDuZ~1`g_DI!?hf(2^^vTf@FSCSx>5^Z&<6Qy`@drN?poY4`XfOuo z&=h-b0GR9B4$+lJhJ{U>QECn-Tq_m23d%IEPOHaCF~Fqme+QKtX7#xev{Z zMp~$!D2LKY)N{F5KeDnap&830B&DWq4y7_Ah~Oc}djk*{G%M1(7kcJ)ETh3gtlGt~ z@s-i4#-|{V+rRlCyeR8f^wMd|3NK)IW}(r@?$$D~enHO* zXyY-{wtHW|{uavS_Qp}~=rAP9U0C-gGO2xZ8xh|8_J+rtPLdAdcU}&_fkoU<7Br~x zt@5SfHRyEd+yKPuI$HhypBB+G9Qqev-HGU9h>(l0SMj(g4Zj`XCW=^%9nXECi_jP& zQh?<#17&3T_zI{Y$S7sLQM}<_nGQ5sOdADRVibII=2B?IfAYWO7(8liSnH~Qp=R|b z-YDYVr+847)CX({OKnJ2T!R#=)BxUockyr1*d+9cCIAj=N&%z8mr-#_76T_&JA)fq z+mjglc7Cu;N?*`9YgFYK93dZD14L?G?_;YH36PPikrLmYby#kfEV6jPw6&MG)t)y( z|5R{M3W$%lv^m|te#b+AB220mz_s?Cr5|;EA^$Nzs4k`nlKf7CO4E5PgKs1u8Tv26 zUfAzXMEz6&)9fP4JmineCR7umF912K;a|#uTIxIFKHYqiD;OKU&Ede5ti>98sDeRq zuujBSUU0WN?-2zT`kudON%i2+Nb1hP&jvOLLwG;M|6!BRqkRbi5H}O+huyfJC@8y1 z51r|mu8>JhG~UqWa9=ifji%D!ce&5z5mLyVGxpKldM*N`TdGWrhTN5mBYvvkeRnp4 znG@p{(Ez-Rx2p=IPl&LDeby2b7wQU`JOVD4Q5Ey>R%Go+nSS{nrRUJKeE&|WL?!c^ z&cm-%mq!Ofm$6 z4Kj2Uohm0}`B{L(4|E6 zjwOKZGo3pyA1(}iZ}u9I6EP(8VYVvgXXbPrVA z3l?GgibW+f#%AjGnXM~_;7_Xc;AiwvZ-AJ+uuAWKnchC89ve^JuA!i*ft#-)Q;FD* zs2SXWC5M}KjUbJ9wA@`CtU{af!@!~|)^*IKWp88rqv8PLy7)Vpk16CY(4G-VlQAdJ zNEj&j;v6YbvgJ!g+`L-um$RvT6ZY(!{438~I47g(u!;lx(B%JS`$L=Fqk;@!JNi3+ z#6F|`#Vng?rtKat+^6_2Z{e>wTHctnH{#%`BdI95 zYIbl#^hR`1GSn%IWRmT&0gsEM%F8_4JmBibl;#3~v$+Pr@=K&B#Qv2VQa_p=ZBPj@ zyAcPOH}XDzD1Qt$Som_mtizGUn5y%b@-|2#ov0CFZ2qJ^S0eE(rW0TO zn4ZiiPfbIj2;8lvK0mkm_bmTwvhaT}@IDN6mJ};~q6t?)LtwRgzvS;p8ZNI_>BLzY za7Z^0dJJ@*3OMep0JP_Lt@WU)o!fN7FCbCWMmuT$HsAhn`ry{I72Mk*k}!hUS8L?U z%;Ht4QfxZeapZe@i7fxHVycJRu<)?$zvuHW@93Z3gpf40;t41jj5z-JRSwPtcq;RM z`%wRB6K{n?=iXO32>{(0CHu<+k+6cC;0n))TzMzrf70|EAzm1chfd{p*h&5e*Rd6R zl1v2%O{SVYXZ&B7^5n^*bUv#-JmIap5G;7KNdWrwG%KuHaR4@Gp8C!s)H4d;0V;x)gK_VB@|6J>ySSjdiN+lE!1_08)|^8K0Vf{r4g$gH%T;9*>MLk_F#Cao8tdZDu1FY>?>bIlLxWJjHMTec`K=#_l6f80wr&G^ zEk1ZY*0v6mgK3?8Vzv#Ij*)#aue?i%9r2_egtQ*1)la4 zGUzU>RlJcE9)ZE2zws@G$2^KZS?o6~DP=EE`|f!ELi_6cTd)0QUlha`qQ&knSBuBl zD0o}CKu1KNpWWBsy#Tn+!OH%VW(C(Hyx;Zw2jCfi>#`Quk?=6rE2I&ZL3AXIovb%q z=p(Fit?%4k^;IT?OUc05LeH~d!vuVljc>Lx)xcK?)vOihg|FyQp6x|3LC`xE54~!$ zrxPXWsTBMUeZbG*wjM}UW-^hN@CBg^t#&_eD2N9n%rLJ64`&Jv&euB@$%J8OmFYmg zm74%f>?mp9Qn|+P1^Bpq#QG+d?9?|>@HopPJT}Cy^WKsb&_8SY#1u}nsbcSZ3Y;Cv zM2ULp$_(*nTALf0uAkw4$+MqS3;;%Iz6{fI;cbM8*K!zF$k@scka{*(YkYSHEy;0H zK^p*=e;AQb;0Wl4d!k7!>Wi@JBsCFbg;+9rfE~*9taQ5CoLjp@%_IF2g@6;;Q4^kq z$q9fnR|2)nF|Xa0{4e@}htf3VAOEaWa*O2>o1B-i_}&j3FXJ_@6|VpTfZhH+3W^gr zktOU7QBGn$27!kDUy8KU`ldrsdgFvajyQk%YZHn$#|+AA;nDS4JDwNK96+BvUpa5J zgU=IwrKc(n#8{PD*wM2Ug0qH%k+BqTGaS@p)lUEbLu?*@>5jlnjp1tUySoLRze*~X zX#>I?Fa=X*%zTIMMv_F}hFJ?v*D7#JQxI{4r1UBb=;52IrNRH^{1{%>_zX7n!s?Oc z(sPGNr5u2bDh$MjLBbuNb?#Fn%a_`;y4gJ3U8I6gjz#&6#LX_>Hn6O2{=5|Y!w9Bm zsOqn5`;V^rG0D=x=8ZP%!|6S1G#oAk57h3S{GWp7gr8vbq4GKzF24g2-zo4I5RO24 zjKTRIPxI~h`f>(m8sFV#(l1wE#7_$ERMce+^pYlvLZZ%pB#8iOQ{Jxj_EWmHE)#LL zJ1u2452x`&n!X-R1W2Ka*_(lSVQ-_1Ga-uoGznR)j9#aeHz zXe9-`;_zd15=@d<06jV$9;i>Id;ThAC^>({35yI9_=!*2)K2oJ)rASHu0|E*j^xBv z9>~*?%*k3;kgm#T35{uF22+3&fK>zXd;(Xv{zh}au*#qM*)KSKC&k8h`%4amFpYe< zhbtujx1ym8IG+%nzQauhN+oq*PGXs5cYu)XaAMUTMH%l=w(PGvJ6l`ol{)#f#Z5oo zcDCQ90Eil8KN)M>)k+v9JtuG{Qh?brMS;K7D0;*>?>c5 z_K^D5IIx+iuk&u&5-16Y2rlkUr=>Qm40MSgSHo9FH8gL(_RXhIXM4OKJ=euWc9z?}C|P zb20F#zkX5hx#~ZBIDCB2eVSV94}50+&SV@%!CwP#zLf97qVTm;@FReA;yhr$Q9q(F zMG8yQ`Y>`Pxw35(} zhieC4d~2T>ich4aD%Exk&F3bg>u$Q^7EiUt%IB2M9pHe*FE&hNMFaK;>+8B@0HGHX zbP^_8NVQ=mB7_SEoH?rf_3MAeHbJaLyEa04Sh}H}4-CnbVzDw1Ai|M7!$HSg(@q7{ zlK`gL_H=`X8{-YCeN|*H0d@@}pp z#n9H7U;$@;*Mi&`@xbsTPYl})U^6bIVE~S)qLHvz|G&jCd7u~%wkAvBFDh3bfIT{@ zNK9&Q7J1oN#bV z=JMADnV4O7_+7`=hgW65P+v#}sO4kkMx^1jVk~t$vo2CTQK2dLY({OWi+Y+x@IslU z*wEP0zg+*eNGCiIw3(_zOCfs~|1vzX#P5X99HXIM_MI5V@I|oJg~bV36w* z`*Wcl`-I@wL{Gid5<>YB3;y4$xe;IM|4zuy>7Ym|1s!=Kv}OxFiw=V)-18a$?DP9s z07F`Tvn`p4^#7{`03g+oSMJ?xPt*Q*t$KdFiDIT5$pEDiUwwH zBxMz6hB0{AXDU2chSJ58OTRyQ#5P0{eHir8`=VTmD0WLVi<`7XV!&!vB9uWyS+%`}?wi|_MVCn~X~Jl)sAwzVF|?)4q>ujmZm}N_YuB7GK|TV?5cJ}3Od?D9DqD8l0$v~* z>qD!{B1Hn?c#IPPN(GZG$Ep!+;sGOEjZ@7P>y#cH2ikWE_8NzqV;`VF8s1PpE z*RK2PQQ7GQTegHY89v{qE8h3J>_GtyoK!s0NFkjFJj}cfRF%G!d1RrARIWW%BPUNh z%@4jWX`Rhne-I?0?|-bb-GgWD`}2lQSyH6eDSCLn0Tae_+XVSFP71-C$GWdc*D2T1 zkoFO0R~so(OEL*T6t{Wbm1I=I0i9b9$n6YocHhu(3 zll^}DaQkxe@7tb0gEMuNkvZ-fbd{L;hJ$zroUdHu2rEMgGXc{zi7N!?Db5SsB^@pT z!~xwq1ql)+uq;m|hsRgS25JK#1;Fz3UyW`~@7wNkqsptb5Hc<@1h?H=oC>^@$b4h6 zmkmnh^3wRx`k7LjlgxAY#uJ5h^_J;x!sFk z>jxH>g)}vKF0Y*_*itX!qM&aD|8E+iw<>9%E?kJ=45qsu9YNqHQvG8${+o}ZCMo3i z20+z3BJ*@8cfZs%NddTy7nw$oaAM%y^cP~x8BkQ)abi=1g2oS};RArQ2nGG4Bn^fY zqpRSQ`DdG`@xc?I6a@*ZQ{37SDpXAO7xXHze}2ZVU!fJAhGRv?N)lbxH>vnHOLkI# zhI~Rk;yx4@4F1&0y;>m|Wc~@x|B*y4OYs_uYnVM&n-GoO_9K=M&TBVReP%24KrjhW zY!u@7)LpQQ<3Ew@O_%!i;iwnE=kvzNv|;Aa2u8L`pbR_+GMDzBy4!EI@-M+SdFD8;O=D z;5zwrSQi{p8hzV59THU?*Z*tccCa@i=%Vq5UNd+A&l;oNuI&UJ1t)x%)PO}b(E}~{ z5O=Au=kXyhyPp;s>q}*+s$lLU9)t2*pTjZ!-#`PFwG_Y;n*T=;PfvhxbP{Las-xAP zE&AM$-B#wqdwWyzX!(UFhI0YmX%xl@Jd5UKqwMj^TBiN)Q)%rr;uO!WLkd-eT(&T% zt}epUQN@Qj3<1qmvb&*SLE&dJiu&tH!k0$bpWE1MiHTWZV0D`6gFk5LnXLfYrsK$- zf=$XUF^w9}0Sj6-70OD@k3^SnCW$32+%IRvU;_-mtC(U0d5O)Vfao`mzxl`I5RKJM zVKYFYy@8E!RZ2vW|EBtVJCIVhiZvva{3R@oQlV4M5Nl8onQ8axct;^i3F$fOe@b^T zh-HVxRFsCZloyl(wX`g#!4xVWGJFgS6_5qlg)2sps>hA~QkUv4hg@pp0kB(t8v76* zEwl1ZNISmsMH*V(a5G1 zNv79xck~v4pKo%6r_(S>g8D9UEb<7Q;y9AvT?ehsO^O3bdSEE?KXp1m#!cQA;af@# z(y~CA;R^as`5jbKVgD2u2Hcq_WO4ZL- zJQ0{!!Z!qBd65|h*>hgQOgXI;f)f!;M6x`1g}7v>Il(vm714UAnI=Rw#}q%m4JH(= z6mp}@<|DZ9C`T^RSkzRzBK;#IJIip2ZDeb472y(WD>o$Y{i?DlsY7YXblT)!Cznfk zCV4Xg?JgOg>3P~0erDi>O*}zX6I3-VE>87nAj;4dlY+ly#ywU|mKW8=mSaw|4|}28 zqoHyZ>xh@%TdSEvYx*nmNqA0A#m3GH1+oBlW?UA}MKzbqD}!|C&7lBgeFl@6@sLpY zn+c}#%~$vStE%K%$?hLU;ZnvKs~_A`#ahQl&B&J%3)?ja9L*sNnaRG#Sbo-H zOX9aSP37Y?lU&*|n5D$R1yhw>aLUJX=W(*MS0?#U$;&FN`<*9pm>kJ|8-IVYnW5X! zh#KU5q+aB1mtJg@O3@-|(j_g+8FN`iU|*6d(|7hQ_2rd*5f}&GpR-mjl|NF-dw+TU zO=Vy6^SIti&TzkehKM5(op*g%g3ojoCCM-BX!dtkdwXdp@pO+LB_Hoj>EKHDc5@2b zDM+kxe3M24ch)iR54~q$YF0#KYLq++hg)Bo2 z=@TnzVQRr6VGv$-<&ESo#o!w=p4uKfqrJDQ{XW}VxxlT&_BkIUagzAJ6=+I_SMh7 zL3Xt!BX<954DS-=(XBU}-9g3QF9`zVl=-Xt+21ooO1f^oJ1@0atH2G5TWU^u?gOpr zP9F}9zV>$en4n5R)M=1QfY$C(7@24J)8K<`TLZJe!xyu+`ssgR_xJX7gr4wMbMeqR zbEjQV>z+VMvF!hOtr5;>Cm1NhF|UaOqSi5No~=d$)m>;V&>xOhK}u2;BR1|qnz!%>G42)o0z`pOWM#Qb9(h)+ov_pFRv-)1sEkwRxZhKIplNz!UuG%XF%A3Ai~6Xw0x`1`?QK zgKJlnfXy49x03Dr1OWZRlD8KwWS2^?vU!z%wG7-R(iL%4FRAuEE4PdTGP9;9(R7oW zvSy=Py9N+8*<=S8oQ7gR`PY5SODb8Ssi2r1kW~erB}caz_j50KI?1an{}aE(fD-fg zvmib>r6E2#m+=P;EEt{KVj9TSzBV0K&YTx8w|JG!c=gt&Cm2R~1;?0qp6D2Ejl_kH zJA^i5>E4QdaHput2&@!H0V6lL(l!Sgat(QS0D&u;cu2d}@ckE!T<+&XtEzwR#di;b z6n2|+Bkl~!#~0mv${-a_uL~$$uZlXpDPfZ17R$Q+W3mFG5vtydR49&+(1c(`ap_k` ztQXmSPkv;EF;b2EM!Al7$F3Y{JWnPj7U{1*^sYht=VpMri27d=CRG4qd10=oN^GV& zLduq-nxj`ALmw~#_hS5ZcdFN{46wgC^MEpkvMG9Yn6m4|w+T|ND-HROmlUb4?%n4f zI?14zB-pnZamL2{hft%P9V@YkyJAjM7T#z;w9->iNLE0mT1R<*JEKCu%FuyMZvZI064Pc< zstA}_ae$M)(1U9dBB~%u9)k!Aijb%VV0GNm4Pp7mUE$v{J@OuD#4$dl67iM)VX)Jm z{rhe3Bn9Q3nf~qYGcTvW_EX}yNx%id>h6^$cWF8z%V-M17w{FpY|yfNBob_SQ3^9RAKk2ph_M{%f9fnGI1F}h*V3IN6lPI&*I~VSBR`QoS7pN zu~e?(PgT9trFS1xtx?#-JxyQLBS)H*dIMJK3y-?cI}vmJ&3$TI5h9=QITi>YA;P-) z`8rzDo($`SV$EH&tjHZsIe{Hc#ZZ+FBjpo9uBHgQyhps*FE+zexqRhS$KT%PXz>s! zH1s|M^lfqaGFo8u0vzUNB0R}(=`iKE4~S#9w|X;yzXDJdmJtsFl0}R;`%D?X-(h*4|HxsjNd{&e5n%|~QmSKY?)oUq%^CX=On0ciIX6vO@xCcKbMFd;*ghKKDVSl( z3zVW7URYAGEQLc8glIjKIU*Yo#M)+?{lilEVuE%nEM(^blO!W|+3MwQhlHRFP^W8! z?%7%Rd#)arxG@{B+@kBROo|Pln|79UnkbBP@+W;jv@rYYYFA3W^zWYN$_2`kacQ#J zp=kM&rzZ3Ig~L8+Qxd|O_dsE1qJ!Mm7n{`Isl4=|^enPn8$7W}x;xdBj7i8jx zQ)Vm?zHoLv?~LY4TrWEKocg>-TX475hx(talr`U{JntVvGmqk;5L72KB=3;L=q_8M zg+^mp&i&?q6vV5>bkeVw0T%2o2XX&x8FciFy!c5p*$n@ zpTursnDtbb?;{AgnB|V5d#xmlshHSXeyR(~8h||n{`m%Knxj|`&R*Wys=s&EtjWFB z;NEI}lN;SeU{r1=cxzpK^X%Q@{p|Uyp%Uz6yV{bvWSJ?OL!z95_Kj|^z+=%LEkkdc z;psrnx72U8eyZ$==J}QjBN&R;zO~Mm?wbVbmUV2Bz?J;?xpn&8nBWmx z?@`*sTp9#ebpl$0BP_46N z>EBb~T;6%-kj@cj8?y6ZBpv zoOtUs&_Y{DABI{tL~C)O96vRMD2OA^Q{TjSuh@Ahx+zqV;9dYEeftD$4iRbYRu<^$ z;E(kHbFPrhAeLH@d=4I!(BM5mNen>$|K&Zw7RNCTza$e2iNwF(k$dFA5?wdz4+FPp z2SQHe?TKuZ!~U$bzo6O(LuJ}XO1-`X4h>{~es~lJOqeW(w_lP9!P&wa9^r)j6*e{X z`?4C;Sj?$&!IPNAR5)Uz@;p@MBa#Ct{?Uj?# zwS17#MH&I#PgNH68godXn>hzG;8cCEL!9qq-Su%rae_C}W-@aB#DOa{j&fIYak-LoZmt6L2 z!a#=j!jrLjR_;?W&j1Yxf7#H}-W`!{z^~7rU{q3eH}g7u%%urN95!3lsCYrz-@qBk zs=(>iJSmp^9-Hq49%=aC>Hs-{OIpC(`E9)#MES)6Fb31|XktClza`k3CAv5YUqwNu$4^_|)`z#7z3UJOI_p zBthfkM;I*sJ0;o|0Z#RLBgJ(c(Tq^z=7S{3vVx$RrHc#0{<{-COUuY8F@tsg&m^QQ zXAcl(0^Jer9y?5UyL42c?o46l-gH}adr))rn@mB0eKO8fmnuXQwNZRY^z}stfoOR< zHw&cYsjJi1g6h?tEhN7G7p-rhh|KGK1(Q?@>tciU1QI^ix5~FWrz!B+7UJ1x(w7;7Ck$8k!|#fc{XX|?e(tm^W14Bk<}cz&9Y z&hc2Ok!5TElQ_#5I{D14ug9bVMfso3O#xJBY9dw!iot`j{@RI zU+;isTX5DsUsuz`x*wcIJ@}ypOzei5H0cD0H1LjRN-ViexEde!nXP)W-DktOy;h^i zSL&{RJ8qql3oRcwHUJ{`yLUmt?S}Kkm)vVx-WQ9cP zgf2O5a%>;2M0L#u>gX77)+;ZdpwLm(#%f2w{#~MiE54Zu zBI5CDR^hJV>~4FpC|_ZFL(d<1T2-3tY27W`lD#K06XA3Vz&VEo(y1Gf)g`Qx73oOH zm!BH7?5vvJ$$S6@)$%Jb{vY$|)kx0*d!X{BIN#sP&X$@VAJ#xVv9qIz&*6g&?c3CS zwj6ok6?SH6zbcDv6xK*}*9`KLXp6}J<3nO)b$FnDcUT`!ZEv+xm)hj~zVjHoy7m}7 ze1n3Gfe|Jc#qn>a%i=UHH$-DSsKnBW6XzH+@b4|TjuZ3+92L^^{lgFVCs8!)l*Ui3 z57syC(m#fszz}`Iy~I6;m1>m+25j<~TcLO^$5}iPFILjA21%XaVN3nSDLGHWB|=b0 z^ur~(A@1;Xa3-SyzOZr9LCR83JQ0be$=+yP4JbYKLOyxoDavBOgX7M*>)rixfyt~# zZuS}T7@U#p6@%xIg~{SSb}p^BRC{?>iey1GTTf9t8PPE-b$0k`w$qMc13;$nhm3h$ zj=#2%zBC{4jk$MQrW3qqN4;t<){MF@07I7q39ljBBA8Cv?P#vwp~m;{H~QdPK#RyK z=sZx=k>VN(Adfa-y*eIv#}iDy<^uhW#f9kBKW`)*cYk~Ugc96d6WzjN=$Q4MNr-KF;kMLyqJH+M$sjQXL1*0uJ5mO z$nlF4jFi_XLOao*)BeSJ|NH7^pRdNAGk?qOl9kcuJ1K)}$}(M^nXr?Xz#s|ANYX*l zvZ)b%WSxFnc#9z+^+>YJURHL1h9Gglkk{kwE(xVZ6o|9sPw|59&#|eRig5}k)*E~c z>WL$!aKV&lLFx|Uq^f$BvT@4TDydAt(_+Ir`iPyWB%VYePDypv7M)uHb3G|@`8E68an4pFXcJO3`)u>Pl=!xE8PWG$6$oR=db6R%y2J2BhvnZGgBCF zXf&lNeCCAi0NO*}5D7az*4htxdRo7Ow+H(@tlHk@kJDo9h5(9NP_w_4UQo#gynr}z zU%a{F8{k;mja3J=0HLSeFVJ0-_@7QS{zT988zAc{{e7$RhUzy8;&)1ps85X0qdw;&!OWkUyNrQ;MfR*R&5ZcuyrrT7=`vR}n38?pE2(+U zH47Q5>1i-J+^Y=$B#<%bpc!)@V706G)ag(l6X^Laf0V4W6bfm?)oRL`{fp&Htmg8& zIs16jhRZU5N7Wp1m(bOg`p9|P;pAJ=wPJIfyU7iYD1Ok9+TUax0Mn6P_xCqorSywq zMg)&Nc@KnGC{%|3JVs$ltB&T(WlR!HzqAtk8Y9^6E%`|USkbQiLTSOD&rO(TS~~E; zMgYZ!ZDI2NglQ@S^s1&nCAnIG%7X-;Bw0zcYk260!7=y~x721)Hc?0L=`4dKt2uWi z+Ef`qS~zOlDLh@m&Yt>uK*N`Blc+~J-#{h&aoq7d_F7dxLCMB35@7@c8JfENup${2 z)eUwR)C@6H%3}1UhfKdvY^owHeI1K0q*lD$cN^U(u#67ZHyus6Om_U+75vs%X$L(w zuCbC@E(6k7OHG8S&aRcWx~Rv7m+;du)~Iu5hcydvyghuiUGz}wcaji_Ia+X-?6fXq zp} zde2IC^CTA_(ur!S1ZXM#h}UJ|4b-dzrl>EK+4qbbB~K@WEyR=y*8~g;Bb4RDO}&_z z()>Gg@Y5UO-Uf;-IsjKNy~W-u&=ZpbN>Yt(WdoETiVjT`nca%I$}~p&JTLGu@x{Z! z@dtRd1vD2@yan=7*UOLY66c46ithAv{a}P^Iz^#B68U?6RpOqqc!xb?7K9VdlP)?o z*eZQ1$;W%S%5T=dp37t5lc}8Tc&nL%#&N3R#;o%jUL6Eit&Ri9@H-Q~^!aC!{X8zJ zfn14^lo-5#Q6XVMRIgm6_exZ%f>W*JbkYoxVt4&2F8!53dwAR-%3yMOvPf#O?_=H;|lXdmPp(Oo_ z28^*(5JLPS^#wKEhpKAt#c}I@SmA~`)am#-=3E>SHn1I zWq)05K^=_lJa#&s8vV-T;ybZM_^z@Hum|gnug?QO1_I`+v0npNwOYi$hb6Ec>K5=ed63vcN`` z#e=RqC@V7Y(QPoEf=q*`L>yD40b3itYnEP>hw5mg=K0>@^qXUr{p&9OQ*#O>yzJv` zY~zu1*jf}N4plQS(w>&M2N3N@3OTJ0#w|U2r@%lW!%>3uuk;knRP&zpI5Xt2;XcR- z>cNs>6^KtWgpgit)h~-iTQrz3o}jZd&B+|5ggQa?KBvD;k(nkMF?u3e7t;8`LWY!( zjep%rO%*Q|qXckIm~zhtHBWFUK6g;%lw)M6b${&j(Af!xvt#7vIEEBemd%if(aF(& zDh$64nY8hYu#Y~DT0QmEiEEe_X`M0mjVmw<4fK8(<>Of6M@@L}{abTdsxwKi-Pz8* z{6NC&QDvWmeZlujI0Ryaue1D8R$Do)CHDQqS|;hP6h{^?9YuF=qn1FaXpk6uhQVGy zP{i6-_Cgdaf(~UkiLQkfYiM`*Q`B6099^b@coy6iJm69Mc--LkPpo#6&ZNP>+K*Zt zBHfxyM&@5U=3!VIEiSp${{7_3bMno8Q7$eb!AV=xiengDOg)|U%Nm*K7YkWEHk(0d0G0ksU5YqaguxI9?x`5~LH=*_p~801i=FQu?UTm*^x| z=mbt#`AB`G8M-0M}}Ay#dt>MsdoCrBclK0;)t^DDPo-&kO5d=Sci-- zHobtR7v&s7fD%!ld|HpK7{S}zamgE?8#kjgQ7cHUv|2eKQQ!r;9 ziufY_2+B$k&0{*uqJvZrVEGFQ6G;AE;7bCZ z>h#&?e)6ywDK%LSU;vWZFVYwP+3M6O0aQ&2paM5P03goBf$Z#T<0|6Y7={qed=M<1 zP4mOo4z(m|w(qoFTGetm*ufzs+sb6Zqy+AN6v4-+xD-M&^VNF8A8V~DG`gs(AEX>K z9z?|*M>zLrgzi6IP?edUnC<4qw6cxKEqM98wSM+EFniwQK7fXS7CTc9@X=!+J#NJZA20Lx0*j8fA;#0kaBcCca8sRg|m5iEy8%{ zkH0>}cpnmX*!+iHoSzUu@{b*|1%4sQ;a7lh8=%)$iDC(W*8eFP1t&tjwv0d^>w9L| zv3(Mt;4rR%Zq-@hTi^ddSx&$P~FtGHB-Dc2bg&J?91Z zwVRS*ztf>rqZ)53t1kg`#m`bTS8|QKn;tE5sl{JdZmQGW(PLR1`AJuCSICG6<0hcu z$#p|3tM-np{#Ne$nUD+clHKg0(5J-L$i6~ucBSF$dZ;hnrmdgl7vvST=!@Q8Qdj22 zRrD@$6{KNNx+gB5jrP(GC(57^CD4x4w2^;rYe^|v8p=SOPAqNZTU%D@_~V#hrQBjF zX;J@m9mllAG3_=5TB$?l|0fKy$$pNAkKM<@hbgrRUyHq{|Ca`i=k}sGR}~Wk>-e|2 zvUNrrA*98*T3K2cN|y?1=65^Xw#TB~wv&@la;O93jw5)Tj`qo#3L=P z<7PQkl4%3|pDy5rjSILTuNk#WO7Zm2uV0JWI@J9g$SshpDmXm>d9JU9L+z78Rt0V3 zz>DwsYO*qXY#qkG(N%Y(dK26-zA>q+n`__q5EbaPP|`Q8RNP#r8-~^9AN5yPdOhD_ z71ge6B&aK$w%De7$Sr+|T0jhBm-5G~9{XQ+`{&P%1Q;J49?Of_I z-Lw2#W?r1rywitHM(W@&Ar16Y48K5uJk=fsf=eoEM6p0^#F~6528g++>g`57&Ot}T za(4XxOa93Yj?k#ALr(vX@H5E5LKnbHl+KGV+w^o1Y zHbgYyh&DyXp^Zox0HPn906X@`ikimt$yQr#;jzDI?*oH-yVhQItN0KWtD{brM_Z86 zaYzTo9V=^@cM*wf?cou2#9f>+vN1Ubh+&+faZRPqd?zuYz)E$_sC##lp32j7@R?rL zP>DbI%Fttf6Vh#-wc`LkxqfG8Wd%pN5Da74vW>4KS7UVqJ}&n&vAf~c!zJ`05Fd+h zeDOAdd-Wzm>&93T4 zmRv?BaL-xuzn*2=NQy>9`rwafc?UKgy8Suq=Qr}FyR9Ux29~GSYFJiW0KU= zMkfEIyQ5Z0rRtICMEMNDLkX~@s9SYhy-eZx+KDOvA z|36YUjr_sT)wxj_#*>VD>;vDYTvgU%cw8w25%S%SG}p2jD=iN$qP28~%2s-{YQX3Vr<26z%WWURP=yF)e(N=1vVAqi zi270^QGB=-y$H@{4f$K=ti<>fy>js`mMY9dolZwmV*f}3AOnFAEKkjxRNXZV^Oq-v zU^-p?$aIhc68B4Mfu~2V2f+WK&BkU^e;$6pSNIUp^If|$fIM#+4~H%jG^oMic3GC( zzyOWWPj@RE<;Z7*VNsUwrGjM3YeT7kwy@LDhuG%B%UPEBx2E_$^i={bSk#JnK~JKr z^i&o5g=&1LS@v@}%N&+D#F<1Uf(zN9-c2KUeYlnT61Sf_H@sa4Ep@sjybym7F&b|T z>?Z=azLjotmw_bEsl6kI%LF=euQ_T;=;re@bU}|0Pu@tt%d1+UTS)H)^T}sQ^}%9w zM$Pb{YsMgF{MN~z-sjcPKe+ptoAe3CFeW+x`Px#~gh(0H6#FH*80qa`ieR;&biznQ zRuUV;tHg?Qed9liQ5u{`9H#Sp!~LX3D-Qf)4k=4inX2$V#YM=1I+w4Np_W^UtH;&t zR)p0cXRzGfZ{YbZ3qSn8K!!C#2%vA8ji)>$mXz;3rwnx&47S{nAS$^6aM_duzE#xM zV|DdA&?luIC%MspFVywU$fVHo{~TSUzZQW{Qv8%!5kN>ak|ZF?^F1JmNtG`hApEEY zxgeziLa`qtZ_=MAHv}5W#)_$i)<3AWnd8^F-vSdWHjKu7`7ONj*@8^!VHFn0rg+^9Ci7Xnt8T#|CsmEL0>9*o#u&a8LFp_{t<;BAJf<8q98jc$B zS54$hLoP*I3H|96S??M+zrX(W^%Qlx6(v29i2GZ0hKXjN8MUfPK}jHNlKv&^7B!)# z?>N%fMyA>2S+Xk-9h{~CtZ`Oo%p7x25LiFana4cP*miL#_>JS zXg)%5o2V4ZW8n7@-1nDTXohaTt5zQi5z3j36a5;3Z`}XFay~YNSra5w26-8f`eNxB zT=e)A5bs*veWy3tK=|11fackkJnq86dv~TcK1sqj3fzl_7(%WD3BhL_c|E9Fx&+wKtdv~Bz|9D8-1%~}AQ)gyQ%6MUzl-4od>vgWkVRXCtVIQHLp4b;1l;sjlPe;cp&7xJUYz+zvi7Na-` zYt7T!1=&Du6MJ&vBr8(gcFpmZn3X8jUdiV1w1ljG@|V0hYHT|{vmHJ;uf8(fe4_ib zNjSv*hwi&yzYq>KofQKY5a4rvt+fQdYxn80FI^v^F$+)dbZ7lXx#p^6@;X9aWRVY2 zD&52#Ng-H~435$w)b*~}!{Q#JH5AXkIzaJVCv^*Dfy2US24bWZVYr2|q$lMAoZ-vW z98lwxu4{C%9;GrML#)72?nq@ccvQMS&SSbjrnGLhUKni_Ck#yUqDRv&#y?Ibx-gaJ zh0~W6n}E^Fj?Q!pDHn`va0vEH83Nx>665&^j@I_TkL$I37U~W>KRaS+#qQ1^DgYf6 zuIBia^|Zcz2s597g1CE(7+Um7;9pTCEDRHg=#JBshJyJQL(6E|Q$!Fbt8I&cVyPBc zetmzxu6+1sXt^7wOLJ=PNL3p{=dj15wgP@Ydo|S=F=|V~rH86?u2c@bFOl-ZNLYd_W~;wTcl3?PxMj^4tp2Cfj>1AsaTKcZ zLQqvd&tFB-bD|~4M-HqZIW0m`rt%z`FLjnW)O*XB5e8jsDL;!f6krcTFEa-Tzl=-6 zJg+q0SWG`z{zj|TG#Z8hGT;X;6j7paTshj~KqPB;R!7C9;=_qDD9+GO*~tqe)b_7@ zA1>{Z<;n5qre{sd*6$ypIu(Bmq(JW~8&xK9*YcF!FQ==lIpH!ltxdqQM3No7sg-=8o%8~xM3uk?>R+PGQa!DKhW=yLE3y?L5ja#8aH>h7pfjV3 z67ITU)BFhxwTQ_*V9n9~em`d{vBg49AY0CVdeRu|+}YA0{K3uRQ}$w}hpI(Pe7e4E zQ}b&r!Z7XVK3x&AupJVifOKyD2(M-iSf-ku;TG-3nngifZMh0~bdAPy_`t8=kLehJ z+)It%pBDw`H<7#POoBO0l%gcg6Z^2!eo?nf-VkHm|NLc_NCalJ3GTQ5JR@NsSNG7_ z&ZNnh{nEjdTetamuHh<=rGV5EIA!K~Kl&pkH&t7wEOUs+@%wJvEnqrz#+znBiDyNg z(hRMLb~23_C?-foV)4m(F|AQgd@leb8t&<4Wbg=2?*k8og#JEaRB0c+?F*UA}Lj)Jtda}(*mOWo1(XHRxf86%%JSSZJ-pfXg48UL<$_mK8yij@tt#a;F!0$0d2WL!gf7B)x?D?lT(=P>Ol^@)vSZhJJK9B z-cysD`hw|j)^s`%Tsu0IOe{L{NraW%WlUUHj zsMjgGQf}-)&$C!@(uOq_(DW}9Pl|{>)^RIgWqvZuM>9SCC&2S2yxRPsuv<8+&Dr(# z$W4n2g7Zd0c|=-QCAthxd%HMRufz_Ee9RzTFA@iv3mc+rPoCfs6RiCu+Q-DKrJR#;Q2$)vDV1N*~-@eH)wf%C*$ z*Em9n))(H{@z@?2s&r{no@q-St?L@RV)Zu#rp8cnke6GQY}pgy>(n!eG;U@2^}uOp zq_U;2g8Wyfr>xBRA! zKNS?vH~*N-p-PCph*^17b>&hm=LigYlg5pZH-T~_wo~XA1~Kqdtk~CGv%W`+a@?aZ zbP7S$G>!Qi`v38eDwGN=b#@5UfXd%vguQ=e#q3A1u2WP?`J()7$ki#-SZ&AGs#h!O zWlYDiX*&5Oh`7{@h|c0n%i@x`VVRHwm(-#of|^WPiYI4E&=vcft*3MG5L zVY>?27TjTWY9$7cNg;CM&^heH)jWWL3(wu=ftR8bRYQtN?-8hhQW{xL_ezD1tu8N$YUixoe39xxjG%T(0q`~ zmDT4^6?>UYzf2<|!ihm0U2gO5!lKVvpbWg|Vkn zt9>{)f6n~5{rn?Nq^_@V=Re%+VS@sM|I&OSL_1cjzOgbHwa0ovk#c{7+2kPuRPFBb zCE*HkQ@a_a@hX!YR^T}!K>1)S#S1|?x-oPzpH)j0W6`M~S;dn|sU0=$p}}HxnlmcM z9c#*b@7>TenRJ#pi-2sXfCjMqp*!Bj8E*RDjrn5p($S=r%5gZ_+Uq%6knwC{|Co)fuvE-{uhqXNZH%cl zt!2s=i}<+Bn!J6~Nb0Kr`MD@#tH2%e@L$Od7zBAa&y@(ymk#k5^aGec5%KIp+nHaK z_T+LXK;q@T)09ypLY)w%;t8MIdue7|?iJ})?=X=aYfkFWWlqYuht4jcqT$tXIIvoq zUGtHo>(`zGs84r>iMxUa&a7TMcVh%glLm#oi6>?K*={D44kE#gr%e-tWTq@T93a^4 zFXfSFm5zo6$hx5+W#1R}^m z=kOMm1UA@|x4Ge!L3*jufT&xe6r{1YX0G(_M@(~WQ@#I8U|}KNqQ2f|?`#5x$xX8| zv7LXiYS39!X13LtBXe1en-JK{DK}jg&2q6tDH}^;Nu}ORn-z+q8kKYK_&2~;`B`}u zZ@zC}hLPQuSBx;s3vg?*GQm9WoglOuOt6yn=^%!L?H<9JD`I)%pu4WumlDM^Q2vUQ zwt$yah4{l^OIY6X8~38ogP}h%my0IzZ@v!P8+!L>y8IXNtCu4kjZZ>S(+QWmz|>41 zGH0WXS`t#rHqrbYE&jmIeXO_$y%Icrx49>@s0M&i^4LE%E2TSENf+EAnv?|jZQXLX2)rIf{Fid@LK|} zp-<%MrSA+wRd;RT)GjnG%F;oXU;nH5eKcVa-p!D(SRpbvW>b z`GPtebHzT@nN}cXu*J*iOuH^`41+|Hm<RD)&L$F(*SPo>@c3P3HrsLwEWI!GS5xVBzdafU8yWW@O=J zXdy7}^$1(v&r2Dp>@|HQOEDR2ZFJ1?9uc=%u9`@A5p7o#XA_kCVV&w6Gy_kla0;I=6bUNozj$Exq)8UT`R0$v(u!|s&2 zN7*ieJy<18-d{kFJD|qaH9~|D7)5K!A;X2UJ^wiC$_irDGWSW>eK)&nm=20l`^7!0 z^xf8dudH!1Stq<>K)vX*SE#T`78-|}dTJGx7h@%=X}-C!T8Ya2UxV5|r>3I*nNy_( z4bh^;t=s%;>NE5)QlqT?$HfUvrsL%AET@yYYor!l4TVOq1g1DQRx~gm7V{I!EA;cu zBs*FY?3V`0QCfXO0}b#@Hr{UGkR4vH#=Yw6sUFq~q=c|wfWBNk&o-;bak*Ix{ZED8`Dy@95g+k$Ryy7f1bY2t(T9#h8 zPnb7CeETT*Tw*T&1^e-b|Dft2DvsK2=kCWY^RLs1PG)q}&lAE!QbCYAWv)BPtjmuk zS3JBE8mWZa16LD6zh25r2=a#x$_sH zPK(D`IFpN3$|);0+Mza+4%K1DHz=Xr4HAC^`sQh4Cs%OOE$IWkk|n^*t!IG(p@Zn$ zVtL@Dy4oGPtlg4Czt|!npB1yGcHXh4HsdD&!T9ALPFmHa8ORpqU{#%mUv$>8?w{v9 zOXLe(D5tMR^Z8qYmi4$XNO0jdTG`np$p!`{o=V#5oL7fg`%>=+sMWrAU^emOUzdZL zuFmRee#RY~b{z*>-EORtrg#$GWSTz?l-rlztm0Xe-usnW1PA`OKa7l**(p`8;`8Qm zALWS35~rNJue&%8Hpo`CsE$T(r7-tSN6L%IkqT2420}^y+`iDoK=V|32G5|G`b^8_ z3{(Pp6%WpE7WXXGB7ri)72GgpSdVWjZUIb%pN;F`UAiilUbSyg={q(*gJuP|8O~wo z|HIy!heO%_f5RmeX|bkF#Mnh8)X183Y#CejqUk&SCIF z`_udNKK-5(s+bmIM`nztSM_@wqXKb4pm@BDEeO+~B*5vWDUwtW7071eupt1k;dZY? z3!Fph86>KO4frLsaO>?C#jj_t9ZWi_CMg>LYDve1{gzsOPLv01x%wz1G%Uo;R{A|B zeplT?MXp!2yz;JdHu&`FU;|76PQzFO9U8LIb>*y&6@D9fRly8vbZ=H2WT7c}ThE{3 zv_hv<``^F(VZ+DqwWyTr2dPN2bDd06<+j=F6$DE3;fSfh_mqcxe5W2i->2G|zWFxl zRnYq{?$$0Zlj+b13_XGULjMjIz)9khBJ0MYT~VgQz3jEH@<6xJ%Xv~lqU*jcRs1#Q z<8mgc>^Pp5?8mDbCdAzhcHeaKC8&$-#a&5HAIafQG4kg6r7PQ-b5TO}w9snKtn{@I zjm5KN_@GDOlA>j<$E*&t>r@`}X;mWf*|(rjyG!1*+jD)-neie}p8lpB^tx$r`14Ni z@o|(YAnAILy#TSd-Ka^P97V<}9a_Y?TUK*c&v}u-s^!)@l!m5%23jdb<^)3x6%uz1*0q{{h@?(G2&=&D35JzQ5NSB@p_8U8?4s58 zPtJJFSqsDVT?$!vI+=y^UFkLcRs%Lfa)mDusSFC(3 z_lEa{`Jc*^druu*#Ilkck#b~xk$=4I< zPZoZ4Jn0*ISoPxMLPU|q)&`bny5KN%Fo^T;*ema3KV_cG+rt#S23wi$cADVpUXIS1 z;{y8RA;@l4MG0T>U&;Yv^KX`OEoQ0m{g~lx)|W1t^ZLaS)M*+d48aqIpCQ9%?YsFM zjrSborgt!iJC!Pl`u z&O>sYwiAeES<#M*lnLWiM$@A$!m}yChNsufF5{Ewq#OyaRK-iVU#1471RdbZ^ek*k z*l1zAbvUJ&mw!WvvpgVXEtA`dD;IyxULd***Jg&e|9@P>WTfdr{ntO zapR6j{?|$y+MioaTMqj~D5;=;Unw)Q@Ao%HD8hJB)K#xg&q! z`KGDsf{fY8OPQh{MtF3r9E^XITl?yU zJn&G*^68gd8==b>&B`HkX+-JDu)LY3ol1O`wU9L34*vf>dW( z&sQ36mJynrq=^DLoto}SeE5k;TNUdyJyDck`Fm^@LkCS!sfXkWZK(B7K(sYvA%E?E z1npPB^Rb`z4Av!;3Vh=6fPw$Bw=F)K%L*+|rIUh6F4URl)Wzw4)L7>7K_$tm(3~S) z4ZRq~jC5U!8;pK+kIYv!S?@E>WRjfE7ptlAZbDP%v)O*%=8y;N}aUN&l)8Bgeo$&N!#PXjj>i_C;l252VF0=1|4{T=AZ&7J9 ztZ%549}7RiYd!<5LFtl`ODbd!I8rHj>Y{(T^VPjM{>p-q1>@&5#aj*H38TkbuU;bm zMN4y7$2IWqZlfDX`HY`ZOrR$jhCj4V(m)4`sg_pxeK#%TXukge-diI}G|>kG#kadc z`K8aiiGFtNb*trLiXUWjFPm=gFt9!(;HmlfRl|2B`dWH7{Ji~ux$*qHC%k$vc9Qy6 z8w5__HC&!7P#@l1CG+@wq*3x!SX3sa_X^^V!8H4D-wxvn>!xzB#;cs6wdH;H{=|KRvwo&CZ1 zZ+=nG0v5$pb5!y~&4e>zkFP!yp!s$W*EvBAjf-@9tIQ(@FLd6>0DF43I9=Ot#8l=l z@hBMTI$6*HAIj!+uY*8874;-5J=p_aioLv8xo5is&e8N7Ohuo87dr55Xy_v4kC^8i z>8Ffqmvc|;F)vACe@Ni&$Oc}-htV>swQ)I&>tFIR4Rh=#J8yk7f zKyD%pqcS}3Q{9CSqdl6nnkwTy8n9LjiXxR z^T00?sraaf;zB;7imp2)-{i_ClBni|pOze|Xsn&;x~ZSTSbRJ8G{t=~1*X%FrAZ@! zfHfWXqrIuMpuN*)56<)OkV$N<=L=bUd2^IcGK`_UHvc-sEd_`V&s@6GCdFN*MSb|0l10wxmL>G|FLt8R}xHK?u;3;Ty=PI zUt;q)fxmshF0vNV-)9HT<+GBmOlmU2|M~5|za&ovUV}ZvagSF63_i^#r4fQ!|Go6QDp4n2gd2K)5_I1~G!Bg&it*Cr{6Mp$QXsS)|_t#+Qg}--mpggHR zKHP=o++)#4_V){3d?c-J>$`8Pw2(IETJdvK|9A}_cuEb0@(%{4N-$%3vv1%0$7@If zm_T{xEYnpM(j-~XwPXK#OYNpuhflr_-{nOCuPJkb?m5}tFYqFRZ{U7?$h7PyAk@C& zRe3!Bc!ry>sy$YWt~wLK0i6;1qU7Lzdg^ymprG&XA7i234Ua%!L#DCQk2?)@gVaz( zHK26Xh$QLDdbq;T9ltD1R$z4(tiKp(Nf$|4u0`8-y%z%~Q*R zp39wmF5_5_&tzWbZMx2h0vkQL{BC#N?V_KxCwp~_-EVgqI0PDk@TX<=GAIfb7k211 z28RaDB*s~$4`q+ri3d?XMiRh!|Ys#{@ulQM34t4Y7*}KG$^pU z2-F$$15XIq)CL?cTI|q|t0gI;%zy<*A$%fI^e{TjK*oSH&HvWNisY33SaEG$W%el*W<63twnT#%rpW7tljGHM)92Pslp z5Ft+e7;YMLxpv>dV_!W}QwOJ3e-796f@tZ8@u#(=8EY^mDc#xtGYT^pKDox88}0Pi zpP;qg1(K^~B*o0Fm{qNdN^o}Xej<2Pn~7l&e8P3jncP6YINNh;bLq>~Bi+EVnp(qN zxb0_%Y1aY{ZE!aTn%=tl^l%~6dk)a|lLQ_0!53pa_D<>O(KWEPu>mopLXd5KMO~@c z5ngfQE!k9iX0#a4=Z*NM3p0zr0N+Z0Wk)`|S2s!1bkH-zwGcSmE_XmV*y1#(F)q-Y z^BX`{k7j3ARH_9L)N2CAO20ihx$53Fuc56JKnU%k)h`66n+~TQ%e2uuVv7WgnLv?& z*U(LoYj2@n8EF{*-r=xk(0X|X5G=tKCC4%;odWkR;MC<-B?{bxBau*lSYrL2z{QAt zY-i@6wB9xJOOdU17DniDH z<0L}@i6^Zs2w@K8=U;Ctxu|$sLpv^6tJ4RfJ)ayrZsNy9iGD&a#d=iF;mblk_$j;v ztx(g9lgY6nvE$&$gQoQjyHw;MAjMWB6;23MZZ%Sf<+hgvv3jF?n0oy=Qp2gdn3|!i z{N}vuqMimSuJ$zUySAeHN>@J@j^n-JJVI3nhspA?1X?#~|D*W=^YJKl9|| zogecZm*Q;l6k`n8Fb!;DcBWbQx_WQYN zd}h45cAyy>dw^~WGEG`BrK5Q&wSOCv@y(IK>sI6)(nv zONdc?`sApQ^f1XBL5HY7#@+z{P%Qn%%V(#BvQ=6Yhw-21Vr3$eZqu?~dtD?c9LbJB zM7QGc$?rY_!Ogz@ijaUJlPtgDJFa)*AjS1MP`Kx#{Hg+r`{v5Hcm}5-K3Sa3nwXMb zG`D25cvn+H{zCu*I5!ISJ|_=k&1|*wA}Ce}+ilT#@}KFducfRT`+@)RV0wSD`tzT8egDFv)?# zB#?$ny{7p@wKMD>nml+!-lRWTbhhOE;VVBR6ZzTk$rA^#4Gb3-a|&`m2e5NX;lsno zyaj@W+R7RsA}l5SSfHU1xK9(*W1C5`t(`~d8MG5)=i%D%w&xAjB*}QS?_nR+x1`!! z5t@!D4PnS=Q_v}XtYO|q6_{jyPSA5v3d=H05_o6oq1!0k98HM8Iqip z4dYGs7+tyR2-sW6Kl~Jbxh+jx@zp8&k7pWjM-;tn$|h6U(ej~3d^yi?&&lQ~Sn8sj z(e9@deMcw`RX*m8MA6Eaa)02ls2|=3C%zZohK_(IGERnDmOF&sHxw!5i98{i?itRd z?0DDY{E+6!EJxh8_fOX>Ue>IhquobK154+^0rWU5Y)hR>6D{x4ydoL#$rd;st&Cp; z))|#NLdz9~mr1K4AZC%QRvD&@u2H`YwOiKV*SW2~3vt5DKrMQvSQreL++baFKb!2R z$AUr3lAN;EK<@Ctt*}_t-QjNz#J0}T<0FDDp%t-(GMdWofR2bE+)~ObJL~msvd);#SYfM{NWKcBM25L&5RV7Rz$#>sB z_f=|zr=QWF$&=m(d$;}aMe3_kh;XZ)Ts(@_Z4Vq>qg$KlOA#xq z9H?k?V2mX3S+Es7_XRm_HGI-_ka8ruI;kwB1ZTDPr06wYJNJAB>3UyI--{bYmKeNV z$dgn}VFvf81s-c#DHUw%lK9F3FE)z3Xtnf!EN_M!UK8~}WB2Gt6pA=;LM$KolQ+FS zCDGTVE{JJko`X_bTcakJH3m^_@sDU4H1!=RWZ*+19PKwkyA*9u>gY2j3}advJfT)y z6J6h5OpaDVXtH_fUS*fk3k9-ISkX>nR+<+cj~0$$Kc;Ze;y$KNcG<{!lP_wa6|p0B zj}!uG(w{?kG{9_{J;BIuxx*DfMnK%BFMS!k^_A;heFX4f>b8u8u6GzXsAw>MzA~>| z-=n!p&msDSVj6dbVyFU`X7H3=$-SP_m45Wv4>ty(jw=xf94*PImnR%~ap~--FZME_ zJmR5pg1tp!)#cu5&lj+V3-9{DnbAubuj{|;9O*o6~_mwT88^?veEpwdAd-uIiSn+tfc$kdl28XW< zLzec81=2wXf3sxemKlR?oYe_#GG$B8jF*~5Z5~PmlfN8J z{5?5>$;84$78-$^RIbRfrZt}C?bV5x)d@AF41x8wPPo@5KVBImb-dVtc0#WuyH9&H zs&$xtG6HoW#zqWn?RYwo=&G}8aNQimWXuMNdU zkkbbyv32c-@Y8y?R>8(xqk#B<%mRg%eoJXnyu294)d;Jzw^{MtD?CXVpi!Hk1Z7#R zdQZ^R5u2$WFZ}CU@0FKi1jR&`8^f&jS?+(m7m)0@><*DjbDV?qy6yZ=P%A#{ zf+&0XkG0sV1ai`v+nl=cQgrtrvD-L5hs5~*+o*!{nF(dE$18-gVq>Yd0}Kw_z+h4O zwD0VH34iz@$F%EK>`?zg4n9*VQ+=iRP4SoWUv$15`7hTx!?mTRO$yXpk#9hh{SE}M zMG!4nU-SOQlfte+Iu<+1!S5|uLT&`w>r(@j?bQ z-QL{IjkR7d&om(|w1=3GgnfrkSsc0Ym6_WNM@P+Gv?2yT;`}|&*ZwJT`m?z8CEC)1h z9>!5$ns1#th<`u4dwXA#6bGzfP6~VeOo*f;&E>n*8%1iB8QR&giVM?Fm3WTSabE#d zN!nt8AJy`osDF13wGKcGC9`}}Jjcj*p7nv$vpxvqLq^(g8Z^|sV?^nEc9Fm11TZDh z?zTJtLQyQsSGS|e5N^OdQb$UTkm({`^Z;*V9g<$%a1NEcE*G97Ck@gidVWm$DEfO-}$>;sJEp85*V*F7<0A){uI^6!pU#ayb0Vt z4&7QkPCCMSd^Sx42vL4C&t~l1n3X~|g?FR`J0;q|8^->Jq5a1^H;D6aq0ypYsax384&0LaXIw^iy&dxgoLJHpB z27_T_ooCxgfkR@UT{zkPY?qr#*!|28z91bZZh!%9jBQ>$$Edw6WX4Poq6>p%YD6ch zwgOg=B5(uf0quR`JDxbhFOUjl!dwhIDZ)Q2&SP&{HopmuuNytH74xd1zrH?FB7br} z-eoMN@ldHv_l4g3!+|s4Bauha?B=id@y|Cb7nJpL0Eh3%%=+P=8Ny8SN$^Y2dezzckaUb3b6@B%ro( z$6WW5I$QA16%+@3@&lK4?#B)nz~V7!!XSr|jVGh z2Xv@b=8sFhB5-8mBfu#F_7L7NQh$y=I7NPMN`r(sGV#pc?BQ#is;tz}d_pL_`a;VK zsYlcBUdt1-8iDko8r2>fv?*bi!X&Ni!jMo{mkN2tr<|z|jK&3E6w|!rEK8PMJm4w} z&Q~OLa-LO-v(94ZH1scfdoVu-X6o;Q4Y)NN)orhUVhN@kN=&jH24ME#1c2CD^dt#~ zf~?n;gh_!Bkt>B?St+>Q+RTmAi<=CAsmp7CF+Yg^`nsnVvdnDoQSAWLB+u+5rHB(? zy~DL7y$S1ZH+!^!351~mYJxpghv+?%S6 zC4K9*)Ie@&SsEtrH(6@oMsc?u8?Fmp-cX`gnCa~l1~X>QtyY}U#&>3y5t^;ffZOK5 zw$Tiz4AyD29qduFbaITACX$0)OJ9R!kju+F*x}d2BTY!VOAp|zI&&{3%vIuq7cH;i z$FS=%zVCDEPFv59e-^-}L2~2~!I3IPG6>tSl_Iu-?tnRn(yAyC6%OEH-X5)u-@Y{aQU*W6_?;uJpl*~un{_lHCl)Gv*;%cK4YW1Yi-@y$~0$-7-y zy7@ed0*HU>6EAx5XM}ILf38H5kjwgS$(^(vNtSh8ng-lKCI8W3%bUfghlNo6dwGs4vyfsog&=c=H{(0JV245*5>;RE79RGL>1=#$v%2daa7xUG?%P zy#Pj2UshlqD(C}LVlI53V%ad-$lUa^s*48;*G*T-Q%&;26F$ahMTge=BwXyf+I-t* zL|{ZK=?%M*Wc@4nP-(zL_yTI;(`)pMih*aYK6%cH@{Tw|nOS7p%dbPFAR+qgF)S?(Z2Jj43*-v z#St)XsvkvYBoSH?tkmf3Ql@HNR2)?xc?S1=p9J9iJ1*|AmVMiRVmwYi8DEeaiX5Fm zZF&Y1>bFjx+w*$?mfSu?9lr;ssHffqtz5&m<{-{#7LP%<&#MdhHH4D4_!lK{Ir6l@ z46^AjcU~6wz8W*)AFIk8cD-d%C8uJp!5*ma(qwW&oXD%(fL@tl0o((;^=`*2-C8`N3AL%{b^;$OnQ{n62&IWzs?Ve2z*ByWZnn|c@*CM@`(fD)194hTW{1%az! zY*!cNa@xWjQBh+Vn#nJ?PblJbCl#35gN}w@EkUb7OOtUmf)7J7&#A6GE#`V1 z8H%K}(8|#KiI*LU_^2L=G{IM1+lT|@qZWUmdW5X37L=yv3(w$)e2PQl*&v=~gs|MA zZvjA(z}GGoD`*}R{3HVwPra?vfZI;mK&K?T!&V@EK~W{`q2i6bXBCcQrC&lNP@p-p zv+Jj16|faU=J&RznL4xz{o>NN*-xxr}>IolF1*rq~@v1?78z?!1+#j~mMtT)+6vrMaM$}2-(!kTvGpYvEw5+qcAs*59 zCn+S=C`3Uy>f5a={jta}K0SZH#xw|FZ8?Z%v}^&I7xhe7Nx&}yKKyw@YU18dWEc_* z%6@vIXyn5*{|1Fw9zLNF@Q9;`7U{YV)<~X=Y{zprV{Rl5m$KnNwLCuPqK=f6?4-qa z&2yJS9_ke*NZ^)yol_6t!?{cQ`kxZ&nXKh@Qo0*15bP#A`!m>$a7rn)YT-H|y*Gp* zmrmiJmtvML+nHpA)41I;)mxjJxAgqsJngXz4Y~puJjv-X zT83>ejTjv8^uppcq$sUaeRrT@<-ljVxhse_h_MDe{cBvhvKj1}B&Q_f_X-CiGNK=> zYK2s#C1zeLNMQS^jf%HA$6l<7mhV(JN~{rRmqV9D$3CB`xtM*Gmapvzp+3T3cJ?Z9 zBuI#DC!d5M`}jvv))3@XcfX_+%BJ37A_~z@5gY_f5qZ2WEhTa{O2zUzEb$c*FL^9@ ztge8pt+A%aGniz&t(JPoDg#af#ncdo!zFp>i`CmnoiH;iq`I<@4WT&6)seiUM*A9RVZY}PTR2DQCqDLHy7Y|YfU3Mcz+jA)s1N8vrwU%jc;eT|WrtP2p?A9pP+Wh60=19pz~-}sTj0sMc$0ZVim z>n=@dT}zP+pQLCfYycYNynI0 z6ZMRcXzIW&XZYYh>MmZ_s6|5W$!6OIpQh89c5iV&t+|Ef>St-En$B$#2FfKr_D&j zvb9F(k2MaC;*=G;4H2B}=XL>_5z>I2Cp+!$M;Hn5f=3mnj9*V*Zq3bb0-@vVxOg^pO>9PD3$vy`4jM5xqZeEABZ`~J>JeJ zz8aGz@Z&!x@Ccenk%6{DV7yZ;n3zlrW0H#PdiZ3MC!YTT`@UZb#eRQ)4U7!@XVK|N z8!KDtYXWFa4SaH}BFec@2EBKF19G-35{%}5z)7d*42DPP-1=H5@|&O;`Ocf8Nqfkx z|Lh^1X!#oqFG(T*#y_sh0u4FPZ=dUBDh6iXhzBzKxY*cMBK>iVn0GbZUHa|3VdS6W zt`cuOGAVIq!Mqz)`o06MML;I>zU@C*!~;l>y&9$2NNkVpZ0;B!_>d^8+rX6_iPSRH zEN5W|JPTVnFEtS9`+D-@f*^2Q>M8V9?@AL#=tUmw%-s-3v)>O>@1!WZQdC=Pgs4UuWoM!29Wv>mF3guF`_x8duP11C5anjbMf5TmD9(VWTi$uN$AHw)9o1budnP1^aJh8m<ivqfD%}jhk`eTim3%y1)BZu5))&@@%nOU@@LLe8gX1!ZnuZ-aVz>!HqOb zzM<{HuX^pf?*n=J%!euptE^WIEQS;Azk0Wk;;`orFjT{WoRnUOxh}j0C(E>WSNfD@ z+41;0cdM2f?B{#!r);x_WBP5=442cX=ju+MFc&m->s~c-xau?OKL>7VT;C5nY{N7) zanP~@YbSl%acr2Q>lX*94i%c}3l-11F6Z|z^>l7IG{sJJWn^Y+zbsUU*VIkUz)WF$ z8IjS8S9+@wm$MI@5N;r56>r+RPu|AOuAaOeJP^H>Xx#hVMBtw4l^Nyb#QgQouGsc^ z;@t{gteg5E2$8n*ZRuv~P3dK0rXpNbbkXwf%<99~J(}@7kSu1;yj}ZcTxiwy7U?{r zeukr-QHeP1`Qv3rQoFjaTZg_8t(F~YR?=C!folHM4^`{y`3ICr9P4C=DN24*2Ye;L z3~B}<%h$90<@z~BmD(Yq>LJM03X75tQBDhrX|4sU29@(sXO{ZfG-g`H2VTQ5WJ=&E ze7G^Gw>AOJ%%#<6drZJ;O&NCX(+(oWKyg+)6Ft-p_1Y{@$a)_MO*vY|?!c(=q+x z>dd3%fx_PShbs;B{)S6ky=C!kUfEg>w&j*zcL^jp4Z2CJt%nCsN4@Rr@?Me9n?IVZ z{BAtJ!L>V@AFU9|clPjI)3${Bgbt&$C`MU{l0w7rq(_9HsrzM2?j~srV=Jq?0;?2B zLFNgj$xn_HuAO$PO%t%N)iHRGFw-_I={Egx|GN7L>t*-Y9{rqZtF}bfKH0Nc!1{1~ zCv5rUOPO(3op9eR4XTu8uct*tROOhW0Ud+!(yAYy!i9+)FPEm0WILwc9CgwwJ`EHu zt?AM6G|{o-jWXu}cLCAG>G|nBbzF7>qO*DB{mz{;n*u+|JiEST&fvxq?4}Irst*=j zNy2WdjMGkakJ5`ye_Rs&R#m7opABwXBd8z=UIS(H|? zEWw$X6F^v5Zwtdp0 z>C!+Da+O>F6UD#L=>MYoZnfpIs;iX`1@^{I6f@DI=jYPbi1C@H->dWURf%77BsQNt zvRNlE+6)vAc6FznvNdIP?@@|K))&ra>t!}yJALnU;7P&O>iC0Ii~Jg^k+%X{OUmNE zbO;aT<<69T-`qSAXg9tmxMXv@TUPbObbnT9a7fI(EwNO_1pl0pjweS-J2qFR)I>Mp z=QUEL4BM8aIdR1MyIW^N+-H!Sc&B@`qw<~Oc2AitP(BlV*=bD=7H_-V@4FuNW5j=H zXd!Jnqmlo|h0!-r*|vRR$u__@u@_Lks+7{adq4A7TaKLAujdfOW*2og%N?$65$?Bfn7SujH#)X~1s?)q%PNO0k-E={ zp6%5=^;wyBE1HV>Jvb!>p775pPTH+1ZAHkR&Z|rCeBYRy2SrYtTMZW!PE**Fjz$SP z&l*VBgFUA$7`{6J6|48*?mbzP+>E_HHW^25YHvxTAbyMm7V-EJVz!2l2s;sMm%goi zcRBOQq7jA)d&E-%_fl-%3g23z>?xji@`+yK#yx|I3t}^~PgA>A7l~`1mxTAvj6Pm$ zE3)_|z%AkL)MNV@T?eJ3DXQX=g2KNyxfdm{+4{L{IpnQ2C@|GO(iJooY;RXzo10gg zAtWCuK7T~Qa(U4GsrzzMr&If(;?=>)4>#v(_S=4a9co0pa@i$}alf(q&(cD%C4^|Q zTTaB(AtLeMu?v0T4m85wE7yvGFNQw9JfIkm*x%@vGj zqjkCGoMZXi&}k#$^t2IDRcwqR?(5@_*4hP~(L2~6eO2O>3mZcE$yQn#3o2Vm&BcnJ zKfTUt&GC-LzKCjOAIDl^h_CLB8}${-5XlRlY6h%CIGx>~YFlTnUPhbGQ3=`>wrelb zNQG$yi?5X#I(g_MdPH(zQpJ%iX4Ugi(VA8$#;xdFMK2fUY~b9TpTQZ zf3TAi74LjaLw?aE$fhbS`jQ_}J$ytr*-A)szK^PsaQuEFwrHcde8nP@?Qqxo`R*ALib>wu}4d3RbpOBGlDZ!!~LjYQuNaCcph3ooi)fE87ymen7=H* zPk(C8>PQwDirf?bBYg(#UAE47OVc40(`Gmw)pDeZE?D^<=2Qoli#i4Coz#jejr+=_ z>%#=E%JrEFh^-a$e@(hi-=EACs;Lqt6UvVoZZ!_kFTUKfC8R4rU0uFKxSp9B%u_nL znrYMeXgT}Khi7h_J%OD@-*7=mE(_j*)359{%OET5J>2j1lolU;UE%&xmPmHpan9aw z!GW_HL}mBso8?tow~M4U4!I}t@E8{M#8$EVO!ARf<2oVBzA)YIr>8e z_*i9pI7sh~Dv$748b`$~FJ5{x?S}sB<$&TQgQlRo6QRhLTT>0YTv)GBgd#KW$qG`1 zvQ&&cUo+Qw)|so8-<4;5uARlI-6I=6z}%m{^tCd1W85K+5PI&;pos)C-eD?a3hi+; zQm8rgkjUq@*%**-c#`0?9Rka$L9n?HEct5}4;zMlVC!e!k?Fk5{cFA+t3RJQ)&8`7 zFyh-9sd?oR*FC=aWIaJzCSRX0_6z)uA4106*9xOgY8e(N*tZ#bB*q}S6`yWJ$^>u) zDyCU*ho$xIjqa&hW4@Jqx4auCQaM7!pJ}=NS$yS7kxfN*<{`U`Xe1J2{X=WRW~d#Hz8(ogf8fZ`=ZTi?&H$wWW2 zB0A3Yb$e_q>TJc>*B&<~ky5qM=PqXmVT{}-U5pB6tG`M#M)15tY)S((}kZ#t>`YiTnpfz5iCjCyO_Ts14U(u;Z-{)An6nU3&>j7 z*~*baUm$c{6S`aW9XvjkY$JEL{HK{rqs;Ehb;#K@d$$j}JoZns+ugp|hrm7&c7=KZN-_@?RQ5qEoV<*_3x8_9X74-*| z|C&fPPgD6h-;M+>n1_QvWSE9d|GB<2>B(E?+PU=>8l{bw<-+8)Dwj(Q4UnBy;zOKv z1HSUH)*jZ}qwg4OwH<@et@}y|C4nOS2l3%8)rx)eI%EJ$)L-DdepeG0B%b&gS^6mW zqlPUOb{5XS^3K#!y1fFJL|e~N#|!SYHu)Pt9pl6^Mv}~KDD$oED>fg6HXY>9=2fDH z_}b0YZJo<*SPpZcE*JyGB`_C2wL|-Y<%#8w<;f4J?q%qE0C#e-tB*IQx*LZQ`iYho zI;|-vuj1poD2h!A1EC_mYjdMxQmVfWGLP<+H=#nOAZ~kqWZp2MxquQZ*{FJ|#9uvA z>$OSX5K?4mkGS^MgaIYK@T_QdB#UkRLFCdxy}W~a=uL(_r^UY63D}~-k#CJFgDpI# z?I+ccSZiYPk?xOn!DmIxu?d*|(aM>iHO_8J)#nx)uJyEO!~K+cOnsDkQeJ4gHf~*1 zT-l4yyCjSm#I;5~Z_Y~i#kUwYili663|2*pvBG9&X%=lH^wIwm&Ud5YhS$~#%}kJ; zP~hysP)3nxy_{!uXT@^g_lIH9^$rHM2?OH^lRiPQT}0PP0uEdnpAordULrPl7Ca4) zvmDvUoUVW4n!mx*5{eWpZ_8=sJ3~)lj?+Zlh+gVWw~)oYPJ6d(g*sLJ z3O>dP>8qA#9fmBl%8r}MS6Ge|Ry|$x>XyR)V1M^RVKu|iZwDnj*7}9J?y(u?%c-E# zTo+Hgdj2z}=nJKP+0@;T@I?#ic8}{{RnzdrEpoi^T?{R)p~DERRN|?eNL(} zajuqsYMF%n3=c@NQ}4xx^OuuCpXNo~Yvrm;0ts|+;!1>$EF*WHn`rl=k;}*9XASaQ zn9!VY7bmH9=mdTM4`@C<+G!@kauUti{>FDW=tiI>4mj^ZFF{$-J5Th!Z&J%K8)atcymhm z06u)r$Af<|LQo{~LckZw#~}Q2sw8N=J-18UdcJC-*2v|j?%G(Rzym!^NQD{#Tk9PU zq3*igdbg)z={cSIh%(T~9ufC)v-2H3gr|FIcW1#e<25;WM8bQ#5;h2#095!tKoqqSb(lD2W1%K6F+$%VtOmsE;);8sAH$@F;A$QVm@ZOGT zTiEZdj7In7-EPc+^S%}=LU4!3wu@{p)GTYTocwTc2aG4-=n7%aX2{;{7;RZJrtjZ3la{J?|fq@^2(l_*OFv$gTc^ks;IW}mk8j$zGvGx z5!*$cCl4PNi(2^e;}E6(r();7Gz%-{{=S_g77}J z)X~}1Z%>(RaFN=1KyFma}9xQCP9%zBaQ@&((=n20Yem zdx0!nI{f~$uw*Q6@PgmqjLdP*unWOD9Px8DFlBovz# z&YO}-Vo7vgy+m<$n~p5qB4S2nYP$gA&WNVwht4W_Fu6mh_97=ebiY$?lla!UgDBL( zX{X$A~(=(#qBo;#}j1VT*Ajo$ewBNY%T* zaVd6(7t+o#60-)HrV9998|O?gQQ}`fwM`9?fH#FAi#!rbi^O0m=s1ICQ>6cNRXIp> z_L3VIQINQD!j2Qy)}X4T@&OTwE-{m>?Aacre==zAL8SNBgT0Y!#H^}S;?j`6KvcZ} zTHf1a>{CpmVQ2AO|?lMZ$sSzkdiH!`}5mWMRlQ|M`@Nsez#qk?sobt zwj8m5X%gz}l4o`wX^ZbEupCZ)gs{;Aw~q^vL}=9lDDUh_lT;idN00a3tu7?7522QN zX#C{G`eU-=Hb7z32C^Rrv0E6KV=+^etb!F8VY?PSHlR<7q5Ha$)BR5320g3I`FlOCb+VpMA#z_)1jNz z=YAPnX{Mn*^0mjtYB$h4XGZ;4&Alg%Q$~@>QpI8LgjFG(rY%VPOSR<_f@T4XT>7?! zk@y_bU@<43^Fqw^r|Kg}JXZi^Pd6O!i0wKt6b2#Un?x3Z3d={g5|B{w< ze_5dzuoI#1u5$)h#G8qX0!B7(-7^@H9h2u`iM{EYLBzQ}Ank72QyyWU(|3NdoT525~r z3<~|2v12&Lbf7$bR6?}VwH6}3@zUWPT4WhlFeH+V(KWmTu7F)w_9XYH4HVLk-u%m> zDZ`^(xbbK4$k#1OXh0qfjWpJ^6t3g4brmpXjXcPqsNfEX2pbo)N2fp;sxfIyWmgnF zrjrT{y_s%`pe0V{IZRR z#v=^c9ic=hD)Hz+%h%-xGl>e{UX85z0t1%F7NU8?{M}GV(mxd1-$I1JI2vq!#7BF-Si}0hm3zhRQ z_#D*RD0TgM6z>p~>=ii8xSLRC&u^f%_`C#7YYz%Ict1-3p}=Rc!$G3Wq>5Nk@Ws{O z*YJs=!v*oTKKI*HKhe3hxhx@&LV#H(Mml%H4;(qPg9WU^De6eHMDordnV;lyG-t?3 z-0nlHs*c#!WV7aYUG62n+LqiAU?LhRO3+>c>$7FqKt{gnH&Df4_tZtfZ+)Fx3z%d# zeF<=kqn`zVpWt)+@ zi@cScI+dX4e0b6zjjg%SDx*I+xf1iZj8)I^L_`orw1VQD!3Xn!YC0JpE78F3xi%xB zF`|~?hIvE;#MhjVu0E+OHY&tplT{|l^VlE;Hre8p6XF?EDHNywte;u|?4e%3Q*OIR zswUxKiX5r>9~q8Q9r~U`ChaYU*Sakv`+>`ln^7fH+^?eL{1{TMGD&4phi5D+ybO)I zkLyjW37^%0`mIlYmNt$gLVUF4_U7Pr0F}%B;@M$e3f!i`1lNYV{7q*7i&sI@faJ># zMMQ)BLSvf57Un#UZ+~Uj%iNSrR`r*1J1>%0&4uCG--!x@xtb3pO$t=+om59M z?D~k?3AiL#AU`=A{AWyc`OKF>YaL@QT^~2CODizgo9aOeWhW#i-!6^WPJecJJ)=LofxYJBr3NW&~`-C|T{r(u2GMQgTi=wK`B>mXtdF!25N+MBMvNvuB|M4i- z5ailOGaRcfccU`m_4MZLT{}DdE;3zG+YB*)ZQpzv2E1}yrz;jQjeoK_pN=T z-M%>uu!MkubB^JUyKtR`CmHIV3X5dL_}y#My|BIT8^E9Mt+a}6|C1fMl|~$}Io3YM zsBM`Z3Ml8|G^kqGhPxn3Ik+t{v)vYwQ4kH~9gLXWzVjqC3yHFFzI-2^F8|Q&%iA5b z4lJJ$&m+%w|8Xz+Go)MH0gp5a5iI-mRD1`Mf-+j5a{Hg8nl}1xkGu%n?^Mj|x4-5e zp%;YzZ%B*u70R7{yalIroQ;x`hkp;Jki)xKF|9A@^Q}XW=Xh;G=JPG zmF>S9hcu)AyKzW~)c=_y$IB!F+1uXT9JQHG7>L_4qM0qxzkp7=w2al-o>tO8>!hUN zH{AXwiVb$V?()rnr6D_ko-`_n!X`Nz?F&9@dV}VH9hSFdm@V%zJ8yOc#otd+S+xjb z%3YyxB08n%Z4eh{eyW`=?ltQ-Gd{bTF;eBX6kRb^u0LQ~XX_rHo!82WcxxLNINwO&81&LqOof?YLere4(4|*)U(-HW!wfXhi5xrzRRid)hUFP^D zFUcL~u>5uZ5q^1xb(-O9m1NwC`RQTiaazK2u@Xod0(1$kIDb>Wd7M0_sHa&BP{EB-}(=W3phc4fEHdC~D$LNA> z`E1eBoQOd0ehK#*bvBrQ?^9#NcZ#fyD?1RFqU+{Van<*$f`wxjH~6Z@2BSCc1Q)o} zl&MUa#bp)x8jqJtR9Wx0dGK%}Z-nvWIn#%M31U6V7{j)ytp2DO8}?;S!7A*{G&QmQ zqZ|KMdtVw(b@cs}(3zRpqofB*a3`{KU2&wXw$^qh0P!`^GJz1G_6v-W2fpDb0{uF6>|HnJ59 zlFJaL82%0%Z<@n&0cY2+jND52>$>@p`V`%^k~GaU)uEvilI}N74?aD*9v#Ye>3TH> z$|feXYw?>1<2VT5aL3}g(!w@e9ZjMsp3~MLkxP8^H5IGtmo!2Z42=Yd_-#skz4VXUd^oN<&-8%-x2nV&gGSqJB z#60TydeiFm%?HuDa$GAKa%mhmpMTPyUf7%N;D9Q{*|2maQ1zCr}>z~d`pLb>*HfDa@(I;b&dt)+T?3!=JrfF5PxFu60=nUZe3>4U^?FO3dT1?MCC<+s&IK z=6i1U=~Q*GxQ!$w))baGJ&`ofA2H^B-feVo8>@JD7CzK}WIlT?N2$kpBIkWCcQgn80VcD|ae{-nNYExYT zCFivhDd-XHN!HT!;b0yu6;s()d0t64kM1QCFUlgq#K|@8;<=Z-tyz5=%Z#ciwWCVL z9M};zMrjHACmWKsdYsy^=aw$Iua$OOtB7yE5j|e;u42wQ{8BPLU2yU#mPYRC6cGM2 zT<S=DY4l>Ml2_>N@?@@DLn~+;@H1Oww`tK zI@&NOM?JNWshWFV446eE3FQ2&#GJOY_7YVn4#TaCA&Ha>yTFb6UA%Yo7mWdbhzP zM_vuNL&-_Gi}!C57=?vp;Nzai93m2N62a`Hzb5lG?MVwWd*!%;v@S!67q>VqcEBZK zQcVjX^0D_O$p%QFpD`dc`%3xDE^M|>nHt39mwiPi4p6+m3#iR)=v#@x|3d}($WNf! zss;FC%l@BNrs<7ur3*#rZ#;qr`YvAPdgd6GDZ@4PQq_DY+De`?-{H92`4b8XCH|J9 zM_R7Q`=Q>`+^`BudYac`S>3Lg@v&+FUVE#Wc+^~mP<(#7-pm;{SJ|eQ zCPBZCw6d0eTD&G5tBxUSk|QIhWRPMTprm#>ZL9Etfr6UNTZ#cb-Os)(5ge*qZLVA* zvv~~}xiqR5xfnjFvPq?eQOv4KK2zpw+B1pH&HUkD|;(*UXZdSwvWu` zD-|`&sUMe#?`M#D#Rl)X@4I*&X)wHQkwYqVfaL`0DDnaMh8^F3VzX6JWIjnW420uP zqj!*gl%j_INoRk)x@WUHK8}0X@Cs28Mu&-yVhf?6@I}AK4yY$5{_ai9K%g{DIKV-A z7o{|`jiK%X8>!nnp(*v+ayNZQuO2xH(m66(d2&-$48VV#pm;L!g#tUVZBozR_0K39 zOo%^ANkLQjNKGZDTm|v`Zh96+`tcPac7r`z-L8~`j~jqzk_Lj>>Fy5FmH(}qWTOB4 zNWK9tUCmA6Bt7Y*l(+IDElSd37^alGm8&JMxJi=+;O|g_a#(;k6kh+jKoU=-;vYR= zkowOC(K4{%`x*a?{eI*vp4=n z@tqZ36+nFC&qx|1P~-d1ZW;=EQPMXb_)h^WGu~6G)r=Cw_SHm?`U9i#h?apcrMRg` za9;~0FnJ~dbwJ&F)Si<^mQTo4m*pa*6$!&T{G?*s7{1FD_UeFnZ`ruNcK1#h} zprDsnd?kFlmzU1v*56(}MyA29>K@WJ%G5r4j$OJ{^>CNlaKa;6JLV{og2Gq~LNBV^ zw-j1o-(7%p^F7zxbc$rQtO3l?t1u4)B`3CKvNSbtw5pD_XNLKSp54+|GM^7HbbDt8 zZ2xGU5nzZVKv#oH2ZpC)pgRH408z*_HGs}eUHqdT{xb0R-Dhn{%H;EvGAAd;v0kt` z)p`6|FEb`mz%WZAT{8`+TuT5S%`^T;+bQ(Vd*9PB5cc;5if+v$28#LQ^mh+-q?^_} z{$aT&Aj0`p zWM>j85z|N#IhoH@CN_M{V4ImO@exUnD}ihWL$iqO^N+`zZvlcL^i_C82Zr1r zX&M>DI~X~d-e^H%;)^Iavw_R`c46bMh8P?+_X_8oE1$hduVC8+Ca~p>%7MK^1$=SR zC|AE;SzqtGp|rYWXuC0SH*%Dv2> zHQgacNb+I3czZK|Nrli@p zC}w(l5^c5km1=YE%Dd49#yFK6GT77EdGXI9?4g(aIMO4mzbape;2K^a%G9i z3Sxjr}?Y0Y!9e-@ttL^xxIjl zSKV|aq@}#Q`}>E%aL%26>B%9$o11}sR7Vy?e%L%%4Zjvz>_!>8s_k{=VW_k%n@MpnKW1_Q36c0tVfKg<)REKy#wYrpN^sC>uQQuvQL;? z7j_;I)84jy=a=?OT@5G>`ObcN+LtOdCG6ABS4~45KHqVZ<;;dlXd@{$i=(`);K+ZF^GNv6dUfv0& z8X#+1fK6p#F8uZ^H*Y$EknEHImkS(Yz&6pttpIqJ)t+EnlnU#`w>{OQc z8B(Bh1X^(yUj!mKt;rS{jb_!c)J_z$>`8Z0@vRC+Jr9XRyP++zsiE>2 zF(p1^VCyMwVGnRqQ18b{i6sP~lHVFulrPBg@Pz2SzO?K1LeMxKi!td8Gv`eF+V;W3z82-&u`X%6K zp>?O~6J~+Natts7^=e@=02C8DbF=>3*rG;bkGPV=QW7w531{k;YI4Rp?oHpPN=_K@ zb?ooSOH0|;n{WP$IH^DINTu$>99}?d)+1`L%vuP^2k=)h$J%{5$gP!i_!9O0%ibWY zAGJkK(o(|guGIRHgk=xHV(0utPI?lDlz8mdm#62r2_vS2+S39=+Zv$gt(-D!J#vsJ zAZZ$9r4(hogHD2Tj;Afpzt~B%stSlJJtRa=G^K1_NLv?nCB!M-?TC9p(nNoVTiD&Y zpKQvn+&fjc0ZI_wRh*hUStQ3MER&6zRc3 z-s7#HD1+1)FftK5TJ6AxmI55_FO)gYj~(3D55imBJ0yAm1mM2*LYyd`#}Eo1Pprm) zTB4!#U-(>AIj)MwCY?$dmjMMXxKdR~920qW_&LvzeY-yyMGX^_sfe6~(-0csB@LmX zOd0CG5VOAqG-#Gk(J>$IGWJyktnXZqVdorFTeXaJ=Hg6roN?oWj|-v<+!WwYgrZ1( z{kO1GNCkku1Zu;|Sc6$54d|G7zW~8r=8hwGrEu;Gnn?5^ZW9+J4Fi|NAo(%)P24}j zz;Fjz=7~=XM!Nuv({o);R&Gy=%fXAj2#B@@e@V5K)^oxdK*>GuI(Q znA0%9JXuBW>HK*Y6a3_0So?KGWy~-CYgH=gA=r@IKe7$1%B4zLHXuU_MQXRyGqeS7 z8$d}#Fmh}I#xrP5$5zORm(s1@kGX7=nO`pzIHw4}fUgPB6E0R1irV6!WMX9PUSGL~ z!7UTR`eiseB1a}x%fM9@%B^Y-T;bRoLpJAV>`;qOhw)0|sVgImR}iR#{YzI`3TjFz zqpkuQG%!51<(ex!J$(MO`-j#PB`qLG7XrrC+vbY435`h#VW!pC48-|yzPxk4P^S9* zOCV2^gG#6JL&Db2dxk_O%U*wc!fXX8%gzKvP0h$4G$EBtUhtwM*!O{yn^Vh)af(yz zfqO`^N)deeNZz49D)8yt*_LhG5Zg6~(zd;p3m(HF&JC$)`_&jbib~-g9@T;a3sje; ze}d7kKhuN!@#4!6CIDe$X5Oc>K zC<4nKb$aCuF(SM7beP8OK{$R$(Aby1cyKRLl}G3*#K3VBui%7>y@cZhrWWol9|IVX zfW1@HtQ2cc=YmYmx~Om;6(vrtdNJa<&nFOxS%-g`*;BJAYy0r&7NmNt%=bp6C1wH- zvcVqH^e~h0H!-#?(5hjNwc2uZvkk1RZ0NXuWd5*XEN9$ZY2eK(ZO_sl?V0IypEO&r zFl@>}i)r$9(h5pKg~;Iw?lI%!r8#Horrr}>7}(#+6KP*W0bOHXjPm23PN4BYg3K(fvS)6u5#R}otA*x4$fcYCujX`Msk znK~zL$xZNndGgQ)tZcj6^7-q8k;OCxL!!0w)dF#sSvA$~kzzBYaK>AGd3E7vj1wWn z>Rn$DN;GGOacS<@6?3Lpkj>m^*xNl|$Tge(zQgmk4)oaqBE^2AkCR-fj31ph|?g$~VXw0<>wlHGVx@ z{}e=537+B?|8)tK0$>O#j*pNcuEP|~7(>*dUl#=s2V}?$rr2ft1u5m>2{G&<-(@di zC(rpn?Dum)Xq z!CcP*!xa~1)DoYX0BoBLUGMA$L*ttK3kvD8ky^DX^&ssfh$U%T{CY9SbDTzh>RIx%|la-IXy4bX>-X;qzSz5~07)f$9OOpRxm| zyzqLh`DK|N!^dX{BSbI0nWVUrN|vyDk`gq7;kDFR{0njS0${Go5t|G!Sb){x^2e+&+&S9p3L)Q5r+5q0-ZA{9>pWjwH!F62Qt(02HVdg5EL$|=?-ET#_b@NIO_ zxe=q#(s$_!W-ibdePx3~e9dr2*1>$8Z{;c_edAI!nEV*_wsKYOU*!e1qDBUe9=gi- z)^YVE3SM@AK@) znNS{^!fO$^$Gi-cQ`MUNWNbJtIk#xvak6~<`P?rt{vNSTRD`{#r?uRda$G2bRr`ab zCXqt~u`b~pnUp}58He#CDz#$^vCEG>>&2H}Snj40`z=fu`M^`{K~jp47(_-?L>VO} z1c|?aGT+`-h)E6&s1@`2Q#ALWo3ZgwI@oL8ka=fX9JjIrzNS#Q`hD1r4uuVoD{l3% z`}35Cb`)+ivotSMtBzevcZk4SX3)WL(Chb1dUH0`^17M_EX#Q}q^>VDeO5^`o0w$9 zy&pI2WcQdgJ$^Pv_}lo-Qr&M!W1>1;^qN-6z)Ht=vmj&jNc z7K<*o!Y2)=nnPYsO^-H^Enq$d8_GkK-@61w)()k%l9t4Sp?yk=v9mSt`qTpyQCVkt zvSKpiv(gAGw7YUYhE&s%mE zVV4&sF+eihaza;SoQd`5n%LRLxrtH}TjqLetvzt~Vd-GbX zqiWJ+s8I1piQqZbMG5Td&u}K3pX3~LDk-PC($waM=9j;Y`<}m{Yb+`>P3!b&o!evn zCC`B|%XaNl6WK6`l7umhIDb1u1G)>mMK9(;=%go}t`5FkS3MIY^UXF&l277pO5?H8 zZaWMR-k@Fm&`h*#rM)OC`+TnqS(#Yu(`0vjJ98e}dymt3)}GKMF;$EQ>@8iN7WFhM zy=QADmpP{0IfS{meD5=ASMK||=2-MVH%(1)7JraX7jCveZ$!aGj2Ck=K{5GT)}7og z4M(5c+~17{cyV*OPg#K%mR?`pS$fsU34BwwSLi{0q9Y=QqykMPyWmAw*31wDzVhDHj-}? zmH*`7dEw*?{S^;bFYj{7NSzAw33WTqTH!R9OXy4p5-l*xci|{?6F*;}XC~yjUe}g#KJ`ry zjB-MwldW6`3ioZ4(AGWR{7xG?bYB`3vk@|ROg6-Z`5B{Q0ei#wV8cv%*(mefPSv0E zumibzxM}n58Rue*gZ_7CD%>!|8=2A3!PVu6Io8Z=d5wOe&Bl0V-iC5tGphIj3z!gc zPu)rOf(^kkdg(VO1bEy73DTaOK#l$ic%Jz>lwLHsyF7uURM;JH)UooJ9{E>$ZRX9m zMU7_zP$tWnY*P=cj>&~^lnz|>vtVLy@|=n^r_8*jl%)k!=xDD&-#9i$c7n3tJ5vb_L83a9j<<4^5?D2A+Y0SGKD8)Xe0Tm@#Ku*4 z({MH?Rz^6e*8GUxY;nN~Aqwn3w`pfJ+gjz3fej*tSulk$*AVLR%H1eU8D%E}3YZgT{SclW7$(Pe^hQgE&vPU*Q8#6P+p zFm7m1hA!qeV6YOYo~_+g%%i8Xuoekyd#&@s`5+@+(aA7wdCjTV(9H17a?AMVn#`X& zof7l!BzCkp_(jxd+Q)OVYJF*?UoZBPtLmOeIXE$3=f$F5)bntNUTIkgno~3Nhpu=m`vF_klDue1&DX!aWPfaW#H!VR>bH+q zC_JX-*Yh;2qe!VJaQ{-C5IUE!5(9z#K9zYEt@v^eci7;j)Kz%9 zV;7%o$D4)Qc#8@>bxNhzb&kxCJ}bd60BVk`AlRP6-S6~7EDO~r-~RimmlPm(~OWihY|^LN(%l{zyS|V zGQi_Yi|Q-&G}Uj|Dv!94LQTCBd0ILpGAph2Wjkw;l-1(x%fVaGERw#zMuZn%`BU0Y z;Lof6%#(}hOjEnqKGMxM=h))vbby6ujo-n7kY;{a&u@Z4JCbZiaxT|z-z4Viq;jC4&-fZ=U}!Qou{k(CWfXGADxEo@8{ZpiYg_s@ z)?IMGZCwI67$Ml^V)#kStag9CO_wGd@|m7-m!I$Oe@p29FW!-u&owRMvZnsjmG0Bt)iod{##b za0pnxT?F`wC6cNF5a!TX6&?`x3D?w)!%_^Qng3g-^ zt{I}?VCw-4x5Y;G06hiuaSb-ifEH8xOE}E4SDQBpD>W2&rg19hI&yAB7tZ8kI2?lN z4loDM-|A$(yak&VSISJ(&&!~?dU63Rf^%r)0RU8q;11n*a@(+HdDL)&&^Ctc^I9I^ z6zE3IZw<42U$wgLRa}r)fuAmpops+2rqTMsf!##Yc#s?PD1ojn5Aqv zKqjCj{Ry&J(EARo^$t5OD>dj!sY< zj(kZ2Sr|ofItwtDL#RhN$|uZOge=3Ev%hL(@n&0e6=X73YUc{*bz-@p)3Ooco^<{h zzZ4CsF>o-7>*N(}&D<1NTYW2bNFlUmj`moII*s*0veebdEwLehL-)7o^(ji2Ph~u8EWK-J4G}1|RiU_OTVoe%%veYqJV>F*N#Ro}jQdWrxUHgxe+vNaF@Z8Ry&@+BxuMCg^F9pN zjNPQR0~%sl$$omXOu%P$4>;spv9c;nfP8v)abq(B05WCeLYEv8YRGwR?F3UJIVf+ zu5E*PdfI5u3kpO4Qf}1XY{lD;YP^P)y&ZHVt-1Okr1)zgw;cShPMER&pNU9IoLGawW9Ul|?!!>HUmm4}`5r zHh_$R90^mVeh4}eZzpLc5^hmrV0!$8>Ne9`#6MErU^ytt?(PH}^Pd|K`}SW)h!i(% z-o;=93llPdBQiw9ZZn`G{_&ZD8jF9jvH#pfsSRUU`K0;_sc_@(U2J6_)W11d(ok;R k`v1E`oKpXHjM4^uvg|Wzhv~U&GWe$;t9UU>+UWlO0Byu12><{9 literal 0 HcmV?d00001 diff --git a/docs/assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png b/docs/assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png new file mode 100644 index 0000000000000000000000000000000000000000..94364e593fe68cdf46af8412a2afc7ba6eb33aea GIT binary patch literal 130810 zcmeEv1zeO__dg;kSd@rjfB~pTNTRFomkT4=f9!cD9jNF{&C_8&b9w|m{ZX+8T zc2f%@Gg~7oJ9cZBJ-7t!8=1kZ@ePzvmn@J-BSvl+K6XxU>zI;}iG{U2%FdjT=QQ|y z#@Zf+1b>6e;OE)%;D;9YpNo^7ix1br){&9t6c@K3JLhq5Ll$O&K!M(%Cg2YjxFlm? z3xlJa!Nt{9xG6ZoZ0#&i*0@v4ZrGU^Az`@7;9+RC#wc4;m@U2~m?JkMk2E9a zDR2k+!y}1bANXKxWQBiJ1uT&b|Gb4M?pdufW^7z%q%E|irR0pH&zqc)HP!yHH$tCQ z&a$>fHs(qw)790RI^$==&BKeEl&K5uihz(1?t+;uVI2Ih(1r6Bmk4WFoj5nAnS+HX z%nsih?n!$T3Tba)^Zi8=l(jX?1iwl69*t~mQBL1CgQJl6)#2ONfQ5h8J9OdvPh<*R z)wD3R2fGZZi69?-deC(_n1z`+VPvq2xLH~m5gOw!*qIxdqMUHguU=ZaD>yk4q-l#n zf&SKP+S=QMMpn*}FeIdE1Ucfz0ebV>pZzKg+pn7bd%82XG)4%T$go+T5;%`gmvXQ_ zr^d&|gA}_4Z_TYQu6~qe^ZwOpm+BJWuP(p500GHbP2&yL$Q?-U4E9`db@C6bgXtDhOM`?Co9fe%Z*u9tGO1!C>Gm zY+bZKQ%-h>-mHGqhCUKLNI~%T?Gprk(5Ih)S8h%_l!NUm-h&qFE)x$u_C~g5fS#?| z-@5JayaSQ}uGRl9e%r#3M)nqtYkL8V@UMX4b=SEb6!Rm2WKNq}I5Kkcn?b*VzgI6o zt`Xq&Hyz=>=m1fTRiDXbx5~aCqR7Q(1NIGi41b@{k(7!dxR2`$I2Bxf-;V(_2QyVdNrM$wSlc*2OFd-*_73bH%GwUl?cX$U$isb; z6gS{+HjokkPGE#Y!jLHFkN`%tf!SIBU%$$@zq_aM?Z$tmGR`mp`Gp9{kJQG9lM@by zuK#LkBf!ZiDG8qVSwjN{itpLZFEfK*qBa5qw}z)SU)|gq6y*|HkJ&)ILSp{=m=m6) zuG#m0C5qzF@js#{{*d{vM^U^yzN0~Z5M-WnFtWC{xU`BvKZ`&=V9ob_`8S?}IE{q3 z1bz!xa7zjB1LE^1fd%2{j0X!|0+U(`7VA;2uP5uj4_M$2aO-Hzf5YA3d@qPN8JgNK za!Uf(RQR1P4LAVE`LE$~Mo0@YD9B_2m>lHMPD5_O0)#_O;cr-3n3_VT^>sbk_iFoR zxE?`*rr@=Shwyo=Hu0?EdI;D45@nJP7c{OK3c;w&idGH=( zt>;-+9QcFjzuxoU<*^Qi{BC}H3W?+f{{x_qvaqu;vNtjRpa0<&;K!pj9x?a`ak;gK zK_IkikNkvoe+go&d-M~;A?!M;^WSnm7lAJQ;C~=LL{KYZTkzX=2hP9zj0ibRLtKvJ zyQ#CYv;S7-iQ|Loh<4p&f7k8@diO8zyFaivK7v18YefW$S!+cE%li9R5k4RFCt1-c zoC$s7Z+~<9S!+DFN&nAse#ofeLL)!@2_;j0ng{xtI>h@Qeu9;(wH%NuUX6>awH$(l z{e3KlaB};rC@VOHd{+Vf||7)7`BU2LkCo0mqQBgbt{EHCj&nxo4hql&6 ziPvS0gpK_ZIsC7o)qgxi=KfW|J~fn$0^t3cxavVr{a|B*1lb#?{DBao1KjI2tvyPTY$cp19$b9IA1yYGKwb)djc?30U- z1zKyLgfPP2+aL3+_ZG`fGJ_!Nh*~W_{UKTUwf^*%DA@lBO@Yd;n>!Ks3A5u`x7L(b zfZ!3lN#sP7 z*THY8$F)ib^#FO2zd10jSE~7g1B6+do`T*whf1iCMs{`~Sy8 zadPv5hR|niC~wHl#l0G{2cN&C4xtZ_7$N5fwOsRl(eHex4!^M9e>7XOse}TtwR=jq z_N!m1`!$O9P5pkXmmuIBUiZFMdJ=r?|93j+XNTnfaY#bokuc;xwnT&H=i|@6yH5G6 zy*YT-fSXfV1^iGovPReu2nPsOf6qdHFt^{sH`j}8e{Y_D265ygXvmLBy2g|UO0}LT z5ftmMk5u!lS4jE8e*G=s@!tk}{CHEvYZN~xFFPTNK+u-|KJu+M>OU6z2;r*l=;!}b z8{p%g?9qRCcY~ix=+97Xemp-S7}PqW{&Q6us4=9;f1qlA{YnUkr$QgJS1Ik+?}zxt zXrY^5DecdC+kY88^Yi0RJ9u^C;ua7hge(7h_$)vm5x+3%KUay@3(EgKeEy#j15m*K zS*I4A|EEb;Zh~I_7nS%|jQY=2;y*VTAY+TN0{e^l^=GfIiGg*(!0TN4&9w;shQr+V zbw^xZZDP%7j$kHhPji32h$}CCC4X|IAqZ>X0%EId0ephE!!L)z{)@cf&xup<^YgOv z2z;+Oe_X}t&&R63u>WDue=e4;7pnbj2`g^ipZN+bLTt(y{IdG4vx+SYs+>7xZ7O2{ zzF!Uc5-o1d(@2!bcX6z*%?Z!{H)C7>=v*on=MP+!mjI7zP?U#oZ7qxv&gldO@}~yH z)~(mZ`}98%#7^^HBR7P_uDFHcx=|aDQ@2xXba)7?O1NV9L zTco~zQc{9WH@G|M)@%F04f310*Y|<%^+Mxol2||MasR;NwE*{zRbv7Ki(i8RAOMA% z%vu!SBGg&@ePCcccl5vWDcCwZaICjezbp60|I?yFZh}`K4EDzd!3BQBsQ&~`wLYW) zg@tr>9ZdVR89;8X-^i)faHZeOpT0pFbbk07KpO84sXkr^!6AvR0W>bcwY7joI583w z_phflgctUIk1lZkT3z_Ys=r1c{(Z^--lYELYQxV*u70vnzZxoj?)ar-{Gosl}F=4>z$y{VX{WwwfT$CVtpy$pW0mhx#vocYCybh%=>XGWfl zxea8;q_(j!#qA-Ddu(s=X0o?Ts2;Ybz62X(cUI9YKX&1{Ykh4@ZW$uhZcqR*nv1D( zsxzm!Ohm#!w1HHDh?wxJ8s*~~T5$d~SMa~?X1c5em-*@S(gXp21-aYG3Vr0BTf@QJ4-%`@8Nbr4?T zq=JFCl472-lS3j zeG0IXrypP3Ny0#(P0WnsdQn-{*wInAVou?6!Yk8Cw(XoieqQRVag0ohY}+QBI3!qk zwlENRNl~`NV_MOUYEFoO<%g#jl1OB7<~yw25{qWEqFw0J>5XXdTa;P5A54=<@>#lf zNk}SerpuF;R4VY9TO=yBXyvi@0qOAz)VQV>Hf$*8&fY*wA-8?Q2_xppVBdqJ!5P$} z2Aj96j!g!3O!*Ga-mN4}%9rKaEMhlI>=6&n=zGtc;*HyOhLG@$Kyb#SDr6#DX^PVl zkG-SKvD|{I`XxsJcH~ObD;0)2j>Kv&loz)9RDF5keSloti1CNHg5G^E!s51^W*m_! z3VnVGQW1$OKrg62Dqbh{;-$V%4f9K{q`W+bLERpGpeY2ZI^+ zF49NAV0z8NbI!LS<=ZAEY0;-~20&3$3#802O9J-9l{Uec2y}BTa_`Yq(H!*w8g{DQ z^caO43z%MF%hUv&01$mYN6NMDL_b0XL_f=}SV06tzg2)o?nlvUK+{Ve?BlaOVN_LP zsI~1owQ7STJE9-vw*hoRV-3HvDw;bCKv7AA?mduTxCN#cEt6h_s#SY&JGLPMk0}66 zcLI3Gge0p~#KBhyHij<^e*f?FxD_;c{(vgGVler{-V8b%CJ`pj2qxcF$gX}JOkRBq zp7G=4|JuiuNSX}D7Al3{GyB*_CSHk0E-gN3?_&;ATH>4PQALDDPqfGJn4rhu;_jxi zpSa+CWeunSd+qL#l;`1g2L;=Z@7(92V-p@e@s4If@o3=mf)1Eu)5+J)40ouAEz-T+ zrxtHMl=Tdr%ZD*%?;hn@S?Lta#w>*0nyXA%kbyIona#H9WoP!ndkj-jwK44BEA#?y zN8)IvB6(Me+mYv)vsy|$dF+LJ(clcmkd-h~$BtfN z?RyV8=i#!V7^x{L_|+9N$13Y><&zotb!nY$&x0B>tIDj=&Rp3wt^5o9Q(cM8!PIpT z`8E4jT2Io##zsiyzjQU6Z4Lzbd@p}53(ih%0Xwlc2s=q3N9L8I`J$dFq<@|!pBsaf zXS(Z_&bc)Zz<^+rY3?Gkfu?IP>F=h!YrE9g4WXJ0*N`{B}^_Ac$cUdu1$;2Z`) zHSI$p*X7%Wri6~Hx;3wlz?U&sh2A6~escKEtbALGFHGsmR5#tnoi&j&y>ER7%Ch|X zgY&pVW~H-Tl+|L#;+lCpYpmv{U0cW|+3I41g_^Mkm$oVN*N<9Gi2DKa#~wPe>O;Jy zseqeo#opWxYzXR_&$qvCb69FlZoB(Jgc98xHUbvPF4`Y@vnI00$F4afd}Zas@oW*N z2CEaSJ6du52W+&%Fe}NPBNI{Tfv4a!nHN~`BbT5F35HFO zF6pwTond7e)wx!Zv%BD2_D!vrrNs8?09wV{dz*QJvkeOcv1HjVufaQG&G>s$Lzh1- z*ScWngXXYHB6SDF4r5E3Y>omX`|7FB08XWx<+OJn36(sw%?PR*gK4Lr_NZ|9icax7 zvEenvTiZN25xt^Eimxh3zgV`R9B5u5*N9rkuEeE);c@DQoRkHKNj}-c(}tGpy2ET1y=$J zj#@DiB~d*Ct-ODF-=OJM#GKL#mo6u`_|utJ>H1duJ?iBex|VF&JYu>O7VP#f^thgn z_Kx^n(r>dn)$3tcBWUCip-V1`!72_5ut@?RQ%i#LYZAqPlk|;>r`(V@LH4MG$}uy# z$7mC+mt#u`!qTuT$XBh|mKk7Im-a=Rw~%1)13dQJ^j3c0Ia{azq)#wc4OB&1Iat;$ zB{k)e@URI01FgSBExYPkiAdrU!3tpG7xs}bWRL;}!*l$D1en3a&s6+)_hbn%=Iao~ zABHf#fF-w5l}scz&FtNeFu4Ol+07hPo;Lx?mXYt>@SSrvmInQ1+G<+fil=h4)NgpU z>OC1wfeDk@GD&&?`lnV%<#;}yujwvJ#I41n9XYVU+TJ)Tle?)8sT}W?)U>TqJO(!K zgvA7JpSJ`tsni3iSugUi)a=mqCsuX*qLs+On&d4Dt7fd`@i}roEUz^cj-|gao0*M` zqlAwZhAYiCjuemici7caWGg3!DII`0I<|I=lw9)D&PKP;FFZR1-#BhK?LJDrRKt+X z(pobr>>!$yyRShN5wXKH*^Qk1vA4cv`b9C^>KN$2bQ(Wz)Bq9^qIhi2I1-QM7U`A_ zDsz54l2!^CQ68`*JG00ou8WNuia&Ty>k15TW*nx^auLri6$le`e%yWo8>T-jWs_VV z?yomGdw%)iyDQrqb=pUw(s{~O#GcSsWS?coaJB_H*EMjuPC8^L7*t00thrk}d#4J3$tdW?`*K8>_!>z zFN|kyo7Y8&XV(P?=(@d9eB>14Z;x%WZ>*kNoP#%4g+;f(o)-*yqR>Wjwd2pLCw2Uy z3@@PJ$Fn!@=%yPzTNfkP7_Z_PBcu`7f4Y2<3?+DdEF$2k+JyVKeXGGC>LG&~X%AiD zg(^3b*-wq(eDA3ylE*I0PMN`3P?1LbmD0~woC@3XH*z~h4c8x73T~IY!LjF_!MzfB zk=_B;>;ZO|$7>EaYPw-DsPG(F(Od5!dYrgb4g#r6+6g)0W z8ru?#IL4ZiJ?Gm}7OukukVgD(eKq2PW8cpQ5{2B49VN+V%Fyg>AE{2{8S3vX{#c$+ zG+?XT#zL2}WmDnpa@&`cY3ZrE9Y$iu9=tdvP>CFc3+ya$vPf5yUud%4Zb=s@RfIl; z@hZQjEJZ)*)81k2{xkwDggNvqYh`fR0TaWad*sOF^zbn)K_j<{!tlOn70L9a&oR0@ z?y>ETgAuvlE6uJR1;&QW5DDAC%7?YTV` zM$0|Eu&E#oGwPkA8`>1icYo-ho&M`vlm&(OS`mGJ>uHhE0S+2)pt$Y^wQFD-KG*J-QJXQ*@}f(arFanuM>q& zjiztxY4Cl&W2b={D<+IZ?@*RD`)M~FS6as9L#WZT1wk-FRzvaTlv3*CB|5gJ z2_q+m+ZDHdsq0)iplzM?dERX0lQ@&Qp(0D9me@swp;hIwe z-2%3lNBVEIo($)Y?h?&ZWE|-bui3u=x3@mqz~1VeycSI%C*&os`9f?!I$J$TE=Xcw zxZoy_t#)>l5-W>NSBg5*r6i__A}@tegXhwIp`QX>tiFiA&qjZcN>zx~SeOl#W_~7; zrqvqgxea?EBA+CuDgR|4%-7Cy{|WhoT`*r|>Blt3XDhNuMioO}yJclJqU5n+(`DhG z^5VL#rD+3#cr_29lr2b7Ve82pbE7OWQwk9l;7HC?-p3xAa8UWB#_OW60B7-RrsA$Z z0CZlW%O9wWj~r+%>OH6JMBN#uVQXXY2-Umm>Fn{M@$ zNj)C&F8H&0bA)+3+>E;B#O8#OxTV?ZK=^ofW`0kNrfz}#(Fw4amr1?ynOiueTL4%YKf zz-7=*X1kXb)nije31yM7L>?2(<^(BrDS8N`2=C{QooE)yD1|_7Z4L0jX z%pO!Oo^?d{n5~4juV}Z#hw5{f7y`>_rMcBI|4xX#dw#p-8-<}xfZx`wPaWPWQ?L{+ z#w(PB_>7N^s0C*F>mB0s_3*6AYRbyyOZBhMOTTz1u)nOUr{gr{e zhdcwbmV|%KQ+As^L5YYPc&jL(Zv+HPZ%bVZwSs%kPh*{pWgeKFKe-j z+ogfi{w|68cY`xvX-Q`c2lDjkM30_-rEP`Czt&hE8oB96?E41}sx1i33n7|dV%Ap|!FKA2+{BqW9@#IL^xOm;j8{t#T^c546wR6ZgzK!x4rytc%M#(&BQZMuU~##Ez5$_ibY}otZX2E3a#IcxX8Srrxj7Z^b7nwddXJ zb&pQzuoiT8l_DaC@eX5Bg_K2?q<;V8zWqGXlffOSIanus`^83+r2K+C<0@oDdv*xUi=wf+UOf7o|h`7($+S>+dgX-iC+b699_OO^`SOC~kpFdnCh1NA{))`;s?} zS-e5$5%<$Tlvfx!NcYnsKo|CkPt(oMuE8tLPBR7Z|*-atcNTPc-QS8;%3n1klRj zgQjuHKsFBKM)V}A$_yuhGyI3Hvn|6CP8&_FaEqAj&3;lg&{LEr*CbYFShVERk~OLx z<%-%y|0FdoM5h_qrW?-JE>g5~GhgiG2>L+X^;hjR~ zguzMm+~npwur>;CTJ@78%GBF`dzyFyX^@XZ&9!NL?) zx7<9Uhb1~>eG42truNJdjvhcLY0;FjV0@aR6uI(k4$_&mh44i8;`^Z!FKgNL4q**m z8fy>L+=>Wa;uph;+0xVqTTkq)7!skyI2OD>CO6j}6s{e)T4X15M7RoUig4*OD*V|h z6M&z;^j#02=yxgPbQnk6$2dDb*<_QC^XZzU-+WU7H_4_P=XjPsCI;^Wj56t-|G~2Y z3j?t1LH%iMi$+AjW7NyaaQ7SZqLnk5`Pzf(wFafUQ_JaBquT737L;1g56`}oHEk+! zDlA7{6N6iY*j9PGS552XUx@W*LS4*i=Qj)ILm~M4X|o09&C*JqKZl=guFQxDl5b;_RH%YB5iHjaoU%2yi_(JFhV7v{ zIr43qO7)}_3a;U!??^7D>+_sblRn>qZtEWT%v?KAUr%c8+kn=u^;mvO*6CEs?--YV zW?MGOkjr&IG27kR>4U?8Y@R625_`3Tfb-Gs)cx|}UR<)OSIqV-Ae{zu`$3n;5mnFS zu6VnZSdq$TfoHxWzRHiRozk>*Ly1LS$NQFD)MrvJ4xoOmBfI#r2u{0Q;QUctVpkB* zVzCaU9P{Xy9a5&K+0EzMw=?a6W;Pn`G3}PFTg!iO^^kqfgBvqOk~#0X=$xHzT^r@p zUtTMeC@gcaqb5SWt-vipXfqz!p!A9J+&09aJHo_glDtcLmf>5LtJK{b8+&W?bDUC3 z;WxzKvx7l5pQ%r^^03e?r1^hH(iiTKwh$?td6nALVaj~>Dk8+)>k?8l+r)rZ7bcR; ztiORv`g*=tw{#?jz&_{L_Kko}&1%A@229gZ2WVTG=1s|ZK1cL-r|UC>FlGsephx<) z)QvgyspzgSWH0G^>(QieI5on1cY7Yl^ZvYvy{@=|Dx9w+(IZIPa4E(Ag2H_k)l|9c za%^+G1|JP8ru2b!0s0IrW}`IYh;iT}db~2*17&{Lw~f_{h~zZMT@A8@a)G*#-Y(>P zo1&$|RT(&DEOPI(`2FH}3@BSI@<;=@&Yf(Q;|z=vXvl9iq)Fb`dvQfWz}gO=nx|wjqlwV*I&R z0*?z0bjBUaFS|Z}l-b)A8R6@O5Y1{TX-O#9LL@F|#JZDS`1y!{uUWhYOT}WhZr?;8 z2u_?_^bW@Bq#KBIOfgnYfWuNd>Cq=tdEUcivcmT4#3#)viM?m^%X5wH)oQ5ZBTO@o>LC3XL~;)>pl~cq?B(SOWEybjp)uqlMfya z$rlyLPJJ89&OMSY9^zAWDI&XODe6O0!qpeu`8{WsW;f-axzk5ILhLX(!Nh3^4h;!$L3&Q5l725F0sp?n|Z}fym0a6>}D{=RPQF2DeR`VSm9V0tw zH_S$=1~$xfjHDN^Z|anGrP&eUGcq2R`LbKsh%ep<5k2Lbwns}T5Ox5g88o|qJ(9C; z{|D8W?v78dlrv(BkOTVC{tG>KBDD-!X{eenSz6h}^CI}jdL>sPwa>a=OP%XF_^2N8Tjnm9dY1-mTLupmp?}B zbyWRDX`AocJOz%3O4gud;eNCF>RtIA9ZSi_CTY(`uoW1?Z)K_8WprDTOHH!Cd@(Vo z-qq3+0pp9;Kt$V=%!gZ|nTw{a_|(hA`gpR%nF@?`lo|4eN1CFeFKf`vn&*dTOBV-G zl`gr2^OZLg%1uTn?_bzdI#4+M`C)8>&n4;SvgEJSW;7nnKbPcr@;28OCVi{5t!|{K zm3yeQMzE;Wk>>eQb>4^?A5wjT_{B_arL4nxARsG!S&kO3FQg!BD*NVm7|8}wQ_`dH zRC)BGXkmTnfR|SLG+Qn)=g-`e`gkkt#k3c>sW+me?0nvh!V_q6VWVWy5qMSj=zBjW z3;k?A+Zi#g^SsHi)FmDw%MRHw)~=7S-5Sar+^MG8)6w#bR3uB%_N>y>bjSNQ=ssn&hi5u+S0hB5b zQmR3JC5)H456jb)tX|oScbbQm*fED�w5zSVk*$9JXu<*yf>Wf9ZqIh$yC`=4lY7 zEJ|g$LON+%d(W9b+tlP3F{bl3Yf&P4VZKOlkJC)^2SQ~SN1i%w6~PmwIViKb%A35n zDIq7wZR9@#OS1{Ii@zP)dqnIdu+nddGOsmOstsAGqxR*B#ly=nbsn#s^sY;O-!+K2 z4mRB60SM6I~y9pQ39CG zCMxJTVoDpG?GU5dto%X>k9S^xb#1!R#|z>EIV5*g$%=Akkh5bP5x&s(mK#cRg?rfv;T!D*z*9pSsQ^wy1(e_xh?ga0K^m9&H zax8)8P^C`r1X(&16h7Uzms#yHh%d7cMJwQ`%4<2mjbz&NK&-9$7Pw{CSAH%S$}7E| z92~~4Och8ZG=g&ptjqUVz))Ky2JhmB;)aAs$sY>B>XuN1xMS}{seX_l zESeO1MvPzi6R>gxrLbino?=M6AnmVr2rMs+!kHbP96WXez{Xx9yW>F4P-X`rV`?$b z%Evddu969$scWX=4^8Jq#d8`cRJ_$|FzdfJ9XOZ{{Cc?;7_sjiBNCs7ej5#BGq!yQ z|xAsis$Dk6Ca zBF$1D>@7`vMt~90hPxl)?eO9^1=@L}Z|in1i9#rK+r1o6^9dxyY0nxl}2Bz{DqZIpWoO7@RlsoUTuUl9XML z$o+`|+2x^H07lu`P58=-Z*m=imO4EzrhEhhpreYfrQ+3~5Ewvf^dV8e=N*oMCc?xY zr?bE&IBIj+QT{|Mmmsl}lm;|c1F}oTGJY_+`DW@CKCP7?Cr%P zpO>R@Q%=3SbdhJDD}K8vj$H<7M`RuaPDT<8$4Q1;lFghkb z%OwqOqL(G!0y};y z548R`AF>(_dJJk=3N)nl6RM6R#CLbOkxTumFql8C; z?t(Q1*$?dlttHCAdX9`S)IxHCM!s(hrcNW-a1Gi;fNI7C-@p+h#?Ao2DV1HIfkw>& z`QJ5)A;EARJj!KH#{`XfY&RG+Q*Hp=Fg?_)B(7iNSU2i@ct0Nk&5CHtc)(}==00gj zKf1!3j9Sqy*?nQQAo>}8YrWRc7>WCOBp+BqLUpZ$_Hh}PpMH$iL9lgNCZ~=pPBrAi z>mppnmnN#{(Q(rL(svv`y?O4~<6>5{I9Zcw)tTpVXl1HFJk@1tphw8ib165qc=EMS z^h$&0a2k9?;df?a~if!ZXM4E7vE!rwC znTg1WmG+0|j(mC;!2UGgoiCeiUXCcXt9h&tla>@})(~UmI+A%qAg1}OcsE+#Nnd2V zcnbOX-SZfgM0`r|18{}=Wz&s#Ajuu3^eNH`krNqy{O}~*J?67{`8}6v#8wvVI?W&6 zI_Y&haB*S0tn-6Ehcza>taS8g-|e2vOG;Hc_Y1z6Z8xKL{q*3*s`ecMT^FVX>at@*T|1s6#_06NUfJ>t z)PO-c;t?^^ua_u7!FpN3YElc02+W}qX=_xW%PMusnX^|KW z*VMZxP2)&b*i+)#Su|dXph~>8PoTb?vD3OJmG+zgGP6SQ+(6(FS)HZ%8N^A$4_a5g z1j-(qxV_B}CLO{ku{7JHz|jt-KE6Z;%VfRoce&SdWy!IeQnoEMzelIyeFP#clO?^l z$zy3o6Y=)(xq74!uFQymf(F>~V`pL!NVnB4OEi^dQPf8XHHT=!_nuu`o~+?$@q-=c zOg9uS8cK*n)`oGRGfk?ZPTm&_HMnHK5to99va1)Gtrr<@V~Rwoj?OI$$)BHn*g3D+ zsvhK;+oF-p9OE1}>_(;{O7ceVXtmuf*0T}3uj)@uDY`C<=9k!!i+#DkF^owxGPG}$ z<~OU~&+%dT^Gov1woDVN^pe>u1@?lYh@?w-8; zG34ZHPS#-8AljOs6bYw_^Vm8|#vFA4Of$mFl^sNs-jrOL$wHORRZ^!bMe@&hr$lgs zEIBmkn4|_-=&xkeuxEEW^|~L*G@#zz{P$>6?s#>TeV<<}JItf*-4$*U5=XF36di8gwNta=@U0 zA;h{Q%H0P#8^CYf^)xBY_(THOv zwe}{>O*#DC_UCm`B{R00)2>|Dhs^MzRqD3N?=GG?>Xyyf)HmVoq=6P<-8!ubwp?*zO)E*&K5Z5l> zJ*%t2YUn=GV$g4%ddR?Iu75P>yz&HEYVb%1xyT4-`e}y@t@fbf=Zo!!E|_Wu8eE)+ zOuCNB?{@6TJG*>oIHFCC6=U=q0Osg{0ojD?i55roB;GX3UT}H=?&GpV7x4Xk16(5z z%&15apV?J|?Qu;%U|W4CL2V?vL9EE!SX(_kxGDA6>Zd!ji?7kt%*r{AqaAD?9=Kz> zTUj0lv+d9ggvDQExuuQpiglZOl~fWrMOm|~GmP!^?B(yWk!;vZ)7k+G)!$83)4{FK ziJTjHM>UY7cR;ZYWKp+(xO`X+BO1V(F-Pf_IOgJ|yo%kr&cPX&th4r%i6Z)Va9>-z zE4SP0sXvTNELBP4UYPoc&l|~4zKh?&qI%+iMoM*78IzWJGNa{HCCn{#2c2svIWmG> zYUT>((C-T00Wno6>|v?Xt;k;_wHGsPKU<*jRx{9a$)o8EvkM~oB_agoztsoYXM^`V zW!2$_pV8fDYRMhd$Q#gnIg8asbJ2w!{vf}11z=^WLMVIB6TdOc^J{2iGoDmjJOW65 zWK{+&Bw$Lia(z}7Iyz)W>6ry8de6lPbdoyDc&hmFmzSNpho+m)V$oc*9Y?K$OQTZn zgi0l(C1SRmk6J>Uk&q|ZawI)Tk?mA+byi26K<|k9Pp7iO9OP_Ge=%6JE$`# zTD<8P*IG$O?4;wZ?z}Dkq|fqF218)_XhzV9o>F&LRCkWwRT!|g6GwM@tUTXBpR`ca1c8_vP`S`3Vjv8F! zPu10+;|JI>Q{>T9*-5+7ZjgF#arQFNd~E8lp!uAnPy{?- zi9_2%nOl>G>wiY$eEPT7wiU-os&0S`v2QLC<8M62FemL&I6 znNf8_($(tiI*6<#tGr3>>G7+%gW2_(C!ESWmbMOEvBS%r20Hyb-Nw19-2D)-tyyj} z37E;#9PTaibm3UUwpdcvKwft*58Rn@u#nd((V8b*}|CI-{OY@3w zD|EP{palMHQus8b=IBxi-^*ZgoqH)k)}vhduM@Z|x1}>FMFfEvL0lE0WXFLN&rCE4 zK~gjaWK*_NTnjJZ%M#F(SgK-JAoBKD5VWk&zq|oAB;eXXfPTi^e$EPY)DNoO*|)&b z@Tu#|68iy%?EA!rgvtU{j)EB-_Ik9BK#QqlL8-4`C^rLip#Y3JSnqd)a72)UYVivf z>&&4Gte_~k)_av?tWLO{?9pk~hA^R*9tW51E$kgyTKs|`pT)EpA}edfyBoGg4tc=q zVqkWIb+8y{eR4Ejmw6Jn=Y7+4iTs~14I>Yq)3<#Ne3_z@oHs^pdp^1Mekq^1 zRxh1^HF5_XZ^ryg_xPCM(yw?8iC7@UC3L1}B!hQHc&3c^esjc#?j6RQEazgc3618OI8myj^{SvO9hZc}+XC+hB zF&aGqs>v3snyA9#l^?tGtaRD*ah=^z+ATVP9HHz%os#((WUzx~PEQe%M2xij%4OBO zM8$C9=0v6ZgEc7^wv1fzNiCY6&&)W0Q#^))B*x4kG^a|jz9PuQM-%8mB^E?;p3+G< z6R7mw%ZK9g!x|$ypA)j>r@{gSypG>XYbesJ6iXS$qrb&vl60u&nm(oWSeV2<<24`C|hbXpXx;6CWce8I?$%{r<+-rxvSM805?;o2wd&D#6s-t-A&LLORoSQG{R?W|6hnJjx|Eu%N?Ji3V z85hr2UOHT$%`>ZWv#$1q3fSsNk{m~u>Z;!M*O+P3RNIdw*SzMa!njPXE5!)5$HSd+ z6qX%=bFI4Bd)WmKg8d@w{e_FS+Y7(7Ab6RLJn{?wL z6((YoQbDq!<_B(y$NRkMI`?bj?-VS0Orpl++V1@<&|{7pR_pQT#lC`9-k-30gt+ZixqLs`@4 z&=nK#SA|RA8#$Ok=iAzu18$xRM&hziFa|p zG)_W%=zU_2uCLbZs_lEYjJ??!;b)4lemDAzI5k!yQWEWCYI>|JGnb~n82TIw?Vi!2 zAfA9+er!XNIhV1Jn1QBuf!&RGxmULXYBF{~2B%G?rpDSkE#c|9&?k^`KJ+&1052_& z-UsNQf3_^X&a&LL_@o&8tyU+pTg%R2(7}w|(6d8iYKp7~*T<3cPwPX5^0rqL z%a5C-ftPWhy71LEMvR3F*fko{1I2IWm|(>Nt>QoQ3$MPRp<+u+Ls?_E;{Z}N+Ad>e zr2XFL1Nkvyb|OMi`I&=c#zG!FEqSVvH+JTBF1x2gTC}+Nas>_usEA>-HTM#gv#u(4 zc6Q_;OxSdDSzq1%1QsR71bhnjQbf+VLm_=bjYG`|ogdGfA1P&_&&*5htgxfd!udmK zMis|Qw>IsC&M24c6!l1rP2{HYLA}Vf&?CsV2q$%0*^7TX4vY`$ARTIj+<^f}of|U&!h;FXHWe-)@JYjS~k*LRLdc5{;xm z>a4v}Uq<=gmC#zfQa-Tt;Ir1)?YZOz^KF2siFcY{OY?@4l63&9Pw`lq--LbT8-M6C z;2!P37LnC=4$Zz@iOE@69Nf@pzA%IAchERiw`quP0x)=Ar;CR-`)J+H-&b{OFRBw+ zTymfZTkT9vf>{Og`}HKRnN|VHb++@BZemx8c6RZO*oi8|ebe);{AQJUMN4QAMeybDBt7?>1P6ZWlpP1BVW$S|RHsR&b?h84S;n%MMnw3hy*H!-8G-7j zAUIi}_xf%Qak{nHh+sZ#L-slFD^y%_oJ+pUJS zeW>Cy!wx;uajsWXvlga@MpdVr5~e0oG@XBB)+UlM{GYi zqlv^#;#HuWWKyxsTznO}aWrmUgTuLg*c87qv)jybr@XGaEN%KuJ^F9vqIT&M-#Fm$ z_|_h5NowJHzeYZ)n#17D!TrabE-3PW-F+2KC%*joUUciYv96Ot$PUJ;Wbn2HFugGn zdG?S1VeEwk6Eq5)U=`Vwx^@5DNOyYMs6e+-Co)?J5{W(8_C1`>Zi`Ip=^T3|wzl`} z#zGTz7hZonjxK)n!LhRf%jA)Qe>H12ql#+g$uFI!90$}_vJSFL9}5FqMBK=5$3bKL zLi^)S$(3(1d7XPG({5R!k-Mbaek5YEvHC}#A4dkMw!c1ox!N`>p#m#emd~PGRDUTo z2cEUL(tMR?-Pa&yq^TJ+bV}H<;_x^Z(VM0rnI!VAl4m>%#ON{uY@d@nbd8%evVL>l z3*u6r=zL*;Ah3HoNQ5(i4)97-rCVpHnY9DbL`Ppxb10yn?b)$YBG>!6U3> z`Q4T$dpNG*@OKl5JX46FtLNqJ+3H*90z{(lgof$fOg(1rK(W1<`z^3i^}yDlgTFQ@ zg|>1}&YmTVxrpU>pHc}%k3B-8H!gjpJWye&2z$k0%25K{V7Sd2) z5tHag==eh+nA05dAbtOE{NXWeVElIlGUA1Scs|zMDzF>u2sQMuK#B3b0q!fPcmr-a z6xH(gj~dVQJ*V&J&o?yH4;|An8}aEEK62B3i%-8?5wmUOA^ZjcW2v>6z%-O~?3Il% z!_0Irgl{xeTZP>WQm?x{{g|<$*J8`Vl87>?Zq*DJo3T)P_U_Cmi9BM zC^CG-vVS2;(CBVze#d6V52q3}diLP5V<2#R%vY;tm)y>&CP^-7VY>7S;@^FB1Yp0ru|h46nUc`a`YP+p+tHz{TjAxl=Qw5Zs6j?dpZd&8ups+wGKWIW^K7ZS zebDU2xXVqe+=;v=dwDiO0ggLxVrkl%d~w@*B{Q!Oc~UnR{a(;y>g~E2I7*VaU%OSR zi4&Ln@acbB>qXYb+%bT$wvZijN3vGBfkR?=-c&7m8te3mF*IkayQp&>la8|L$nmWR z`*7joRdO+N;6+op)uRGL#-93xEx%?-Z+qsou>?^wm>BC($YT@J{Mpi7Hzd6Dj)O$C89R38L5|E6d|6 zwg)WF1T6H28jjo+vFyn%IdhLGuSuSz9gK!#Ny&I|Sa)I>MC~01qn$L`3tA0a+fO8? zsHzqvV-|B;4YE&N-Ry$m$ZIop9Lb#c;!#VZ(O5BQp&4cVka@`%MDy!fuG0mu#A|ts z6$~P0N7}X1dml({!Jtzk9Y%Gqqv?LchaZ4FqaJRf+OaKD?{J2_t{U+pBU&1>N5YXO zU9udk^MmwV<{bjFl7&rdQx816l0t>9qW3Ir0pZ`>CjLo*Jn-7jQjuF$w{9xDl!k;KR@4+*C?~EAL&;)@kWwd zyXy9VRi+fwYQ$ zyL-jLi9#*0PdO#29wym<{O*F;#P6z^zf0;Fe(hF|b-JhA*PN>amXMUzx#+I$LgI@BgVW_9? zH{W}F%t?~f%%@75wZ>!9#I8UMjmvb#1#YpAfDM&aExpvqN5JYLXcART<14bnPt81% zKl{PF0{CHe_J&8;>src%H|}wX0eZF=X0^+YPC1&l?qJzaf!20z){vEn08h1cKt&8Y zeJGFY-dpwbVqft-I~^mE*)|h$*U4ARv_dYax#+A0v4xZjWsHr6n6b+}CH^N9@=y>S z%kT~)g0LQoAbOrp!}gH3_kjkjHhQu2^NVfCg(C00sVDsg1`x7hRX2{*`1xQ*+Ow0k z!7|+D`a}ImrhA1G%H3H%bL)dh5)J-TbJYlaWGp@Rs*=kpkFdL-OsQxJc)< zjZWrRZ0X#kW<^9xV}zJ3nyM=OxE$O9QKSHu&9-^~>uzn-Oz z&+lJ~Mh3+tO=ZRJ#^@|(iL6|rt10)T9(_qJ&YN0c1+pY9T7&h`nAEGmqNzH$S$t@% zBZ+(kR6?U2xyfj|VGiAdI4z8l-|5A08eIuZO3QsyaFU|AfG*Kv7w20ipUVB# z4oWXILej%t4_cO%b>^dDYwTI)xB$}X9?0-zD?1&KM1DzMR)sw?<=X5GdJpH|f_dYX z(7Uk-C17hh&Yu;9J(LZ|1HjlWqI^#VL#ihD9%LPI+48%ZrXQ#`(?+RYoUUMKOl9`N zq-5W=DjZ7K=04{+-6)%oXlO}=sbS5&S+YEjNw~6IXDV)nG!kIJ;XQm_hR*LdjdvNn zyzb(8NA0OZ>l+u3=FoA^I6Eus`mG7+5s}1x-;`20FI(HbjuX|{A^ zZs|<*kpMlP$_-ogtF#12b;O4nSYA^4(25eq#<-4U9&l_kdSK@uZU+)wosF{e(ZLz$ zcmumSzHQw_B&Y$#$>6)Lua76Hr8)#xt9$Xxq+U69W4~?nVRr@7E|9-7;K~_}7!jR* zBbnUO1CO*qKD)d@y-btVSqR%@HwvYJoTAWfr65onp;w^9eX$s8#I)%FD_s!FkS<8U0Q$MHWFpy}3S=RA*6)(E7IHf*y_OP}`Q&X+Y zz1=G31bmD2bkdBq537@3UQC$zpd*kL;;*GKZ8*i%dG~y3`0jg1@|F!cMIs{};y1RyML`g^KkQQ+0?&e&>^SZn{HHhfT($Gs`?vS4 zF`QT&RM=X7(4nbF!zbE)psso_KbBb6`uE^2VdU%c#dv4}*Rl9YJCom;la}E;PWJd= zx59#!Vt~H?{aKI)Dfz9D@se0wY#{N?j)LZT>_vFw8T|fU| zIq1$D=*`~;oRyi_>P(1S-S?!D1=qdoai=OB6ZzOCcj`-qI$~BI4Uh}uFKTzu)O@AY z`DP|&#h}Z+Q4Vd|i~Wu}_kgvs4SC`QjiOKJ8Q}Htb|Hp1#wy^IXXDTLSggNPT~!vp ztY&>o)%s=JS9OQ_7dcXr&@j~&mk;y9iyeX79{HTpP2iR2G7~X1F}(YB++&gMTgw!1 z`Ajl)lUi+C;q^V~pFOekEx4njPiUxRUGKK8(ZK`nw!T3!rT;xT#ST}M_R-d}<>ky! zSKr^(zck%r-${sp9xd17YO%b_!IBx}6@{p1$#A7MHkyKc62+ZS-J@|?F%G5qRICWV z#A3-61d}=5K6h{NPdj`sN<94f;(f~>3(+ZofYxFqhyu1k{0RWE9|aiJ2$NHG>Zcbb z{X^(N@SRD393cC0Z^Gl$M2FBzR0@~x^x1_9Ei|erZ~H?9 zbnNA)j2^A0?K_`4RThsf4dS8rZA^MvhQ6-b&RN!2-9<(j)VIzGuM=TTVW}X8kN2e< z>vsD#b_6 zbGDk!>z%hRR>F$J82sW3G+8A;g%+n&sg8F!|AK${@ z&PuwjBB{c#5?9>z-a<|RMWs^0y|#K%M*tT$OgRhKi*P+z5K0F8L6&Q{BD>t9)RyYC zR%l>VyD@xt$>bxiT5hXcpVa!O$eREG!>&0<1NYS%v=#vJG$*u`I+7wA{Rg5d<_?ba zrv`(h1IE_YT4||t#yp9z(Q8Uz;eic$wq+jlnQ?j|N7XWknd%s0j%tYrt!%_{23Pn<*;gOz0a;C#4 z?$47?UaOBcTs3-K`nQ6rRwjej3=09yO2uSH?~s$vnm-*4YAb8nX7JA zJ09b$@u`beGpuC)9H$SMA@AGz^dB|#C`~#(=B^r;dU~&o^_K7Z;z(?>#(Vwy-H*wT zx=N4DGq7Zb6#Vckazp@}<_L`p^49x;s|#P_&z)1))+l-ke|i2#a~WH_68vEPfcEnC z?%gerVz-*a)K|0)VT`^!KERSDT*!cqsgstj?nGpNXgfI^QDk9F8nI&4$f&_)4Wd2g z!MPOoItv(~0i}v#G+4g#t1#YD?cYGLHD8abOHVB4hlaHMIdmz9YWDf|g`(tr4;DT3 zJDfx=;e5q}$lfClSf>+*0S@kD1+)*2PZ3Ecf5Oa4(yXj9Lpn2ZZoBatVW9Y_0_q3A z0%S!Rklp$6kg*28nMcyu7d7R8Ro1N%#)y&(39c5+fLBN7 zik4$UOMc0Gwv#V}vrZKD170&81O|-7_!Wgh9e$Z{(JiObkRjViw4V@@MTAEefM!T|dmR=OV0UG1o7HTqSEmp3e6hJT=i-(4U~V`W_2ZhDRN^^sU9= zMy~FY+63Nvpv$bMAtXW2M}it}-qYK8o-@v)C63f=W5XzwFg@8PHIVsZQH*Je9bx9n zabP6vcbwhtmnmQtui+$1*Wtu&qJ!TCe6ixz$Zs^^My#P^9p|frS3%f*r}x@QNNO~z z1)1UB4V~sW?b`(2KZ{8tStIz8b`qpc@3Ohp;l#x**Zm_`Q|>(K_tnWY?nS$gl9KCcbK2L;NxH8yO&HR>2W?1`I01{~)LYG^#b z0;TOQc~nx#TjvSF)pMg2e)ou7>dgoZC3RE=eFT9J*s5#WC<8q3%dL6kAhp;_ z+X$y9h(+<&RsizylB8@qHJM1~^irXnlP@uEY8!RzBtkZ(kHPLw(zjWsovFgu5%;%6 z4Th(&#T<&Z^cW#<(jMHhz{Gdj=I?Ovu$Jr6C(HeciIEFrk!$AX(`y%hEVL;q3GD_aWEJrl?~MPmVK zc;e?ib>?}Kh5a)sfU}LD1cDs0Isew0!{Auj3SBJOr+#dF75DtwPXR--FryvY$#uL` z4uW;(iO@Chd3E2kIN2`7zCErw_*P^1#@<2H++zl@h5D4YSyutoip~AgVj+OL9AS1> z=TqlOoz1ajBh3N*hEQ3`-PTId2YBMlyPcJ4Nu;CqWj+?~OCo^f;R#|B3mN}pA)J@j zKnM%bpK)v;IT>m=1FTL6d%j(+U9JQXzdRV$q#$2=Ii+mI_BEe(sP*BoxZ!*V?UhLJ zW|c}Z6yL(??=y1plR$0=~qhIQ#27_CkxkM)8zTC7f|ldvyprGd&a3H~%N z!-pH{%65!hfhK}bB7<($I7*mu2~(_Y8f~%L=~^vTK^q3U2P`#cV)IfYnub3 zjJ$8gXA7KeydYf3E#B9=jfY6_pvu0&J|y5z>d* z=cG89NlA3ljmJ2+c@BRhmqW&tSnpme#)uznsa6O5vj1_Vu1c%9J!;US+r3_w1f(me za1o^rib0YwnA>sN=eaE}OEJs3%Dvl+*uTIm{8b*3v%P$W!qb~|5lp+!OLoeKbQLz*LnGBgcRlU zx`Ljv-l6oiay8K070}?aa%<+K(r0>~FtYNjS9&cGwV3kPOP3domzTKuD>R4FJxHbZ>c9=jvd53^{amMP#W(akgw zY%X{9Xeq(;j38qt`-dgmM2CPgz-_HPB5T+0LF$gcIyfo6db(Onz1qu~?>bm#z96Kc zA6mF^oJ^g8NkeCrOwbT+#%`)|=EPMAd6LQ7n+vU?TFPo^_l>(0Cz{7F zoELwU^jW@rZ=96HOy+XEuq^!1$+~nUDK_&DOqaVtXZl+ESCO6d_`VE*5>%Z|$Y-0I zK~dMH$l6V_!*w&<^+Lrf=jxgB)n2|g8_8y_eML{ar)aA3Jp%MMw_4jKdstp>WQ~(u zEgFWZ+9tVHnoX9N&qZB-HSfEe>D$^*aka0@dvfM6{-QBrQ!o?-nx-**Cq&|Lt(psb zCMs=-uX(1a$p6hNbbEU!bNu416%bIx4odb?zOVay-6K=`foS(Q>1 z1XZnqBth5%Gv2jXmC>&Wx|BAmB*-NC_G;I*vY3srm$+tP_lmi&jvM>xz9!OD`Q2to zkH9Us>t3>JIR28*%=6y()LNQ>^Dz1GJa)|yasQb102Xi&-qsl6=yqHBu}bS^J%42T zU^Zf0s{fHyvi2X*h_L)awQ`0u{_A#D; zenqE3rrXBEy2w=7GPfHtBxH)IO~hi^W;ejqzA5j?p2zZwrbv`$C<@xhZjlYBs`4Sb z!Aq9E-wT^&S`S*TKzrWgsZrNo$$G`AOU3axkJ8et;1$G5maQByuLe(UW|U%3%tWw*k z3xaI%#_#$%z%wrobC?QF& zmoh-F)KvRQD>cEnaSF;J5YcL&2e!MC|+tU-4z&3ZxatLj@YdMIBTtS27{5 zywuXjf8T9^CNv#xybO91B4EhG*5`z8}O78q2Qg?~W^tcQL0Q6CzIs7Jk=xz(%DJ$$f0`U7e@89KE7)=|M3ZwIqw_{6V96I{jj!Pn-V|VbRE59dnU#??Cw5`=J zhnwZ%X#AX+Zphe?YDVR<5@7Kk_-|dQ2sp!U_7?!*AGR z=GjRLKYoS(^7@wlf`XTnoB5tNu;k2n`7r{(k1{kokC7bvTB_Q*S1P(OwTrjyTyR`I zxl7FSEO|n4n`Qy$hws=$UCZ+YWAFWtbCGl!gj_A zH3q$1*J4A#%4Z{KmuI@kJKe9=Tc#yzEsuR|c0Oy{miA5QQzbdDj6j!_&;_&nJ7XB4>ZXC`Z|E?8{sePg?)Skp|x0(By;N;3}eZWL#eW)^De|OBzATd?1 z%m}N@H^KW=MWX`Y1@c+LV%m9L+k*>1bxO%`nspn(_=IWRwQ+l+?@WH?b$%XFsvSNJ zVl1I3r`WQet_#5Lute4j4hMSsE0#uURO;^z(A8&9xUfBy65@dx-_Rz3tVi z+*~*-ka_h^TgRe8%PTevw)vudt@`bXKUzk==G>>ZwF&%4uz&I;9_na~7>r}{pQ^iJ ze4`*0QhHrOs^hS|^Jke2R>7$!Q&QWaNmiEiQ`bOs<_;@p3K;yJCgU3k`Re-H`c*=$b=}VX=k!O3yn%TN=(rB_!bvj_!Fc6f11j&d5OHgCiQMH_L5*L^>+**sn-Om;0Tk zkJ%wOYKK{`ovz&uXZ=yTL^^}8^GonoWhaW>3343BjVn+DnBc3{5P7=b=W_2UE_EHn_ zuk2)|qPK>cKmHXiw$3l8Fi%Uw6wl}2XYRGg4t;G|j$dPlA-rZ$h-pclA7uLfe zwl&ed8^Ol}m6F*82Ct~rFCBDi#T)uIE+fh1sJT5q>R(K{jXCb3nn!jyVIyZ(%F2iM z!VA}Sl@H((->EMyAe@a>|yLHQUdoJ5IZ-{RW9G1D? z?P}c{dyOQMPBCku+YK*CA6=d(I$r!|mk{Z=t6}U$2gh3^6Dbx#z>d?H9t?8oshogK}X9Ln5dtM5g zZ)EKqWHsODq;6CU?WTp=OqEnAI2&|HdnI$y!M868(tWx9soJuu(^RhWFY$?++3%G* z+KP$!%K(p=+OgAXTF2+xIwo{$XVcTpm0%m1_HMY-5;tr@Jwjum)De%7qnhlf1Z*42p;3L&bGUq(`g|We)r|!6Ki6apb7={Us$+HJ_ zio-m@sMDHa%>1E`+S?=DVpk{~PFBcZq08IS{s;NS#g)}rHGYo|dXvT0q~R7nziioF zGH_3MHw2OqA|)l)^6i#c;g{Gu)n67joBM?A5=z(Txu~Vd(q-*7 zF%~_yB6hyAoWEK7Ue>cat-d#Ktauon2KDnjX-TWs5}yB}*%DwRAJuBpbi7Z7Y*RWP z>w7XVwa)fAxaE8JrA^Fe|54B-%6@BYVyW)Vcmz*D1cT-kj&7 ztce5ij(Wv^_GW1bsuyp}SN zd^&8HUe_cpHdw2_`?mbn7hDfZ*m;Sg<7kpum|vJb>Gqk^PfAeolAB>I(q6zHi~8i> zW6A~YxihIbSua_!^YYH$6_EE8orj`@FO&^z6DFGYu!#_fM?g6XCx*@wr8 zJ9U|ShB@XY`^&s)YJ-I?uE}gliFmZ8UiNmW^ZQRvdC4o9=$DtzTHtCZ$tXbmS#Jf4r-J~^9IaQSJNm@3_b41?O)LUF#ns?YMKiK*=%3nkO zYHTLBskET$_XMZCTAq&9F|xuORQhL6-!T#VG270EMxy--zfkr~; zLGjR}VZG#K7oYhVSNX>Ty6*6f*^72#P3u6?_EB9Ue|>wt#PShi(`}zpl%~4SQ94Jd zUg1OokN!(NF_f22g|fc&?z7G2&7zSJ?Ws;7Z|l#Od7DKSHGRh3F3r@mKl$r^MNX~j zu*}I>qx&qo%7dvffp`G~f+W&K4ZVQOzeAqX=-TR-sRE$R z_o04AbnyMf&O!8>3+P6`7Tb(p-8!5wQlMQ=|80vw3Le9_8lfoweD@)l3*DwdLKVB? z1;kps_3Ut+-0Z6h(G!eGQv|>?jcr$c&oU)}OM#9Mfh4KAc z^9L2$lm?NdZGfjdPaH%8*F!bmc=lk;=^Qg^S2S%kQ}#1Pel-rEu@a9wpVgXj=nRUA zoc|OKYQZ($tk!)7TBb9HqNNoeD)({7(7$IUecvMs&sRSEMcuSZ;SG>WwgAV&Ns4vG z{u~V{)=d{{vF-O0)YIx&O;Tbh_|qM22L1`N!@NoDpp^ z21%6jYS)#qxmBaQ*yBGEaz{F$(S?Ekj2EB)Z$UP?5{vQS^FnpFX5SY!GZEE4S4V7+n0u7dkPT^2|@Q9o5vXQA!m; z5I8@b<=hpVl=TEK9=e1oAWAbfQ6O6K9&OzDy1TecwUYUr^*Wo z>IY!SS7t8g8B8d2@E@0E8PW$wR&&#LP5s4Dx7o{K@!-wEMC0N&g{o_mW`v=2n((~O z+ESM!*zuRW^`Yr(=X1_pq+pOI1`L^M_nB- zEHb=l1Guw^PXx(tWSblOEj)Ai8<@2fjB-k}c=B&?o7ZVs7?gv?x(qZjlE@eWe@i&J zH4ZT5PJm{86r@;MAHnPaJ4gph(20fC`Q{Tfgy;??q4Zp!n^OV0;fb3Jm8vJ0;m3K# zCo+Jtl}l_zW8$UIA9K;Y`U)y!GVXz{40*Ctz|b%L$^B+41tIT?evW>r1Kl{^j{3U+ zcFt$;ug<=h!2v=fx0BZGJmBlS>teY%d3m55)Vq!OFo%FXOS$qd)i}3e{B1oApNkNAzKU}!^A5Z!8cL~@c3qV=uh|jPe+p$^mkDLDjbQ> z5C6j_kI=O?83KT(I_F<;uA&PjV#+C4{+Au!NrZz@)i->C4bUINk^C)ygVw0OlnRWy;E(w9>!gZ_Iq7 zehY&IUAu(Xr|?35ZQ!o)A_ICt1Ow1lxl>%|F7^_RVBCoBh(|3y(|Ehh`%`S$b$K~u z+yxJJ$D7-Ju6;oxr}Gyh90yV+oNbrVpWE(^v4;=pPn-h-{eM^Tw=h|iX`@X%L4nl< zN_G~CZLs((s$|_M)ix?hyM|*%NVDa5ze4rsrzj^oMx*9br)|QaRy8lb4!eD(fcakm z?}ggs5}gQhx1KNF%f5cD2G{;aeRsrum{>0ous^gS@&f1w{GnKm7VM!7uTkr5uX4Y& zn&aSbY&z*2`PkW~2cUld4(1EmulyDZO?EGSY%R?kwf6gn`+eJLe(ro_any9aA?Pe) zgNsbWikckl{K>uV;IK^iso3DLJAsXJH$9%8Vfd5vqYSAJCM#kk+1-?uXDbaoL~()> zf+7?eHV?(V8$9~BvU?c8Z;G0JMZ%(qZR=h4^)tbr|upc%6Yr%&qemk^Aa3ch&*RiuWVKVD!zf@}c80%6h->%`>=vAqU51V$( zFdLK===C#=Pdxnd-H8!+-596X#^dor~%f>#?xfku)l)z;Wd?CFzpLQ<%)VlUz%hs@LzsZ93 zFxO76L608SylkYAB?+>9fu$@S^VK`dA!V@M%lm>DIAFn;?6=v%%pAYLHH9nQoXJ;j zyOXnyu_k6g;jL{x0&?fJJHx)PqjbVic<7!`g07FvHIB%^l1l0in`-BL4x`SE8o-|- z;Vi29MJA3mG1F1CzPc}l+Tv?Judto0&g`yu8G19WD6G^;Lkt^8A}Gi0Y3(zyUDO7h zvgI_i=$4$8_^Xn}M|<{4_3CI>2jXVg6_MmBJ5WfTExi2Y!j}tmN}JjO?DnTLr!S~gwFAY zdzDI?tF5oh?@_+Zd_i4Iqz)=|{n%vwlbO(CSf;s-Xb^W>ibYvzpR!?eV?b%K`^xb> zu|ajbxg5ozQfVsT^!Wt;?g9(Otmd=w!)Of~`nyf$!&U-~+X9sZpUky8_ZBSK4t4+9 z)Jgs2FaMpVo*Csl*d@3cieX$ZSkG#bWqLq>VWuhHR0VaHhSaSMlK4lbmaCVAh)fY= z3RceSe*Vb>iBGe48zsM5lSOQB?xiH6N$UehJ8}F|kp70W_iq=6nT19Ao$kd%AG}@s zSxM6OqYSg8T7jIC?Wu?K_2sPpv82b92*>I7&|BlSI>d@XV<9lxz>_$GvwU#$Y`qL_(HQGcrv z$~U%^!soYZ^NCux5b8#@NjE}IZrAbA;*3stcG}V@uBVd3$k{G+%Qm7?Q(6Q_@9pfS9X!Yu+fzANCjLuPUwtrvt1aP zD}%Wbo>w{<{0@aMvG1U2Wk)EcxM^MwkXAn!WH)`)&KWjYyD=t}%b`8EoFV<&jWX$> zCH$F1kaTDsiO$vK)R(i_c}e9%H<3jMoF6h3PU>GaD2?<})#ezrGr03$a>PvmvOQbS zN1zOgG$3#r9OtLltn1G&I$UxJMb-aaZ;Tnl-xdsBWj&le6>+uNkb)>2c>iTPrc+;f zjekYg<1IRHm*V-Qb9$BImEHz?Wc&Y3!^lOX9W7ExrN@jd8@<#(nN@Y{71CQ&3)s#! z#I_MrOdnz4yHRJIQRgcS(y!2%rH#k48)i`Oru4#{U*+I$Y*lU=~j0(r5 z;`vZSMsmL~_%io4HiFOiu-P?i13#F6sMCqdBg~rNHaL7|Y!yiI_tK(nGjbO%QiX5a*8!Sb2-LDEdbbjqpn$$YvS5{yYw){$xU}FKU#>q^!!tS z%Kg%9%e;O!L-iNu=hs&OtcR>wisehaeJp;l%x*)t<1*ab%WJ>mFbpa@{9d7KERhz4 z%tub+*}pj`Ha{qlAPwIAc#8HhXeDp}wywam9)gysoxAUiNRZ~vu?6d;$#rT}T0@1X zi&9~B4nCz^vDYcK*lQC85B*=ftQuN?eeZYZnuy{Oke6%_BP%>Q?nW8gur?m20 zrxxL|&L`SF;>;pFrZ?g1=>2tzj(P%JLgd>Q%jTVBoXb2uibO0 z7BXR9{*pVRA$FqK@=Tny9yvAa>aj=FS;;hVa{oqEGu;kH{W+l2Rzy1#c@Hz7!L14& zA@aAC3eAvdzhYT8a9rpcQ`O$+bn)~_DM;~IP#>463Kh{!MU43N7uIai3kjB0$`__r zL6yp4uvc*mUrQ%G-PCWjW6NzIQ_?Bxd)=M2|Hmzw16b4!(gY=YsU%V3%?JzNhe>{M zzF~JGFf0CjaR*W~bNnF(oHEaRCZkp>QKU%JT-atYdVSkB8BSGy=KbcV3NTd)&f#-k4i^%VX9 zTLR;Rga|UBe|{1I6t;i8v(}Bm#>TjMh%}u||5Lv{{_bPjE3r?^h9qbmi``5)zPS5q zJ$w5PgF9mL9*jqLl`~DBD1cHjnZ-MpZDA_PC7~v?T z8LJ0u*2xPR#)YO>N(&od&blS8G_$JOuvW**U?QaWy0?ddCTr&T59@AC!v?Fu7p0Xe z1LX;Nup^t<7@(4d`a`+wf7h1X?4dnX_CUXTl@vMbmlU|;<2jIJZJHwHMn>G0l9$L{ z8~&@$c5$y1DrOz!$MU(el0=sBe4b&%QuRBcycxw7Ze|~gsz(#?p$8z>oqi@5;mvtR zzH94nvLhATmw#XlA)k0?e|_@dx8Zq55NDNJH>$H<+w4 zxn7s0ct6R*j3LxfVuj~!$E~s<%trAeJuE-A#`PwU5&Pa1ZxJ?-)qZaNBWQGgEOR_i zUnNK5sW$bvF9Wa7`r4DRGX-uG-_E{>Xg2dO}Up$MFAabvj1aKAUkVnC|_q;1|Of}j(c{tu`gA?+8dYBbvj$ z(gP0j5StIbIUWm43^_=|V8j(P+Sb~AK9irguT}oBn;L?Ok~jkRGg4CR4hNp~$;gs1 z&WXqbXFAsr-k0}`He};nf7lgME9z6f>_h#SfO;;$!yUGvby$PT+1zTW1aR)|;;L_S zw3M~Q#UBpu@mlnW2p*)!9zW?Gp)t?4sTpF{E+yee2^(7Wd2k*VZbpcT$S;-Z4ewN~ zwUVQn^}T;yNAn`zd#cDkQQqwr5ewP;Y@YVMlu5(OYb~M6X31nB)+bHTe#&$R4iOXpw6X57%~BN0!DnEjULxP$*r-3i-M~swsHeTLQBU4W0Ke~1 zf^!oQHyHwQx8?8svTAGXa!cc9ikL_lvM_2OIEHu2reVBr+_PO+JOv& zl|81ON}AjO=fX^c6m|D+oaJ^7cM5tPFFdc!KeFBVXDE80Nw}v@Lu`on;hw` zXyx*w$s;=)W@C2S>F)Sgy|BKrW7k(CI(@Oi!hRm!RJgqqTgFdZ4G9B@TR-P5{rbES zgXx5SQLODZ@}YmNwxlqfn+~ZTT-}|$@22-s46GNjtW*GrGj zQsj;MKTDRchg_eN@@wjXD~;Px%96EB_q17oOUQ$U2X|uv7&b(qLg$|vLs~+tw z#a&!_!g9LLjAkd(y?AHM4u$fa+dxnfsIMnT^ml$vD0yo6K|C%}W8HsqSRlH6_q^#4 zTM#(vHVf_8^{)598KaQ@@mD?}{b@}E6**#>9|^6t8S^(kQ-gV+=+?9IY(iGj}UJCUITIJea;+MXONu%NACBrAw3MYMPWX zQutAgJ?^}g^o%f#?!NXKFtw?^&kA8*&WT#7-fh&tN&IG!idFaW3oDL=TPlb^oP(%|w4$a6dofW@Tmbv( z7860bqYgxY;d1#}aUeiDsC=;F$ad|1mGhg4FW(QX$9FfA!G}A0Hd)=Z&w9UXiY6a< z9^wLRuPxFpc4NpN0FynoLtl!vQnzEUG2;MeB;!dU(f9vi0i^CxvOLu?#Nn>$sMKhI ze_F~i!+kjvn>rPlzPvC{mVKtQP`zc-nyVJW=mmM9QC1aaE>*IW7 zd%|}Vm5Ukz*64%-Ir06{=(QfCBumhrE}LITGvPfA`toY?_<{>Ib6HTGGq3X(?LI5v z2(x7qlP*siK^py58H$*rA~u3c?lon7oT*p(?kEcPa}`LFLgeL*3h+R4c=>j{6Pm+c z@!$3jL*1u%o+Y^+B_f(7o8Xbs!;&@iANB*xuxzFB8#B6~sAJz(YP_F006@mgHDBcW5Lv!bPxLyAR(# zKdUZ|dbZX*;!S7e(@2yeJ;r7<6uulO|9A6zHn(46J9PSkE32!R<$KOk$xO`{b6u(O z&n+_7+I0CO>@m+y5z-iS(<5lFI}E#_pEzOKkzy4>OVVUmb+9cP8IUH=dTFM zk}e-C;w=c$I%d!?gul=NaTWp!?W7zJeywkcS@qosJE{D}v1CzQ-f9iHQklQmkNQtM zsdMUes9gyTfusMq_zcObE>m0*Zd$W8A=25Dn|s~)d@!2M+#)*lJ8;>GdR)C-`5G}9 zgsv|A{Rh{*PsCrk8b8lIffsAb%E{OKq$l*Wq>E!4VO@UZ&sSuufN6k<;Atq6?-NjP zF-*dqi9*()dyM%jU`zV+ODHE*+yFO>pXqzlmS!vV`?bfBH#Rxe~Kb+g`MFAuHYqBuk;m{ z?F^M)&fG^_tW14({ow)IaJ@8r?;P*xr`g$auPC%BCnb3>hqXbfBBVut%M)(?I;$%K z%7?=c^6#F|PAsO<0@2N$^e}jRVQXylSj%Or(Xuk_A~OZe%}=^tSaSzTs#rPT^3G&Py0Um{^bIKisXf!U zC{!be!U{*M>2k6dJX7_u=L7EP5bF1hdkRInr$`#ugB$nslSD;?S^78JxL$_NOMchQ zk(TSNI}PO%naorMMGxGIeDRH-$p>#=oK z@{9H&ZX}}uEg5j}`o}j0lLg$0(ifKAkbx7kxy>|$iGJ!33+^rv8NZ{@-fYP$;LU~` zy1fPcb5HbvhZH1{R~w`*u3|2E}5hWPTecX)*^oMKB2MZE~Qvny|xX zZWbO67GAodIr&!+yfT}E{Qnp8-;=?`g!k(P&LkgYnjJp_UMbIgM6>;rBqXhtD8VY> zB?wWM8nN2a-Yy5X?;~6-eEFM>dNUpspikxEY&Q0^dzCYY*(M;W=2e z4LYVkX2*mk|H`gWP+lGY|8zX^z`E!lLEhe`*I+zO!6=EoX~7|%BhY!OzBI@r?={8nhV=)0x;X7$5gmPp-m&c7e6S`j#bi=A zC&4|j35788yDw{kt{JY*=j>PcuRVo(=ZLB0MwYj!y#y|g+s>!??(rC0d&_4lf1_e4 z@8;Hs6AK$#f-1)~#Os87H}57QzKYurN;%8v>HMr0RdKjy$gIcrv%U6BV2RdmrRsJC zu)7w_F^unO5WTNqJnzwm1w$)MI%YS8S5I+Yq~d+ z`Cbv3l*3+T65Tzp2Di|TSW#K-zY~6z%5l-u#n-wqJs&95sWQ##o`K8{uG(56h#&FT z=uzgfCBE)veb;YD_2KoG=kw+liroSfqXc*B>}O=UA}QY8UyHHAxt0SBHRguwulC*I z+L%P*3i{_n4;;?6{f~Q%?!RrmI{qY+9wew{5Mx@v;F|(3qvn66@Q?dH>khv+peEl8 z#MI~X!=k*g zSJ(|cV-J;1d1eceH7^E-K4^^aXBvWDiXR0da%AVm#BXiPxjh_Tv|CxRpADF&-ha(r zDRQzZMjeC1QfKAbv_8x;jvxx6bb zN|5`Oz8tF={-q?fuS>c*TWikrPeEv9NuljFnMhTNU1cu;AU9(L*%P?QL{>C!J0#qU znAQm!&BEA3U>%YR1ri2JTZ-pg_TNl+wui7maH`3)Fh4q_9b8AVb-(}U`ND+7S3Vv6 z`!f`sCVzreEt3D`-KY z@7bYkAuJ7vl9C=mXIfX3CSChTO^k#*Z7x7-~90`orsFAWa^6~v* z?8_3=NHwNDWi2*NzBg_O)R4VO%@znkj2+r~v1db-I3>lD(dMBYD0D#&AJPgoeLibm zKK8j_W6t{nCISsdo0Q`8AZN>CWk_H>GBNMW)W3cngdAs%0Pz}x>mh#7(tYW};Dm?7 zzNErqn=&XN)L^Z$!gRgg9aHh7$o28EY`D-lRUA9sSl?Vmr$bw}cY$*&1^3l3i-7xD zKw2Y+C%&Di348gLa-F+l?<cPrB-k6BKH_r2N}aZG-;)_;)z*U5t3RU1 zJbFbQ+RjS%Q(aq^aI<315fN`-Q%rkf(QHaPfVL@(Apu6dReq;H zVTMZIfoE5fp|th0G8E-~JwrKvN~~6YO;vYmIpHMXc{3&4u04Fhdhk56j2Tv$;deby zM!yz$AQIxh+u>vtF!QjT<*6@-0)%x`lF$-A%lS}z*qnpejp^b8rnM?;(m|x~U5n`qbq;*?Vz_D1l>s`G9Z!sG;@|2F zw4YwisJrOr{oU%8=(pLA*hl!EKPAqXAVsB6kB+JkumEYtjvqzO{{S2L;#Rvr-6o_<#>6}Bett|X(=Xx z88hEIZ=8!lAk>PTmi_GvqB?4Ysq}4q3^|XFIBAreG&WR<=$^kW?C~jw{|9?-8CGQ!t_=!;f~Yh|!=_uM zy9ES9x?4&>8blhUyQP)x?v_-MmXek(>8@GtM$b9(&ADd&%>0?}To*r(?S9`C&wA>< zpJ%0TnLmL*euVYHQgQ2P+@9d<;(<1o6PeqQlxj`_&Ofpk$II>f1=E;}6q)RmvL%45 zGK;#+CO|q$?7!1d^_sX_LCRy~xJ+?mj+#G(Qyc5Zkqf8Z-cO408~{b3=mIDVf94S zr_k@Gn1=1+g?J?^y;xhPD?wbG(*@a@O~;P7L}2?B|FDld3Nlu*wR;R#5N2Qn5Vlh1 zG^dImxRwD=Ob4aM%&sb0yF$6{4RW^yNTGBsTAQqTqPt|Xau$vIwKpi?^$f{(U0Doa zbw9F;w+B2I666-`Kqev|LF#Qjll{X80HM+w2^QYLs_mqMr%XZY!xCh!2G-UPUjn+50Latoo)nh)`tL@b;CD^=4*dp-ug z$%jGc_h$&%@Rz<{?i-1h{X%U^2t|ql+3p|ik^kK?JVLygd;|G(39tW|w;MFh^w|y|LwEkK{XeD=IfH#!0ctVW&QM?tSH=adX zRi(Lo5M&u;2@~46f$O^LVFTe$e3hFc@rDz+SJ_oyA&zK(e`|ex{`1TcfZMS_nKV{B z`-{SPL7jtz_F}j?y@uZtovfz-Sq6V0^7^4z_KyVMQ%$}PFw<>$g~t2fb@cC8tr^|- zvE9JMEO+j$2{oK>%T^_bG@M{a%_M=AIs$00rAmS_e`zFGta@CfQ5_*Yl~w@h;b?CX zz+BkFwuHQ+oDd!im1@A|9&;{o0fcu%0ZxeAY(mj^wd+} zhIXrI!=2g8c%Y;p#z#ErBSYxmt1PW-Uh_K>eZ1P&1{wRG?6xvxo8?-T!(c$1#hjfl z!P$DOX@?PHEK?u*4DMH#L%zxrw`b%MWreDxZli>j+ReCtn|4dOrx}2hy^J*$2<`%Z z?gYg%20gFp`K&$xd)Jf{XD#;g)C`A3;pK5d<{3UP%SH8EB4&Lrsc?G+S6B^p4`RP2 z@ZHs%9>#f+pF0Zj>ZR5j2-<3$RnOyCgG^Eg*G(r1LBKYV=y4dQ0>WPy(Zzxy)7=(y za|FmuuYC?aQ&HTEe}hOLXpC9BAedUHR3X766WtJC#G)gzl|2$6pq7i(!Dn;S!4u-P zqzif5ox`ez01`s^mWcddaHD^00|9QoY&X$sMKUG+*N9D9&WIT;&DV z-&+=nyVhcM%v97Zf(Z+30ufjFYjx_k7H}@g*CExW^e+cM*3D+$3j?A1dBwex|xTVwX;RU^1`Dks>XC3^qN*CEZOyid~ ziT{(0_?Cg=!9R5BOf6ze!18gy?omypoUha(b9=5M-VvJarigPD`iVR2nV3-^+vBoY z_*ui2g9cBiQQ%?hhN|%74D&&Soypm_ceL5hVio~o1#pD%91G)rq^locsiWPO`M+i> zm@kp;muc4+%aLHl;gNY_cgIQz0GRJJS4pwb=d3_|!A!-{a58?>Iz{P@!c3!Mck#fb z->8Nf`I6zl2Q@NaToGS4+lLG+Mo(8#kt?U<$Qdq)`}L#498#qZt{~s9knZ*bhf#^! zjKMT#mgtD+i;; zE~-5IS+#UvD^Du&ZhapUMy10N7U^F7Lz5N-I+YI= z&)@y>Hk?qh8}$UGOlN}GzR;HAg39gt1W9yzY@>f4>= zRyy+Sz!R~AG9rF2j^n@O;ht!+JNh!zR$+%<&q{61+M+-lyMFZeJViy;F}m?3VvS}o zCR%I_kJaAK_39s!16hhbI0MX+qO%!zV$X_9*7+-O4iulewU;xiKXEOh#GS$q5vS@w zzh#$wHdy0#ZunfQfBpd=tIZ2?=-S%ocC|X{d~d3rBz$lqRpQ5C z@T%(4EFG@yZP)&yFW1bQe;&0D^N2?PhK*j*7@=f_(`pTM{^hIvGCC3kG=;L9ou46^ zVPmYFDf{12KdT?1n!GTCj8L?&-6OcSfe22N$Q)Gx(-T0J6LoAKVtzY!NO3l`yFJ?{ zz5uvVbuV0!6*?R6WHpL2T3}8xY7CzlwHjb?p{{XbEaL$j`(qhR3|A`En)L1_b(L7Z z9TUd}d^FOB&ST%kS`?Lkpb|%I8WE?GWSI*1n9G=tb^HL3C*|=8jrLg}p{U_w5sRE! zD$}2pH&c4^sCXuzE5q|5`cE35u%vgkuX|<=7kdU>)NnQ`degkUDyFZs6CeziyEMHo zzha}p)H79{8eaeAXQWBspVUB}4xhI<|aTJp?oWRq9^ z!DX}*k;bPbnTBXJ_rC@RtV{Y?s|qYC64o&Cg+>~y(M?pKca{qPZ~;i#CS(#)J&uD% zHt?+4K>Yjoca}(fTb2D}ttDgJb=@821`d(XE|s0Jj!xc0Cb>SYn#~zyNrS9MANk4Z z%rcr)8M4?nr`IQ3UWZ6t@yVu%HRts`yFOdP(N>|K5KCPv=oe#6DSMvwo-+~WJStsp z<~3^Mq9e$;C7xmD+P+2I%Y$p%T`#M!Yt0gX#X9~0D=~%#7+9!>* ztB=(SVky5b#4Db>a|c~YPkX3{xq@?o<`?e*H+nrkgyC6N|>)9#Ox&c9`1o z?!Ku}*s7!Hw;0?O%eM$<$Dv*J&X>h!93#~v^|i>xlxR zA2%#BP~We_q%~Gk83H9}qChrqITb}6(auEU>cdKZM$4xj4HF*EWJdWHxi1y#wJt*6 z0(Q#7gr~+o?WM9TtvTxMF@pTln_Q{#!Ke8ya3>NOJKv?kCBAjUO>%He4_Pj$10xjq;tO_#_CfN?p)( z2}L()Qd822C~no;pV0k%bo}vr)q^Mv9(}&+e_%Z7SGmYHI3cZvpLlc4<&OXw{wbYxIywIBQ z^k~BK?X*)9rH}PtT8T(Y)RmD;9tQDI+f>W)23=xK6iX?GJnqlLT*c^6lP==ERCzh3 zqxT9z$opyaIhh>T&c3l=VGZ1;u)WA_?0RSr@^nQ8IURVL0W)l*xJ z?}kjClpEDsyOJmmuE^GU+M$x*#yMDEw(cEY@H}5lH=_?++IA%y?0=4HoAC`DwQW+W z9nbb^s_;(Oah7Pccy(G*>`MiqrcgtkCZwWs{?2kocbOi@=l3#9a0R>`DPi&b*Req! z8d=fZdf{p-8EhHdUtiEEH+qq&8HeZf%C>XDGGHOphOgNHl)Uy0iy#ig_O+qLz4y~P zYYLaEXGRq^2SeHr#gg59(?`K8KL zQ>TCex0(xc9JRaadi61hQrNn?a)pc8a{{*+_q%o`p#!37=`B%$It^ysIGyF<m^U-&ESKKKDP-SxPdTAWbv%)RZpUls`*`12}FnUXjMLd&`zopHLsA zYy61_qk9x~6+SEWRF_6STiqk?kUge<3}b<-sW@1Hpvc|#?vqcWN!+AOXeB?I7y$)d z!IXX(7vEDw57&6gdNYK)$-dU!Msxp%_dUwu7uYKNnyFt>4HwkFIx1?UbqCug-A zG`Lm|)qbDI6TigGU6L&q|G3ocL&R0WbI&dCB(|cXDyULvaD|vR%k_MX`i#F!JDkI_ zpYDS%p7{RIZ3n`mvelFmyCo@hh6(KH0{$X~S0!!RTC*Z@hi)s+!!4>elz%qa6x-FC zeN({@PIW{;Xvw!C}%5VHpTxuw*zxN!eJX7DA_lbPkcHDkXg|lbY+0|#*R!aff zw>lD^6K_8~I?QRAaAzGrU*=h&g zo5tyQy&7i0>EU$b?**BHQflDgXw|1Yi}t#Nc6&?m4_UAU&YPl)!4+lOq6`JT1EOc% ztn{+gzX!xsItuc`XlSjl9PvN1z_egpLz#2&+8o9gTIol9E7eN`Ym~Sp?$dz(D#!nJ zIYa#+sh({XJjm4ruG1Rz*A-yHeSAQe?L4&zy8Ax2jD1#Si7TIIeYdmxRF($@g?2~` zARrb#ygw!Z+;H$a$oBBwX(;%iKc%+qo>I(gKB)={Z&Y_PPn`g-yg!CA3Sz^4f*>m8 zej6R|x@_<}C$4Gwm#A!cS8JNtP}#CgzR@q7*r0o&OUMR9sYB=oVY6#rm^}D05+-;d zxt;1QCg9oLO2+sk10MCV+eQN#`5ZDB`CtAsVmJt==T502)Wpk(zs~G#{{m@^p0o;x zj_bt=9(W$bxdH>p1|l%(me1(l1L+}1NKgYnLh_I8vEWW53V0sE{2dk_ko=znqSz!i zuDJlLYfqYqrdXLJja(R*w=kl&90)K=FWW~z5dKjdcs(FdK;UOMK@)b=DjQ^s{%IEz z{y0U*Ctc@e0UD}R0Q~-Ex7^h_$hcwKD`qQ4eR&4tKFATyA^Z}KaE&K|#V?xPuHMDz zc|%#zdmEr!wj2-OO9CJ{Y&awb{tNSzWnv^wPxj}^DPh0{wx@pp=0)y}114z?F`LJs zP5kfw6PWuS=%ua@9$0fdU6pD$p<{ToCSPVrrZ=nzjavT|WE`Yg@xc78=1qn}GSxXB zWG&rESFcW%F~|6?INV<-qG41C(0hIeUILI}!9o)b=!P)aqyIk)%0M#VnvZq9Fov~%bx@!q3LqF_z(ymR)xIh zO~w*Bl>bIs4KXFqxROk$IsppG%!TovDD=PO|No(|1gM)(A~>T=;R+6={ModClJ%Un z?7i@k7zr@%H@BgqQ2N9hLL<9pKhst2;(c0s19X=Y_M-+k#UBG4p@)Vmwrh|Wg6ltT zbm$gDvwuUH(**wN1SKdEtD8l*cy=v3i z%(o%qeA6PncD=$LBQunTc8uKn6SaQ5`jyiKzB(}v*(us0x+a}AH#E#~2y@rrm*eXt zw>kR3Go{kPOiH|yOVDn6|K^*wXrXUfoD#bU?K0vpC_ZkLTiLr=Zs42Um1=cF%z2UA zb?Htf(qKZO;~5|T*HK75abPc8j=fXf9!-0GR!&g-Kh5zDRInSS{rPKhr;r-y_vt2u z6&4D9JyP=bI!?Mazg$*ZDH^ZaOmo$Q<$UJA4Ke5Xv zb)Lf(h^%?}`xg-`zq?ULq-7Qk^gZ3WpKoR1WH9)1sdl^0<10yOPuVh53lY9;O2KYQ zca~db5|-c(l|X494rPybiu|A@7kX{^tCC>fLpNx^vT?WFt&KPm+7HBVFntYK;tL0| zSAB7)b#|Bgo#Z>RD{wwM@F+kTY*^5j)nkw0huDz}h9e` zS3YNKv$`YC7;byIXWXIJ5U3~kirSL#JfRwn_pu?A1T4uvYYDH?QlrqjYRrUJ98qDk zLPG;CH33MJrh~*dcl9-rAOA4E%ii(}{E1=7SnjLWgm(l+w9r&d%b40!k&40nH~V=2 zUDvMfDCZYiYS4HWqWgSj<(%&w2`FxXa=oA?BjELe-oV5hw1;HP?_29CRt7)4GXhk7 zjUhO*4676s8ctP`+WfTj^?O6=pa~>OxZ3faqr$*3Jmq}6{2ujdDkEQ_Lt*%AOG@Cg zt&Ptf1)2|6Yu-$^)k`Fmxeg1y>PtpldL8ox@QfW}stqQ2zwt`Knsbz6oV}jc#hC;w z-K+X;*q?oQ<|y`qkbb<&wK&J%8e0QTg#UD7?OidT1V6iionjSPeA$+R(R+$JUFZoVu31?MaQei+HE z5WkSeg;pB#6lhoJN?h#pm7t)0D@^ZU*gJPN4>4v_Ga3%EqSGp34~k<5U@maZPuCbc zyT3Didy)!G6xMkfyeN^R+lw8z1ggCa{#XYq6FQOj56slcX6YBOul+4Vc$2L00;6Zz z@}Iv&PJql&u3LZ?|jKw=Sf7)&CTwjB;Yy0s9Gc_aP)go=H zTPbX=E?mSHfB5(7U9;HWb;d#(7b=qV8td4*z_s7)J?Vd$sg7Af^ab@sA@NgUtQt;m z_asSYDRG&lytLWa4BQ_td&6~q(8XcaR`$}MI*go8EjmtA7Vze%h%}mQ&R|{cdwc-) z%E%ZT3SkGW{f06{9k08$2YqY1+#V~0V@EM}P?jaL$v<=;^Wci>)6Qai=S(p45aQ)5 zR<7>HAMh($7Meuor}D(eyZE*S8qw|!cy$j}o8K)7=@+?ChKSK%%;g^^muF~zX=;2P zkA^C{QO5wcraa!l_&u&Q!~^F2FwT?$ahag10QEplvT@r-6`neu7-XEqKL180CM78; z{&24+jaH+~O>~+UR0Kwm)@!oD;#HbBxXI6!=RD2 zE5#JN8W&?7=o5ZP_c;;JDqN|b*MtQ&`=rf~gpUd&WxT!&;+#$JHo7|87%Ow`*JXOt z@bo`d`{aWcs8ZiZYYcGjcA~Tv1k%xwl>7mbjwvdz+8V`9Q7~;vy*m5^<$0D2f-&xI z8grIBFtIurw{E0U< zLt--S{BwsMjb%y(hD~Y0>irRgyUwJ83Kn&UGWXn#e|%Kv_xwFT4>(5s?avS5Pb3c- z))bw%5?W8~R}_lXB^pl``Dns_%-_fYVhS|yDYwTYA@f^Jh4B?c z&6UremuJ%OaV^M}E9bnKY@qf~p&zl$KXY6G^+5K$6Qxe`cXS4Mv)FoStY^ENt-}WC z$6@7JJd$w|Ohj$!GiCU~QuR6TtfXu*5V)(`-t7EAb+e5e03XgwAC$#0ICWuJeA+pu|}0*@msO z5gT|(AvuB`)=s`Uo*3yn$4A8-I?vvU*RpQ?27EJUA}kmvugWaN!$U=NrF#>AA&A9u z*UXXBYku@z!)RsW-uTd>X7)H^8Lw!PBFAPOHs1X21T#mLI)9I6#a4+sOUe#SB?8Y? z7`(OeQO1w2FOdQ1YHt4e{)qWp*Qr@X>)4(~{`mM_5o8SWZ>bl*>eypZdEk0VvMsad z+J>)0f%zx(*!kJuQ{OVfBTc~C8coZS1tvMF zeX`sllpw&~4W$RR(W~c8-gq zq-Jf%6VD+mWviFcIhGZtsjXiw{dayXTuWRMI;*lCU%uvscSnfuDnWg#!xSt+sr%>y zQEgah>0`ag?$a9~Ql!wQE=&ITqq3v?Aif~6E#~Em#O1B^Y>fte%3|&2AXS0H`ZKBW zODE8M!kFQ?`)q|fVARf&>S6B7m=+HC_9O=#_+iv@0G0ferltoqUW`IgAT#8-VpdQw2L(t;RiF|z3fNrctmw3IP(CP zQB~Y*H3e7f(oldF9wsJXgJ)OQS(rEbLk@A7>+u`e3pd_-ORSsobZ+nzy;SQxBsZEk zOdj3-?7WUls6CI-+5u~ zJf$zaVX- zDQm+Nes&_SWaaagz9@ReXP2|A*t4e}m>;njt7ik>y6bygWbVrfCf);U+?V8Cp1jDX!OLO*;O5tjPjTICQ0@q5afiyF8VKCAhy?f3B5FFV+O+k63! zGW78f(gKXBA9lMvMgS$&ziyYHi;P-Qm?T8}vHgM3vk`CE1t>^QR=>^gMu8rcu&jGf zDJoXc?m=0_$CD-YOU>wq&zWY8o7LmWpxUD)&te6km&ZmDTkKBKva%t9ITyWJLZ9DB zY@bk06UE-XG0!K-UV0tn>O2;M>~5z$v~(KvROPP>kYA>`reZ_UmKSUet;}t2$6WTx zDM|Butz(cXuXNJ-puWMX@Bdm6TLc>&xd&ve?@q`_nPe}FPsDQrAE?= zqBty?0W3G(=FV*dy?`YNB{luY#1u{TtAoC`%!#0#!#8FNKpn_})wlv?hsdk`Se-2U zc#%uhszU+;^SyVAZJ~gui1-{ZF!vY^$1xT`$Cy&k8wjvclxKCc8?_ab^L1X7h-oU8 zxgB-56ehHg=7gn6sLxFCIyOGbV4zREoUU*YTE41AId0({x0Q>vVb;sIfAogOD{eaR z(=(&P#>y6@(%=}D51=s46?7UI1{6>FK4Lz91o;Dp;F=WPG=7hJ%tV!4M(@+gqbDk3 zZ^Qr@(E}HV!OHb9&WmK8{Gf;r4W-&nH*>4D2Mu* zA`hrV2yA%>_ou3R#mYUEHkZ47soeg67#<8*aae)IE;?tDI4DYkcstQ&bV%K(C^MUsuF@vm4+#$RI0q zPJ0eN!CA7pwzr$~dAC_p8v=_T!(e1=QFhuc4WyS3bBTYq@(GJf%T>I?3fRNTHXsK0$=V9(5HPXjX`<~`FcT=*Pnx(Ig z*^U)k%=69Z2}G9Ie2`JqRqNlJEgmXxCh5Ggx_ih5Yo*p?SouU$Rg%;9c&y%t>3=1{ zP;~p1J=ZacRc5NG&mn*!mJfLifzD@V+_NyCRfiI?q}zSQzc zrx)Ax+h}fmomt>^T)a>6Vj5W)IcrDuZTN>Ri-JLrA=f$0E_UvenS%!M_)Kx zxMXRw@?{HRt6ny0=N$k>f%CICmDo4ZYki*->+S+VR#_po|BLwuthK|KN9f<)t!zEO z(qPGi&3wx;2v(mdT29JYj%FKsI%=b`x@mI34D3h3_OZ2TI_Nv|GFRNY@}<_9dmZWe z&p@SB2Q@B!&JUf9Ok0+OaKD-c6NS5s$?02_;`Q-J5}jT#zun3=tEfC;Ke(DAr$;CW zG{$Ydl#p=9;z5(YY@0o1(Nm-U2_s%gwtd}87c`!?tZ>3c=&Mof1CV8ZEtw&i*U_c3**ywrQzo3R_(M__N63OpB4W%U0AQ--3hwEnovG#uXgbzu1!EkPO^Z=aSG;%Ubh z4%GC}{#HDo-kh$%XBF7vE%7f89yW)!2l*&WPTpBQUgdaf(s(gnfr5@MnQC*-1lz3_ z&c&U3h&Y4Q^Xz~-OH>!oieHYgSq;#C!$JVOkyom)%4g9(-vlmxCosfJMY~qdF0hn| zR$g0QcPmv8mQDYPB^O5%IUhHj1o+p|3Z%P3^8V>JSh1F5PTtP89 zz3O(Hp%Gf_Ezg}p3Y2en>1k~F20at6%(159*zGWJVyhC zyuLx}cfA*XLFMxtL@G7WSFVS88;6G)#{y&Mb?w+golE{m(pqGoWkw~2*3so^aJCz*n`=LK_L@FWKW8-Lt1Z-7Kw6?iz-phCS!Co+Y~@GJ zjKEN#e6|J0pA+VFUEkj@b8_ODzDSes4)8g1vlAar>73Z?`fzlU6|R*e|I}G-@EvkU zS7#+ZD}Nh80b(l??7CdR1)b~hB(Ijvz533iGBF7grPmYXrth_PaaQH}Ol7|BgFxH- z$)7iT#Y466pxFY*&owMa-g54KdbT}NiQc2Ue&pVNG@@oPktd-O()GFh?vu(fsr%de zpHgIVif-191`Fk08PicJuPavCp6~E zpCIU;l@algulk#cDllD-<>+Rc-L+H@(h5hF(3g}Ao-|aV;j_$5P3-4t)N!T~9uePL zlIQE3WwA7ai))0}Ig-yaCZcR+5R!mfX&##Pr41@9?*!N!7cFDoj0pncT!=*S9STaz z>*aY5eT+dvh%+fzYjnZPR>7$|Pk$#GKE>c7q6?b7`u5#{LR}fX7yWfN-Zay5^3{&= zdH=7>1+ivVr03zAVyUH5yhFrTQ7_ef;f1JDtsDR|D7G~$X{EFQsX8et9tH-&tl-!$ zUz^cCdxSpou6bDuuW6Ga?-F{d{*Mb_hKf|A&KY1mQ-`Uv{NjUna(g>0fdp3wJjr_RMoG}WaERoE`-(p8oD_Z#5k;;J9)r!)n5p6R4-7h8ne{&ZSQ^v zb1t5$C(P9riDK5`Oetp`Tmcc?`t^4*d4`YgM)-)6AywKa3(%AvFCNgfNE`*~jdUH! z*s<6WrRl!)i^Cng>caYznmS4bkJ0xILsPv%5lya67HRpbGBsoCHx%%!KEgF-X{P#% zAhsh-rm$r0%q!xqFB#`$+h28)x!sA!ejARYS)BDB64{=4=1xF+*ZKlU^~Fp8O;bYK zYudECK}CC%j#6=jU*k9?XFWv~YK1Xl7!^p3pT~#=!l*~m6j6Hd2&d2jaJ@>wgM z;e?4`Rvokeb59Yzk8g z7||R{pU^2WU>sh=S11EMu5ZqQ7wN$BD{Fw&pDgI%mI!^@2S~o%%ZBja{}|Q|3?PlH zZA$~t^DtJyNN!JyX_LVNQYg@A0dVGdF#ie^0ejzqiq&tun8N~(y7@tj0uL0f_YH<% zYJ^z;{2VvYV1|{H56ulI1WC_7)E0cj6I+T< z$3!_n3%=zyD~*K>;A1|(UVvmkxR1s2c-r}qpd0t!y5WCg^NxW)J7?&DUUI09pb})O zdIU4he;X>C4+)zHcnUww&T^9V?@+Fpof8*9DFAeOAnp;kfu8@vLKu=z>;eCg0wjn8 zB8W9%gb0hv+6}JnMmZ%%%?!RJ^nR_a+yAc&|L1&3vFGp|!v}Xoupaw@|C6o1Nd8w= zBbb1!y@4RPaMT^}oYfO@)Q_TGX(CFG?J)kn4p1CP!HXZq-)hNC>WR*X^lCslA#jL) zbpCgx02=v7!zl$9LOrP{T3Oepu&Czr5HIeZ#S59aHhN+Rk~gV*SG_I!~h zmjf@#4h$f<*%N3@aC%5H7T$vyUZnSd3g#>7*8WePr}hvq41O#iY`EIfoCNi4EO?D8 zF6&)dZ+7sak2El+e`_2Lw#+po3?0m9CzuyhN}t4ov0g3pPuwhSAw;A2b(Q5v1$6F3 z`X7}>phAbJTKo0?9een8g5rlKywFBMi*oya7Uh2y<$ojvQkVZ59G_z>@1A(p1<1;& z?QFAooeADzlS72 z__}WqfZ=;=>|XrP?qtWHP^SUU{p`T@5VPiNsV{{HHjti?LE*{u%XPacXX|pWhjsTW zW%%l`_L?dG_326)rR&k=qbG(Rv^J#k_7m7lUzX~NIBfmG9nO)Lx0h9`bHBbkemm!V zcL&^@ICnUv=Nv*rL{u=A#;C@kUGG5OdHQ={A(C1KZ}V3b8c4p&+Uw9ej@MX|-FZkH z@b#;dWIX7eV05rLNT=Ia+r5x|ps%e>+!adXe|33o!Phi6v`=u*^^Tk7uc1>91nNDu zQCdr?wHs+@UqYwD4qTMHvluP^`)F%&yvdbIv)VEc*!u*UEA7rFMqT%t!Y zF7IRV+y3ql_h~fgO&l*bMgzB^qVb=tW;`rQR&9Ae#Hs&hs_yMtR{YMze*eO>^Q@zV z0KMbYdO=C$Tl_C9Z9JY=7kObD3^m}w_J`<#C*jE=7thZbKu{JnPFW>;^Fp%HS9Zn`0 z36qc|nS8_~kS0>X6!88g%8T&w1%8^=*93N7BsV#70ll9ROwNC%2w0m>(v2pXT#rXR z)A(JUg1c&bX;a@K2II5(1w0RI8!gt+LQATd^&kSX>HyM1q0mz^^A62%V{}Bc!Q=8w z^CT!@s==A{>afIfekoqB>w7F?RW3U$8*Ak1WFdU_>U=xHYOJ` z(^QK!h%Fzyh`|_a;^K4!zpKV%BYXSB2MuWzh0=7pCj7gnQXM?y~e9mml25 z&Rd?GsQ#koS)f^uN95@%rVOwXFRtl!wrXb6HLG5K`-3@wJv8I8LL%U|gK=)WJymNf zhGu>GWG*9Iu&v5s>>w2{tVpwJND#|)r9UlgGaMaEwaqlmJv;3%Gi!xy^_!d9O{xgc zA0htei<#^qD*dP42W|@ZWmCmEeEjE|6}2w=G*p@l3SH6^qqq zp@ybZ&jxHU^LlflLZQ-30f#{uC0yu=TfFfatlThtn@l{5T%@kweS%Wceo|(SXbV`J zL}B2q_%f<3^-UZF7KLtv+jqD-${8<1c+QTt3Not9CE^+!H~hdwMv}q9F^MRH3@Z6% zPGoL7-Y+8cn%(ODu!t(Ehm#5Ep6l<-eHm|d=gXE$TAowQd69zC4JulMUP&p}YS-Dl z-KrX^%w>A{$?JV^fN|jK;J2}$r7~U32`zJ3a$IN%uj7!bMEZ)fRuZRGzBKV zr}vf1FLoo8QS0X2C3{WSR<7ot)T`iqU%&A5G8m3tOedvhveZv z3HIO!0lP&Maikt~*dA4=+GvEfy#$l|$HyYU#(}?r23`+*^2YlFLjt^e#>+;EIEmIb zzbd6AXQ+oJYi*70dTLmJi^3|u#?rtPK2#-taiVtDLCQw?qo(Jsk`s!ibZQ{#F&;72 z={W<0nFId`)8Kq2xGN!&%w+Lye9Te6#Yb`>oUh^Bio~*RbcpJ1bcQbTZN9uwO!S+3 z3jqi5O^K?6oSo$p*Lr~q|L`Oi;MMjTonCLI+>OEM9P1=2U2N>2ru{~M^iAO#3zoPz zbwB{j6LmR&&M*wcXOW!ez0{p=>9A7qZ{{LHQ59~E!6p6^6OSt@*d_%8YWjBR*hW-J zJW8~dk6aIfC?kajFr^CB{dR;67}-(K#I*;r<M^WMK|$dib1|Sg3mHwFzN2-jTcs37HE8O*CUv* zJUN1+`pvZFi)8N48K=b$%&#!MOU;LGkcvD*96F_y{Ww=EA4&Qqz<6QlL04A~cH>*& zU+&t51kLR-#E<=3IscGNDNWZq?w?#!2fEx5Nc}+aFHDjGX83O2LY*bl#=GO|+6YF2)C9+o7jLk!@+9YwPUKUxFUt&Z zn3v+aOq(1M?WsQl0#KMUlZVZ0Nno>P0U=?e0>3E2N^5@A5f~Eu&I3mf@r~!ju7B8X zpZ5joPGkMpWg&EQ>2YS`Euk(Ef0FoJ>_m{wR{W}93_*?y61RW#3tcl2aasQ;n01`*g_3tYA4P7PH9Yl_;%@g?3u53g}R?7=--pE;@v8t>85SSLt$|N z>_XMJul_?cwBdw10Wk}L?Lu!wyw917T{Ep`n|Xf=8h^)eCllTL-5E*|7x_S;*2ci9 zhSreas{0qFaW|>yV+k<^p|0af>SclhhlfWu886US0c!qNn7UFg4Qjct_QeDQ%Petp zL|?n3uQQw!LB=gW{&_bO8Y*A9xQ5K+}fWWO=O?w(e8NI zgsI_&(v@m#(~gFaxhl%Eu&K9@Y+N`APLi9rd1N2Sq@cDb(aVc`xg<{B*XzHEb(`{! zr$jplN4^f;H6IKhUAS*LekX|>>&rt-6XhPKCA=|#@SvrWyYvG)TFXBY9OpTexg~cw zGlLiG16F`>UBj^nFb_}?|NWqhc$TMUw`i~w?W#+_mkBvzz7w?^E27uSeb089@?>W2 z_UBL!sWHg|lJf?g&T@sr$?+tzv0Vjlb;ge8^I|x^MvD9;be8g0%(zh1y@U-r%4wG= zg94RJGk?QGSAzYNCkF-w9>oUQ5@Q^N6PnNF$fu3cPtw8SXygd>WJz|6ggbsOdE4{y z*6K97X|yl}6wON@q_3rN;uwp)<2qtnp#C64co9@ZK25Tbz-}HD7Dt&M0;cG5Is8_{ zE8V{I0oX=roXFr3isyd{&c_@{Jwebu*Z<_-QGW@x?QT#GNK30n-15xT8lb27_zGt} z$m+GYzYf~_1pRcYoJBS>tued5FvH)@h66}gD9VTzpGk95Gp3y=T1^L7QWyUwA^)~G z{47U>L5_4HCLl~K!Dc1n)&I##r^ok3$E$k*&lfpa7#muk9shTq-i0|$`rb+tX?Pp8 zag%Z=g_3@Z?|eBUg}pH5&<}6f;`CNBBM82)c4Wr?3u)X`Xd%jhj%(Byn?gVN9M`FY zt}iMCufCp|*hGA>4T2iRFgJ?Nu`zq)P-JU`le| z`Cd2wky`nbGpsvk1Z*!CtY?dZZ2vr7~a74F7NN1h#UrJzN_Z3^{>-LW$J#!uAwo=&TsugSJt>h|X_%&W0O}=9iuYH34;c*|8 z$4ZLx!3roqz)Y^vX?$zDHdR+~b$_f8IMRvV#`HXk`rmr2=jAMdIA+8iSqtsR^zryb z@8L~cLB$M8v@)T7G_9&&cl>!f?7R_?D{aIUYr(Ckd*|CNG7DXyS`Ik!Cwy41Kup~uYib1 z^y+klEJgPU%vd^4F)PzztaxwAzJG1^eG&3G+at@X<##tnI11#4?S`PjQ+-CnFIQqD zhIRU1d^E>r>K$d8udiI0->~X7(t(&c0@z@GuReN{M5jTugc9uV8S*W`>-~PALvH5V;mpR6kGi(UimO5Q{64E;pn9k(NMjj&6@gU*S>z7;|PZa}p`zI<)9nUz1A}Hy<(xzhuOvT9H0`RHVwlM@aVUr(ejk-eWXAjnf9eJxPfLrl# z*(>IcbTrLc2}KY(nc$`rEvrKMhcHp0qj95h_j4zDz2-(;G{phH1_rL61N(#u)55z< zM?S7`-*}uaqu^NH=k&CPN^roBG(bCe%w6m`u}X2h^{+p_30MjjJd=|2jARM9_M1Bg zu82_pFzFs=YqV>#EiJMsyegnD25+Ou2ZbfQ(obIk=hwJWNbY0B+EBk3v+*kPVVW<_ z&&1Mf7rM;L?ukrQn94Kv4Qh9mZ=@O<-QSFVmdF8j(hvzS4=^d8bzt<2w^)k%P&i?t zz5)IoxQ=1{C;eqP!h>0(*_s0NF0@-Xgly^a>a?4=idjKLsjTvN*qi;IXhX*R{{kqD zU~e{=d2|5;M-D`68d&Wp7*;UOB6!|6aGD)resYv=7VBMIoE|<=3Sc67Aj&SE%3n5P z_2-Cb0fwTq>He&PS0!J$o(}H_-6;HV`rJ&AaU+IsK2N>}JoV)e^GsEl= z;ZUg#Wlx&>QNjm*e?C|b2<`C2V{OCsfDU!YhhiK@O&(a!Gh=JwoI{W-SvRUvEmZ&9 z?YZ;i?%|BhFFoBjHwr%5*@r&v;&AWJXZlT;DuRfGa8;#@7Z`FCy;RE1@rT26o1 z5N~}5U>(ZRIaYIJLk*h|HFzvOth3dTy7)6sX~Q!jEX;jid^jW{%dL*adXsa5Q1HD; z2!4U?%X`%LgCh#KX*>S)X{~nZSUVBG+@gz={R0PqzhF3lX4VK8=X0b8S1(sm zgb|mq=Ge?wJp2CA9B=9^S=A&FJ*ZpB);%D!p%pU8Z!pW?m%RaJ@GXP`SU5C7y-kIF zx&X+c4(a56;@?VG2+9YRzv;4PB+^R{SpIrF2=}moyaO%q&3|T97a$Txge0s0NSi}< zoViV<&jQ}EytlG>GoimFj&=ZSPymL{3f0pNeBnIl1{6B~2|4Y5%MYewX&i$PYVu+MN?t%FB&aCIPyCIy- z069-PPk@plyS-!sDJ={sB#@|~0p9Kds%Ue@zeoeniBzb-O{V(tLmHq>Qmu&)aJ+`E z3GXKWeNP|WFVbBY4gN7ZFh<#*S33TxL19EZK>4t=y*tPce<8R$4YY&2Y0MAT+1`LC z!xPaYISCnJ@<{M-Y}APO|KJUb11A+U1iOj;yj;P<{}7bj{WmsY8>m0Six&G4=yjh! zz`XzLAM6YW9plg+`k+5}lY@$8;*P$Rp&NS-hWZ=0CO?SaRZ>0xEVskpOS`FHhQ|3x z{f>8TutK@jF&>ZI?^0r3C*g;rldSkZP1Q>DVmG+nd<{(+b18rf2FMAj3mFCfv{a*C z9qmPq_98%1%y~|kr<4 z5v#+%b?p4GjVn#c0#25AopUFt;^aDN38{Nk(%Z$E($UXEMMbG3f^echHtW)w6B-~2 zC*uFX-dhJ%*@b(<1}dmXH%Rve6r{UL>23+>?iK~ylF|)Q(y{3VrP*|YgaVrm>4tZ0 zpXZ$U&NttC=9_uv{qH#A3=FgHec$U|YhBm6u3s#+@~U`Y>^`@FgN(in_a5koM1gc# z@z37RT6;J+!XkFl8aFE&oi-O^R4ObiE&nK)6lmv0tY9!3g@mX6MJCJq61w~Ivl(;= zk0ib)Sy%&JpYI$ksOyF%8z%tI%Ktfb&mn<3jR7ddqQ7?M8=XMMrr!53B9+%$ zN~|(aZl~}#M>$+NM-9e4`6625@34HqDi7CMa5hQ^OBT{|+YS8!P*O6pE|#=Nw)m<7 z>t8GhHWHQfwrM&AB|4-H0Z%~mA($^Rq9x%y&SVB(uqEYaCyrA8NhU?GP(5Izo)>{n z)6Pn4&a6vTdSj$3Oc8$M3xCl|$SW=UA>xW>z$>HC^**;=k}VXE4puD*-S2Ma-QOK0_XV z$E975Qt_M?$A<{rjMc+K>Eehl~zG?+iZ`e|oSB>`eKZY#P`L;(&#wDRA|u0*NI zZJR~85RG+NPVMN{n*B%^%T@l_Fl%0iF|zq-D%uI7Ud^Y4nSFJv*XjHRUctA$he%o+HiF?(v0yNjSzX?Db8)F>_Qc22!MEzWt9W+3EyP0|}5)EXo3ush=K zOACIF0JK3tUBnKXVbt)jO*A9CxmAxfy_M$HhM%JQG$ZG2(LV}>=UeMS&Gbc^~DONsWJ-FS?ZHh!v?qs;9~f99#*ki>H# zS`NR|G6`Qf%Uk2Rz}0@LMDrcCl$=oP3ACg&<8fME_uXQk5fl{5{F^8nh3x~ue=OfB zEh&lmOA6Z?-y;URO|P>S5e0ea(QdXsbeIdxw15vsbFWnKeZPCiu>x%IW3|uF26R!mT0K(gG%n% zUa7rX?9-%Ce$vXDseyZrw%ow^uY)Lw(>T*WQx{KpFr8oRw)UeUM`zxJpip`ZXe0Oe z*Y5SLEBe0+oKdyd{tNNk$1JbX+PzsFbGqDp13r|aI#2^yq^vESkN@2+)gz7}DAE0p zFaTMSJ7hch5^E`*qms=Y&E=bXNB8ak?cGZ(R4!2Do}D{)?wf#5xde~uAFGtfB#H4g z6Le$TwD^Snk~Hj7qpUdHyLOU1W-hG&`eFrzLh=RncXEiun|6NsDU}*FQhJ64${tI_ z(K^?7OVTF{>178hYJhrC4Mj3kW}_?DU#><^8xenDv$uGu7UINLpq7^!nD4SMzw@ee!@5sM!X(Ab zhGByvcg%XW=51c8u(PNW0?vxCj{S^f8@b2Yk~vv9u1QUIJKK+3(YHzU0d*- zg&7i^QdUG7`8P{jGd0RDlHA^^mLK8{mR$d~G$nEavYax~Hc#?xKE6}r!gjZIUh|&n zVITdaIiOn>=stoA?Izz4hx)q&(V@~wl7Sl`EO4wJu7JF@%rvJ^06^u-9-y^ z08YRPCoigz8>tA&^~{rc_8m1-ufYn9R>ryKB%J^Fcsn`f@~7XBkT{%rZT*bGe(T0lU*#UcuJKj!^UmZ#q*Y^5L3K;AytUnF-_~J z&->hrcm3!NipTAg_0a~hD$L=+6;``mj9OH!>C1o8O>|y}IS{C`q0DhM(Ij46lQ!c^ z_^KAFEwfltqhKjfyL#k=pb{|a%IvXl5eqSE!xGi3)nuvs7Fo8`PBaVpFk}jtI*&t9 zrh@k<6B}DbM)vV`+xuMM)`q{#B>3?xCHY;|a*Z(3il*+|-7e7Tx>-_po+`NR)2=G- zzA^m!s?1?N`&VX3xhF(|1V-!70Ru zJTq#FRa<(7^+x;!FTqYtD4w?DZf@j0!t8pb8w(M=yuIV(6*DPVp?B^wV(YFC&Jy2Olf zgZ{*;oV1`sca!{p4qOxbUgN5=bS*f-VmQNJDLvGt5y|Kx8G z_IshS$0Aw1!w|R7q?5ZK)<&^2W5KU37=G1xV3ql~$EmBBCM! zJ`_PNP|&4EQKGJfJ(WqxdA_)8el)jB1ur!|f4#z+bTtoYuF*Sp=2;cYQmGxfCZTP9 z8>_O;P65#rg?-EXc!H0L5Ioj~p z?YGK?tfP;Z9wGpKiHmM4`)N z{E$HL)cV|)CBd6e%VaDAHIFfe;_>hIKdn?hx0hemdDBGnVd|e09;ZB3tHo=lFi?0= z0`IVoI*EP$=Vi6;wH;$}QC--TWvpSUE6`{;KN?ds37fC7R4q{7@=Hz{dAm7Y&|nnj z43A?@XgV5|_k*#uh_(ViRlV17;jRkISkm)I`84Vy4L5r_TBGo3F)2^AX5Zv%t-CnP zB>^=+U=Eo;aWOsari0WJ&jZV6I~B&!64%GCB$!Xi`SP%`N7*|{DC6z*8i`7%I8Gi@ zM^6_XQZvz>7cclfg(dLW7Fz5MMYB{Y6a?4xM{zg4vU6{JT)EX0tR>Yk#dFb{26TZ2 z+_duwX-JGGc`Izf5)_*;Uz`w-bkV9JzF8MlS-G#BP}zE3cIl5=E7e+D+GAw~wRyj8 zc5uLhyWci4i!VH zlXF)L(>gJ=#f0yl-!wF^QN%}$`XNTudEGag1SZOSs}kt;uWv!bmzf%hqEFGQ_fV(v z6%DK!ke(*;1Xj>$GyQ zUqCBK>QHfisncvIT~HRm(JkPS>98@{ENBlf>rntEM^lgfISH_&IuBGb2y(U8f@-AdEDBfzm!U@EFOR#ghR+R*A|3M7sj4_ zhno1*Fl2^|Z^tm<5oFDRxN^_g+(Scvx=?6pmM*Kg9odGR427qk<1^ES>dh zLrB($KJI-Mr_W6(jPPAM@dsVN^kQ^lUZKSd9v8V#L^`=kl)Q3++2|C9nnayQ)zc4W zgxzi~xF_8v2Bs)qhY3RZGvCh3z}@|lmInPDef>|4cnkCj3y=10zSWBV`D^Hks&g+M z3m%oHDAP!wD+l;BH0p(y_FMFRFU4!@!=dWK7eA~WKO=n356~iv9PB{fFvezP7mw80 z4yf#VE}iW!tCWgqdv4;n4$^_M$$LhWZHoB2c8KljhC191!h7w zEEU^K@sviGlpvlKV=L0Td}rEL405RVy#5xZ88unO#6ol#kL5Vu;Fp))EmLq3q$O1J z*yFmGYTV;4@JN#+8KpaV(uKN;pO1exvGSB5ec2b2&7i=;^Wj0`^I+opmpvm-7c)cZoK1#fEuox{~qP2?TFe9&yoX`?o@P_?b-DG_ccmd4m@a%lHGX)%0uTzz8D1pg}J4W!-f z3=gkATyXE~X@5cimC+E!dNH#X{kM4l@}a7c?l~e~mDd`a)~0>uV*s3$kb-SSM6j#f zsWNyb&LbM@8gKjI@ZeUFy}A136PbthX}AemT^^Xuo(FTcp$O0xjRV-v&cqPo8U70f z(-ZfR3k zye=V2tm%a(kfP-e<>o}4^-43u_nx3nG&=fVOT>ZIaOCV0xn#mK``Co^r~eNams+mf z*ON)HGp|w1AVUGOpt&YT_{e;H5AL*4S@phP#HSDeAC*14se*;V?+-~21JzvD)LcwN zpnV`L^JQJ1&E*H9XGXpwy$&?ifi`5lpD8~bPd7XMaPKl1W0$Ykr+S=H%|w?(OBDJ| zJ+6?lf4&3+m2l{3D7o82=kMLQmTW@{q=5IY+qF&GLa(#tr-84vIl5jj8jsm&^P1a+ z?ZIja#8=mw5i-2e4;-5Ur^`axGAUBoX3r~?RbD8*JyZPCd$M5gnU-^h31br(909W?T7|#V59z7!+ z&UyiXyq1fH6hti7-NqBL-77V0-6iCpp^+fqa<(h=#q{hZgyp{dI8mzKNW;#qBE8|f z+LRziUv3*P}v9P!QdYi%9`Nuy)Uth#@7o_QsKFl^ir{1TitIFBTT9#Irw9aoM(^BEvX-n zv$RxuZeG@{{_xw*XiZ{NEUb*uPogyq=i#%i3i+U^QE)cxxoj8!Ha*g99^i{} zL9abx2;fku0P-S)GQLQyfXcr4INpT3@-2NxbR6Id2D5BUl_lh5`00dtu00G8boeFW z2^#j-ewV6{OR|^r6agM+({CX3lkHSX~<&KkDb zI_J2PDBlM5hc(x|P@|ien51V%)vN>anGVMW+=H zd?tWUk8He4=E3}ByD#TIw$(u25y&I%&DM$=TpyV+svH?);!een8`ql;r5`Lu3R`hJ z_xU4weto)l>`%RECE-IE3v>4zNcI*MNbO`_SfI+XM%}I3{R&{`@G8r(l@nXMI8^j- zI)|YQVK(iGIe*7tn29t1jD(ovw|~Nr2LOvKfJ1ZNY;fJmc{gxr`H8_`vf2Ae`g7Ce zZX;jx?f?dn`w8KqD#TkZcpDfPApZJsB0Vss$XOZ+}(4x(1<#1Zjw8A8oA3&kNh zYK^rCkGmxRc$rfJ7Iyg2dc4lxjpiadnmfXSy`$RErqN$~yZA_JAAc0LW6)LVci*l6tl?d}5AA8906y_oBzaV?bVl-t)6 zKmk}XemVNR`QN7(yLf>juS4rKeI+lF7gPiaRr88FhSn0A=(QjA_NgdflFduTt|$H4 z1v+8$Rh;soBS4vpf|89DUy2E^Mz95D*)#MR7!81<8qatOiZb6Z7Bez zU_?~L0@No3wNw~X9Ri>whAkJH%-uRl$|P~sBbWu(>)DZ{i%5(FF{c$x>Aj!l(@ax) z6f53G>zUmkPsFTV^gT^89ZJ`aXVqsKNa2p`8c!?(aL9uJ?nR!KcHXhGrl zd~UIV<#`gW&`_?aHvsZ{h9a2ZDZq+HfncW3=d%8F?Ar*YukXo>xd|Q|`kdMcAmnVf zMWujdAr#ejpvLwmseGESe@m6kj7n>-luk9)Kv&DV9OG;TLaO<_=F{|LVrt?Yrg%s6 zLigoGDI~T60bZ8d)j5EpBxo9VN=o-&F-nWys^%%k&Q_g9*5~Ut)==2~w9ry4)pI0J zlI?#Z2TLXN0-GteA}DI_KjpoLhJqb1-+0oKuSh3%Q+_YNl*{(7)RHP{B2zs502%WW z&7qiWUK7Rd53x8p*s}`am>2nDMxn)G5V3qQJ%c(2(anh>0)VvO1R&7Sk!gVcwpO-+ z8fjvX7!zeQYRoQ_zx^n1G%P+6SYjPJ#aj`kW>0F1n4_4(!s8}BF~K0=f=EfsY8#Q9 z@JAl5W%%=XoL#WyYMpQ@16DK|k~{mtVEfIz0EJ)ut>(Fs%?=y53%tV#VwcD^N!uRE zYdrY;qK};2LNiuLV%$~;IF;Eq?FTP=(4+RY-dxB(_xQE2kN^AkUyw74mYqsMvPA(Y22w~v7p7V}c@@G9|+FUtF2KAdu zuqlQlxZBhir$MR=r}J-RX}6Kjf43-_lSk_KPx#XY}c?1%X7Zf6<6I zvdhL}RFNPj$1`k)sXur^*Q8TG@6Os;K_3f1FIUn4!r*Vi%Z{cMb*IJ;a7Xl4pBX~5 z**gU)#zMHp=#Z_O7oqPhK`D4qvkxPtB?`ztF6%b}iFi(y*%zbGiY~f%$Ce7`Om@_f z`9^pCtZ`85oV<)*#)cdEo#!s7+Pq3_wMpH2Iudpmot9!9PzGST>%OB>?b<__r&8ik z?5X=22;nusERG?K!ibECSei^b!1RCly- zRXAQU*`~3zWc;^>H-+*V2afYDGG5YpP5y**W?4T5d4o*WgqEbVcqv*7mFAJL5YJYv zBxkEvC=4OKyyWDZfYD+(Q5a`(%_Cf|lqm)~5ZUBbXMrQkJW(mPo+!*c z)h0pj#+XtR`423BKBaNE%sz;pWEuQN#x*KNRz0v#2K0o`29Np$J!vx>%(@m3zCF^&e5rCxi zq|lh3dyT;cfXd7nmtVhI3$9v2c0E9Q%{cX~fcJ@wC4e|p-3%@h8+K^Gh!;EE2H9=8 zgv{2gCfB$0Nc+{tm%X9u0B8xHt9QgB4)#?xohpkeRBI#{Yz3dC zR1hwcAcbe6m<+p}sb_oaK(xjH)`qJ$Kykhd4toRX=3?vM>Tm|#RwoCrlk$Bk>_5qEvq34QLyMO=1|Dx1HEPO*9KW9a+O1$)`dXaa zXaF>_=&E(qPQ82#0IAl<_g+tQqwlpa(XHgsj{anADn1e;DKd&&U&1G19KBJ@jbN| z02O!Hozuo=)g!dkm&v>VJHoMN-ypiO`I_K&m}C{$-07t0?Twl zSawm^!|(jgtFIXW1S)I`Mo#>}6PztywzP-dEq7SU=vXMlg@2~)*^;Gri&CoB#FYK$ z7E&*g7hfh3jnzo9JIYmPs%SJ z$d=e~?EHn}bD$>D#izJhQv^LGaKUX2vEIMIhu?g7b~nVu991VZVLLgUY_Vxj9TuhG zdv)dr;JEueYPhQa3Y#F}k^sP=vm>f7^!GP_IyE|_*W{rr7t4@&s()!6Sy6-z2-Wp* zV2b3SXbkDH@pJ7(@z?IjfUbhm)~c;+yg*W{XL-&m3w?N=vm?@tato`@bt|5f&qE78 z+`v+B*#Zk06SbgVme;?{BUUMNDX{1!?Chdd>a3_0>e^RiC^xGDv^#~LPnB1$ zKxp3`3%!amfSI5yfBj&Eka?>xtB;6Ky(Y7}o(3DL@M|!QzruQrRr{so{So9@sTSM9X6ij5-BV#s#7 z-gK3XVGYM+O)?~|Ei-qmv;}rB2is}su9SEDhwK``1=J#T0maB z`KxhtV|F#fxMk`cSbO0S1koZ{XhwTZ9ea(rE)&>uMX46aq(JF86g2pa&O;}`oK8f# zp_A^ud?)CC(^Q%h-Zu@{X?9OCv^%yXQbb{?7J=XcO>{;Q2}J5A#FT6_0q+4)fV8|c z`!`6b00zRC7wxTcVIs)PJT=ck+k_wO^$0(*yR`O*ATdRG&VWsYfvbRJTD0o4gVX>S zTtLi%P1Oh3pw4L0@J48Y&o%$(3KgKZ1Jlb=aoJ4;5lOOVcEQvX!Q>rZg;7>qCD`^ZS44EumVSmJRaUeN#vJxV7&tB7;U5A z%LH-Z!5*)gh_J9I&H7?T{3QZ;!2$pFbi6>-Y@x}M-Ex$s-en{D1_DIt4>s~LbY`w=UZj1OU>utQ!JoX z;Bb`0jsQ2;r-t3;m=^$d{l$I3Q<}o&9}AY)Wg>7>o@{3!U=f$eIFT|Vr)o5} z1z=*oR|hkbdB_7>JEH81#<$l=q(69sq(O|6N#?RGY)lpO){wSC7bW}Y3fdO20PoSw z9KT%g0J=+MYPOoUylttO60rAUPRnvmqT0MLu{sm5DW8}FU&Rz8eEVC#e#OvYXPbC&!oWF|y&HO#9Xd|G4DGKZd z>cXzT5mpB9cC+URWCEmSPmnoI2E7!7+6knK*4lLrnI=t0G7ge4(Vc-TdSqs1K3~kx zt+k5-x8yyys4ordVFJ&ZYr)qx({^v^AyH}R<<#KO`Mu(-LCN(G5-W_;aR91Hv>;o4 zX2@kZDiMrM@~z0EQOXO4IzGey=G=mD9GoZO7*)S@@Ep$6Cv&QGBV+pFa12G;QG zaXl^NQ5| z!OK0k6l_~QnG}xH3fp^RhzyZGj4=@gh-743e?!60$v%k1s#%))F&C>j9Ao$U$43ty)P)7BS(!Xt z%T55qaXKEqUJC$wivq!QOPEiyT}pk!ZD&>zr=KVLyqnyAT)iTkBQf0i%d>e3+rV%?<4G7X0-+v=2O2;WuZ)9j7Ktf6 z4_7(wV&oms2LSq(eEnlHM1kh)xifJD8Pcin1FIlq_Z~44CvXP^ z!WK^uX28_mLjJr&%;kffiU?g}>;1REj>!DJ_=ww)?_}qiG(1& zx;B>(D8Ov2Po0o9HbPY3M#5Zoe|l*}>^gNo~7OG~YCN=s^(WK$ctF{*8{cQcrF(B%6Qo&jC zkp;94UPcDzXJ7*np~rk-Jaeb{FtWMgCSfsGeySAv#-c8kMOWj;j~_Ok&z!$5>hVY; z6B8;bDpO=y@@|{3VC;aBQMvc(Fz0?6%Z@uJi_Hj9pZo~}8@+kzG;=waw2$IjEm%O} zz>9KE0dgNF#E%Th1BrTc6X%>a1P3N01DW00^Vp~Z@#+U z2JYk%4e-i%O>J^fR6%8!j|O-^F15V*Rfhox!PTdPf^yrRXb};h0XNl$G?4rNeV=Ds z@BH%}w&#^rv1tn#udG}gO9q9N;WIDA(HvRo=8M+cvPQq1lr$X?_95BHcr7Fb8T?XaL|dcy#e$PF9r$5<3GTFSD!DvpYX) zb9bn#k2P6-3a}0}kF$xs*5en4aC_t=y|O(|l(3o4>U6g{{y>>IFtZC!VC@@e|Ac+z z)E>jj7%!OFA@r|uqm(yzwtl4N*F)Dq;L-HU{a$*AnYs9L&-~P|X1&a@*;#a_U9B@?}jdGtR4{Sd-S8Wj!~|jb>{nB?n)zr} z%ZNETbi1d{TS)a2+Qj8s6%NsTWmsW~K5_4=O6R9JQ*!u;>D2Sl-y3!>x96JPmPB~C z`~LQ4e>d&U?)m+3+F*D!x=&@eUpXq!ls4Do zDSmOfO~s;9n~)UKy(KI>Qs4N>yUModlp2)w$u$*>?jb|)S7$k74g#vpqz>+!`eNC;F>y`VK*BNKUB3&j8Tf^rEb_cf((?FLX$t^wRyYzt%g50d*dg zqD&bp*~;2EyqafcxT&Se?lx!K@(lVqg-54UuO4D}&7<%o7#%CjWj9^1I?k9@PBxqm z-G#jWrPZbY=MAH$INF%bG06sWx1SMbr4)PJ{@ImUux*n=dCO4 ziL=GqnNnG@^VgqmL*!D^l}&`sEle1d8;S*{8e6aGEt~t3PAdELDl{fb^_a+8FO%^L z!c%IaEpfW?#lm_`+sqevJDl33b=F^PzZ@gzDM_i#kAbO*Tex~)nuvXs)^Hr_ z6`s7#o9k;VgktkU;g~bpJQY>oe%=nCaeL>6hCR?X3U)cwqBM z+H5J6mfh@1hSzhc&gW_%@`7JNfnYG5H3NgJr1Ky;$hn$=-)*J5YUeCW;P#*hRys9Z zE3m%x)=5S((B1AGJhDx07o|wLZ3D_g{qUmgPM61rSYYn67nYrYAET=t0hgW|m zM>O}@pZbbAt@N|_F8C}0L_zaB1Jz`%zOyGHEplZ$$-i35pl(%BHi`ss#wL-L@HsoUw?(k=_{QG6FT zl;*>@AYXzbt2M>e7KBJeRa6=c2!@Indx{+T?)j}4fw!0LL=ls4SQ=C4>Okyx!5EK3 zITq!O>$S$Y9NyW7^=q#V6zy88dY|m9L8ugUA(8IKGYM1tjXCH=jn$Z_)b*e02x3F^ zThsnnZD~>evjdYPeqX0BN~zb$EQ@}8N}*;F8I*;eB2vCx&euCjCr=cvW5Nd?lvCB* z&|atSLFBpYA~hjw1}~&ruGaO@;3G$|Cdd}=q^W}X z-C2eRIP49X1Q!YRdaulXH?%at*u9lb-y4^bmW!$h%hHs3%L+B$Mg+ZgVFfBQXAMkC zu1p7Y_~Dl7E@Rb{I!(s=VzjGnyi}<&=2Ggm66xNpClTY8f6Bwigd}o#%f^C!48OR| zynie6S;Y)7VK8I`3uBry^c4_!$EI60lrHSPS|u=2ktq)0*K9vggPd;7XiN4!+1Q-0 z!2D(3a@Dl(GyI1hCTO6r#p}KGM2Rw!a8C zka;;>Fjh5kvK=-YWnG1_u%2zGg3Dr{BKaqe(V(@1G5+n7tnOd>Ia-ds*V6W<36JFB zM;N!y?5>Ips}rTOwN7p;dx(mW4))1@nd)G3okUJH7XM*lFxkle>J@qH+=^)u9UUu? z;XM~TBB50{g}1S_B&8YPaobH|`q8L|L ziDqmXCLF<}p}F$a<^pkVv^>h#phZQUjK#2eTho?p@;9+|i_>`p z{x!Kj$Ll}I`%q2ZLo;p0ba zY9T%WVv@j4_{hgGu?-NLNY8{;9)s-g3YnqahVcSM@_W#`Wsb+9m3+&9Dw0D6biZ_j zuAHE9NvwID2aEnTqu$XYN=i=_&EV-A!CGCXnAPGfN@8Bqge;_KDN#)5Fl&jzeL<>* z1XjH-8FWLAF61{mYYibB#2uW4rrE>shrIA|f{md^Cp#0JFr^Nw?S8{5!U%bYz0PTh z&=OOVZ?}czaQY3X#3(_Oq}0WHpuGE_5UI#55;VI!(At7{mTFaF#qEVR}vmD6gu@=|<3+~EXZFzQ&P1S=n~KkzW~ z77&N%r3rg$9IOnHXMf-rBwomp@G0>MUlERD4d=*FRXeEB6*gY_p%JHm2~G?lJzY!4xA)^L-1MwqU(zAd4e- zio@WXb2kI~Pn=f70E0Nh-5VCEXsfVo1XU59dZ{Pf3Xd(L2_@){7n3Caa$}G2dV3gV zU=Ky`!nxNuzNMv5#yc9geB}ln)29?Wk>SXy#1n8qY=?hOJ;>r8P<_FL^SCj_h5T4L zqb%Y~9cX&H!^G=&1XYN%8?Va!nO{+BEs}iLl%$$l!WT*$Wv~|PTnHn-jKcrD@XS7G z3&=??KPFxh2gg34z}`$SWo?j(*fphyA2JkmXlJUBN8B^NbOnJC)42el#%mbqQ-f7v zkT+u^r6gb|o`PUz(}F{aFff9&Ew_rPrkDhMuo?+Z9Z+~FI@ zq603{<1b$^D}&=9PEaHPpl1FjBTe?+cO-#tT41;J7<54+S){punRSKdHZQRI6!`@v z$um;J5n#?7fC7G2xC3zuPXHrhkNN-AvH%<&BQ)@2@eUtZ!oc(jBB@6u0KYfUg4+Q$ z)F72S4Vwm4aSEh!>!Yt@5G!`jMj{ih#0Je3Uga|nA0chqS+V=5~4#_fiz|Sac##0{PvWE|0|L?+_Ar8U=Op|!z z(F2Wp`Ufh~yErP;;qS2hRji)-pJDqy88-ZFCY%4j0{qX+`+t;#|L@GaJI3c3_iI1e z&ghk8pb99_NIYjq{h^rctc5M(g{&tHrE-8Y3ouE*3OOWV0iANNFJhhD3`C83JGf_r zcY|UB%Ds=sj_TQG4P-|Zxd-X}t1SK+G)rxm{1g=KeFH^8E_6Y9Z~;_gH}dWZB>Aty zdJD3rXPreF`9D|rpK(FXmu=xI_uVV{-ssOXis^IyG5Jxqs)sVz{mTPXr3MXjy{a?` zuQpV?o$Fc3>b*}(CDQ5{v@6Xg{Wyx0At{pShsfy)I!OqT1N?sgcI3z|IO%%J&Lgvt zA$Tr)OHrl^LNsT<0|aBI~jM7x9;zJ_o|v$rV0t z!)>6A-od9={)Modp~S(Rpk06;EX@{5r@;_5FqG3^T6GgB0+MArH_pczGuKnXqXmlm zWTKoMPU&LxxjTm}*(_`0urcmR<=v|&7i4qD#YY~F1!C6_VuEx?!++4=^hV_?$q8F z>Bd(hSS|iFWM6Da-1^Qr2xv50wpCiBh#2!#3PzBuwg>vlZmzUxa(T^u$H6vAQL83a z=>QlnkK|e&V}3dhS;c~yNhpuyC{jNf7qIB;!;2!0R0B+~(mpnv1T?Bbs7O2j^F(~3 zswQ-%KwMr_lSJ3bB&yk~X|tQdbINY_gpaQaOPn00u&&p{XrC}NQXQuoDUOxf=Q~c_ z95j;7`AY0ED5o?Uu@+2JC7WV2(yx56dj2nG*D>LDWm8alj?Bgf*zArM|L%N)A-X7VRi4`IYwuR+D6<^v?yjNvS$qYp!bC*E z82un)@Tck_=5Yu$f?TC0f4$b)^!&$6Xob~+7t1z@fChZ$<`DYklp2oN!4W(#CT}yA z9kkLDBS`!W!Qmg{ADI>v!B)FJGL~BY7G#NCq0J!Y{15{|gu_Kz(ccL2rVX@gKUnqC z$K=+8*l851Dn#t^vpP$$jBatA3!y#72_ z0z{@)!*#v(ihy3LvG?Yw^HC!7;+vYWBYYN>_W2Nx$|vbUr~|AbC~K#>%bb=OKgKGY zsne=cNy^1YyJdyD7wyQlm= z9wVF@xu_e^zdt1h6mO4L`@4|%-(%WV8IE4Z#xz$m?=BvEZhQwdYx2?NTjs8JJ5R>$ zW0?pIo_oo9t$S`a@FD#9UVuBdyI3pezB0T#0qrYPDG*`5ue+B1)wsCHK(2v!9!a$J zqoh%}qQN&VxE?7XRN{JKZGm|GjKb)tU#S}K04tWA&z4Ym@=xg-PBJx?bUxEkrlVNB z+15W6uV7(Niol)h!ZIvRDG9IY5HX}iZ-?J%T=yPcs0!BjZ}PC1XgN~qip_?#la(lc zvxX1}K7zUi&~gg9vd9vcE{~4T62B7HGI>^Yn3rw3q{$(G4-fvuZE(lvsmU`5EiA`K zMd0zMEEqfw6MXF~#ikP17Q?HRg|W-oq}7v59a@}NE%n5*$wlNbm%*K{4zs8T48;!i zE@#AA@2rtHzIxC_uaGTmP~j*#`4DM~NR}E(JrGtPjWR<$_cZZ~8`jfyofO@yN>@MR z>Zss|390UrON+?Td-Wjil@eSWAhpamnxor^pLK0}s+zLPWGM#MianJV3uD^aNj zgxMeT#-3;I9d*V@Hg>`4iYwi}Cz5AebY!mfomH>!>$sK18$Tca;wj^qnCjnt(ihyT zT*Oy(jvE=HhaU%-Vt$pg{xSQ@pY_fS>{bLvSb(=}(&ad}{r-@*Fl&k>!&9qlq@J?V z`-))f6?Id-%%&7@TJGQlpgs`_r1FkL*~mNOzWxX6Zeyrfb{~7zN9a=K$+ywcUfCMs>3)(+!sVaHs5O0&@%4tSw9mr|KRyBetIF!ngEp6;MG3Uw@ z(#V`VPxy$6{N5gHcfbUSXuO?|qqL8V2G)X~=|TBz>{APe&YmY=t@=q!Kp`J#eFCmK zU3Dji*&r07&V^Nn9UxLpJ|uB4#To4luX(%w?p%E5{@(t=m=*b{GE00^R=6)>O(88I z@1i>Lj(>a9&&Aq2ct?| zlpm%efR@VE^|#2P?Dt_ZN0hWxtC70f)mmI3#=O!IZ*N!QbYtHS(OIfzzL4ln8;cFc zwNgBV*lRcHPqNPCM7;1)b6OiJNZ3dY4sV_u#M%dBZ;S71Kms5U3gOSITIMhZX5Hpt zsOM)hydgvDbyf)0Gy?1eo-R_ zqOHn!QYXzT{D(gD1<4ZTBJUn=@UUPelL0jukA0LtLmfd@#Dp5>=4LpvG!DGZZO&Xy ztWWd8g{_0t3qA zYYcaSg|oE?1|NgXqiBARakw`&rg2|TNprTpKMa-nmt#qkD$rRqh{#cnZ4S%9qbI_oO z*+6{Thz&C88UEpojBS^}^2O#lQ#Pd;pZv+|1`OiWc1KyM5mvR}8?#Y%vJI*Bx@vW% zN|-(VgXYyD+;{z-AHpKC3xrVV#ayM?q6LF$rQdiRp|@2n)V`6PShFtv{kCKCwI-+-l!9{sY6>=PJxytoN1n(_mvLEsV9nwYqdZ=292* zX)J}NGg@~M(n;FPmKY=#hwlqca=G@9Hs6vlu6gVSBw${uTipVQ9EUabRv$&!HbF5d zki%+w2QqEjRVCKHf>YRH6oeg4Vjf?Ok2~|_y~-@=IH zNUD8}Cz`P|cyqjgsn!b9zMNb8z|;%9YpttV=C*5J1xif9hNIW*TKzK)gQ^>&wk)s6 z-}`6$8$%W|H9QB@xw_;cvi%bDQ+}!*o-yL5-!ZU!6-j)q54kxIvIrB+cj*zCzPVjQ zC0W1X)V+iw#@GRwCPg&O8F@@1Mjn$633vYTkjTLve)5bi@L*ao&tgQqUcr+#ac_P3 z%a^YC>-jv_SoQ*vtXY0_3GUK(I(=f_bQhKCR==OCGJC;FiWW%MO#hTt(Y>+=`Eq)f z*WwUrve}g2DjVDztKo)t?gakQnZr{d?d+2XcfaF(LN;-1PKSG}-~N^X!Pf&uIlyXc zd9fm;=?Kxj*5UZ&Q}0wIGP#&S&D6XMINh6Li?v?oF;~{p{{hNK7hy)(lnr6aANm<4 zR~BY-_5#)!BbEzCRzw+9_%2|w=~^FOZLBaI>@a7B@!~MY!|2H@_ZFc3CIi#D;gv9J zpwyEgT1njbzu0@ruqfNFZ&VPZZb}4{R#c=_y0Jh>k#14xl#~`lK_v`AT0pu+Qic>1 zgrR$AP`ZSnbFT}$pXdL+&pwX5KkZ|Gc)!ez_sn%(XRfu*wSMcj@^ycU@goWh!7!b- zZ`VQ;*zNOv82Y@QAHRKAYrom&=6P`?R+?+3V+-@FYdklT>+wsa85%LgKL#OEIR`8< z$xl*yJl_Q?)PK9dwjcg1^j<{j&Hjf4|IDmi3Nn+s#%u2R=UlZ7`K8|P!#eYZx6Y=0PmSy+Jw<;5Ul^PtHV>y9EkR65L&$ty;BQ4i z{O3@K&$TE*ZKC=QYtDH*W8@}Ccq;^oDc*O3sdU}A+A16kJYY8}*1KHUzKyeoRE zL-}XafBAVWuEKJ?hkK-o5~RAZGEXDZ;5=%>gg=e%7A%9HO#7*;+yk-DMQ}pa@vOKaB8qRpLZ7 zOp+_ix{Y_}Js$lL`NB2^cqnI#NDrB2TWG9joiYcq7*JI30GSv&g*tq_#I9#Sj(qp$ zcn%ExZ-Ksm9#Hu@NN(ec*lW4Y_a6A+M+mJ*^8L7KuXO^471VEJO+%_9HJ@V1(_HZ^5y4aR}@CZJVUi2XTZ*rM7Qh3SV zx_=hTFpi_rOlN{%y3Q1~A%Jz>1s0^*^nGgNC65V^LhmSqBXVvh@E!6&rw7i%e4$-g zZsaA%IQsV!U664cg~>1H|DVbFKa=(U&z)6$P~^t?N4)2NwSHuM*z=~``Xz-#lk*xr z3SpmLblg)&QNDGa`$b^byQ^FZ=Pqz^@?O_E=gH)mhbyb>osnja{S~`2xKX-fU7pLr z@}snFV`tZ3@1V+UFdMQk z1RTSspoX7+rV>03Kj(*^`vgO!QJ-LoL;(U~OiTLl9NA-d=J4}4LsmWH=O^Lk>8FEu zB&tpqwzcE?o61K9)SZ3$^alK#f|H;bem+I^r zyvns|cpk^V_VgHj12pE_`R$~)kYYm1j;?P0BcKB{5&NML3$RaUB%X^S?%FXwLFic;HBZT{ zV|Ys>qMSV1Rj3HNrBzwUm2qSGnO8*O7K0oG8lNp5=NRk<$o(^u7d+QHGme>^tISp9 za*Hp&+L>ElC2-xzy!EchT=}yXq;sR+j?WG!hsHlaJ@;7mG$bUY>lv#JSpEH+x!Zd5 z@wu5-&6`}HvGOSzi>vkgos=$}W zOvD&wC28;o3^+$WqYNsqr)<8P_!uX#y8n?{ui`=JaE!QmncI))2y4dam%kED89~7f^q&R&2 zo~dPAF%jdMm+X20n~$C@_#*pL+3$qJ7;26wKVr5~Ej*4FrUhoD3vqEajeDLOR`(NF zUSq!uvFNghkJ2i~NG|hCzN3-uVc#e$dA6*$9#|lDQT2&7=3sL#8&&SsYFA|Gnj)3I z+w;k%F-E3}Wq!TMaznD$sfAx#n#=SekNk_c0Q6m0l$xaWN3bYgVNu3^OH$zx406Gu zcuhu6uvQ&twYquUP4M0+5W28qySMUdV^+o{NZGq4+OEQG!AdjRpjj_+P)%F7Qch&J zV|X8X-&$cEvLAlcu9T^pr9Wx7r&lnmD&`0r)2EXHdZ?LNrAZ{;o3J< zarMd97H&hi$xcm?L9du1YMsRM2{buN!ZZ;>+xGMeA%!qE9pjReFgKS#v^?y#5WpwC zd8=C2q_nt*aeZ%@LSMwbgE=R925Xcz!)0AEOPyDhnn6?FFVh;sU)AI>NycBPQI_3Y zjdm-QpF7yc>|;~b@(SIGd@5UIgIV_?>MEtW_HkJ;C2g@$3H>Heh4L9g+YVuK{FUZ? ztZAf3n^WFuNuJ`Ym>TEXwYxQrg}>c%2jI-xWu0EQb0W<}L%h5WMcT!Um3)xMnC=vNVm2y1yc)-^=SwN^iJur%+nQi;gm*qa z_cxW%ijz*0dy0x>5XuD+Rt4v1PwVkEoTCgn5Jb^@59m)uu*)?+H{gS*bH}rkhN%OY z&qQs;^6VJC5hCTQUi+I)^#;<3-+=uXB;G~}x}X@|j9p7~funGwI^Nw>2SJ`k&Fc*E z1AE5h6D@pY16rkE?#3L*-fM7-h-jaq{y$L+Ag7JNpoI^;>3^UP0<*H*&<8BO8W|?o zE-R~1lZzWtVp`~uU;t06tDtXFRiyq>8hmp9VgU|EEN46lS)$ucTM649HI(i4|L}s| zAz%H+EE=JhtYP0CeW zDTF7f98j`dH7mgzBHD@P2qCpXiwM^y2&xGtHq{OJuxjtIAYl@7yeUz<&{bb+E|*0j zNsm1E0v>b>SS+~{rNRlhmbp)p-a*%)uz8Cw(b2>QRRV1Rk@NV^JHZPwoZG>Pgz$r> z&k2+0B8*RW8CnbH+(S~Jnr?Jb!6@V+hE^#+sA}-`Gw}LB9%-i6tJfYg?Un6T_^qIv z2n^1`C?ywvxe4ORR5ejlCRkc~(%_rN@UDPzkA%MNGPKuBlg$r)QE%qAPaLqHr`oG) z!$b)soR2DT-61JPfan;qTMk2g75H1Gb}F;bk=bmoH!R z@_$n*GhR`2cbW>ed-y~gX}|#*F#XjrGqWyE=c(b2Tk*|i-j8pSB%ngUd2I0||2tQ? z)xdfcCT2(%JS!lLCLfzY=8B9Ui6MeR$x~HeQd#|54WWVTEgCqpONZM7@8}T7Y2Tk( zKW*sNec-H*DN7oliVuOM|0>ItY2yAX!nbQIV?9%&G>I(T;v7ElrXeyy6UZCiivcgW>khXmpOdW<=-b!9^F>tUS{cB zk$DUia(_S#$yVROSEIz#&5lKiTep5QY>Wvt{9?Ge*rq|?g=~JG29iS=5EN1!Kjaz4 zA@5VMToX7g_%!Vt(&1AMH+76k*HxwZl0Ou+YhQqD%HbD5f@kmucG-}p!~Z^AUJXg0 zkA&Af3Z`yra%o>!As>B(bX$>-i8>;92=^RqJv3R=sQt9vG_q1rkP)vHu^f#b9Ce(X zA0=THszA@3Y!tfoX1CTI7J_Z`0bafT1h0f}*4pvjEMTr^j1?<*bH@@k%@a*zeHu?5 zii^{_eh=$IEPlSM?hMGQ@$lqdOxy^d%rKp@)<}9p0u`SJj+Z;* z;SoTg8yL3G-(kmAS9m!26T4)dG4gYmOMTtT zwBh{^_9kbrrppg!j`nX8LDlWa@@pIYE*>kx<}NOKsqfq`se;@mki0nh6=mZ!vhVQK ztYt^zf@1ie(;`FgjmY8DN&THVcGPl$vXze}*^m=v@0~l_(X*FEf1+$+jZd$J?e~9u zy@p}&nwdU{Z%2#h2FIggky;_!w0=5VBJY`zN1=VcOb`cRe6!Fw*yspP#>H0}ay$s- zb{jTe;lne$aSX2nG-8}F=h0g`weF9^?=^np`yjV=CbF1vd=VKmB>@w2#KZOP(~BGf zTR*isRma$yq}@4ZF5;^oy)U|YQ1s?Vk`+>4W^XKW{gLigvY%=6 z#jIb_zYN*5(licUI>Kw0*_(!p7cy(*DKlkF_7yW)ht<$;o<0Pb62f`wJ^9(Aj_l%J zk$4cs>qJq}?+wk3O-+Qr`ai_e_MedTZb~&))aF+M!M~Lq7HX`A4v{S+P|`7~Sif1w z((0ss$>AC{2>nUv<*E2)OX9EE)m|)C^~G_SuGh$gagO$9EXCk{rBkhGf)$N!RZ!md zN_ri6DHZr&BtAwQiq{tUC%_6yZ* zcD!B?k(b@ZnATYOZ@8~aY9?MM3>R~n6dWHbMrOVQw)Ai=e+4eS8mdASdJniR6xFO0 zP_Akuqm|2h?|8rk2rh=z;U?HVW^u3EwH)QSq2|hJQ8)~jYULopS$0Fm@&vN=DL{%z z%--at7Im2oTVEVbtLn|`4(=`N*(;Gsn#blJe-2Hty?u$FOOC<&z&jvf-RJ*pd#ra! z6UymLG)Ib%mwpvO8sde9N@nlaLl&y}28-=*`K?Gp4OPnGrOXE}wqjF7xRvujI$7f9 zQeyDhXssiye}7NB>f0L6URDt=2KrJ3mnwyfEg4xcE~4OTY(}G?F8j;e@+g=kKa);r z=gYlk)k<;Hq`HnM>cCG=&1)R1=Uas^tKF0ZYePxy{Y7q(q;!|kgQ5g30dE?@bUGST|MIMUcLoT0ZuE*zu1MucAh`jdd^;%F%< zw#}_J!gqDAb;PRWnXX=iT?*-M?oZBVImGJN@{?X?)Z-L=FhXtQa(zMXg~1+F>bs#Q1=_{GkdqsjjY8` zuv3xyHANL%QcP`Xj1VewoM=kvxX?I_&&`%#oS$u_fxhkTRIsEVI9mh8?J>X41YR@| z?u1Bp30mkbW7L~gRVknc$c)0*ov^XWsgY#}g}*VR@SJvJXx+chF~Dy-sUr1}f2+{5@dQ*|-&tSy zu%(&*sNv6u!tf#=;pn|k5O&6%8gD~3x+0e(&yGvCE zxlk34>kH0tqA>*uO&S1$gE}x5oYShf^@ffU%BE!Md>I!@aL4eLq+fqQ>ajHuxmI2p}r_U#pKwY}- zJY!?1Eq)Iui(P9E_*SE|Hu>^b7k5{MFXt^|Q?`$k-Lq$nJ&-M9JYqkESG?m8} zE?->eFX{xtdadmSr6n)}io36TW}Z!WGEM?@BfbBl$Hz>f+zxlhsIi~L%Za8afjOXD zkTB;}(h>f)6lv@uwone$DW_8+jXKKRan2Lr+H#-cIY}@VnPOxmACKQGQzog+Kcg#r zR>YwxR!qIu$TMeepzEdS!-sNdBfwEK*Ygo6wbX-ZKKh*WVKF?MK9tWivRFEp&YRJoHY#b5~Z98$O@ zf#qJHG0f6KJ7++9^uBqZ+O3+Q#G&()R*oJv8GDf!3MBKib9O`F#=Ag~HlNXU@jOw@ zN(8UI^Jq1mW}eZIo&D(Lv%NtVLD1+yK}kxFN7%wSKO&Ykefx@(vPCq;&K&X_#+RyG z7xaKZM5QxTxghPi6S57FBjAAVKQUS^iim(eVX}fS543wgg4ntu_^n;ZL|Fa-J0^gV zwfK2{)NnZvdH{n3KQ>x&dpemx@(IdgeLe%q()T5H2d$ov^Hg@&L&~QD-{Fsuxy7=j zp1i7kE7ctqAm3nC;?~SO@016fTFW+%UjO*k=DHP7ZjRUHleDK+K*{TJ$P2X2^Q2Jv z<^v2Of>4oM!k$MC-Pq+4Xl3DQ#1JTbW3uj8aU8^DF4HY#2~*Q+Fu65WvGnMlKUJgrDQ)z7-;okEi#TV1apO~m zVg)YYU;|GoBU{FchpHK6Bc~1L)-$4##Mf`sTrNTz3Mm3hr_6piT=1ro-tgtiKUUuP zAkh?O&6r>iU08rIKwQW-um$2!w(Ptt78suCy&wwS1r|_ zFR&9FFEM%%s$Uhzw3c)7h5DTm8Ye>d4AIlTHWIcq|NhL7lmJgmb4gQrAqjf&w%sGF zjUXKjO)}K@y}%FwC)$2Vr2?3g3?04^dh9OOFSJUez{F+rCU~v9+GBE}#6@L7J=uF^ z3obRaz)DC$bFok2@&En4e9BsrA60BCm)?BB_~NG%6apRAsV*_?LjpHauvAT6=uC_D zBu)y;j)`CYVF@C1S`PQ2~BoyNp6=AbUJmf5zdoC1UtFxfOu#^tp|H8EE?!-ZGu`Q#l>cOpMwxTj|y|&H+l9 zi~Y?4RRuOGS6c;>yUoNmfM~^%$lK`@F#Rx#JHORcZAP{`a-6?`#(It$w?Ra=IwHFL z+CH8#yYJr>Yo2;1VJmLp{+s)9q0LhX5&Xt1z8NHSKmnXD@~gkXvt+ONAyRXv1+Krg#2bj<&2Sj2=?7|3UFFMXyZuM~lglPQC<7;*%pA*PfyU&P-?2CjRTj_H zUZr1rI;c+-saS&b?-ZT6pmw&h6JnUl?5p-%g)G`+ou*ofb{>H%&xVUG`^?mlCOy2r zj*J}J4n2lniReTM4)j)7S1Cn-aKG2}2CvMJ)B6k*KaqVMc(9%V3GN^Zc>M#3SiRcz{X0Khd;ZKW{>+{* z`vh3b|7`dB(|;7g@>i_Bm)aXp5|~0BN7Te9SS#OR9uva)jh+G5+@_+XoJ+I#9B|kK z987YSC%ac}6+1@o=-zumnD(!JjzaV^39`ub@E0q6LhfI)q*@Fye%O%y{BM>MUd`2| zBXK{DJdA`n4tv1%>LBh>j^%4YBBVdn3Gk(FPn^-;_GDKO^Jq9w0Qq$*m^@PYdgA|@ zM`z*RHwb<^BF{C$b0XTI#s5C1hnPqAB0Z6RQ@|`ql*zGstA$UNx|`~e!>^Dv;A9M!pND>ur7M?=#OJZ(^^V`O#8hS5MI_@p}E0UB+#>W68 zeSt5tl%o=Ihir16=$eVxaj|Sh^DWReBQp|v7B1lBKAj^Bm(h7W`vr1aWOR3dswDR8 zEpSa9!=pr2K}Q4q+tUvei&byS>2O|6qakr=#Ngf>COC%7oDhBpDTd5vPeqDICg=Js z(hONY{EQjO-b0m7a8_1_0{@LVwL?@jaO%)Ld@)I1wys&B7g?6QCWtr@11q0CtXcW4 zT5s7EQ0ropfeVvxTb#xmIqis$Eq5x4Z(HyF(r`upSCeG%!cy&$qH;_xoihtOO{`T- z4T|WJjLe@q;gWfuSAO5|>0hfH+9~pA(6imPj#;mp?~XV0e7+zuC4ZaEe!sj?n=SNp zICnOgRhZ_YSLwOcM=WsPpAHiH>H4$ftj71zpg$P5LAzFjgTT&Bz zH|l%VbLovgXy2EGB|yx^A2tp3v&k#OH+8ECCY2dQES*P|vrR%Ad7nsv-wvwvnQS#^ zmsxeT_`Wwx_Szh!-dpd!$}Hn`o&9BKmuaNoB#Wt@6&a7g*Sm2`RoYScrd_v!v$*s_ zvVX#M^(U;1Sx(m4uawg+%1&G9Zv4LVvcha;WZSGCUlWZg=c`)mATD`bfA*>Dd4eW} z$#FsjtI>iR?8~C@h;09A1a8AE`8nKO{;K8J#t8lxp5MJ6s?_6cw$}FCMmRkAI7IbF zSGf6l_me$cG`aVm2NZDMnk&ELaDU`l@Qi?(Zoa8jZn;>I6NZnLC&GBgjTI#$;k40N z!eze5WHc}C4VPKP`wP)U9*K(l-D3s!$|;dkmXEJSj)_aCPz+FEx?AbB)q-uSqRZ>8 zqt!3kEV>4-0)gcD%l6y&Q5SeQRAV(R&M~gnhsNDS zug++!y;rb$a>JW)Ei4x<8BzFGu3Tx1K3gWm9CIj8M+Xsi@PMnGW?8OChE);QluaVG z$-0BFb9JrPDavCZu>`BsVAqwNXt&t$Ub#wALmNF#;a5Gq5|v}Bfn{Dx4kz~KLfOnX zR>(Zbe0_?CVK(P0vLXKzBF^-%C{~NvdCDAnE_z9hKdfY0=?3pe8P?V7Gp_^A?T5?v;2Ku56O6EjIZxf`&~+W!3;YiUx2;7R1HM<4+MTp8o^ZvGUvdaBaYeA(8C zX`s{G3UMqPek>BA!UY=fibwwZmR4FdK$WvcechQrD);#sX#6R2g`CO`><=|47O08e1&v2gtpy!QhpR(lRC zc<5GB3`Vv5cso+kAcDrrxaB6*jK61H6xax%hO z#~B?90n?@(=qT}d&_9TN>wy0g`||6c_z!s9DRuX$n>G1vR(DkIU4ZXt8R-F0& zn9G_&sr&TZ#9NY5&~}?hDChY(iPAUzrX1S!GPdkIJ83=3QWto3P}9y)?~c3~Y@%Qi zI2gq;o56oSUMn@sgid5UaN4*9lh_;D6Tnmb)A6VOAm|M*@b9i)dlxr&t0wWDpcYdILNH99sO%zFAmr=Agvvz*%eGw4z=^2envufbs8OkHVt6~yBSwP4R{LeDk6 zJf?;oDpg0ChzSd8fnP)MafZ`m6Q@&ire@Z79>RM7Ev_;NtnfJI;XcQi zI|c4Zmtps*27=$!Uw}fgJ6(<6&Pp@iitSXPvjAVBQo`h zGWYZaYRXODtW}j~{{q_3E+7vSGN~b07^xcBFMwOd99%5A4c&%Z`pJrcFH`DI^>Ls+ z?YH#x`9Z6lZc_Vl7J~oz=B;)>Q;mw%C_`?s#=d)NvHZ^AT$)gwAb-=TE|fJ7nBFX5 zcP#swJ=?F?7|xoC_f)yq_i_i^VftY^DM#VtNaGua0b`$Z=rhb5NaS7EtHqhI@83;0 z*?tD<&Be|8Ub&HGKvxU2w^7~-N=iXej*P@zuMB%(A9$~IYft@yxl@E8*z_-R<8bw* z@ox?+7DGyY0OcJ|!(_}S8OPtZfp?ZO`@jfUI^{osa&#=vfPJY>GBt?-F_z0> zOZgnm=0Aaha1ED4#8c{bGxj`i4yB3m#I)<-4xTFA7q))a>^v)bI|qo1wZKmyv)pf; zS5%uW7Vouk$1}QE%x7moL_&}#1W`B9*106&JPFtwnS)(*U;&-6WXJk>a@f2fX)}B6 z4sMEhd(jxqF~4ei{55c^(f9ar5@Spzew$^$Mov4~(i{or#Xzc`*XO^$Q&!|yD4~R`>NQ73M8_kmx&*wrsZg`!qTX~?E?VIQ zDXqft&lQQK@?^vvxNijBC^Xu2!RkQTL`S%aRopop6q{p9(T~KJ6UT`tey*M!GcyKq z({^=LDbtb}xM4O%37Ee&b`-d7o0<|Z2OC@Q{f_@RkZQMDQ+iKE80sZ{Ol15Q3m_WJ z{2%ZN-KQuUO_5keuc&u@aDTqY7(qnIk}c2by;|soQSuY+ECz~C=7Xt3e^UWR?^<#Z z3-F6BZ+K282CUXG+J8Ur*;pKEOpx(?J+cf=+AJWP<&>5wr6IvIkSybFgXm*VoJW}h zbWV#Zn#K|q=kpJD7BChO$Hfbj*L2(23Gj{Wdt3GwX1t_iwOb1Dp1f^;$)pr+;%Ul> zl{2X^tKW!NW7nFe{L9>95=I;`ywskNBpB-C$!k-9=-BzQ1|Oi0#6Vp{Z9pKB>Nc|S zFHi_0Yj?J;K&3ctvbi%{+sN|3XMG-fbxh3RvoH_zdthJKarAPGQ&e1WbDbf$JFtGr zKchwLb5&*bG|pUogi;k4+d{m?2VyYa3?g=ex+W4^y*x7l#F)hC$F@iJJLh(9VhkS_ zow70+3{NlJQ59A?8jc64$rgA))Ii?-8o7WY|D_TqJ`=#|(`KDl;o3hHovNCU3;e$X zMc*Q^FC@l7`X{exW@_AVe#j?4=ndZT-fcT3|tk4GRC5g5$3A&7 z?PLHK8~{-7fO()qdaA;ijm_!zx5MrI0s|q_pwv;#J{s#~1A0^s_{^njAt2OvZ6Si$ z1|TTkUO)EHLyWoW-D4Z)4?P3?Ibau9@;2>3gJf{MOEEa`U5U%q&C_=hSiYg8h>=ZTlJdvpMu#|0FTa}cNYRdp?^2O4Dbv=7oF9$8lEw=!+AcE>#yyu- zWAxZBMnT(V!6KfKd(76(8n0q6tZwzU3O!gRwjHdxQYrUsc&TYTq{213U(+(}pPBK> zw92`kd@g(FtUBJECH2RN^SO1_Uly< zhjy+*5y-smiR5MQC?f`p|0j2)Bkv9e@n;tumQJsr#Z1@8p>A#tql_)aseuE@E#D8C zXRBx<(u&#}Rk&Snt1(3;2PdLX?!UiNd+)XOqLY5piu=rdef`g#y~y_cMG^Oj-<^SJ zg|SImw0f4qFWnicY|8T=H6_Vg%Vgqkv3ZL1#+k+?dkl_uD(yEaT$zSV0uKD;bDK)L zl8q?Icf3FLEb20@I9*5MnTVa2=5wurF7F9_CIX+!ItcGVkx3Fn(%?Kqud(ISuEYmy zSj@DfWNIhH*X>@BW3HP1_(&($C>y;b6@obN3*l@^(`GDz)q*Xn(BPdp*S z@LBxUT;EuQ-Zv{fl)InSqFzREcz^$f2d+e9Z8kKo(SL$$#qiTd_ABeto8t>_7Nl9l zAK&iOSJsXcGt>}wvn)&So!`9?tRU1;nY`2GX<}bdwraRzw3j=Z)9|wrK$zX?PvU)EkB(jDw++wP% zRi%6?hjHtLbG!DxRj=|aiJ|K$lAimpJW9Z+6u3FNt(9r6#bvJdkKbR4yv$RQHK;<9 zo8X}LN4oQ1J+lO()M-@U8O+?GY!Onl&voGHzSB=Pp*LN!P-VICZpihgP=Hu#wl7~~ zN$gefpN|4eajYkNhPMlZS^Ki*Fot<|xH&RfYoztOdLI;Re-3Y)4Zco?oX3e;ZY|W@`(08(NcO>w{ln^S-ma!Ja40es0BfA@nbx2o4561cq z4y$}mOj;aV2qwdi#+qlpr58@7q4mY&N7ur8J6G9%f=+S>lS@$9LEZXn6XlmPsiY#7 zE!$#NS2JEIhxhM9S(kOqCY=oqNXLemMXzmS-<&Zoin<`s5$oijvNL72{WWUc_Lnv+nlHMxZt1#&~Y_kO0bcYk)Kkq8V}0GSi~p0N~hQ?{Vn zAv7GA4H_YTSTY#ZBrz)MS4fJv;1u44Zkd}8E^M04j=E3%(Y^35-b$ZbmJm0dJ0dcQ! zRBpU;k@ql-OFSw~(^zCpy!oqToh3VF#^{XziN9@uuqW<#%r6mE#lG7#pKFC_V^H(e zzBHEi&Vw+1*EOjN*CH+Os_Ju3y>k`FpMQ59UCwAp8*#kzmGVbHMx&_Y8*3En=~+>B zHmN!ZrQdqd6?60Lzt(no=0aMRL~Uzxx(4~j1Z`-F4@#F<_lg%px>@ukDy^8a+`7`u zlIN+cQ@xCJHm`?!WQFbFs#}`^A0NEj(-f3%-=sFCiSRsV)0e7TdaR|>62Tc`RiTZk z3^;Q}nEf}ojK!0lpNG+7MAyzeYDWaiA%00%4eAq9;J_Hhn^sQ(tEPq)=q@nLUc5nv#VjKm0;LEKy1Wj)TNg2z`Lhm z-okq)XrRbq?p#ztYpHw4?5Q{U{5GlTF`RwBB13z-BGea9OdpfHe}7@0UAw`*qaE|w zR&d7m!@Lm2Uc^&hJEHs^hf3Vty45oa%H_Fn9yxnr+0||l&30I`&*Bdcg4Ugyh>J`4 zTIbv~E_t5yZr>*Jl(&V8!1}sC|A2K&tt*&Uat*aCw1-PU0(u1 zwfrP34#2GZed4mnFP@5-88t)W>h>60#k>7CC3RQ&@WhJ(+~uMqO$#5@zt`U2 z_4;M$-Gbe35KBc%C`y|8e>jy!H&MG-31h2GM)oeG-lS*kncR~fg{&jclG~r6huYgT z7gE0r(|}cym)fu%h>GQ*=6C`gg!zT-4v9nW9mL-{2)Q+iZxaT4csf7vXfkACZ<~n^ z0V&L-nWx=w#Wv~ep?!i{1a~}L9G^gblLX??BH)0Rm)mdnD0@7NuH5&?yGBr~z(vrU4a(NlG{;GP7QK{3IA@9&y8qJUEJ@K%yuF%NMhC%Z*~R zY`Vus?v*54Ih>DWCh?1=@4nlha7#1SZ05FPh&h{s3N%4!Oa&irRN(UPuO|$qKUHx= z$?%h;F3x&?ckA2Q+X7||2Y_D8Rxf*_%O@^`STmBZq`{H}xNsb~y+DTk9YZwge{-=v zQgC%}as7@pn{}DmS<+0kIpd=CE3+|zJ&V2n7W?641t7zR&a?k)g=xBHB|+Qr+my^q zZrPF+=AcLeP&FFaI#8X9$zX`xv3M&4!BP}D&)frufZ@Yo=49^NDkWNeF8X&<5PP}@ zwysJ>SU?MSUeZ50Ul78de;M~LNd7AyPYiyPa#atXVIHqFloSgY>ZdIjuy=7%_Qlcv zOv8VH%hJKtaH$uaig7PZb=xG-bAcCez4gEc7AVo&C3_}e)CFgIt(e*ka^%XR& ztW(Mf{s|>`;g@F}%y5>F3u}4QTpkRe(%+d)5DaAt9|Od^M>b7z%PM%mTfi(?ADg@O zKUM`=bZ2h?hjJ)5l%+pb*et4&rkM@IY2K|8` z-EJzgS($A0_%$O2L5d&4A46`Dy&uZXj-G%jsBYZ$>$@Jw^|wU?BZZMjh8j(v zo=*H#+;Qc&y+oifKV8&k6ih+>~)mkVazc*GO&(9Y&-pR_mO&^eb)d22Ub5{9}H95EuEc zpN3QpZ>sx+tk=9EjnIW=h9!bgu73w;r6XM7Q?5#HJGdN5O3%k6M8JZ~*(L&a?``a~ zmm#$oC|16Oqwk$-t(vG`>@-J0%ocv>0TXL6`&#vB{aH5~PBsGAw2gTb$?G`g*9E6V z-M&7Y`0Jg+AaUX24@f!#M5w=J%oNfhVW()>uG;*Wt!Ktnow(x(8VxvwlOL1Ux&FZH z{?~r-p}Bm`z4m3WM~1brq0e%-&z$rgl5%p7cQg=XqbIY4d&MAa64=7+!r~{^J@NZG z(y@QR`o={r|F~!K%Y>cX1C_kBA41=^!vC_POO@@ zz!OmzHFy!`-(2mylR!E@WUE{;Tz*H2b*45jUv&fz4MHQ4M3jpsCikOI({Z_t?tb5% zOTORm8)G8dnDkziG9XJ2nDeFqzo#1n;1A0(lz6C7tl{o5w|UAstU~(%0*R(R>0qEk zvAQEaS8$J=gQ6Q9Y`tKKg(R-K-~eAR2Iv24D#}m&EiE>BDOf^q0nQ z7Dta!IUv5?_KC#D@OIgLQlz9a5=xHXjS;pV&IVv$2E1~EoT8Ic^Ark7kX_u?zBR_M+)RdWkQMcF%PB^c{A6dieE;c^*ybiQ1V z6L%4x>j7VKFL+*A7Udmf2&gu-M~ixmK5P{|tfznwo+<);h1gCgBnTpF1|NrMZ$>ZD zNj{>y0LvlwHdzagW!}vXYEByi4y>g25@=l$u=xv060|$0YC&W29>_s z0RHAY73Y=-5$@TZoEV?oWqHkvd$*nwQECB7Ho3=_B}lfAu<7;+5`Tgq$y}$ZOt7od z6G%@y>c{9MFiVmf$MWhvqHJ6zb--=R$DC#1|75;p@#VJ{#CR{;Row_$da)%;+ z2E-56?WzG^78H#~=^9!cI3;4Ol*hYhd6}2IthywC|K4QwOs$Wx9ilpsz99H>i2}Y# z3&@~?BNKhuq(*nIVsrdW_?3rp5Yx(9=gYlOGu>u|o73&d z9;@y0LUx0qJDy)AMIou6;v((^y;0j6{z{7jENJ6yfIdpxj=9_7C+mpsrlFtFXZj;- zVEaw2G5$`W>P)6xdtVCeUiV`U$SZ{oJ#7W8E$xe!gxw3vKJ<;;LFye2by3Rf-i11H z-EdGJgn|>CvH27Jr}N%L4OTJ7 z^F9p1V48-n4Hx)sJ0^IHpl@Axcv^SW$mPRQdmTRa+WhwinVtT+gbmh@k<=BK)nSlv z5IoPFyJH0))q4=Gw)G6KF8HF2%{N_WHxtOWam5I$XEjHg*5uC_5y^`@x!v2WcUh0i z;pw|u^7LHf3{X9N&uVXN;hkP%>}`;*Z#Tb+>fytMxeM|T3L%bx1Weple!tRE zf0A2IsFOFpjL*?2)x*70w!ey?YGkWNzy|W|O&Er2z*YQpkBQYhRzfhU|K8o*1%0{rr za5@sP)P~|RqkcrIjof(_-vg-;RKa37PG!iY*DwqWOQ3P> z^FFkel_e|KepM{ZsS@420Q=0hJLV9K1cp%3@!X7sNj7B5(ypF!M} zGpkXHLY-G6&j;`m8Bi*5+4iHfH(IaQCIc!HV%ZyBy_k5%F5jqx5v~XIr}0S^&+{hM z|2HJ-+=bGy+Mvb9m&70`gx9#z0MeV6O%S^Szz^yg+buWZc<;+aO1R8tSM0cA$(u9r z*Ygy+W7_8DJ6`0tWNRZ)^uxrpe^Z)TD}qM~Ek7tk@K~z8@|piqKQ8g$nF3_%2m%6*1Z5O;m|cjRJACBqzep$kIMa?qgnQ{g za!oDUUS@BGSiIzv;Jp8cE@0syXzu~A69tHIw!b}lV|-}~5=*!sYRU3L*g9f}E+Y9J z8>>v0QOV#s3$s*dy%r>Z8}lCraiD4cSJ70Lf}5h8RgLBeB$mwIwXRu0p!Fj@JI%xX z7y^re%--Tji18<=zK*lChGH?L`Y*DHvIT?^4A_+^Ln=dM1!FO`v>z;h zsK}^H+mPy?D^MKNWyE`?M8z{-XKP_vpMEW_v4;V))~UD>S*Zko>C2{W>PKu}W*2OWZ@;ZFG<6d3$>bM%CG^^2SK`AW9x+#V#lAL(OI zOJkn39*6GKzp^a|I;l+n=(@9Ts+vP)CSwLBFH7Xc7@I||_2lRe_||!t8kdv4a(%zn zxSt~>b(E*b#)I2P2;lVN& z&aClIG;nCZiu@OCk*MK)1WqwxpfdY)xsDOpcAObFxZ-31&_ElC`l zdfO$nG5G7JbVvG6Qcwx%U>2)E!YKItAwfjkHkltfTg;;3!7vvdr$OHTx3f-gs9G`z zdGD}M5n})!sP?Na_(LH{?8aZ5&l?$gs5$QX6NFeX6!?CV(@NNszG7Q4lnP`Rv9#}x z3&M;qDT1-^-`d0h05n&Z0@w+!lVeiVCk3ue>Ofg`_MO|w=Q!RDxVJiQTb>W-!z8H;vPfWvzr9?Chkoiub_8VFs>Nabn-Mhr-G zXY?aXHH=Zc3Zl8usMY}OY)Q}EwtAOvv2w@G8??8$8+q@QaPS88n7%?y}y1W>;0 zA1KIUb-o+DgBIM2D)8-j{9a?F=4){gGoRsOHB4xS0npRNN2#7(e&bDOO{Pp}{MDAgHg>erSN;k6q8tS$p-*c|<=9 zZRH-(D+iLz(iep=R_xEe`1+JZ6p-Kvyjn3Vp=Eqj5zVdjx?|pBWsl9H+sDScf9;Ev zq73>-hZf|%2k%B7u-&cOwl_#+#=TT13H_3)q|e@EMz9mM7G;fNB{JKPa`QcIvQ;55 zE*IvcXUpQ5_J6f^oexc(-CL20V8Ma37C~&OpaMccKtRMvWhFPOl^T$ zHetL927(Y~n6go1i3|fF>}d>BM%ekC$I$lIzu^7y@*$t{jNJFR@B5tVI@iTg3x{)0 zOqn<_H_hppAL+5H9i-D4h7n-8gTNf~h-=%1tcUdhI8ocqS@76!`Fv>-MOFJ%ddwB; zrnw(Eb@LL`jOB~(m6qWEZZ^;Lp???i`xENHv!q|G(3GIgDr&lljfN8{m0KRYCWLKR z3f#y6FU7?ngG99)4pa2pRa&xXl%LB+2YH_xhwZX_=i8w%yk>5}_FA|I`5pZtZKGtY zu2f+!N|-YA`o|e#;Q_g3RBT=i|IDo4y4%EOiS!M;=?sBa{yH^iLZeTO!^1e$ybzB$ zP+=%NQW8p%9}F2vUp%XRQT##4tlkTJ(ez?z_4ns3)<*=z`caoxo=axm8olEjPCrOFpRG0;+ci;QA@vEg`Na*)iH3lVULYKtcC^PZi zN|a4Znt#~TH;heP77|`ht<)91?!*CULPG;$D*`9eD$umk)J~;}u_lLk1L%L<`qCm9 z0bU~HB`5k}?fUoezocm=oqzs5S&0s(aim3sK*VzM6jw{&0X3Bfl3aNKnH`sBFSo&^ zd3mrxVWF+$uwiSE25*>6wcR(Z8KZuk(+Y%OQd7kb$-aJTG~t$9ArjvAK$vMplD@GV zy29`n)~k?I+$R8gQE9+ZH0g8#CHfWG2s_@xwccWb!r7WIlzPtD%bf9zlbz_(lw`WQ z^ebDC@L@RadN$#ftJ&=@bBoA=xcc9VnX(pM)I^b#ZgibXM>9F(4*AsTL~OF{+?1r6 z%=Yc84n*%IQ@YBX$|=>9p~dF(9>a&Md9 zyIs%k=NQQ`)a~!C@r(5rPYY{(`dK0muPa)NQ8<~Gi|R@FC%x&+$xRR)3VILy@6l0W`-!VImDR~uJ0w4WS3pF|F0?8535)yxiJ zc=HFCH?HC`t}RTv8s{b{$`f}g07=Lbs`V57ej1k9y%=u2Se6NTuvbn)RZnIg3lf)` z17OPP*I(Q8d)?(dK`)AvlZ^3VR;U@;)qhDgJWA})O%WQ~X&#;ct21JzzXHZgFynQ&{2pEY}%!hojhr%pfGdPJfa8eqFWviI8RFc}RsleaPBy+K;MNo|P} zqS?#cC+&=d$-m_|MVks!FnjX%lHZ~*yanU%q$}>GSbfT7DBB`u*0%{c9d6jb{2`SH z>af`T`0%RM5DX>`%iFAV{y@-5kcf?+4P~=cQ>c2HSo&craJhZ**oU#~A<3fcvCDa| zTm$*W5VRcP6agDoEZS(l0PZMu5GHbDmu&4Xt`QA$M~pMDR|*pnU%x?RB+W~8MZ11i zP**@2`Ir!zuwwssBO!EcTpw>wUPseYPAjNeScUI1-L0HoKRxfDTCSzkS`x|YXlO|3 z`=%c(8p0w?gquV@aj2GQNy*Phbv5xu_++FjV~VIgX?^UQ-2&N0+Tr{x(IF^>0EcCz z@?g?%pJ>FxkwtMf+8=+*IW*o7 z5d^VKK7Zd_0((_z(c!aG$j{8pAe;D`gH-+;X8*bsZY{*!E+Re&XV8w4*_uBgM)*rd zzKiV%lvtVz!xTWr*#|#+Bpb*LoptN)BM@=F^ z%5_rL0QC3-iLml32B=)R0b_r)d(YleHt+paHu+z4tS*cp=nx%>IdE~ohCLW_k>omW zTUqeEZ*spVSxl)+v1rFNWC~tCS#N)PNjkzF%$7|pfwbhzgV|C-+v`|a zr$Mt+e@BO*Vp-wtuGak3{Bn3*2*>g~!AgXgk8F41mw_XSoO-51~n3|^}9O*BO{ z^5jouLP^q^ssiecGpM3W!6Xp37|rzNUj@NNQD83T+cKAlP5^u|0ZcVPs&K(f#GwIY zY+mA+%zy5l<&X`OXJ>TN!}#Uw;a(oJC!#4IKsX}n_UR^M%0}RDpq91K7(9Cj3RrJD zzS-CUbA|pF{W}`y9jBSbW$}ZtV)~f}t7);E+0gNHr|Hj=SwNK_%r~@a8-PrtsOJ}4 zeNgW)r`NYsww9vKmW}@j{XM`|84%4V7c+n$S_d+x2;d~>Ioe_jy#mPxq5-Nru4qx^ z1HyfrVA5PU{cQLTGHnLOl zMD0Pte=?h000qN9)>IoV13)0EUVfYYJ=b_hrE}I2epr)5doZ^_;nFB%03?%=xK4uf z8=k6wF+xU4!LPWsqM4d=(>3jbej5&;_c*=X57R(poU0wkj;oqX#7=iiiklRy63_b4 zrJ#Z}ToYf+yKsnWv;xYj6p?49* zo^7R7rakGq*y`2M{RyOZ0b{g`)Zz?1(QjKY=e(c<25c?C5L`7z-;8-w%U0Y6GmDIs z#CwYj&P*j+n&5+v47anfJ|6TX{vM89AH0-BTx{c z1H_>-?u|y~o-fTkC-??Dr;=ADZ!`K`BbC>&b+fQiazl@M-2hjcfI0E%rB02{9ciJ0 z6O+;f?Or;rjyS^BNM0$`dIM+d!5 z85na!?ftbHxpV@!;mO5;}g8`6ELej32QA4{n0G{hVU zpobNiR&9#F;CHavO6cCiL4XIUNf|qSE9sEECZQCF;3wAu03^lzVi1@10vVzxv)A%|J0BFsvcX_jpQ(Z_LSO?i7Iy-EA<$nf~Y5`ml6P1TsA`$XE zM2JOnf;Is55XKuk@Q{BV&yov8VnN&>i;xFegu20-PTC?^L=Z@-8Aq1~9r8kRAv( zPnrPY>9b>{$(QIZjG>+0afxPwKjmcr$|T$!b~B_ydd1; zAAV_H99(YpXFr%<2#5k_&=~Wx6}MHTOGBpT4o8y${XyQvMQ-2j_TLZpA6m9S-a%)M z1xW4#5K}?mX+OZ2e_j6qx&B(2=TX-0PXWf&yCB<*Yd5>X#dUXvku#TV(M!E{`GMIdiGZCn$Qrl lxvw+D`uN|?|0~17 Date: Tue, 29 Jul 2025 13:32:46 -0400 Subject: [PATCH 022/224] [Doc] update Contributing page's testing section (#18272) Signed-off-by: David Xia --- docs/contributing/README.md | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/docs/contributing/README.md b/docs/contributing/README.md index e3ae5055b9988..5a2a70d57e85f 100644 --- a/docs/contributing/README.md +++ b/docs/contributing/README.md @@ -26,6 +26,8 @@ See . ## Developing +--8<-- "docs/getting_started/installation/python_env_setup.inc.md" + Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source][build-from-source] documentation for details. @@ -42,7 +44,7 @@ For an optimized workflow when iterating on C++/CUDA kernels, see the [Increment Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blob/main/mkdocs.yaml) used in the vLLM documentation, as well as required dependencies: ```bash -pip install -r requirements/docs.txt +uv pip install -r requirements/docs.txt ``` !!! note @@ -98,13 +100,14 @@ For additional features and advanced configurations, refer to the official [MkDo ??? console "Commands" ```bash - pip install -r requirements/common.txt -r requirements/dev.txt + # These commands are only for Nvidia CUDA platforms. + uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto # Linting, formatting and static type checking - pre-commit install --hook-type pre-commit --hook-type commit-msg + pre-commit install # You can manually run pre-commit with - pre-commit run --all-files + pre-commit run --all-files --show-diff-on-failure # To manually run something from CI that does not run # locally by default, you can run: @@ -122,6 +125,10 @@ For additional features and advanced configurations, refer to the official [MkDo Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment. +!!! note "Install python3-dev if Python.h is missing" + If any of the above commands fails with `Python.h: No such file or directory`, install + `python3-dev` with `sudo apt install python3-dev`. + !!! note Currently, the repository is not fully checked by `mypy`. @@ -153,7 +160,7 @@ Using `-s` with `git commit` will automatically add this header. !!! tip You can enable automatic sign-off via your IDE: - + - **PyCharm**: Click on the `Show Commit Options` icon to the right of the `Commit and Push...` button in the `Commit` window. It will bring up a `git` window where you can modify the `Author` and enable `Sign-off commit`. - **VSCode**: Open the [Settings editor](https://code.visualstudio.com/docs/configure/settings) From a33ea28b1be64f0b57e9eb90389dd36715c60ecb Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 29 Jul 2025 15:51:58 -0400 Subject: [PATCH 023/224] Add `flashinfer_python` to CUDA wheel requirements (#21389) Signed-off-by: mgoin --- docker/Dockerfile | 4 +++- requirements/cuda.txt | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index b87401c593572..0cd2cfad66fdd 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -386,6 +386,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist # Install FlashInfer from source ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" +# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt +# We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel. ARG FLASHINFER_GIT_REF="v0.2.9rc2" RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' . /etc/environment @@ -408,7 +410,7 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ python3 -m flashinfer.aot TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \ - uv pip install --system --no-build-isolation . + uv pip install --system --no-build-isolation --force-reinstall --no-deps . popd rm -rf flashinfer BASH diff --git a/requirements/cuda.txt b/requirements/cuda.txt index c1273b224eabf..5557c868acafa 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -12,3 +12,5 @@ torchaudio==2.7.1 torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version # https://github.com/facebookresearch/xformers/releases/tag/v0.0.31 xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7 +# FlashInfer should be updated together with the Dockerfile +flashinfer_python==0.2.9rc2 \ No newline at end of file From a1873db23dd597930a7e4731a53314ace92baf49 Mon Sep 17 00:00:00 2001 From: Doug Smith Date: Tue, 29 Jul 2025 17:45:19 -0400 Subject: [PATCH 024/224] docker: docker-aware precompiled wheel support (#21127) Signed-off-by: dougbtv --- docker/Dockerfile | 26 +++++++++++++-------- setup.py | 58 +++++++++++++++++++++++++++++++++++------------ vllm/envs.py | 11 +++++++-- 3 files changed, 68 insertions(+), 27 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 0cd2cfad66fdd..75b5ab0230c87 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -209,16 +209,7 @@ ARG SCCACHE_REGION_NAME=us-west-2 ARG SCCACHE_S3_NO_CREDENTIALS=0 # Flag to control whether to use pre-built vLLM wheels -ARG VLLM_USE_PRECOMPILED -# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed -ENV VLLM_USE_PRECOMPILED="" -RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \ - export VLLM_USE_PRECOMPILED=1 && \ - echo "Using precompiled wheels"; \ - else \ - unset VLLM_USE_PRECOMPILED && \ - echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \ - fi +ARG VLLM_USE_PRECOMPILED="" # if USE_SCCACHE is set, use sccache to speed up compilation RUN --mount=type=cache,target=/root/.cache/uv \ @@ -235,6 +226,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \ && export SCCACHE_IDLE_TIMEOUT=0 \ && export CMAKE_BUILD_TYPE=Release \ + && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \ + && export VLLM_DOCKER_BUILD_CONTEXT=1 \ && sccache --show-stats \ && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \ && sccache --show-stats; \ @@ -248,9 +241,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ # Clean any existing CMake artifacts rm -rf .deps && \ mkdir -p .deps && \ + export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \ + export VLLM_DOCKER_BUILD_CONTEXT=1 && \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ fi +# When using precompiled wheels, keep only the newest manylinux1 wheel and delete others +RUN if [ "$VLLM_USE_PRECOMPILED" = "1" ]; then \ + echo "Cleaning up extra wheels in dist/..." && \ + # Identify the most recent manylinux1_x86_64 wheel + KEEP_WHEEL=$(ls -t dist/*manylinux1_x86_64.whl 2>/dev/null | head -n1) && \ + if [ -n "$KEEP_WHEEL" ]; then \ + echo "Keeping wheel: $KEEP_WHEEL"; \ + find dist/ -type f -name "*.whl" ! -path "${KEEP_WHEEL}" -delete; \ + fi; \ + fi + # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py diff --git a/setup.py b/setup.py index d46e678e7aa40..58e5833f16ae1 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,7 @@ import json import logging import os import re +import shutil import subprocess import sys from pathlib import Path @@ -297,6 +298,10 @@ class repackage_wheel(build_ext): ]).decode("utf-8") upstream_main_commit = json.loads(resp_json)["sha"] + # In Docker build context, .git may be immutable or missing. + if envs.VLLM_DOCKER_BUILD_CONTEXT: + return upstream_main_commit + # Check if the upstream_main_commit exists in the local repo try: subprocess.check_output( @@ -357,19 +362,48 @@ class repackage_wheel(build_ext): # create a temporary directory to store the wheel temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") wheel_path = os.path.join(temp_dir, wheel_filename) - print(f"Downloading wheel from {wheel_location} to {wheel_path}") - from urllib.request import urlretrieve - try: urlretrieve(wheel_location, filename=wheel_path) except Exception as e: from setuptools.errors import SetupError - raise SetupError( f"Failed to get vLLM wheel from {wheel_location}") from e + # During a docker build: determine correct filename, copy wheel. + if envs.VLLM_DOCKER_BUILD_CONTEXT: + dist_dir = "/workspace/dist" + os.makedirs(dist_dir, exist_ok=True) + # Determine correct wheel filename from METADATA + with zipfile.ZipFile(wheel_path, "r") as z: + metadata_file = next( + (n for n in z.namelist() + if n.endswith(".dist-info/METADATA")), + None, + ) + if not metadata_file: + raise RuntimeError( + "Could not find METADATA in precompiled wheel.") + metadata = z.read(metadata_file).decode() + version_line = next((line for line in metadata.splitlines() + if line.startswith("Version: ")), None) + if not version_line: + raise RuntimeError( + "Could not determine version from METADATA.") + version = version_line.split(": ")[1].strip() + + # Build correct filename using internal version + arch_tag = "cp38-abi3-manylinux1_x86_64" + corrected_wheel_name = f"vllm-{version}-{arch_tag}.whl" + final_wheel_path = os.path.join(dist_dir, corrected_wheel_name) + + print(f"Docker build context detected, copying precompiled wheel " + f"({version}) to {final_wheel_path}") + shutil.copy2(wheel_path, final_wheel_path) + return + + # Unzip the wheel when not in Docker context with zipfile.ZipFile(wheel_path) as wheel: files_to_copy = [ "vllm/_C.abi3.so", @@ -378,15 +412,9 @@ class repackage_wheel(build_ext): "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", "vllm/cumem_allocator.abi3.so", - # "vllm/_version.py", # not available in nightly wheels yet ] - file_members = list( filter(lambda x: x.filename in files_to_copy, wheel.filelist)) - - # vllm_flash_attn python code: - # Regex from - # `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)` compiled_regex = re.compile( r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") file_members += list( @@ -403,11 +431,8 @@ class repackage_wheel(build_ext): package_data[package_name] = [] wheel.extract(file) - if file_name.endswith(".py"): - # python files shouldn't be added to package_data - continue - - package_data[package_name].append(file_name) + if not file_name.endswith(".py"): + package_data[package_name].append(file_name) def _no_device() -> bool: @@ -415,6 +440,9 @@ def _no_device() -> bool: def _is_cuda() -> bool: + # Allow forced CUDA in Docker/precompiled builds, even without torch.cuda + if envs.VLLM_USE_PRECOMPILED and envs.VLLM_DOCKER_BUILD_CONTEXT: + return True has_cuda = torch.version.cuda is not None return (VLLM_TARGET_DEVICE == "cuda" and has_cuda and not (_is_neuron() or _is_tpu())) diff --git a/vllm/envs.py b/vllm/envs.py index fcfad4eec1621..9b6d8c8be242a 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -68,6 +68,7 @@ if TYPE_CHECKING: MAX_JOBS: Optional[str] = None NVCC_THREADS: Optional[str] = None VLLM_USE_PRECOMPILED: bool = False + VLLM_DOCKER_BUILD_CONTEXT: bool = False VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False VLLM_NO_DEPRECATION_WARNING: bool = False VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False @@ -222,8 +223,14 @@ environment_variables: dict[str, Callable[[], Any]] = { # If set, vllm will use precompiled binaries (*.so) "VLLM_USE_PRECOMPILED": - lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool( - os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), + lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in + ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), + + # Used to mark that setup.py is running in a Docker build context, + # in order to force the use of precompiled binaries. + "VLLM_DOCKER_BUILD_CONTEXT": + lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in + ("1", "true"), # Whether to force using nightly wheel in python build. # This is used for testing the nightly wheel in python build. From 176bbce1db0ba81230c396cb46cf4035a16d2c66 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Tue, 29 Jul 2025 17:56:29 -0400 Subject: [PATCH 025/224] Revert "[AMD][CI/Build] Fix the AMD issue caused by inappropriate of symbol exposure (#21647)" (#21850) Signed-off-by: Gregory Shtrasberg --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 664fb6a0ee9f0..ea56b8451f228 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -243,6 +243,7 @@ set(VLLM_EXT_SRC "csrc/sampler.cu" "csrc/cuda_view.cu" "csrc/quantization/gptq/q_gemm.cu" + "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/fp8/common.cu" "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu" "csrc/quantization/gguf/gguf_kernel.cu" @@ -296,8 +297,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu" "csrc/cutlass_extensions/common.cpp" "csrc/attention/mla/cutlass_mla_entry.cu" - "csrc/quantization/fp8/per_token_group_quant.cu" - "csrc/quantization/compressed_tensors/int8_quant_kernels.cu") + "csrc/quantization/fp8/per_token_group_quant.cu") set_gencode_flags_for_srcs( SRCS "${VLLM_EXT_SRC}" From 9266d980480c8da52a0c29960e9086128e19d664 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Tue, 29 Jul 2025 16:34:19 -0700 Subject: [PATCH 026/224] [BugFix] Fix interleaved sliding window not set for Gemma3n (#21863) Signed-off-by: Yong Hoon Shin --- vllm/config.py | 9 +++++++-- vllm/model_executor/models/gemma3n.py | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 86c3b9eae64cb..1dfc746e2002d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -723,11 +723,16 @@ class ModelConfig: ) # Workaround for Gemma 2 which uses interleaved sliding window - # attention, but it's not specified in its config. TODO: remove this - # when Gemma 2 is fixed in Transformers. + # attention, but it's not specified in its config. + # TODO: remove this when Gemma 2 config updated in HuggingFace. if self.hf_text_config.model_type == "gemma2": self.hf_text_config.sliding_window_pattern = 2 + # TODO: remove this when Gemma 3n config updated in HuggingFace. + if self.hf_text_config.model_type == "gemma3n_text": + # 4 sliding window attention followed by 1 full attention + self.hf_text_config.sliding_window_pattern = "LLLLG" + sliding_window = getattr(self.hf_text_config, "sliding_window", None) sliding_window_pattern = getattr(self.hf_text_config, "sliding_window_pattern", None) diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index 7d163320e0d6a..168665cc29655 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -297,8 +297,13 @@ class Gemma3nAttention(nn.Module): has_weight=False) layer_idx = extract_layer_index(prefix) - if config.layer_types[layer_idx] == "sliding_attention": - self.sliding_window = config.sliding_window + + is_sliding_window = ( + getattr(config, "interleaved_sliding_window", None) is not None + and config.layer_types[layer_idx] == "sliding_attention") + + if is_sliding_window: + self.sliding_window = config.interleaved_sliding_window rope_theta = config.rope_local_base_freq rope_scaling = {"rope_type": "default"} else: From 0d0cc9e15001b18997207fc86af6810500d587d9 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Tue, 29 Jul 2025 17:11:50 -0700 Subject: [PATCH 027/224] [ci] add b200 test placeholder (#21866) Signed-off-by: simon-mo --- .buildkite/test-pipeline.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6cda800b6477d..f95f038840dd2 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -643,6 +643,17 @@ steps: - python3 examples/offline_inference/audio_language.py --model-type whisper - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl +- label: Blackwell Test + working_dir: "/vllm-workspace/" + gpu: b200 + # optional: true + source_file_dependencies: + - csrc/ + - vllm/ + commands: + - nvidia-smi + - python3 examples/offline_inference/basic/chat.py + ##### 1 GPU test ##### ##### multi gpus test ##### From 452b2a3180f5003a0253de1ed369c278a6abdbe2 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Tue, 29 Jul 2025 18:03:27 -0700 Subject: [PATCH 028/224] [ci] mark blackwell test optional for now (#21878) --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f95f038840dd2..2bf0b6fd9a169 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -646,7 +646,7 @@ steps: - label: Blackwell Test working_dir: "/vllm-workspace/" gpu: b200 - # optional: true + optional: true source_file_dependencies: - csrc/ - vllm/ From 0e36abf9931baa070609376debb4fb3772f4a3fe Mon Sep 17 00:00:00 2001 From: milesial Date: Tue, 29 Jul 2025 18:16:25 -0700 Subject: [PATCH 029/224] [Bugfix] Correct max tokens for non-contiguous embeds (#21798) Signed-off-by: Alexandre Milesi <30204471+milesial@users.noreply.github.com> Co-authored-by: Alexandre Milesi <30204471+milesial@users.noreply.github.com> --- vllm/multimodal/profiling.py | 31 ++++++++++++++++++++++++++++--- vllm/multimodal/registry.py | 2 +- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 7f6fb47a21fa6..d96803b643ff2 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -180,11 +180,14 @@ class MultiModalProfiler(Generic[_I]): def _get_mm_num_tokens( self, mm_inputs: MultiModalInputs, + mm_embeddings_only: bool = True, ) -> Mapping[str, int]: placeholders_by_modality = mm_inputs["mm_placeholders"] return { - modality: sum(item.get_num_embeds() for item in placeholders) + modality: + sum(item.get_num_embeds() if mm_embeddings_only else item.length + for item in placeholders) for modality, placeholders in placeholders_by_modality.items() } @@ -253,10 +256,11 @@ class MultiModalProfiler(Generic[_I]): multi_modal_placeholders=mm_inputs["mm_placeholders"], ) - def get_mm_max_tokens( + def _get_mm_max_tokens( self, seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, + mm_embeddings_only: bool = True, ) -> Mapping[str, int]: if mm_counts is None: mm_counts = self.get_mm_limits() @@ -285,4 +289,25 @@ class MultiModalProfiler(Generic[_I]): return max_tokens_per_item mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) - return self._get_mm_num_tokens(mm_inputs) + return self._get_mm_num_tokens(mm_inputs, + mm_embeddings_only=mm_embeddings_only) + + def get_mm_max_contiguous_tokens( + self, + seq_len: int, + mm_counts: Optional[Mapping[str, int]] = None, + ): + """ + Returns the maximum length of the multimodal (image placeholders+text) + tokens, including any break/text tokens in-between image embeddings. + + [IMG] [IMG] [IMG] [IMG] [IMG] [IMG] + Returns 9, even when the number of image embeddings is 6. + + This is important to take into account when profiling and + initializing the encoder cache size. + """ + + return self._get_mm_max_tokens(seq_len, + mm_counts, + mm_embeddings_only=False) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index c44fcacd246c4..bfa391829d290 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -129,7 +129,7 @@ class MultiModalRegistry: seq_len = model_config.max_model_len mm_limits = self.get_mm_limits_per_prompt(model_config) - return profiler.get_mm_max_tokens( + return profiler.get_mm_max_contiguous_tokens( seq_len, { modality: 1 From 555e7225bcb9cdf9b037ce064e48987dbc3e13a0 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Tue, 29 Jul 2025 18:45:29 -0700 Subject: [PATCH 030/224] [v1][attention] Support Hybrid Allocator + FlashInfer (#21412) Signed-off-by: Chen Zhang --- tests/v1/attention/test_attention_backends.py | 19 ++++++----- tests/v1/spec_decode/test_eagle.py | 1 + tests/v1/worker/test_gpu_model_runner.py | 3 +- vllm/config.py | 32 ++++++++++++++----- vllm/v1/attention/backends/cpu_attn.py | 4 +-- vllm/v1/attention/backends/flash_attn.py | 4 +-- vllm/v1/attention/backends/flashinfer.py | 18 ++++------- vllm/v1/attention/backends/flex_attention.py | 4 +-- vllm/v1/attention/backends/mamba_attn.py | 4 +-- vllm/v1/attention/backends/mla/common.py | 4 ++- vllm/v1/attention/backends/mla/flashmla.py | 7 ++-- .../attention/backends/mla/rocm_aiter_mla.py | 7 ++-- vllm/v1/attention/backends/rocm_aiter_fa.py | 4 +-- vllm/v1/attention/backends/triton_attn.py | 4 +-- vllm/v1/attention/backends/utils.py | 14 +++++--- vllm/v1/worker/gpu_model_runner.py | 13 +++++--- 16 files changed, 85 insertions(+), 57 deletions(-) diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py index 9bd0b99798d77..f197cbb7bbba0 100644 --- a/tests/v1/attention/test_attention_backends.py +++ b/tests/v1/attention/test_attention_backends.py @@ -198,7 +198,8 @@ class MockAttentionLayer: def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec, - vllm_config, device: torch.device, + layer_names: list[str], vllm_config, + device: torch.device, common_attn_metadata: CommonAttentionMetadata, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, @@ -211,31 +212,33 @@ def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec, if backend == _Backend.FLASHINFER_VLLM_V1: import unittest.mock - from vllm.v1.attention.backends.flashinfer import PerLayerParameters + from vllm.v1.attention.backends.utils import PerLayerParameters - def mock_get_per_layer_parameters(vllm_config, impl_cls): + def mock_get_per_layer_parameters(vllm_config, layer_names, impl_cls): # Return mock parameters for a single layer head_size = vllm_config.model_config.get_head_size() return { - "mock_layer": + layer_name: PerLayerParameters( window_left=-1, # No sliding window logits_soft_cap=0.0, # No soft cap sm_scale=1.0 / (head_size**0.5) # Standard scale ) + for layer_name in layer_names } with unittest.mock.patch( 'vllm.v1.attention.backends.flashinfer.get_per_layer_parameters', mock_get_per_layer_parameters): - builder = builder_cls(kv_cache_spec, vllm_config, device) + builder = builder_cls(kv_cache_spec, layer_names, vllm_config, + device) attn_metadata = builder.build( common_prefix_len=0, common_attn_metadata=common_attn_metadata, ) else: # Build metadata - builder = builder_cls(kv_cache_spec, vllm_config, device) + builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device) attn_metadata = builder.build( common_prefix_len=0, common_attn_metadata=common_attn_metadata, @@ -427,8 +430,8 @@ def test_backend_correctness(batch_spec_name: str, model: str): set_kv_cache_layout("HND") backend_output = run_attention_backend(backend_name, kv_cache_spec, - vllm_config, device, - common_attn_metadata, + ["placeholder"], vllm_config, + device, common_attn_metadata, query_vllm, key_vllm, value_vllm, kv_cache_for_backend) diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index da7e5e2c467dc..a126c7c943ed0 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -305,6 +305,7 @@ def test_propose(num_speculative_tokens): _Backend.FLASH_ATTN_VLLM_V1) attn_metadata_builder = attn_metadata_builder_cls( kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config), + layer_names=proposer.attn_layer_names, vllm_config=proposer.vllm_config, device=device, ) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index e14fbe1e47ecf..231dfcbb68848 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -745,7 +745,8 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch): layer_4 = "model.layers.4.mixer" layer_5 = "model.layers.5.mixer" - with set_current_vllm_config(vllm_config): + with set_current_vllm_config(vllm_config), monkeypatch.context() as m: + m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER") hf_config = vllm_config.model_config.hf_config fwd_context = {} for key in [layer_0, layer_1]: diff --git a/vllm/config.py b/vllm/config.py index 1dfc746e2002d..8e8c1198833c2 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -740,8 +740,8 @@ class ModelConfig: isinstance(sliding_window, list)) if not self.disable_sliding_window and has_interleaved_attention: - if (backend := - envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"): + if not envs.VLLM_USE_V1 and (backend := envs.VLLM_ATTENTION_BACKEND + ) in ("XFORMERS", "FLASHINFER"): sliding_window_len_min = get_min_sliding_window( self.hf_text_config.sliding_window) @@ -5065,13 +5065,29 @@ def assert_hashable(text): T = TypeVar("T") -def get_layers_from_vllm_config(vllm_config: VllmConfig, - layer_type: type[T]) -> dict[str, T]: +def get_layers_from_vllm_config( + vllm_config: VllmConfig, + layer_type: type[T], + layer_names: Optional[list[str]] = None) -> dict[str, T]: + """ + Get layers from the vLLM config. + + Args: + vllm_config: The vLLM config. + layer_type: The type of the layer to get. + layer_names: The names of the layers to get. If None, return all layers. + """ + + if layer_names is None: + layer_names = list( + vllm_config.compilation_config.static_forward_context.keys()) + + forward_context = vllm_config.compilation_config.static_forward_context + return { - layer_name: layer - for layer_name, layer in - vllm_config.compilation_config.static_forward_context.items() - if isinstance(layer, layer_type) + layer_name: forward_context[layer_name] + for layer_name in layer_names + if isinstance(forward_context[layer_name], layer_type) } diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index 3b6d753863d07..9ed46331863c9 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -315,8 +315,8 @@ class TorchSDPAMetadata(AttentionMetadata): class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]): - def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig, - device: torch.device) -> None: + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], + vllm_config: VllmConfig, device: torch.device) -> None: self.kv_cache_spec = kv_cache_spec self.vllm_config = vllm_config self.scheduler_config = vllm_config.scheduler_config diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 7c8a5e056fea5..4c2a6c6b985b2 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -148,8 +148,8 @@ class FlashAttentionMetadataBuilder( AttentionMetadataBuilder[FlashAttentionMetadata]): full_cudagraph_supported: ClassVar[bool] = get_flash_attn_version() == 3 - def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig, - device: torch.device): + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], + vllm_config: VllmConfig, device: torch.device): self.vllm_config = vllm_config self.model_config = vllm_config.model_config self.parallel_config = vllm_config.parallel_config diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 775780807eae2..27552f0e7c1ef 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -21,10 +21,9 @@ from vllm.platforms import current_platform from vllm.utils import cdiv from vllm.v1.attention.backends.flash_attn import use_cascade_attention from vllm.v1.attention.backends.utils import ( - AttentionMetadataBuilder, CommonAttentionMetadata, PerLayerParameters, - get_kv_cache_layout, get_per_layer_parameters, - infer_global_hyperparameters, reorder_batch_to_split_decodes_and_prefills, - split_decodes_and_prefills) + AttentionMetadataBuilder, CommonAttentionMetadata, get_kv_cache_layout, + get_per_layer_parameters, infer_global_hyperparameters, + reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills) from vllm.v1.kv_cache_interface import AttentionSpec if TYPE_CHECKING: @@ -219,8 +218,8 @@ class FlashInferMetadata: class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): - def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig, - device: torch.device): + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], + vllm_config: VllmConfig, device: torch.device): self.device = device self._workspace_buffer = None self._prefill_wrapper = None # Wrapper for prefill/append @@ -228,7 +227,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self._cascade_wrapper = None # Wrapper for cascade attention # Global hyperparameters shared by all attention layers - self.global_hyperparameters: Optional[PerLayerParameters] = None + self.global_hyperparameters = infer_global_hyperparameters( + get_per_layer_parameters(vllm_config, layer_names, FlashInferImpl)) self.vllm_config = vllm_config self.cache_config = vllm_config.cache_config @@ -283,10 +283,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): def _plan(self, num_prefills: int, num_decodes: int, attn_metadata: FlashInferMetadata): - if self.global_hyperparameters is None: - self.global_hyperparameters = infer_global_hyperparameters( - get_per_layer_parameters(self.vllm_config, FlashInferImpl)) - if attn_metadata.use_cascade: attn_metadata.cascade_wrapper = self._get_cascade_wrapper() attn_metadata.cascade_wrapper.plan( diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index ad63f92cd88a7..bb0d890c7754d 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -258,8 +258,8 @@ class FlexAttentionMetadata: class FlexAttentionMetadataBuilder( AttentionMetadataBuilder[FlexAttentionMetadata]): - def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig, - device: torch.device): + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], + vllm_config: VllmConfig, device: torch.device): self.model_config = vllm_config.model_config self.parallel_config = vllm_config.parallel_config self.cache_config = vllm_config.cache_config diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py index dca5de46c0653..8b702e28d67c0 100644 --- a/vllm/v1/attention/backends/mamba_attn.py +++ b/vllm/v1/attention/backends/mamba_attn.py @@ -87,8 +87,8 @@ class Mamba2AttentionMetadata: class Mamba2AttentionMetadataBuilder( AttentionMetadataBuilder[Mamba2AttentionMetadata]): - def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig, - device: torch.device): + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], + vllm_config: VllmConfig, device: torch.device): assert isinstance(kv_cache_spec, MambaSpec) self.kv_cache_spec = kv_cache_spec self.chunk_size = vllm_config.model_config.get_mamba_chunk_size() diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index cf17d93302395..0095d75217856 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -406,6 +406,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): def __init__(self, kv_cache_spec: AttentionSpec, + layer_names: list[str], vllm_config: VllmConfig, device: torch.device, metadata_cls: Optional[type[M]] = None): @@ -471,7 +472,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): BatchPrefillWithRaggedKVCacheWrapper] = [] self._global_hyperparameters = infer_global_hyperparameters( - get_per_layer_parameters(vllm_config, MLACommonImpl)) + get_per_layer_parameters(vllm_config, layer_names, + MLACommonImpl)) if self._use_cudnn_prefill: self.cudnn_workspace = torch.empty( diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index d3e5300dbbd6b..39463b9c06164 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -56,9 +56,10 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]): class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): full_cudagraph_supported: ClassVar[bool] = True # Decode-only - def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig, - device: torch.device): - super().__init__(kv_cache_spec, vllm_config, device, FlashMLAMetadata) + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], + vllm_config: VllmConfig, device: torch.device): + super().__init__(kv_cache_spec, layer_names, vllm_config, device, + FlashMLAMetadata) self.compilation_config = vllm_config.compilation_config self.num_q_heads = vllm_config.model_config.get_num_attention_heads( diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 834c234558350..5c5891f035ae2 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -66,9 +66,10 @@ class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]): class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): full_cudagraph_supported: ClassVar[bool] = True # decode only - def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig, - device: torch.device): - super().__init__(kv_cache_spec, vllm_config, device, AiterMLAMetadata) + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], + vllm_config: VllmConfig, device: torch.device): + super().__init__(kv_cache_spec, layer_names, vllm_config, device, + AiterMLAMetadata) assert self.kv_cache_spec.block_size == 1, "AITER MLA" \ "only supports block size 1." diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 85a5dc8c91c13..dd10b7f02730a 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -231,8 +231,8 @@ class AiterFlashAttentionMetadataBuilder( AttentionMetadataBuilder[AiterFlashAttentionMetadata]): full_cudagraph_supported: ClassVar[bool] = True - def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig, - device: torch.device): + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], + vllm_config: VllmConfig, device: torch.device): self.vllm_config = vllm_config self.model_config = vllm_config.model_config self.parallel_config = vllm_config.parallel_config diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 83471ca51b73f..195fbd3b1b9c4 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -59,8 +59,8 @@ class TritonAttentionMetadataBuilder( AttentionMetadataBuilder[TritonAttentionMetadata]): full_cudagraph_supported: ClassVar[bool] = True - def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig, - device: torch.device): + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], + vllm_config: VllmConfig, device: torch.device): self.device = device self.block_size = kv_cache_spec.block_size self.kv_cache_spec = kv_cache_spec diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index b13362f8a8d8d..d1599ba10b618 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -70,8 +70,8 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]): full_cudagraph_supported: ClassVar[bool] = False @abstractmethod - def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig, - device: torch.device): + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], + vllm_config: VllmConfig, device: torch.device): self.kv_cache_spec = kv_cache_spec @abstractmethod @@ -164,14 +164,14 @@ class PerLayerParameters: def get_per_layer_parameters( - vllm_config: VllmConfig, + vllm_config: VllmConfig, layer_names: list[str], cls_: type['AttentionImpl']) -> dict[str, PerLayerParameters]: """ - Scan all attention layers and determine some hyperparameters + Scan layers in `layer_names` and determine some hyperparameters to use during `plan`. """ - layers = get_layers_from_vllm_config(vllm_config, Attention) + layers = get_layers_from_vllm_config(vllm_config, Attention, layer_names) per_layer_params: dict[str, PerLayerParameters] = {} for key, layer in layers.items(): @@ -208,6 +208,10 @@ def infer_global_hyperparameters( param_sets = list(per_layer_params.values()) global_params = param_sets[0] for params in param_sets: + if params.window_left != global_params.window_left: + raise ValueError( + "Window left is not the same for all layers. One potential fix " + "is to set disable_sliding_window=True") assert params == global_params, ( "FlashInfer backend currently only supports models in which all " "layers share the same values for the following hyperparameters: " diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 84ad582c9c9de..3befb6adf2753 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2521,7 +2521,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): elapsed_time, cuda_graph_size / (1 << 30)) def _initialize_single_attn_backend( - self, kv_cache_spec: KVCacheSpec + self, kv_cache_spec: KVCacheSpec, layer_names: list[str] ) -> tuple[AttentionBackend, AttentionMetadataBuilder]: if isinstance(kv_cache_spec, AttentionSpec): attn_backend_i = get_attn_backend( @@ -2551,6 +2551,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): attn_metadata_builder_i = attn_backend_i.get_builder_cls()( kv_cache_spec, + layer_names, self.vllm_config, self.device, ) @@ -2574,8 +2575,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): kv_cache_config.kv_cache_groups): kv_cache_spec = kv_cache_group_spec.kv_cache_spec - attn_backend_i, attn_metadata_builder_i = \ - self._initialize_single_attn_backend(kv_cache_spec) + attn_backend_i, attn_metadata_builder_i = ( + self._initialize_single_attn_backend( + kv_cache_spec, kv_cache_group_spec.layer_names)) self.attn_backends.append(attn_backend_i) self.attn_metadata_builders.append(attn_metadata_builder_i) @@ -2606,8 +2608,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): assert len(attn_specs) == len(attn_layers), \ "All or none of the layers are expected to be encoder-only" - attn_backend, attn_metadata_builder = \ - self._initialize_single_attn_backend(attn_specs[0]) + attn_backend, attn_metadata_builder = ( + self._initialize_single_attn_backend(attn_specs[0], + attn_layers.keys())) self.attn_backends.append(attn_backend) self.attn_metadata_builders.append(attn_metadata_builder) self.is_encoder_only_model = True From ba5c5e5404d2d3fdee02e163fc75a44bd960935f Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 30 Jul 2025 03:45:08 +0100 Subject: [PATCH 031/224] [Docs] Switch to better markdown linting pre-commit hook (#21851) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/nightly-benchmarks/README.md | 5 + .../nightly-benchmarks/nightly-annotation.md | 19 ++-- .../nightly-descriptions.md | 34 +++---- .../performance-benchmarks-descriptions.md | 1 + .github/PULL_REQUEST_TEMPLATE.md | 4 +- .markdownlint.yaml | 13 +++ .pre-commit-config.yaml | 7 +- README.md | 7 ++ RELEASE.md | 5 +- benchmarks/README.md | 99 +++++++++++-------- benchmarks/auto_tune/README.md | 8 +- benchmarks/kernels/deepgemm/README.md | 4 +- csrc/quantization/cutlass_w8a8/Epilogues.md | 5 +- docs/cli/README.md | 4 +- docs/configuration/tpu.md | 15 ++- docs/contributing/ci/failures.md | 8 +- .../contributing/ci/update_pytorch_version.md | 4 +- docs/contributing/deprecation_policy.md | 6 +- docs/contributing/profiling.md | 4 +- docs/contributing/vulnerability_management.md | 6 +- docs/deployment/frameworks/anything-llm.md | 12 +-- docs/deployment/frameworks/chatbox.md | 10 +- docs/deployment/frameworks/dify.md | 10 +- docs/deployment/frameworks/haystack.md | 2 - .../retrieval_augmented_generation.md | 1 + .../integrations/production-stack.md | 9 +- docs/deployment/k8s.md | 2 +- docs/design/metrics.md | 4 +- docs/design/p2p_nccl_connector.md | 4 +- docs/design/prefix_caching.md | 11 ++- docs/design/torch_compile.md | 6 +- docs/features/compatibility_matrix.md | 6 +- docs/features/lora.md | 2 + docs/features/multimodal_inputs.md | 2 + docs/features/quantization/auto_round.md | 2 +- docs/features/quantization/int4.md | 4 +- .../quantization/quantized_kvcache.md | 1 + docs/features/quantization/quark.md | 1 + docs/features/quantization/torchao.md | 1 + docs/getting_started/installation/cpu.md | 6 +- .../installation/intel_gaudi.md | 8 +- docs/models/hardware_supported_models/tpu.md | 5 +- docs/models/supported_models.md | 14 +-- docs/serving/distributed_serving.md | 2 +- docs/serving/expert_parallel_deployment.md | 3 +- docs/serving/openai_compatible_server.md | 1 + docs/usage/security.md | 32 +++--- docs/usage/v1_guide.md | 10 +- .../disaggregated-prefill-v1/README.md | 2 +- .../offline_inference/openai_batch/README.md | 8 +- examples/others/lmcache/README.md | 4 + examples/others/logging_configuration.md | 6 +- pyproject.toml | 10 -- tools/ep_kernels/README.md | 9 +- vllm/plugins/lora_resolvers/README.md | 3 +- 55 files changed, 273 insertions(+), 198 deletions(-) create mode 100644 .markdownlint.yaml diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index ae42f70077cec..fcde284efea98 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -28,6 +28,7 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc ## Trigger the benchmark Performance benchmark will be triggered when: + - A PR being merged into vllm. - Every commit for those PRs with `perf-benchmarks` label AND `ready` label. @@ -38,6 +39,7 @@ bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh ``` Runtime environment variables: + - `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0. - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file). - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file). @@ -46,12 +48,14 @@ Runtime environment variables: - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string. Nightly benchmark will be triggered when: + - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label. ## Performance benchmark details See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead. +> ### Latency test Here is an example of one test inside `latency-tests.json`: @@ -149,6 +153,7 @@ Here is an example using the script to compare result_a and result_b without det Here is an example using the script to compare result_a and result_b with detail test name. `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json` + | | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio | |---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------| | 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 | diff --git a/.buildkite/nightly-benchmarks/nightly-annotation.md b/.buildkite/nightly-benchmarks/nightly-annotation.md index ef11c040057c8..466def07b6f1f 100644 --- a/.buildkite/nightly-benchmarks/nightly-annotation.md +++ b/.buildkite/nightly-benchmarks/nightly-annotation.md @@ -1,3 +1,4 @@ +# Nightly benchmark annotation ## Description @@ -13,15 +14,15 @@ Please download the visualization scripts in the post - Find the docker we use in `benchmarking pipeline` - Deploy the docker, and inside the docker: - - Download `nightly-benchmarks.zip`. - - In the same folder, run the following code: + - Download `nightly-benchmarks.zip`. + - In the same folder, run the following code: - ```bash - export HF_TOKEN= - apt update - apt install -y git - unzip nightly-benchmarks.zip - VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh - ``` + ```bash + export HF_TOKEN= + apt update + apt install -y git + unzip nightly-benchmarks.zip + VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh + ``` And the results will be inside `./benchmarks/results`. diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index 5f003f42f07c0..8afde017d383e 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -13,25 +13,25 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/ ## Setup - Docker images: - - vLLM: `vllm/vllm-openai:v0.6.2` - - SGLang: `lmsysorg/sglang:v0.3.2-cu121` - - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12` - - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3` - - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.* - - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark. + - vLLM: `vllm/vllm-openai:v0.6.2` + - SGLang: `lmsysorg/sglang:v0.3.2-cu121` + - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12` + - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3` + - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.* + - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark. - Hardware - - 8x Nvidia A100 GPUs + - 8x Nvidia A100 GPUs - Workload: - - Dataset - - ShareGPT dataset - - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output) - - Decode-heavy dataset (in average 462 input tokens, 256 output tokens) - - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use. - - Models: llama-3 8B, llama-3 70B. - - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)). - - Average QPS (query per second): 2, 4, 8, 16, 32 and inf. - - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed. - - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). + - Dataset + - ShareGPT dataset + - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output) + - Decode-heavy dataset (in average 462 input tokens, 256 output tokens) + - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use. + - Models: llama-3 8B, llama-3 70B. + - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)). + - Average QPS (query per second): 2, 4, 8, 16, 32 and inf. + - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed. + - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). ## Known issues diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md index a1f8441ccdac8..8bb16bd3cf373 100644 --- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md +++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md @@ -1,3 +1,4 @@ +# Performance benchmarks descriptions ## Latency tests diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 017ec7ca82da7..d4aceab4472fa 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,4 +1,5 @@ -## Essential Elements of an Effective PR Description Checklist +# Essential Elements of an Effective PR Description Checklist + - [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)". - [ ] The test plan, such as providing test command. - [ ] The test results, such as pasting the results comparison before and after, or e2e results @@ -14,5 +15,4 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE B ## (Optional) Documentation Update - **BEFORE SUBMITTING, PLEASE READ ** (anything written below this line will be removed by GitHub Actions) diff --git a/.markdownlint.yaml b/.markdownlint.yaml new file mode 100644 index 0000000000000..c86fed9555d62 --- /dev/null +++ b/.markdownlint.yaml @@ -0,0 +1,13 @@ +MD007: + indent: 4 +MD013: false +MD024: + siblings_only: true +MD033: false +MD042: false +MD045: false +MD046: false +MD051: false +MD052: false +MD053: false +MD059: false diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5197820fb4020..045096cb86369 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -35,12 +35,11 @@ repos: exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*' types_or: [c++, cuda] args: [--style=file, --verbose] -- repo: https://github.com/jackdewinter/pymarkdown - rev: v0.9.29 +- repo: https://github.com/igorshubovych/markdownlint-cli + rev: v0.45.0 hooks: - - id: pymarkdown + - id: markdownlint-fix exclude: '.*\.inc\.md' - args: [fix] - repo: https://github.com/rhysd/actionlint rev: v1.7.7 hooks: diff --git a/README.md b/README.md index dc2f0afbe3538..5348405b72d2c 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +