Merge branch 'main' into v1-blocktable-opt

2026-05-04 11:11:19 +08:00 · 2024-12-22 22:16:22 -08:00 · 2024-12-22 22:16:22 -08:00 · 0420fb2c7b
commit 0420fb2c7b
parent ee965c9c69 048fc57a0f
8 changed files with 70 additions and 6 deletions
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@ -0,0 +1,24 @@
+import argparse
+import os
+
+template = """<!DOCTYPE html>
+<html>
+    <body>
+    <h1>Links for vLLM</h1/>
+        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
+    </body>
+</html>
+"""
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--wheel", help="The wheel path.", required=True)
+args = parser.parse_args()
+
+filename = os.path.basename(args.wheel)
+
+with open("index.html", "w") as f:
+    print(f"Generated index.html for {args.wheel}")
+    # cloudfront requires escaping the '+' character
+    f.write(
+        template.format(wheel=filename,
+                        wheel_html_escaped=filename.replace("+", "%2B")))
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -65,9 +65,9 @@ steps:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN

-  - block: "Run H100 Benchmark"
-    key: block-h100
-    depends_on: ~
+  #- block: "Run H100 Benchmark"
+    #key: block-h100
+    #depends_on: ~

  - label: "H100"
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
--- a/.buildkite/upload-wheels.sh
+++ b/.buildkite/upload-wheels.sh
@ -23,6 +23,8 @@ wheel="$new_wheel"
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
 echo "Version: $version"

+normal_wheel="$wheel" # Save the original wheel filename
+
 # If the version contains "dev", rename it to v1.0.0.dev for consistency
 if [[ $version == *dev* ]]; then
    suffix="${version##*.}"
@ -32,12 +34,38 @@ if [[ $version == *dev* ]]; then
        new_version="1.0.0.dev"
    fi
    new_wheel="${wheel/$version/$new_version}"
-    mv -- "$wheel" "$new_wheel"
+    # use cp to keep both files in the artifacts directory
+    cp -- "$wheel" "$new_wheel"
    wheel="$new_wheel"
    version="$new_version"
 fi

 # Upload the wheel to S3
+python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+
+# generate index for this commit
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+
+if [[ $normal_wheel == *"cu118"* ]]; then
+    # if $normal_wheel matches cu118, do not upload the index.html
+    echo "Skipping index files for cu118 wheels"
+else
+    # only upload index.html for cu12 wheels (default wheels)
+    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
+    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+fi
+
+# generate index for nightly
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+
+if [[ $normal_wheel == *"cu118"* ]]; then
+    # if $normal_wheel matches cu118, do not upload the index.html
+    echo "Skipping index files for cu118 wheels"
+else
+    # only upload index.html for cu12 wheels (default wheels)
+    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
+fi
+
 aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@ -74,6 +74,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
        messages=messages,
        max_completion_tokens=10,
        logprobs=True,
+        temperature=0.0,
        top_logprobs=5)
    assert len(chat_completion.choices) == 1

@ -130,6 +131,7 @@ async def test_single_chat_session_audio_base64encoded(
        messages=messages,
        max_completion_tokens=10,
        logprobs=True,
+        temperature=0.0,
        top_logprobs=5)
    assert len(chat_completion.choices) == 1

@ -150,6 +152,7 @@ async def test_single_chat_session_audio_base64encoded(
        model=model_name,
        messages=messages,
        max_completion_tokens=10,
+        temperature=0.0,
    )
    message = chat_completion.choices[0].message
    assert message.content is not None and len(message.content) >= 0
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@ -82,6 +82,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
        messages=messages,
        max_completion_tokens=10,
        logprobs=True,
+        temperature=0.0,
        top_logprobs=5)
    assert len(chat_completion.choices) == 1

@ -174,6 +175,7 @@ async def test_single_chat_session_video_base64encoded(
        messages=messages,
        max_completion_tokens=10,
        logprobs=True,
+        temperature=0.0,
        top_logprobs=5)
    assert len(chat_completion.choices) == 1

@ -194,6 +196,7 @@ async def test_single_chat_session_video_base64encoded(
        model=model_name,
        messages=messages,
        max_completion_tokens=10,
+        temperature=0.0,
    )
    message = chat_completion.choices[0].message
    assert message.content is not None and len(message.content) >= 0
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@ -83,6 +83,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
        messages=messages,
        max_completion_tokens=10,
        logprobs=True,
+        temperature=0.0,
        top_logprobs=5)
    assert len(chat_completion.choices) == 1

@ -175,6 +176,7 @@ async def test_single_chat_session_image_base64encoded(
        messages=messages,
        max_completion_tokens=10,
        logprobs=True,
+        temperature=0.0,
        top_logprobs=5)
    assert len(chat_completion.choices) == 1

@ -195,6 +197,7 @@ async def test_single_chat_session_image_base64encoded(
        model=model_name,
        messages=messages,
        max_completion_tokens=10,
+        temperature=0.0,
    )
    message = chat_completion.choices[0].message
    assert message.content is not None and len(message.content) >= 0
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@ -62,8 +62,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):


@pytest.mark.parametrize("tp_size", [4])
+@pytest.mark.parametrize("fully_shard", [True, False])
 def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
-                                         tp_size):
+                                         tp_size, fully_shard):
    """This LoRA model has all supported Mixtral target modules"""

    if torch.cuda.device_count() < tp_size:
@ -82,6 +83,7 @@ def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
        max_loras=4,
        distributed_executor_backend="ray",
        tensor_parallel_size=tp_size,
+        fully_sharded_loras=fully_shard,
        max_lora_rank=32,
    )

--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@ -425,8 +425,9 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
                       if self.base_layer.skip_bias_add else None)
        return output, output_bias

+    # ReplicatedLinear should always be replaced, regardless of the fully
+    # sharded LoRAs setting, because it is, by definition, copied per GPU.
    @classmethod
-    @_not_fully_sharded_can_replace
    def can_replace_layer(
        cls,
        source_layer: nn.Module,