diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
new file mode 100644
index 0000000000000..8350e2705141e
--- /dev/null
+++ b/.buildkite/generate_index.py
@@ -0,0 +1,24 @@
+import argparse
+import os
+
+template = """
+
+
+ Links for vLLM
+ {wheel}
+
+
+"""
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--wheel", help="The wheel path.", required=True)
+args = parser.parse_args()
+
+filename = os.path.basename(args.wheel)
+
+with open("index.html", "w") as f:
+ print(f"Generated index.html for {args.wheel}")
+ # cloudfront requires escaping the '+' character
+ f.write(
+ template.format(wheel=filename,
+ wheel_html_escaped=filename.replace("+", "%2B")))
diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index 64ba1b32fb074..708e548727cf5 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -65,9 +65,9 @@ steps:
- VLLM_USAGE_SOURCE
- HF_TOKEN
- - block: "Run H100 Benchmark"
- key: block-h100
- depends_on: ~
+ #- block: "Run H100 Benchmark"
+ #key: block-h100
+ #depends_on: ~
- label: "H100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh
index 7345dd4e66b29..3c756659a715a 100644
--- a/.buildkite/upload-wheels.sh
+++ b/.buildkite/upload-wheels.sh
@@ -23,6 +23,8 @@ wheel="$new_wheel"
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version: $version"
+normal_wheel="$wheel" # Save the original wheel filename
+
# If the version contains "dev", rename it to v1.0.0.dev for consistency
if [[ $version == *dev* ]]; then
suffix="${version##*.}"
@@ -32,12 +34,38 @@ if [[ $version == *dev* ]]; then
new_version="1.0.0.dev"
fi
new_wheel="${wheel/$version/$new_version}"
- mv -- "$wheel" "$new_wheel"
+ # use cp to keep both files in the artifacts directory
+ cp -- "$wheel" "$new_wheel"
wheel="$new_wheel"
version="$new_version"
fi
# Upload the wheel to S3
+python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+
+# generate index for this commit
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+
+if [[ $normal_wheel == *"cu118"* ]]; then
+ # if $normal_wheel matches cu118, do not upload the index.html
+ echo "Skipping index files for cu118 wheels"
+else
+ # only upload index.html for cu12 wheels (default wheels)
+ aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
+ aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+fi
+
+# generate index for nightly
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+
+if [[ $normal_wheel == *"cu118"* ]]; then
+ # if $normal_wheel matches cu118, do not upload the index.html
+ echo "Skipping index files for cu118 wheels"
+else
+ # only upload index.html for cu12 wheels (default wheels)
+ aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
+fi
+
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
\ No newline at end of file
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 0a29d77e73abc..1116c0da1a6f0 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -74,6 +74,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
messages=messages,
max_completion_tokens=10,
logprobs=True,
+ temperature=0.0,
top_logprobs=5)
assert len(chat_completion.choices) == 1
@@ -130,6 +131,7 @@ async def test_single_chat_session_audio_base64encoded(
messages=messages,
max_completion_tokens=10,
logprobs=True,
+ temperature=0.0,
top_logprobs=5)
assert len(chat_completion.choices) == 1
@@ -150,6 +152,7 @@ async def test_single_chat_session_audio_base64encoded(
model=model_name,
messages=messages,
max_completion_tokens=10,
+ temperature=0.0,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index 294b250362699..e73449e406739 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -82,6 +82,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
messages=messages,
max_completion_tokens=10,
logprobs=True,
+ temperature=0.0,
top_logprobs=5)
assert len(chat_completion.choices) == 1
@@ -174,6 +175,7 @@ async def test_single_chat_session_video_base64encoded(
messages=messages,
max_completion_tokens=10,
logprobs=True,
+ temperature=0.0,
top_logprobs=5)
assert len(chat_completion.choices) == 1
@@ -194,6 +196,7 @@ async def test_single_chat_session_video_base64encoded(
model=model_name,
messages=messages,
max_completion_tokens=10,
+ temperature=0.0,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index a0b6edd566561..5f070ba3b12e9 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -83,6 +83,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
messages=messages,
max_completion_tokens=10,
logprobs=True,
+ temperature=0.0,
top_logprobs=5)
assert len(chat_completion.choices) == 1
@@ -175,6 +176,7 @@ async def test_single_chat_session_image_base64encoded(
messages=messages,
max_completion_tokens=10,
logprobs=True,
+ temperature=0.0,
top_logprobs=5)
assert len(chat_completion.choices) == 1
@@ -195,6 +197,7 @@ async def test_single_chat_session_image_base64encoded(
model=model_name,
messages=messages,
max_completion_tokens=10,
+ temperature=0.0,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index 150221dfce6ab..797a495201d33 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -62,8 +62,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
@pytest.mark.parametrize("tp_size", [4])
+@pytest.mark.parametrize("fully_shard", [True, False])
def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
- tp_size):
+ tp_size, fully_shard):
"""This LoRA model has all supported Mixtral target modules"""
if torch.cuda.device_count() < tp_size:
@@ -82,6 +83,7 @@ def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
max_loras=4,
distributed_executor_backend="ray",
tensor_parallel_size=tp_size,
+ fully_sharded_loras=fully_shard,
max_lora_rank=32,
)
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index a6c93a3d8bfe9..85164c2165a3c 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -425,8 +425,9 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
if self.base_layer.skip_bias_add else None)
return output, output_bias
+ # ReplicatedLinear should always be replaced, regardless of the fully
+ # sharded LoRAs setting, because it is, by definition, copied per GPU.
@classmethod
- @_not_fully_sharded_can_replace
def can_replace_layer(
cls,
source_layer: nn.Module,