Merge branch 'main' into v1-blocktable-opt

This commit is contained in:
Woosuk Kwon 2024-12-22 22:16:22 -08:00
commit 0420fb2c7b
8 changed files with 70 additions and 6 deletions

View File

@ -0,0 +1,24 @@
import argparse
import os
template = """<!DOCTYPE html>
<html>
<body>
<h1>Links for vLLM</h1/>
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
</body>
</html>
"""
parser = argparse.ArgumentParser()
parser.add_argument("--wheel", help="The wheel path.", required=True)
args = parser.parse_args()
filename = os.path.basename(args.wheel)
with open("index.html", "w") as f:
print(f"Generated index.html for {args.wheel}")
# cloudfront requires escaping the '+' character
f.write(
template.format(wheel=filename,
wheel_html_escaped=filename.replace("+", "%2B")))

View File

@ -65,9 +65,9 @@ steps:
- VLLM_USAGE_SOURCE - VLLM_USAGE_SOURCE
- HF_TOKEN - HF_TOKEN
- block: "Run H100 Benchmark" #- block: "Run H100 Benchmark"
key: block-h100 #key: block-h100
depends_on: ~ #depends_on: ~
- label: "H100" - label: "H100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"

View File

@ -23,6 +23,8 @@ wheel="$new_wheel"
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version: $version" echo "Version: $version"
normal_wheel="$wheel" # Save the original wheel filename
# If the version contains "dev", rename it to v1.0.0.dev for consistency # If the version contains "dev", rename it to v1.0.0.dev for consistency
if [[ $version == *dev* ]]; then if [[ $version == *dev* ]]; then
suffix="${version##*.}" suffix="${version##*.}"
@ -32,12 +34,38 @@ if [[ $version == *dev* ]]; then
new_version="1.0.0.dev" new_version="1.0.0.dev"
fi fi
new_wheel="${wheel/$version/$new_version}" new_wheel="${wheel/$version/$new_version}"
mv -- "$wheel" "$new_wheel" # use cp to keep both files in the artifacts directory
cp -- "$wheel" "$new_wheel"
wheel="$new_wheel" wheel="$new_wheel"
version="$new_version" version="$new_version"
fi fi
# Upload the wheel to S3 # Upload the wheel to S3
python3 .buildkite/generate_index.py --wheel "$normal_wheel"
# generate index for this commit
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels"
else
# only upload index.html for cu12 wheels (default wheels)
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
fi
# generate index for nightly
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/" aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels"
else
# only upload index.html for cu12 wheels (default wheels)
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
fi
aws s3 cp "$wheel" "s3://vllm-wheels/$version/" aws s3 cp "$wheel" "s3://vllm-wheels/$version/"

View File

@ -74,6 +74,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
logprobs=True, logprobs=True,
temperature=0.0,
top_logprobs=5) top_logprobs=5)
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
@ -130,6 +131,7 @@ async def test_single_chat_session_audio_base64encoded(
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
logprobs=True, logprobs=True,
temperature=0.0,
top_logprobs=5) top_logprobs=5)
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
@ -150,6 +152,7 @@ async def test_single_chat_session_audio_base64encoded(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
temperature=0.0,
) )
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0 assert message.content is not None and len(message.content) >= 0

View File

@ -82,6 +82,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
logprobs=True, logprobs=True,
temperature=0.0,
top_logprobs=5) top_logprobs=5)
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
@ -174,6 +175,7 @@ async def test_single_chat_session_video_base64encoded(
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
logprobs=True, logprobs=True,
temperature=0.0,
top_logprobs=5) top_logprobs=5)
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
@ -194,6 +196,7 @@ async def test_single_chat_session_video_base64encoded(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
temperature=0.0,
) )
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0 assert message.content is not None and len(message.content) >= 0

View File

@ -83,6 +83,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
logprobs=True, logprobs=True,
temperature=0.0,
top_logprobs=5) top_logprobs=5)
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
@ -175,6 +176,7 @@ async def test_single_chat_session_image_base64encoded(
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
logprobs=True, logprobs=True,
temperature=0.0,
top_logprobs=5) top_logprobs=5)
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
@ -195,6 +197,7 @@ async def test_single_chat_session_image_base64encoded(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
temperature=0.0,
) )
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0 assert message.content is not None and len(message.content) >= 0

View File

@ -62,8 +62,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
@pytest.mark.parametrize("tp_size", [4]) @pytest.mark.parametrize("tp_size", [4])
@pytest.mark.parametrize("fully_shard", [True, False])
def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules, def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
tp_size): tp_size, fully_shard):
"""This LoRA model has all supported Mixtral target modules""" """This LoRA model has all supported Mixtral target modules"""
if torch.cuda.device_count() < tp_size: if torch.cuda.device_count() < tp_size:
@ -82,6 +83,7 @@ def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
max_loras=4, max_loras=4,
distributed_executor_backend="ray", distributed_executor_backend="ray",
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
fully_sharded_loras=fully_shard,
max_lora_rank=32, max_lora_rank=32,
) )

View File

@ -425,8 +425,9 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
if self.base_layer.skip_bias_add else None) if self.base_layer.skip_bias_add else None)
return output, output_bias return output, output_bias
# ReplicatedLinear should always be replaced, regardless of the fully
# sharded LoRAs setting, because it is, by definition, copied per GPU.
@classmethod @classmethod
@_not_fully_sharded_can_replace
def can_replace_layer( def can_replace_layer(
cls, cls,
source_layer: nn.Module, source_layer: nn.Module,