From 29c748930e0d35a98351a8cf8a093fba4b758114 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Sat, 21 Dec 2024 21:08:44 -0800 Subject: [PATCH 1/5] [CI] Fix flaky entrypoint tests (#11403) Signed-off-by: Roger Wang --- tests/entrypoints/openai/test_audio.py | 3 +++ tests/entrypoints/openai/test_video.py | 3 +++ tests/entrypoints/openai/test_vision.py | 3 +++ 3 files changed, 9 insertions(+) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index 0a29d77e73abc..1116c0da1a6f0 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -74,6 +74,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, messages=messages, max_completion_tokens=10, logprobs=True, + temperature=0.0, top_logprobs=5) assert len(chat_completion.choices) == 1 @@ -130,6 +131,7 @@ async def test_single_chat_session_audio_base64encoded( messages=messages, max_completion_tokens=10, logprobs=True, + temperature=0.0, top_logprobs=5) assert len(chat_completion.choices) == 1 @@ -150,6 +152,7 @@ async def test_single_chat_session_audio_base64encoded( model=model_name, messages=messages, max_completion_tokens=10, + temperature=0.0, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py index 294b250362699..e73449e406739 100644 --- a/tests/entrypoints/openai/test_video.py +++ b/tests/entrypoints/openai/test_video.py @@ -82,6 +82,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI, messages=messages, max_completion_tokens=10, logprobs=True, + temperature=0.0, top_logprobs=5) assert len(chat_completion.choices) == 1 @@ -174,6 +175,7 @@ async def test_single_chat_session_video_base64encoded( messages=messages, max_completion_tokens=10, logprobs=True, + temperature=0.0, top_logprobs=5) assert len(chat_completion.choices) == 1 @@ -194,6 +196,7 @@ async def test_single_chat_session_video_base64encoded( model=model_name, messages=messages, max_completion_tokens=10, + temperature=0.0, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index a0b6edd566561..5f070ba3b12e9 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -83,6 +83,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, messages=messages, max_completion_tokens=10, logprobs=True, + temperature=0.0, top_logprobs=5) assert len(chat_completion.choices) == 1 @@ -175,6 +176,7 @@ async def test_single_chat_session_image_base64encoded( messages=messages, max_completion_tokens=10, logprobs=True, + temperature=0.0, top_logprobs=5) assert len(chat_completion.choices) == 1 @@ -195,6 +197,7 @@ async def test_single_chat_session_image_base64encoded( model=model_name, messages=messages, max_completion_tokens=10, + temperature=0.0, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 From 4a9139780ad78a648415f07dd7a5a216fb3f96ab Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 21 Dec 2024 23:53:44 -0800 Subject: [PATCH 2/5] [cd][release] add pypi index for every commit and nightly build (#11404) Signed-off-by: youkaichao Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> --- .buildkite/generate_index.py | 24 ++++++++++++++++++++++++ .buildkite/upload-wheels.sh | 16 +++++++++++++++- 2 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 .buildkite/generate_index.py diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py new file mode 100644 index 0000000000000..8350e2705141e --- /dev/null +++ b/.buildkite/generate_index.py @@ -0,0 +1,24 @@ +import argparse +import os + +template = """ + + +

Links for vLLM

+ {wheel}
+ + +""" + +parser = argparse.ArgumentParser() +parser.add_argument("--wheel", help="The wheel path.", required=True) +args = parser.parse_args() + +filename = os.path.basename(args.wheel) + +with open("index.html", "w") as f: + print(f"Generated index.html for {args.wheel}") + # cloudfront requires escaping the '+' character + f.write( + template.format(wheel=filename, + wheel_html_escaped=filename.replace("+", "%2B"))) diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh index 7345dd4e66b29..0b6d2a1c64c91 100644 --- a/.buildkite/upload-wheels.sh +++ b/.buildkite/upload-wheels.sh @@ -23,6 +23,8 @@ wheel="$new_wheel" version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) echo "Version: $version" +normal_wheel="$wheel" # Save the original wheel filename + # If the version contains "dev", rename it to v1.0.0.dev for consistency if [[ $version == *dev* ]]; then suffix="${version##*.}" @@ -32,12 +34,24 @@ if [[ $version == *dev* ]]; then new_version="1.0.0.dev" fi new_wheel="${wheel/$version/$new_version}" - mv -- "$wheel" "$new_wheel" + # use cp to keep both files in the artifacts directory + cp -- "$wheel" "$new_wheel" wheel="$new_wheel" version="$new_version" fi # Upload the wheel to S3 +python3 .buildkite/generate_index.py --wheel "$normal_wheel" + +# generate index for this commit aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" +aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" +aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" +aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" + +# generate index for nightly aws s3 cp "$wheel" "s3://vllm-wheels/nightly/" +aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" +aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" + aws s3 cp "$wheel" "s3://vllm-wheels/$version/" \ No newline at end of file From 72d9c316d3f6ede485146fe5aabd4e61dbc59069 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 22 Dec 2024 00:39:11 -0800 Subject: [PATCH 3/5] [cd][release] fix race conditions (#11407) Signed-off-by: youkaichao --- .buildkite/upload-wheels.sh | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh index 0b6d2a1c64c91..3c756659a715a 100644 --- a/.buildkite/upload-wheels.sh +++ b/.buildkite/upload-wheels.sh @@ -46,12 +46,26 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel" # generate index for this commit aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" -aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" -aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" + +if [[ $normal_wheel == *"cu118"* ]]; then + # if $normal_wheel matches cu118, do not upload the index.html + echo "Skipping index files for cu118 wheels" +else + # only upload index.html for cu12 wheels (default wheels) + aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" + aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" +fi # generate index for nightly aws s3 cp "$wheel" "s3://vllm-wheels/nightly/" aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" -aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" + +if [[ $normal_wheel == *"cu118"* ]]; then + # if $normal_wheel matches cu118, do not upload the index.html + echo "Skipping index files for cu118 wheels" +else + # only upload index.html for cu12 wheels (default wheels) + aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" +fi aws s3 cp "$wheel" "s3://vllm-wheels/$version/" \ No newline at end of file From f1d1bf6288abfe051ec4ad891c5a96575e347bfc Mon Sep 17 00:00:00 2001 From: "Jason T. Greene" Date: Sun, 22 Dec 2024 09:25:10 -0600 Subject: [PATCH 4/5] [Bugfix] Fix fully sharded LoRAs with Mixtral (#11390) Signed-off-by: Jason Greene --- tests/lora/test_mixtral.py | 4 +++- vllm/lora/layers.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 150221dfce6ab..797a495201d33 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -62,8 +62,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size): @pytest.mark.parametrize("tp_size", [4]) +@pytest.mark.parametrize("fully_shard", [True, False]) def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules, - tp_size): + tp_size, fully_shard): """This LoRA model has all supported Mixtral target modules""" if torch.cuda.device_count() < tp_size: @@ -82,6 +83,7 @@ def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules, max_loras=4, distributed_executor_backend="ray", tensor_parallel_size=tp_size, + fully_sharded_loras=fully_shard, max_lora_rank=32, ) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index a6c93a3d8bfe9..85164c2165a3c 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -425,8 +425,9 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA): if self.base_layer.skip_bias_add else None) return output, output_bias + # ReplicatedLinear should always be replaced, regardless of the fully + # sharded LoRAs setting, because it is, by definition, copied per GPU. @classmethod - @_not_fully_sharded_can_replace def can_replace_layer( cls, source_layer: nn.Module, From 048fc57a0fb599a3e39bbc9228432b0d1bb9e88d Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Sun, 22 Dec 2024 14:17:43 -0800 Subject: [PATCH 5/5] [CI] Unboock H100 Benchmark (#11419) Signed-off-by: simon-mo --- .buildkite/nightly-benchmarks/benchmark-pipeline.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml index 64ba1b32fb074..708e548727cf5 100644 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -65,9 +65,9 @@ steps: - VLLM_USAGE_SOURCE - HF_TOKEN - - block: "Run H100 Benchmark" - key: block-h100 - depends_on: ~ + #- block: "Run H100 Benchmark" + #key: block-h100 + #depends_on: ~ - label: "H100" # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"