mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-29 04:17:07 +08:00
Merge branch 'main' into v1-blocktable-opt
This commit is contained in:
commit
0420fb2c7b
24
.buildkite/generate_index.py
Normal file
24
.buildkite/generate_index.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
|
template = """<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h1>Links for vLLM</h1/>
|
||||||
|
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--wheel", help="The wheel path.", required=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
filename = os.path.basename(args.wheel)
|
||||||
|
|
||||||
|
with open("index.html", "w") as f:
|
||||||
|
print(f"Generated index.html for {args.wheel}")
|
||||||
|
# cloudfront requires escaping the '+' character
|
||||||
|
f.write(
|
||||||
|
template.format(wheel=filename,
|
||||||
|
wheel_html_escaped=filename.replace("+", "%2B")))
|
||||||
@ -65,9 +65,9 @@ steps:
|
|||||||
- VLLM_USAGE_SOURCE
|
- VLLM_USAGE_SOURCE
|
||||||
- HF_TOKEN
|
- HF_TOKEN
|
||||||
|
|
||||||
- block: "Run H100 Benchmark"
|
#- block: "Run H100 Benchmark"
|
||||||
key: block-h100
|
#key: block-h100
|
||||||
depends_on: ~
|
#depends_on: ~
|
||||||
|
|
||||||
- label: "H100"
|
- label: "H100"
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
|||||||
@ -23,6 +23,8 @@ wheel="$new_wheel"
|
|||||||
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||||
echo "Version: $version"
|
echo "Version: $version"
|
||||||
|
|
||||||
|
normal_wheel="$wheel" # Save the original wheel filename
|
||||||
|
|
||||||
# If the version contains "dev", rename it to v1.0.0.dev for consistency
|
# If the version contains "dev", rename it to v1.0.0.dev for consistency
|
||||||
if [[ $version == *dev* ]]; then
|
if [[ $version == *dev* ]]; then
|
||||||
suffix="${version##*.}"
|
suffix="${version##*.}"
|
||||||
@ -32,12 +34,38 @@ if [[ $version == *dev* ]]; then
|
|||||||
new_version="1.0.0.dev"
|
new_version="1.0.0.dev"
|
||||||
fi
|
fi
|
||||||
new_wheel="${wheel/$version/$new_version}"
|
new_wheel="${wheel/$version/$new_version}"
|
||||||
mv -- "$wheel" "$new_wheel"
|
# use cp to keep both files in the artifacts directory
|
||||||
|
cp -- "$wheel" "$new_wheel"
|
||||||
wheel="$new_wheel"
|
wheel="$new_wheel"
|
||||||
version="$new_version"
|
version="$new_version"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Upload the wheel to S3
|
# Upload the wheel to S3
|
||||||
|
python3 .buildkite/generate_index.py --wheel "$normal_wheel"
|
||||||
|
|
||||||
|
# generate index for this commit
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
|
|
||||||
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
|
echo "Skipping index files for cu118 wheels"
|
||||||
|
else
|
||||||
|
# only upload index.html for cu12 wheels (default wheels)
|
||||||
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# generate index for nightly
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||||
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||||
|
|
||||||
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
|
echo "Skipping index files for cu118 wheels"
|
||||||
|
else
|
||||||
|
# only upload index.html for cu12 wheels (default wheels)
|
||||||
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
|
fi
|
||||||
|
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||||
@ -74,6 +74,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
|
|||||||
messages=messages,
|
messages=messages,
|
||||||
max_completion_tokens=10,
|
max_completion_tokens=10,
|
||||||
logprobs=True,
|
logprobs=True,
|
||||||
|
temperature=0.0,
|
||||||
top_logprobs=5)
|
top_logprobs=5)
|
||||||
assert len(chat_completion.choices) == 1
|
assert len(chat_completion.choices) == 1
|
||||||
|
|
||||||
@ -130,6 +131,7 @@ async def test_single_chat_session_audio_base64encoded(
|
|||||||
messages=messages,
|
messages=messages,
|
||||||
max_completion_tokens=10,
|
max_completion_tokens=10,
|
||||||
logprobs=True,
|
logprobs=True,
|
||||||
|
temperature=0.0,
|
||||||
top_logprobs=5)
|
top_logprobs=5)
|
||||||
assert len(chat_completion.choices) == 1
|
assert len(chat_completion.choices) == 1
|
||||||
|
|
||||||
@ -150,6 +152,7 @@ async def test_single_chat_session_audio_base64encoded(
|
|||||||
model=model_name,
|
model=model_name,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
max_completion_tokens=10,
|
max_completion_tokens=10,
|
||||||
|
temperature=0.0,
|
||||||
)
|
)
|
||||||
message = chat_completion.choices[0].message
|
message = chat_completion.choices[0].message
|
||||||
assert message.content is not None and len(message.content) >= 0
|
assert message.content is not None and len(message.content) >= 0
|
||||||
|
|||||||
@ -82,6 +82,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
|
|||||||
messages=messages,
|
messages=messages,
|
||||||
max_completion_tokens=10,
|
max_completion_tokens=10,
|
||||||
logprobs=True,
|
logprobs=True,
|
||||||
|
temperature=0.0,
|
||||||
top_logprobs=5)
|
top_logprobs=5)
|
||||||
assert len(chat_completion.choices) == 1
|
assert len(chat_completion.choices) == 1
|
||||||
|
|
||||||
@ -174,6 +175,7 @@ async def test_single_chat_session_video_base64encoded(
|
|||||||
messages=messages,
|
messages=messages,
|
||||||
max_completion_tokens=10,
|
max_completion_tokens=10,
|
||||||
logprobs=True,
|
logprobs=True,
|
||||||
|
temperature=0.0,
|
||||||
top_logprobs=5)
|
top_logprobs=5)
|
||||||
assert len(chat_completion.choices) == 1
|
assert len(chat_completion.choices) == 1
|
||||||
|
|
||||||
@ -194,6 +196,7 @@ async def test_single_chat_session_video_base64encoded(
|
|||||||
model=model_name,
|
model=model_name,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
max_completion_tokens=10,
|
max_completion_tokens=10,
|
||||||
|
temperature=0.0,
|
||||||
)
|
)
|
||||||
message = chat_completion.choices[0].message
|
message = chat_completion.choices[0].message
|
||||||
assert message.content is not None and len(message.content) >= 0
|
assert message.content is not None and len(message.content) >= 0
|
||||||
|
|||||||
@ -83,6 +83,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
|
|||||||
messages=messages,
|
messages=messages,
|
||||||
max_completion_tokens=10,
|
max_completion_tokens=10,
|
||||||
logprobs=True,
|
logprobs=True,
|
||||||
|
temperature=0.0,
|
||||||
top_logprobs=5)
|
top_logprobs=5)
|
||||||
assert len(chat_completion.choices) == 1
|
assert len(chat_completion.choices) == 1
|
||||||
|
|
||||||
@ -175,6 +176,7 @@ async def test_single_chat_session_image_base64encoded(
|
|||||||
messages=messages,
|
messages=messages,
|
||||||
max_completion_tokens=10,
|
max_completion_tokens=10,
|
||||||
logprobs=True,
|
logprobs=True,
|
||||||
|
temperature=0.0,
|
||||||
top_logprobs=5)
|
top_logprobs=5)
|
||||||
assert len(chat_completion.choices) == 1
|
assert len(chat_completion.choices) == 1
|
||||||
|
|
||||||
@ -195,6 +197,7 @@ async def test_single_chat_session_image_base64encoded(
|
|||||||
model=model_name,
|
model=model_name,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
max_completion_tokens=10,
|
max_completion_tokens=10,
|
||||||
|
temperature=0.0,
|
||||||
)
|
)
|
||||||
message = chat_completion.choices[0].message
|
message = chat_completion.choices[0].message
|
||||||
assert message.content is not None and len(message.content) >= 0
|
assert message.content is not None and len(message.content) >= 0
|
||||||
|
|||||||
@ -62,8 +62,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("tp_size", [4])
|
@pytest.mark.parametrize("tp_size", [4])
|
||||||
|
@pytest.mark.parametrize("fully_shard", [True, False])
|
||||||
def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
|
def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
|
||||||
tp_size):
|
tp_size, fully_shard):
|
||||||
"""This LoRA model has all supported Mixtral target modules"""
|
"""This LoRA model has all supported Mixtral target modules"""
|
||||||
|
|
||||||
if torch.cuda.device_count() < tp_size:
|
if torch.cuda.device_count() < tp_size:
|
||||||
@ -82,6 +83,7 @@ def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
|
|||||||
max_loras=4,
|
max_loras=4,
|
||||||
distributed_executor_backend="ray",
|
distributed_executor_backend="ray",
|
||||||
tensor_parallel_size=tp_size,
|
tensor_parallel_size=tp_size,
|
||||||
|
fully_sharded_loras=fully_shard,
|
||||||
max_lora_rank=32,
|
max_lora_rank=32,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -425,8 +425,9 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
|
|||||||
if self.base_layer.skip_bias_add else None)
|
if self.base_layer.skip_bias_add else None)
|
||||||
return output, output_bias
|
return output, output_bias
|
||||||
|
|
||||||
|
# ReplicatedLinear should always be replaced, regardless of the fully
|
||||||
|
# sharded LoRAs setting, because it is, by definition, copied per GPU.
|
||||||
@classmethod
|
@classmethod
|
||||||
@_not_fully_sharded_can_replace
|
|
||||||
def can_replace_layer(
|
def can_replace_layer(
|
||||||
cls,
|
cls,
|
||||||
source_layer: nn.Module,
|
source_layer: nn.Module,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user