mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-27 07:41:49 +08:00
[Core] Enable sharded state loader for V1 engine and enhance test coverage (#25308)
Signed-off-by: pengdrumli <pengdrumli@tencent.com> Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
parent
bc76128565
commit
9fc86d2802
@ -57,10 +57,19 @@ def llama_3p2_1b_files():
|
||||
|
||||
def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
|
||||
llm_sharded_writer = LLM(model=input_dir, **kwargs)
|
||||
|
||||
# Check which engine version is being used
|
||||
is_v1_engine = hasattr(llm_sharded_writer.llm_engine, "engine_core")
|
||||
# Dump worker states to output directory
|
||||
llm_sharded_writer.llm_engine.model_executor.save_sharded_state(
|
||||
path=output_dir)
|
||||
if is_v1_engine:
|
||||
# For V1 engine, we need to use engine_core.save_sharded_state
|
||||
print("Using V1 engine save path")
|
||||
llm_sharded_writer.llm_engine.engine_core.save_sharded_state(
|
||||
path=output_dir)
|
||||
else:
|
||||
# For V0 engine
|
||||
print("Using V0 engine save path")
|
||||
model_executor = llm_sharded_writer.llm_engine.model_executor
|
||||
model_executor.save_sharded_state(path=output_dir)
|
||||
|
||||
# Copy metadata files to output directory
|
||||
for file in os.listdir(input_dir):
|
||||
@ -91,8 +100,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
|
||||
gpu_memory_utilization = 0.8
|
||||
input_dir = llama_3p2_1b_files
|
||||
ctx = mp.get_context("spawn")
|
||||
# The interface in v1 engine has changed, run in v1 engine will hang.
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
|
||||
# Run in separate processes for memory & CUDA isolation
|
||||
with TemporaryDirectory() as output_dir:
|
||||
@ -100,7 +107,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
|
||||
args=(input_dir, output_dir, weights_patterns),
|
||||
kwargs=dict(
|
||||
tensor_parallel_size=tp_size,
|
||||
distributed_executor_backend="mp",
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
enforce_eager=True,
|
||||
))
|
||||
@ -112,7 +118,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
|
||||
p = ctx.Process(target=_run_generate,
|
||||
args=(input_dir, queue),
|
||||
kwargs=dict(
|
||||
distributed_executor_backend="mp",
|
||||
enable_lora=enable_lora,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
tensor_parallel_size=tp_size,
|
||||
@ -133,7 +138,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
|
||||
p = ctx.Process(target=_run_generate,
|
||||
args=(output_dir, queue),
|
||||
kwargs=dict(
|
||||
distributed_executor_backend="mp",
|
||||
enable_lora=enable_lora,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
tensor_parallel_size=tp_size,
|
||||
|
||||
@ -1486,12 +1486,6 @@ class EngineArgs:
|
||||
#############################################################
|
||||
# Unsupported Feature Flags on V1.
|
||||
|
||||
if self.load_format == "sharded_state":
|
||||
_raise_or_fallback(
|
||||
feature_name=f"--load_format {self.load_format}",
|
||||
recommend_to_remove=False)
|
||||
return False
|
||||
|
||||
if (self.logits_processor_pattern
|
||||
!= EngineArgs.logits_processor_pattern):
|
||||
_raise_or_fallback(feature_name="--logits-processor-pattern",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user