[Core] Enable sharded state loader for V1 engine and enhance test coverage (#25308)

Signed-off-by: pengdrumli <pengdrumli@tencent.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
lirong 2025-09-20 21:15:22 +08:00 committed by yewentao256
parent bc76128565
commit 9fc86d2802
2 changed files with 12 additions and 14 deletions

View File

@ -57,10 +57,19 @@ def llama_3p2_1b_files():
def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
llm_sharded_writer = LLM(model=input_dir, **kwargs)
# Check which engine version is being used
is_v1_engine = hasattr(llm_sharded_writer.llm_engine, "engine_core")
# Dump worker states to output directory
llm_sharded_writer.llm_engine.model_executor.save_sharded_state(
path=output_dir)
if is_v1_engine:
# For V1 engine, we need to use engine_core.save_sharded_state
print("Using V1 engine save path")
llm_sharded_writer.llm_engine.engine_core.save_sharded_state(
path=output_dir)
else:
# For V0 engine
print("Using V0 engine save path")
model_executor = llm_sharded_writer.llm_engine.model_executor
model_executor.save_sharded_state(path=output_dir)
# Copy metadata files to output directory
for file in os.listdir(input_dir):
@ -91,8 +100,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
gpu_memory_utilization = 0.8
input_dir = llama_3p2_1b_files
ctx = mp.get_context("spawn")
# The interface in v1 engine has changed, run in v1 engine will hang.
monkeypatch.setenv("VLLM_USE_V1", "0")
# Run in separate processes for memory & CUDA isolation
with TemporaryDirectory() as output_dir:
@ -100,7 +107,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
args=(input_dir, output_dir, weights_patterns),
kwargs=dict(
tensor_parallel_size=tp_size,
distributed_executor_backend="mp",
gpu_memory_utilization=gpu_memory_utilization,
enforce_eager=True,
))
@ -112,7 +118,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
p = ctx.Process(target=_run_generate,
args=(input_dir, queue),
kwargs=dict(
distributed_executor_backend="mp",
enable_lora=enable_lora,
gpu_memory_utilization=gpu_memory_utilization,
tensor_parallel_size=tp_size,
@ -133,7 +138,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
p = ctx.Process(target=_run_generate,
args=(output_dir, queue),
kwargs=dict(
distributed_executor_backend="mp",
enable_lora=enable_lora,
gpu_memory_utilization=gpu_memory_utilization,
tensor_parallel_size=tp_size,

View File

@ -1486,12 +1486,6 @@ class EngineArgs:
#############################################################
# Unsupported Feature Flags on V1.
if self.load_format == "sharded_state":
_raise_or_fallback(
feature_name=f"--load_format {self.load_format}",
recommend_to_remove=False)
return False
if (self.logits_processor_pattern
!= EngineArgs.logits_processor_pattern):
_raise_or_fallback(feature_name="--logits-processor-pattern",