mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 06:15:00 +08:00
[CI/Test] improve robustness of test (hf_runner) (#5347)
[CI/Test] improve robustness of test by replacing del with context manager (hf_runner) (#5347)
This commit is contained in:
parent
c96fc06747
commit
9fb900f90c
@ -43,9 +43,8 @@ def test_models(
|
|||||||
if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
|
if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
|
||||||
pytest.skip("Skipping non-eager test for FlashInferBackend.")
|
pytest.skip("Skipping non-eager test for FlashInferBackend.")
|
||||||
|
|
||||||
hf_model = hf_runner(model, dtype=dtype)
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||||
del hf_model
|
|
||||||
|
|
||||||
vllm_model = vllm_runner(model,
|
vllm_model = vllm_runner(model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
|
|||||||
@ -40,9 +40,8 @@ def test_models(
|
|||||||
enable_chunked_prefill = True
|
enable_chunked_prefill = True
|
||||||
max_num_batched_tokens = chunked_prefill_token_size
|
max_num_batched_tokens = chunked_prefill_token_size
|
||||||
|
|
||||||
hf_model = hf_runner(model, dtype=dtype)
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||||
del hf_model
|
|
||||||
|
|
||||||
vllm_model = vllm_runner(
|
vllm_model = vllm_runner(
|
||||||
model,
|
model,
|
||||||
|
|||||||
@ -43,9 +43,8 @@ def test_chunked_prefill_recompute(
|
|||||||
enable_chunked_prefill = True
|
enable_chunked_prefill = True
|
||||||
max_num_batched_tokens = chunked_prefill_token_size
|
max_num_batched_tokens = chunked_prefill_token_size
|
||||||
|
|
||||||
hf_model = hf_runner(model, dtype=dtype)
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||||
del hf_model
|
|
||||||
|
|
||||||
vllm_model = vllm_runner(
|
vllm_model = vllm_runner(
|
||||||
model,
|
model,
|
||||||
@ -82,9 +81,8 @@ def test_preemption(
|
|||||||
) -> None:
|
) -> None:
|
||||||
"""By default, recompute preemption is enabled"""
|
"""By default, recompute preemption is enabled"""
|
||||||
|
|
||||||
hf_model = hf_runner(model, dtype=dtype)
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||||
del hf_model
|
|
||||||
|
|
||||||
vllm_model = vllm_runner(
|
vllm_model = vllm_runner(
|
||||||
model,
|
model,
|
||||||
@ -137,10 +135,9 @@ def test_swap(
|
|||||||
) -> None:
|
) -> None:
|
||||||
"""Use beam search enables swapping."""
|
"""Use beam search enables swapping."""
|
||||||
example_prompts = example_prompts[:1]
|
example_prompts = example_prompts[:1]
|
||||||
hf_model = hf_runner(model, dtype=dtype)
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
|
hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
|
||||||
max_tokens)
|
max_tokens)
|
||||||
del hf_model
|
|
||||||
|
|
||||||
vllm_model = vllm_runner(
|
vllm_model = vllm_runner(
|
||||||
model,
|
model,
|
||||||
|
|||||||
@ -354,7 +354,10 @@ class HfRunner:
|
|||||||
def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
|
def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
|
||||||
return self.model.encode(prompts)
|
return self.model.encode(prompts)
|
||||||
|
|
||||||
def __del__(self):
|
def __enter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_value, traceback):
|
||||||
del self.model
|
del self.model
|
||||||
cleanup()
|
cleanup()
|
||||||
|
|
||||||
|
|||||||
@ -42,9 +42,8 @@ def test_models(
|
|||||||
backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
|
backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
|
||||||
enforce_eager = backend_by_env_var == "FLASHINFER"
|
enforce_eager = backend_by_env_var == "FLASHINFER"
|
||||||
|
|
||||||
hf_model = hf_runner(model, dtype=dtype)
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||||
del hf_model
|
|
||||||
|
|
||||||
vllm_model = vllm_runner(
|
vllm_model = vllm_runner(
|
||||||
model,
|
model,
|
||||||
|
|||||||
@ -45,9 +45,8 @@ def test_models(
|
|||||||
enable_chunked_prefill = True
|
enable_chunked_prefill = True
|
||||||
max_num_batched_tokens = chunked_prefill_token_size
|
max_num_batched_tokens = chunked_prefill_token_size
|
||||||
|
|
||||||
hf_model = hf_runner(model, dtype=dtype)
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||||
del hf_model
|
|
||||||
|
|
||||||
vllm_model = vllm_runner(
|
vllm_model = vllm_runner(
|
||||||
model,
|
model,
|
||||||
|
|||||||
@ -34,9 +34,8 @@ def test_models(
|
|||||||
dtype: str,
|
dtype: str,
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
) -> None:
|
) -> None:
|
||||||
hf_model = hf_runner(model, dtype=dtype)
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||||
del hf_model
|
|
||||||
|
|
||||||
vllm_model = vllm_runner(model, dtype=dtype)
|
vllm_model = vllm_runner(model, dtype=dtype)
|
||||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|||||||
@ -28,9 +28,8 @@ def test_models(
|
|||||||
model: str,
|
model: str,
|
||||||
dtype: str,
|
dtype: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
hf_model = hf_runner(model, dtype=dtype, is_embedding_model=True)
|
with hf_runner(model, dtype=dtype, is_embedding_model=True) as hf_model:
|
||||||
hf_outputs = hf_model.encode(example_prompts)
|
hf_outputs = hf_model.encode(example_prompts)
|
||||||
del hf_model
|
|
||||||
|
|
||||||
vllm_model = vllm_runner(model, dtype=dtype)
|
vllm_model = vllm_runner(model, dtype=dtype)
|
||||||
vllm_outputs = vllm_model.encode(example_prompts)
|
vllm_outputs = vllm_model.encode(example_prompts)
|
||||||
|
|||||||
@ -84,11 +84,10 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
|
|||||||
"""
|
"""
|
||||||
model_id, vlm_config = model_and_config
|
model_id, vlm_config = model_and_config
|
||||||
|
|
||||||
hf_model = hf_runner(model_id, dtype=dtype, is_vision_model=True)
|
with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
|
||||||
hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
|
hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
images=hf_images)
|
images=hf_images)
|
||||||
del hf_model
|
|
||||||
|
|
||||||
vllm_image_prompts = [
|
vllm_image_prompts = [
|
||||||
p.replace("<image>", "<image>" * vlm_config.image_feature_size)
|
p.replace("<image>", "<image>" * vlm_config.image_feature_size)
|
||||||
|
|||||||
@ -26,10 +26,9 @@ def test_models(
|
|||||||
num_logprobs: int,
|
num_logprobs: int,
|
||||||
) -> None:
|
) -> None:
|
||||||
# TODO(sang): Sliding window should be tested separately.
|
# TODO(sang): Sliding window should be tested separately.
|
||||||
hf_model = hf_runner(model, dtype=dtype)
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||||
example_prompts, max_tokens, num_logprobs)
|
example_prompts, max_tokens, num_logprobs)
|
||||||
del hf_model
|
|
||||||
|
|
||||||
vllm_model = vllm_runner(model, dtype=dtype)
|
vllm_model = vllm_runner(model, dtype=dtype)
|
||||||
vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
|
vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
|
||||||
|
|||||||
@ -34,9 +34,8 @@ def test_models(
|
|||||||
# To pass the small model tests, we need full precision.
|
# To pass the small model tests, we need full precision.
|
||||||
assert dtype == "float"
|
assert dtype == "float"
|
||||||
|
|
||||||
hf_model = hf_runner(model, dtype=dtype)
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||||
del hf_model
|
|
||||||
|
|
||||||
vllm_model = vllm_runner(model, dtype=dtype)
|
vllm_model = vllm_runner(model, dtype=dtype)
|
||||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|||||||
@ -30,10 +30,9 @@ def test_beam_search_single_input(
|
|||||||
beam_width: int,
|
beam_width: int,
|
||||||
) -> None:
|
) -> None:
|
||||||
example_prompts = example_prompts[:1]
|
example_prompts = example_prompts[:1]
|
||||||
hf_model = hf_runner(model, dtype=dtype)
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
|
hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
|
||||||
max_tokens)
|
max_tokens)
|
||||||
del hf_model
|
|
||||||
|
|
||||||
vllm_model = vllm_runner(model, dtype=dtype)
|
vllm_model = vllm_runner(model, dtype=dtype)
|
||||||
vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
|
vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
|
||||||
|
|||||||
@ -32,12 +32,11 @@ def test_get_prompt_logprobs(
|
|||||||
max_num_batched_tokens = chunked_prefill_token_size
|
max_num_batched_tokens = chunked_prefill_token_size
|
||||||
|
|
||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
hf_model = hf_runner(model, dtype=dtype)
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_logprobs = hf_model.generate_greedy_logprobs(
|
hf_logprobs = hf_model.generate_greedy_logprobs(
|
||||||
example_prompts,
|
example_prompts,
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
)
|
)
|
||||||
del hf_model
|
|
||||||
|
|
||||||
vllm_model = vllm_runner(
|
vllm_model = vllm_runner(
|
||||||
model,
|
model,
|
||||||
|
|||||||
@ -116,16 +116,14 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
|
|||||||
|
|
||||||
def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
|
def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
|
||||||
tmp_path):
|
tmp_path):
|
||||||
hf_model = hf_runner(model_ref)
|
with hf_runner(model_ref) as hf_model:
|
||||||
model_path = tmp_path / (model_ref + ".tensors")
|
model_path = tmp_path / (model_ref + ".tensors")
|
||||||
max_tokens = 50
|
max_tokens = 50
|
||||||
outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
|
outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
|
||||||
with open_stream(model_path, "wb+") as stream:
|
with open_stream(model_path, "wb+") as stream:
|
||||||
serializer = TensorSerializer(stream)
|
serializer = TensorSerializer(stream)
|
||||||
serializer.write_module(hf_model.model)
|
serializer.write_module(hf_model.model)
|
||||||
del hf_model
|
|
||||||
gc.collect()
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
loaded_hf_model = vllm_runner(model_ref,
|
loaded_hf_model = vllm_runner(model_ref,
|
||||||
load_format="tensorizer",
|
load_format="tensorizer",
|
||||||
model_loader_extra_config=TensorizerConfig(
|
model_loader_extra_config=TensorizerConfig(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user