mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 19:06:17 +08:00
[Misc] unify variable for LLM instance (#20996)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
This commit is contained in:
parent
e6b90a2805
commit
d97841078b
@ -14,7 +14,7 @@ For example:
|
|||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
|
||||||
model = LLM(
|
llm = LLM(
|
||||||
model="cerebras/Cerebras-GPT-1.3B",
|
model="cerebras/Cerebras-GPT-1.3B",
|
||||||
hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2
|
hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2
|
||||||
)
|
)
|
||||||
|
|||||||
@ -302,7 +302,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
|
|||||||
return tokenizer.apply_chat_template(chat, tokenize=False)
|
return tokenizer.apply_chat_template(chat, tokenize=False)
|
||||||
|
|
||||||
|
|
||||||
model = LLM(
|
llm = LLM(
|
||||||
model=model_id,
|
model=model_id,
|
||||||
enable_lora=True,
|
enable_lora=True,
|
||||||
max_lora_rank=64,
|
max_lora_rank=64,
|
||||||
@ -329,7 +329,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
outputs = model.generate(
|
outputs = llm.generate(
|
||||||
inputs,
|
inputs,
|
||||||
sampling_params=SamplingParams(
|
sampling_params=SamplingParams(
|
||||||
temperature=0.2,
|
temperature=0.2,
|
||||||
|
|||||||
@ -86,8 +86,9 @@ Load and run the model in `vllm`:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
|
|
||||||
result = model.generate("Hello my name is")
|
llm = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
|
||||||
|
result = llm.generate("Hello my name is")
|
||||||
print(result[0].outputs[0].text)
|
print(result[0].outputs[0].text)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -125,9 +126,10 @@ In this mode, all Linear modules (except for the final `lm_head`) have their wei
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
model = LLM("facebook/opt-125m", quantization="fp8")
|
|
||||||
|
llm = LLM("facebook/opt-125m", quantization="fp8")
|
||||||
# INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB
|
# INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB
|
||||||
result = model.generate("Hello, my name is")
|
result = llm.generate("Hello, my name is")
|
||||||
print(result[0].outputs[0].text)
|
print(result[0].outputs[0].text)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@ -108,7 +108,8 @@ After quantization, you can load and run the model in vLLM:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
|
|
||||||
|
llm = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
|
||||||
```
|
```
|
||||||
|
|
||||||
To evaluate accuracy, you can use `lm_eval`:
|
To evaluate accuracy, you can use `lm_eval`:
|
||||||
|
|||||||
@ -114,7 +114,8 @@ After quantization, you can load and run the model in vLLM:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
|
|
||||||
|
llm = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
|
||||||
```
|
```
|
||||||
|
|
||||||
To evaluate accuracy, you can use `lm_eval`:
|
To evaluate accuracy, you can use `lm_eval`:
|
||||||
|
|||||||
@ -174,10 +174,10 @@ You can change the output dimensions of embedding models that support Matryoshka
|
|||||||
```python
|
```python
|
||||||
from vllm import LLM, PoolingParams
|
from vllm import LLM, PoolingParams
|
||||||
|
|
||||||
model = LLM(model="jinaai/jina-embeddings-v3",
|
llm = LLM(model="jinaai/jina-embeddings-v3",
|
||||||
task="embed",
|
task="embed",
|
||||||
trust_remote_code=True)
|
trust_remote_code=True)
|
||||||
outputs = model.embed(["Follow the white rabbit."],
|
outputs = llm.embed(["Follow the white rabbit."],
|
||||||
pooling_params=PoolingParams(dimensions=32))
|
pooling_params=PoolingParams(dimensions=32))
|
||||||
print(outputs[0].outputs)
|
print(outputs[0].outputs)
|
||||||
```
|
```
|
||||||
|
|||||||
@ -28,10 +28,10 @@ def main(args: Namespace):
|
|||||||
|
|
||||||
# Create an LLM.
|
# Create an LLM.
|
||||||
# You should pass task="classify" for classification models
|
# You should pass task="classify" for classification models
|
||||||
model = LLM(**vars(args))
|
llm = LLM(**vars(args))
|
||||||
|
|
||||||
# Generate logits. The output is a list of ClassificationRequestOutputs.
|
# Generate logits. The output is a list of ClassificationRequestOutputs.
|
||||||
outputs = model.classify(prompts)
|
outputs = llm.classify(prompts)
|
||||||
|
|
||||||
# Print the outputs.
|
# Print the outputs.
|
||||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||||
|
|||||||
@ -31,10 +31,10 @@ def main(args: Namespace):
|
|||||||
|
|
||||||
# Create an LLM.
|
# Create an LLM.
|
||||||
# You should pass task="embed" for embedding models
|
# You should pass task="embed" for embedding models
|
||||||
model = LLM(**vars(args))
|
llm = LLM(**vars(args))
|
||||||
|
|
||||||
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
||||||
outputs = model.embed(prompts)
|
outputs = llm.embed(prompts)
|
||||||
|
|
||||||
# Print the outputs.
|
# Print the outputs.
|
||||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||||
|
|||||||
@ -27,10 +27,10 @@ def main(args: Namespace):
|
|||||||
|
|
||||||
# Create an LLM.
|
# Create an LLM.
|
||||||
# You should pass task="score" for cross-encoder models
|
# You should pass task="score" for cross-encoder models
|
||||||
model = LLM(**vars(args))
|
llm = LLM(**vars(args))
|
||||||
|
|
||||||
# Generate scores. The output is a list of ScoringRequestOutputs.
|
# Generate scores. The output is a list of ScoringRequestOutputs.
|
||||||
outputs = model.score(text_1, texts_2)
|
outputs = llm.score(text_1, texts_2)
|
||||||
|
|
||||||
# Print the outputs.
|
# Print the outputs.
|
||||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||||
|
|||||||
@ -30,11 +30,11 @@ def main(args: Namespace):
|
|||||||
|
|
||||||
# Create an LLM.
|
# Create an LLM.
|
||||||
# You should pass task="embed" for embedding models
|
# You should pass task="embed" for embedding models
|
||||||
model = LLM(**vars(args))
|
llm = LLM(**vars(args))
|
||||||
|
|
||||||
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
||||||
# Only text matching task is supported for now. See #16120
|
# Only text matching task is supported for now. See #16120
|
||||||
outputs = model.embed(prompts)
|
outputs = llm.embed(prompts)
|
||||||
|
|
||||||
# Print the outputs.
|
# Print the outputs.
|
||||||
print("\nGenerated Outputs:")
|
print("\nGenerated Outputs:")
|
||||||
|
|||||||
@ -30,10 +30,10 @@ def main(args: Namespace):
|
|||||||
|
|
||||||
# Create an LLM.
|
# Create an LLM.
|
||||||
# You should pass task="embed" for embedding models
|
# You should pass task="embed" for embedding models
|
||||||
model = LLM(**vars(args))
|
llm = LLM(**vars(args))
|
||||||
|
|
||||||
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
||||||
outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32))
|
outputs = llm.embed(prompts, pooling_params=PoolingParams(dimensions=32))
|
||||||
|
|
||||||
# Print the outputs.
|
# Print the outputs.
|
||||||
print("\nGenerated Outputs:")
|
print("\nGenerated Outputs:")
|
||||||
|
|||||||
@ -25,7 +25,7 @@ def config_buckets():
|
|||||||
os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
|
os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
|
||||||
|
|
||||||
|
|
||||||
def initialize_model():
|
def initialize_llm():
|
||||||
"""Create an LLM with speculative decoding."""
|
"""Create an LLM with speculative decoding."""
|
||||||
return LLM(
|
return LLM(
|
||||||
model="openlm-research/open_llama_7b",
|
model="openlm-research/open_llama_7b",
|
||||||
@ -43,9 +43,9 @@ def initialize_model():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def process_requests(model: LLM, sampling_params: SamplingParams):
|
def process_requests(llm: LLM, sampling_params: SamplingParams):
|
||||||
"""Generate texts from prompts and print them."""
|
"""Generate texts from prompts and print them."""
|
||||||
outputs = model.generate(prompts, sampling_params)
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
for output in outputs:
|
for output in outputs:
|
||||||
prompt = output.prompt
|
prompt = output.prompt
|
||||||
generated_text = output.outputs[0].text
|
generated_text = output.outputs[0].text
|
||||||
@ -53,12 +53,12 @@ def process_requests(model: LLM, sampling_params: SamplingParams):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Main function that sets up the model and processes prompts."""
|
"""Main function that sets up the llm and processes prompts."""
|
||||||
config_buckets()
|
config_buckets()
|
||||||
model = initialize_model()
|
llm = initialize_llm()
|
||||||
# Create a sampling params object.
|
# Create a sampling params object.
|
||||||
sampling_params = SamplingParams(max_tokens=100, top_k=1)
|
sampling_params = SamplingParams(max_tokens=100, top_k=1)
|
||||||
process_requests(model, sampling_params)
|
process_requests(llm, sampling_params)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@ -140,7 +140,7 @@ datamodule_config = {
|
|||||||
class PrithviMAE:
|
class PrithviMAE:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
print("Initializing PrithviMAE model")
|
print("Initializing PrithviMAE model")
|
||||||
self.model = LLM(
|
self.llm = LLM(
|
||||||
model=os.path.join(os.path.dirname(__file__), "./model"),
|
model=os.path.join(os.path.dirname(__file__), "./model"),
|
||||||
skip_tokenizer_init=True,
|
skip_tokenizer_init=True,
|
||||||
dtype="float32",
|
dtype="float32",
|
||||||
@ -158,7 +158,7 @@ class PrithviMAE:
|
|||||||
|
|
||||||
prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
|
prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
|
||||||
|
|
||||||
outputs = self.model.encode(prompt, use_tqdm=False)
|
outputs = self.llm.encode(prompt, use_tqdm=False)
|
||||||
print("################ Inference done (it took seconds) ##############")
|
print("################ Inference done (it took seconds) ##############")
|
||||||
|
|
||||||
return outputs[0].outputs.data
|
return outputs[0].outputs.data
|
||||||
|
|||||||
@ -17,13 +17,13 @@ model_name = "Qwen/Qwen3-Reranker-0.6B"
|
|||||||
# Models converted offline using this method can not only be more efficient
|
# Models converted offline using this method can not only be more efficient
|
||||||
# and support the vllm score API, but also make the init parameters more
|
# and support the vllm score API, but also make the init parameters more
|
||||||
# concise, for example.
|
# concise, for example.
|
||||||
# model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
|
# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
|
||||||
|
|
||||||
# If you want to load the official original version, the init parameters are
|
# If you want to load the official original version, the init parameters are
|
||||||
# as follows.
|
# as follows.
|
||||||
|
|
||||||
|
|
||||||
def get_model() -> LLM:
|
def get_llm() -> LLM:
|
||||||
"""Initializes and returns the LLM model for Qwen3-Reranker."""
|
"""Initializes and returns the LLM model for Qwen3-Reranker."""
|
||||||
return LLM(
|
return LLM(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
@ -77,8 +77,8 @@ def main() -> None:
|
|||||||
]
|
]
|
||||||
documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]
|
documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]
|
||||||
|
|
||||||
model = get_model()
|
llm = get_llm()
|
||||||
outputs = model.score(queries, documents)
|
outputs = llm.score(queries, documents)
|
||||||
|
|
||||||
print("-" * 30)
|
print("-" * 30)
|
||||||
print([output.outputs.score for output in outputs])
|
print([output.outputs.score for output in outputs])
|
||||||
|
|||||||
@ -236,13 +236,13 @@ def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
|
|||||||
monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
|
monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
|
||||||
|
|
||||||
with vllm_runner('facebook/opt-125m', enforce_eager=True) as vllm_model:
|
with vllm_runner('facebook/opt-125m', enforce_eager=True) as vllm_model:
|
||||||
if isinstance(vllm_model.model.llm_engine, LLMEngineV1):
|
if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
|
||||||
v1_test_failed_model_execution(vllm_model)
|
v1_test_failed_model_execution(vllm_model)
|
||||||
|
|
||||||
|
|
||||||
def v1_test_failed_model_execution(vllm_model):
|
def v1_test_failed_model_execution(vllm_model):
|
||||||
|
|
||||||
engine = vllm_model.model.llm_engine
|
engine = vllm_model.llm.llm_engine
|
||||||
mocked_execute_model = Mock(
|
mocked_execute_model = Mock(
|
||||||
side_effect=RuntimeError("Mocked Critical Error"))
|
side_effect=RuntimeError("Mocked Critical Error"))
|
||||||
engine.engine_core.engine_core.model_executor.execute_model =\
|
engine.engine_core.engine_core.model_executor.execute_model =\
|
||||||
|
|||||||
@ -81,7 +81,7 @@ def test_chunked_prefill_recompute(
|
|||||||
disable_log_stats=False,
|
disable_log_stats=False,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
|
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||||
< ARTIFICIAL_PREEMPTION_MAX_CNT)
|
< ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||||
|
|
||||||
for i in range(len(example_prompts)):
|
for i in range(len(example_prompts)):
|
||||||
@ -118,10 +118,10 @@ def test_preemption(
|
|||||||
distributed_executor_backend=distributed_executor_backend,
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
|
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||||
< ARTIFICIAL_PREEMPTION_MAX_CNT)
|
< ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||||
total_preemption = (
|
total_preemption = (
|
||||||
vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
|
vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption)
|
||||||
|
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
outputs_0_lst=hf_outputs,
|
outputs_0_lst=hf_outputs,
|
||||||
@ -174,12 +174,12 @@ def test_preemption_infeasible(
|
|||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
sampling_params = SamplingParams(max_tokens=max_tokens,
|
sampling_params = SamplingParams(max_tokens=max_tokens,
|
||||||
ignore_eos=True)
|
ignore_eos=True)
|
||||||
req_outputs = vllm_model.model.generate(
|
req_outputs = vllm_model.llm.generate(
|
||||||
example_prompts,
|
example_prompts,
|
||||||
sampling_params=sampling_params,
|
sampling_params=sampling_params,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
|
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||||
< ARTIFICIAL_PREEMPTION_MAX_CNT)
|
< ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||||
|
|
||||||
# Verify the request is ignored and not hang.
|
# Verify the request is ignored and not hang.
|
||||||
|
|||||||
@ -784,7 +784,7 @@ class VllmRunner:
|
|||||||
enforce_eager: Optional[bool] = False,
|
enforce_eager: Optional[bool] = False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.model = LLM(
|
self.llm = LLM(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
task=task,
|
task=task,
|
||||||
tokenizer=tokenizer_name,
|
tokenizer=tokenizer_name,
|
||||||
@ -854,7 +854,7 @@ class VllmRunner:
|
|||||||
videos=videos,
|
videos=videos,
|
||||||
audios=audios)
|
audios=audios)
|
||||||
|
|
||||||
req_outputs = self.model.generate(inputs,
|
req_outputs = self.llm.generate(inputs,
|
||||||
sampling_params=sampling_params,
|
sampling_params=sampling_params,
|
||||||
**kwargs)
|
**kwargs)
|
||||||
|
|
||||||
@ -902,7 +902,7 @@ class VllmRunner:
|
|||||||
videos=videos,
|
videos=videos,
|
||||||
audios=audios)
|
audios=audios)
|
||||||
|
|
||||||
req_outputs = self.model.generate(inputs,
|
req_outputs = self.llm.generate(inputs,
|
||||||
sampling_params=sampling_params,
|
sampling_params=sampling_params,
|
||||||
**kwargs)
|
**kwargs)
|
||||||
|
|
||||||
@ -924,7 +924,7 @@ class VllmRunner:
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
assert sampling_params.logprobs is not None
|
assert sampling_params.logprobs is not None
|
||||||
req_outputs = self.model.generate(encoder_decoder_prompts,
|
req_outputs = self.llm.generate(encoder_decoder_prompts,
|
||||||
sampling_params=sampling_params)
|
sampling_params=sampling_params)
|
||||||
toks_str_logsprobs_prompt_logprobs = (
|
toks_str_logsprobs_prompt_logprobs = (
|
||||||
self._final_steps_generate_w_logprobs(req_outputs))
|
self._final_steps_generate_w_logprobs(req_outputs))
|
||||||
@ -1018,7 +1018,7 @@ class VllmRunner:
|
|||||||
videos=videos,
|
videos=videos,
|
||||||
audios=audios)
|
audios=audios)
|
||||||
|
|
||||||
outputs = self.model.beam_search(
|
outputs = self.llm.beam_search(
|
||||||
inputs,
|
inputs,
|
||||||
BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
|
BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
|
||||||
returned_outputs = []
|
returned_outputs = []
|
||||||
@ -1029,7 +1029,7 @@ class VllmRunner:
|
|||||||
return returned_outputs
|
return returned_outputs
|
||||||
|
|
||||||
def classify(self, prompts: list[str]) -> list[list[float]]:
|
def classify(self, prompts: list[str]) -> list[list[float]]:
|
||||||
req_outputs = self.model.classify(prompts)
|
req_outputs = self.llm.classify(prompts)
|
||||||
return [req_output.outputs.probs for req_output in req_outputs]
|
return [req_output.outputs.probs for req_output in req_outputs]
|
||||||
|
|
||||||
def embed(self,
|
def embed(self,
|
||||||
@ -1044,11 +1044,11 @@ class VllmRunner:
|
|||||||
videos=videos,
|
videos=videos,
|
||||||
audios=audios)
|
audios=audios)
|
||||||
|
|
||||||
req_outputs = self.model.embed(inputs, *args, **kwargs)
|
req_outputs = self.llm.embed(inputs, *args, **kwargs)
|
||||||
return [req_output.outputs.embedding for req_output in req_outputs]
|
return [req_output.outputs.embedding for req_output in req_outputs]
|
||||||
|
|
||||||
def encode(self, prompts: list[str]) -> list[list[float]]:
|
def encode(self, prompts: list[str]) -> list[list[float]]:
|
||||||
req_outputs = self.model.encode(prompts)
|
req_outputs = self.llm.encode(prompts)
|
||||||
return [req_output.outputs.data for req_output in req_outputs]
|
return [req_output.outputs.data for req_output in req_outputs]
|
||||||
|
|
||||||
def score(
|
def score(
|
||||||
@ -1058,18 +1058,18 @@ class VllmRunner:
|
|||||||
*args,
|
*args,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> list[float]:
|
) -> list[float]:
|
||||||
req_outputs = self.model.score(text_1, text_2, *args, **kwargs)
|
req_outputs = self.llm.score(text_1, text_2, *args, **kwargs)
|
||||||
return [req_output.outputs.score for req_output in req_outputs]
|
return [req_output.outputs.score for req_output in req_outputs]
|
||||||
|
|
||||||
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
|
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
|
||||||
executor = self.model.llm_engine.model_executor
|
executor = self.llm.llm_engine.model_executor
|
||||||
return executor.apply_model(func)
|
return executor.apply_model(func)
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_value, traceback):
|
def __exit__(self, exc_type, exc_value, traceback):
|
||||||
del self.model
|
del self.llm
|
||||||
cleanup_dist_env_and_memory()
|
cleanup_dist_env_and_memory()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -37,7 +37,7 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
|
|||||||
num_scheduler_steps=num_scheduler_steps,
|
num_scheduler_steps=num_scheduler_steps,
|
||||||
enable_chunked_prefill=enable_chunked_prefill,
|
enable_chunked_prefill=enable_chunked_prefill,
|
||||||
enforce_eager=enforce_eager)
|
enforce_eager=enforce_eager)
|
||||||
engine: LLMEngine = runner.model.llm_engine
|
engine: LLMEngine = runner.llm.llm_engine
|
||||||
|
|
||||||
# In multi-step + chunked-prefill there is no separate single prompt step.
|
# In multi-step + chunked-prefill there is no separate single prompt step.
|
||||||
# What is scheduled will run for num_scheduler_steps always.
|
# What is scheduled will run for num_scheduler_steps always.
|
||||||
|
|||||||
@ -28,7 +28,7 @@ def vllm_model(vllm_runner):
|
|||||||
def test_stop_reason(vllm_model, example_prompts):
|
def test_stop_reason(vllm_model, example_prompts):
|
||||||
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
|
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
|
||||||
stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
|
stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
|
||||||
llm = vllm_model.model
|
llm = vllm_model.llm
|
||||||
|
|
||||||
# test stop token
|
# test stop token
|
||||||
outputs = llm.generate(example_prompts,
|
outputs = llm.generate(example_prompts,
|
||||||
|
|||||||
@ -101,42 +101,42 @@ def _stop_token_id(llm):
|
|||||||
def test_stop_strings():
|
def test_stop_strings():
|
||||||
# If V0, must set enforce_eager=False since we use
|
# If V0, must set enforce_eager=False since we use
|
||||||
# async output processing below.
|
# async output processing below.
|
||||||
vllm_model = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
|
llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
|
||||||
|
|
||||||
if envs.VLLM_USE_V1:
|
if envs.VLLM_USE_V1:
|
||||||
_stop_basic(vllm_model)
|
_stop_basic(llm)
|
||||||
else:
|
else:
|
||||||
_set_async_mode(vllm_model, True)
|
_set_async_mode(llm, True)
|
||||||
_stop_basic(vllm_model)
|
_stop_basic(llm)
|
||||||
|
|
||||||
_set_async_mode(vllm_model, False)
|
_set_async_mode(llm, False)
|
||||||
_stop_basic(vllm_model)
|
_stop_basic(llm)
|
||||||
|
|
||||||
if envs.VLLM_USE_V1:
|
if envs.VLLM_USE_V1:
|
||||||
_stop_multi_tokens(vllm_model)
|
_stop_multi_tokens(llm)
|
||||||
else:
|
else:
|
||||||
_set_async_mode(vllm_model, True)
|
_set_async_mode(llm, True)
|
||||||
_stop_multi_tokens(vllm_model)
|
_stop_multi_tokens(llm)
|
||||||
|
|
||||||
_set_async_mode(vllm_model, False)
|
_set_async_mode(llm, False)
|
||||||
_stop_multi_tokens(vllm_model)
|
_stop_multi_tokens(llm)
|
||||||
|
|
||||||
if envs.VLLM_USE_V1:
|
if envs.VLLM_USE_V1:
|
||||||
_stop_partial_token(vllm_model)
|
_stop_partial_token(llm)
|
||||||
else:
|
else:
|
||||||
_set_async_mode(vllm_model, True)
|
_set_async_mode(llm, True)
|
||||||
_stop_partial_token(vllm_model)
|
_stop_partial_token(llm)
|
||||||
|
|
||||||
_set_async_mode(vllm_model, False)
|
_set_async_mode(llm, False)
|
||||||
_stop_partial_token(vllm_model)
|
_stop_partial_token(llm)
|
||||||
|
|
||||||
if envs.VLLM_USE_V1:
|
if envs.VLLM_USE_V1:
|
||||||
# FIXME: this does not respect include_in_output=False
|
# FIXME: this does not respect include_in_output=False
|
||||||
# _stop_token_id(vllm_model)
|
# _stop_token_id(llm)
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
_set_async_mode(vllm_model, True)
|
_set_async_mode(llm, True)
|
||||||
_stop_token_id(vllm_model)
|
_stop_token_id(llm)
|
||||||
|
|
||||||
_set_async_mode(vllm_model, False)
|
_set_async_mode(llm, False)
|
||||||
_stop_token_id(vllm_model)
|
_stop_token_id(llm)
|
||||||
|
|||||||
@ -186,7 +186,7 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
|
|||||||
model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
|
model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
|
||||||
tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
|
tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
|
||||||
|
|
||||||
loaded_vllm_model = LLM(model=model_ref,
|
loaded_llm = LLM(model=model_ref,
|
||||||
load_format="tensorizer",
|
load_format="tensorizer",
|
||||||
enable_lora=True,
|
enable_lora=True,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
@ -198,13 +198,13 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
|
|||||||
tc_as_dict = tensorizer_config.to_serializable()
|
tc_as_dict = tensorizer_config.to_serializable()
|
||||||
|
|
||||||
print("lora adapter created")
|
print("lora adapter created")
|
||||||
assert do_sample(loaded_vllm_model,
|
assert do_sample(loaded_llm,
|
||||||
sql_lora_files,
|
sql_lora_files,
|
||||||
tensorizer_config_dict=tc_as_dict,
|
tensorizer_config_dict=tc_as_dict,
|
||||||
lora_id=0) == EXPECTED_NO_LORA_OUTPUT
|
lora_id=0) == EXPECTED_NO_LORA_OUTPUT
|
||||||
|
|
||||||
print("lora 1")
|
print("lora 1")
|
||||||
assert do_sample(loaded_vllm_model,
|
assert do_sample(loaded_llm,
|
||||||
sql_lora_files,
|
sql_lora_files,
|
||||||
tensorizer_config_dict=tc_as_dict,
|
tensorizer_config_dict=tc_as_dict,
|
||||||
lora_id=1) == EXPECTED_LORA_OUTPUT
|
lora_id=1) == EXPECTED_LORA_OUTPUT
|
||||||
|
|||||||
@ -41,7 +41,7 @@ def test_metric_counter_prompt_tokens(
|
|||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
disable_log_stats=False,
|
disable_log_stats=False,
|
||||||
gpu_memory_utilization=0.4) as vllm_model:
|
gpu_memory_utilization=0.4) as vllm_model:
|
||||||
tokenizer = vllm_model.model.get_tokenizer()
|
tokenizer = vllm_model.llm.get_tokenizer()
|
||||||
prompt_token_counts = [
|
prompt_token_counts = [
|
||||||
len(tokenizer.encode(p)) for p in example_prompts
|
len(tokenizer.encode(p)) for p in example_prompts
|
||||||
]
|
]
|
||||||
@ -53,7 +53,7 @@ def test_metric_counter_prompt_tokens(
|
|||||||
vllm_prompt_token_count = sum(prompt_token_counts)
|
vllm_prompt_token_count = sum(prompt_token_counts)
|
||||||
|
|
||||||
_ = vllm_model.generate_greedy(example_prompts, max_tokens)
|
_ = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
|
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
|
||||||
metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
|
metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
|
||||||
**stat_logger.labels)._value.get()
|
**stat_logger.labels)._value.get()
|
||||||
|
|
||||||
@ -77,8 +77,8 @@ def test_metric_counter_generation_tokens(
|
|||||||
disable_log_stats=False,
|
disable_log_stats=False,
|
||||||
gpu_memory_utilization=0.4) as vllm_model:
|
gpu_memory_utilization=0.4) as vllm_model:
|
||||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
tokenizer = vllm_model.model.get_tokenizer()
|
tokenizer = vllm_model.llm.get_tokenizer()
|
||||||
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
|
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
|
||||||
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
|
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
|
||||||
**stat_logger.labels)._value.get()
|
**stat_logger.labels)._value.get()
|
||||||
vllm_generation_count = 0
|
vllm_generation_count = 0
|
||||||
@ -113,8 +113,8 @@ def test_metric_counter_generation_tokens_multi_step(
|
|||||||
disable_async_output_proc=disable_async_output_proc,
|
disable_async_output_proc=disable_async_output_proc,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
tokenizer = vllm_model.model.get_tokenizer()
|
tokenizer = vllm_model.llm.get_tokenizer()
|
||||||
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
|
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
|
||||||
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
|
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
|
||||||
**stat_logger.labels)._value.get()
|
**stat_logger.labels)._value.get()
|
||||||
vllm_generation_count = 0
|
vllm_generation_count = 0
|
||||||
@ -145,7 +145,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
|
|||||||
disable_log_stats=False,
|
disable_log_stats=False,
|
||||||
gpu_memory_utilization=0.3,
|
gpu_memory_utilization=0.3,
|
||||||
served_model_name=served_model_name) as vllm_model:
|
served_model_name=served_model_name) as vllm_model:
|
||||||
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
|
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
|
||||||
metrics_tag_content = stat_logger.labels["model_name"]
|
metrics_tag_content = stat_logger.labels["model_name"]
|
||||||
|
|
||||||
if envs.VLLM_CI_USE_S3:
|
if envs.VLLM_CI_USE_S3:
|
||||||
|
|||||||
@ -32,8 +32,8 @@ def test_model_loading_with_params(vllm_runner):
|
|||||||
output = vllm_model.embed("Write a short story about a robot that"
|
output = vllm_model.embed("Write a short story about a robot that"
|
||||||
" dreams for the first time.\n")
|
" dreams for the first time.\n")
|
||||||
|
|
||||||
model_config = vllm_model.model.llm_engine.model_config
|
model_config = vllm_model.llm.llm_engine.model_config
|
||||||
model_tokenizer = vllm_model.model.llm_engine.tokenizer
|
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
|
||||||
|
|
||||||
# asserts on the bert model config file
|
# asserts on the bert model config file
|
||||||
assert model_config.encoder_config["max_seq_length"] == 512
|
assert model_config.encoder_config["max_seq_length"] == 512
|
||||||
@ -70,8 +70,8 @@ def test_roberta_model_loading_with_params(vllm_runner):
|
|||||||
output = vllm_model.embed("Write a short story about a robot that"
|
output = vllm_model.embed("Write a short story about a robot that"
|
||||||
" dreams for the first time.\n")
|
" dreams for the first time.\n")
|
||||||
|
|
||||||
model_config = vllm_model.model.llm_engine.model_config
|
model_config = vllm_model.llm.llm_engine.model_config
|
||||||
model_tokenizer = vllm_model.model.llm_engine.tokenizer
|
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
|
||||||
|
|
||||||
# asserts on the bert model config file
|
# asserts on the bert model config file
|
||||||
assert model_config.encoder_config["max_seq_length"] == 512
|
assert model_config.encoder_config["max_seq_length"] == 512
|
||||||
@ -108,7 +108,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner):
|
|||||||
output = vllm_model.embed("Write a short story about a robot that"
|
output = vllm_model.embed("Write a short story about a robot that"
|
||||||
" dreams for the first time.\n")
|
" dreams for the first time.\n")
|
||||||
|
|
||||||
model_tokenizer = vllm_model.model.llm_engine.tokenizer
|
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
|
||||||
assert model_tokenizer.tokenizer_id == model_name
|
assert model_tokenizer.tokenizer_id == model_name
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
|
|||||||
@ -274,7 +274,7 @@ def test_models_preemption_recompute(
|
|||||||
Tests that outputs are identical with and w/o preemptions (recompute).
|
Tests that outputs are identical with and w/o preemptions (recompute).
|
||||||
"""
|
"""
|
||||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||||
scheduler = vllm_model.model.llm_engine.scheduler[0]
|
scheduler = vllm_model.llm.llm_engine.scheduler[0]
|
||||||
scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
|
scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
|
||||||
preempt_vllm_outputs = vllm_model.generate_greedy(
|
preempt_vllm_outputs = vllm_model.generate_greedy(
|
||||||
example_prompts, max_tokens)
|
example_prompts, max_tokens)
|
||||||
|
|||||||
@ -238,7 +238,7 @@ def test_mistral_symbolic_languages(vllm_runner, model: str,
|
|||||||
load_format="mistral") as vllm_model:
|
load_format="mistral") as vllm_model:
|
||||||
for prompt in SYMBOLIC_LANG_PROMPTS:
|
for prompt in SYMBOLIC_LANG_PROMPTS:
|
||||||
msg = {"role": "user", "content": prompt}
|
msg = {"role": "user", "content": prompt}
|
||||||
outputs = vllm_model.model.chat([msg],
|
outputs = vllm_model.llm.chat([msg],
|
||||||
sampling_params=SAMPLING_PARAMS)
|
sampling_params=SAMPLING_PARAMS)
|
||||||
assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
|
assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
|
||||||
|
|
||||||
@ -253,11 +253,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
|
|||||||
load_format="mistral") as vllm_model:
|
load_format="mistral") as vllm_model:
|
||||||
|
|
||||||
msgs = copy.deepcopy(MSGS)
|
msgs = copy.deepcopy(MSGS)
|
||||||
outputs = vllm_model.model.chat(msgs,
|
outputs = vllm_model.llm.chat(msgs,
|
||||||
tools=TOOLS,
|
tools=TOOLS,
|
||||||
sampling_params=SAMPLING_PARAMS)
|
sampling_params=SAMPLING_PARAMS)
|
||||||
|
|
||||||
tokenizer = vllm_model.model.get_tokenizer()
|
tokenizer = vllm_model.llm.get_tokenizer()
|
||||||
tool_parser = MistralToolParser(tokenizer)
|
tool_parser = MistralToolParser(tokenizer)
|
||||||
|
|
||||||
model_output = outputs[0].outputs[0].text.strip()
|
model_output = outputs[0].outputs[0].text.strip()
|
||||||
@ -308,7 +308,7 @@ def test_mistral_guided_decoding(
|
|||||||
f"Give an example JSON for an employee profile that "
|
f"Give an example JSON for an employee profile that "
|
||||||
f"fits this schema: {SAMPLE_JSON_SCHEMA}"
|
f"fits this schema: {SAMPLE_JSON_SCHEMA}"
|
||||||
}]
|
}]
|
||||||
outputs = vllm_model.model.chat(messages, sampling_params=params)
|
outputs = vllm_model.llm.chat(messages, sampling_params=params)
|
||||||
|
|
||||||
generated_text = outputs[0].outputs[0].text
|
generated_text = outputs[0].outputs[0].text
|
||||||
json_response = json.loads(generated_text)
|
json_response = json.loads(generated_text)
|
||||||
|
|||||||
@ -30,7 +30,7 @@ class VllmMtebEncoder(mteb.Encoder):
|
|||||||
|
|
||||||
def __init__(self, vllm_model):
|
def __init__(self, vllm_model):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.model = vllm_model
|
self.llm = vllm_model
|
||||||
self.rng = np.random.default_rng(seed=42)
|
self.rng = np.random.default_rng(seed=42)
|
||||||
|
|
||||||
def encode(
|
def encode(
|
||||||
@ -43,7 +43,7 @@ class VllmMtebEncoder(mteb.Encoder):
|
|||||||
# issues by randomizing the order.
|
# issues by randomizing the order.
|
||||||
r = self.rng.permutation(len(sentences))
|
r = self.rng.permutation(len(sentences))
|
||||||
sentences = [sentences[i] for i in r]
|
sentences = [sentences[i] for i in r]
|
||||||
outputs = self.model.embed(sentences, use_tqdm=False)
|
outputs = self.llm.embed(sentences, use_tqdm=False)
|
||||||
embeds = np.array(outputs)
|
embeds = np.array(outputs)
|
||||||
embeds = embeds[np.argsort(r)]
|
embeds = embeds[np.argsort(r)]
|
||||||
return embeds
|
return embeds
|
||||||
@ -61,7 +61,7 @@ class VllmMtebEncoder(mteb.Encoder):
|
|||||||
queries = [s[0] for s in sentences]
|
queries = [s[0] for s in sentences]
|
||||||
corpus = [s[1] for s in sentences]
|
corpus = [s[1] for s in sentences]
|
||||||
|
|
||||||
outputs = self.model.score(queries,
|
outputs = self.llm.score(queries,
|
||||||
corpus,
|
corpus,
|
||||||
truncate_prompt_tokens=-1,
|
truncate_prompt_tokens=-1,
|
||||||
use_tqdm=False)
|
use_tqdm=False)
|
||||||
@ -178,11 +178,11 @@ def mteb_test_embed_models(hf_runner,
|
|||||||
|
|
||||||
if model_info.architecture:
|
if model_info.architecture:
|
||||||
assert (model_info.architecture
|
assert (model_info.architecture
|
||||||
in vllm_model.model.llm_engine.model_config.architectures)
|
in vllm_model.llm.llm_engine.model_config.architectures)
|
||||||
|
|
||||||
vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
|
vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
|
||||||
MTEB_EMBED_TASKS)
|
MTEB_EMBED_TASKS)
|
||||||
vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
|
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
|
||||||
|
|
||||||
with hf_runner(model_info.name,
|
with hf_runner(model_info.name,
|
||||||
is_sentence_transformer=True,
|
is_sentence_transformer=True,
|
||||||
@ -284,7 +284,7 @@ def mteb_test_rerank_models(hf_runner,
|
|||||||
max_num_seqs=8,
|
max_num_seqs=8,
|
||||||
**vllm_extra_kwargs) as vllm_model:
|
**vllm_extra_kwargs) as vllm_model:
|
||||||
|
|
||||||
model_config = vllm_model.model.llm_engine.model_config
|
model_config = vllm_model.llm.llm_engine.model_config
|
||||||
|
|
||||||
if model_info.architecture:
|
if model_info.architecture:
|
||||||
assert (model_info.architecture in model_config.architectures)
|
assert (model_info.architecture in model_config.architectures)
|
||||||
|
|||||||
@ -120,7 +120,7 @@ def test_gritlm_offline_embedding(vllm_runner):
|
|||||||
task="embed",
|
task="embed",
|
||||||
max_model_len=MAX_MODEL_LEN,
|
max_model_len=MAX_MODEL_LEN,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
llm = vllm_model.model
|
llm = vllm_model.llm
|
||||||
|
|
||||||
d_rep = run_llm_encode(
|
d_rep = run_llm_encode(
|
||||||
llm,
|
llm,
|
||||||
@ -167,7 +167,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
|
|||||||
task="generate",
|
task="generate",
|
||||||
max_model_len=MAX_MODEL_LEN,
|
max_model_len=MAX_MODEL_LEN,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
llm = vllm_model.model
|
llm = vllm_model.llm
|
||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
|
sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
|
||||||
outputs = llm.generate(input, sampling_params=sampling_params)
|
outputs = llm.generate(input, sampling_params=sampling_params)
|
||||||
|
|||||||
@ -87,10 +87,10 @@ def test_matryoshka(
|
|||||||
task="embed",
|
task="embed",
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
max_model_len=None) as vllm_model:
|
max_model_len=None) as vllm_model:
|
||||||
assert vllm_model.model.llm_engine.model_config.is_matryoshka
|
assert vllm_model.llm.llm_engine.model_config.is_matryoshka
|
||||||
|
|
||||||
matryoshka_dimensions = (
|
matryoshka_dimensions = (
|
||||||
vllm_model.model.llm_engine.model_config.matryoshka_dimensions)
|
vllm_model.llm.llm_engine.model_config.matryoshka_dimensions)
|
||||||
assert matryoshka_dimensions is not None
|
assert matryoshka_dimensions is not None
|
||||||
|
|
||||||
if dimensions not in matryoshka_dimensions:
|
if dimensions not in matryoshka_dimensions:
|
||||||
|
|||||||
@ -23,7 +23,7 @@ max_model_len = int(original_max_position_embeddings * factor)
|
|||||||
def test_default(model_info, vllm_runner):
|
def test_default(model_info, vllm_runner):
|
||||||
with vllm_runner(model_info.name, task="embed",
|
with vllm_runner(model_info.name, task="embed",
|
||||||
max_model_len=None) as vllm_model:
|
max_model_len=None) as vllm_model:
|
||||||
model_config = vllm_model.model.llm_engine.model_config
|
model_config = vllm_model.llm.llm_engine.model_config
|
||||||
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
|
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
|
||||||
# For nomic-embed-text-v2-moe the length is set to 512
|
# For nomic-embed-text-v2-moe the length is set to 512
|
||||||
# by sentence_bert_config.json.
|
# by sentence_bert_config.json.
|
||||||
@ -38,7 +38,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
|
|||||||
# set max_model_len <= 512
|
# set max_model_len <= 512
|
||||||
with vllm_runner(model_info.name, task="embed",
|
with vllm_runner(model_info.name, task="embed",
|
||||||
max_model_len=256) as vllm_model:
|
max_model_len=256) as vllm_model:
|
||||||
model_config = vllm_model.model.llm_engine.model_config
|
model_config = vllm_model.llm.llm_engine.model_config
|
||||||
assert model_config.max_model_len == 256
|
assert model_config.max_model_len == 256
|
||||||
|
|
||||||
# set 512 < max_model_len <= 2048
|
# set 512 < max_model_len <= 2048
|
||||||
@ -52,7 +52,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
|
|||||||
else:
|
else:
|
||||||
with vllm_runner(model_info.name, task="embed",
|
with vllm_runner(model_info.name, task="embed",
|
||||||
max_model_len=1024) as vllm_model:
|
max_model_len=1024) as vllm_model:
|
||||||
model_config = vllm_model.model.llm_engine.model_config
|
model_config = vllm_model.llm.llm_engine.model_config
|
||||||
assert model_config.max_model_len == 1024
|
assert model_config.max_model_len == 1024
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -28,7 +28,7 @@ def test_smaller_truncation_size(vllm_runner,
|
|||||||
|
|
||||||
with vllm_runner(model_name, task="embed",
|
with vllm_runner(model_name, task="embed",
|
||||||
max_model_len=max_model_len) as vllm_model:
|
max_model_len=max_model_len) as vllm_model:
|
||||||
vllm_output = vllm_model.model.encode(
|
vllm_output = vllm_model.llm.encode(
|
||||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
||||||
|
|
||||||
prompt_tokens = vllm_output[0].prompt_token_ids
|
prompt_tokens = vllm_output[0].prompt_token_ids
|
||||||
@ -43,7 +43,7 @@ def test_max_truncation_size(vllm_runner,
|
|||||||
|
|
||||||
with vllm_runner(model_name, task="embed",
|
with vllm_runner(model_name, task="embed",
|
||||||
max_model_len=max_model_len) as vllm_model:
|
max_model_len=max_model_len) as vllm_model:
|
||||||
vllm_output = vllm_model.model.encode(
|
vllm_output = vllm_model.llm.encode(
|
||||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
||||||
|
|
||||||
prompt_tokens = vllm_output[0].prompt_token_ids
|
prompt_tokens = vllm_output[0].prompt_token_ids
|
||||||
@ -61,7 +61,7 @@ def test_bigger_truncation_size(vllm_runner,
|
|||||||
model_name, task="embed",
|
model_name, task="embed",
|
||||||
max_model_len=max_model_len) as vllm_model:
|
max_model_len=max_model_len) as vllm_model:
|
||||||
|
|
||||||
llm_output = vllm_model.model.encode(
|
llm_output = vllm_model.llm.encode(
|
||||||
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
|
||||||
|
|
||||||
assert llm_output == f"""truncate_prompt_tokens value
|
assert llm_output == f"""truncate_prompt_tokens value
|
||||||
|
|||||||
@ -180,8 +180,7 @@ def test_chat(
|
|||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
outputs = []
|
outputs = []
|
||||||
for msg in MSGS:
|
for msg in MSGS:
|
||||||
output = vllm_model.model.chat(msg,
|
output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
|
||||||
sampling_params=SAMPLING_PARAMS)
|
|
||||||
|
|
||||||
outputs.extend(output)
|
outputs.extend(output)
|
||||||
|
|
||||||
@ -217,7 +216,7 @@ def test_multi_modal_placeholders(vllm_runner, prompt,
|
|||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
outputs = vllm_model.model.generate(prompt)
|
outputs = vllm_model.llm.generate(prompt)
|
||||||
|
|
||||||
assert len(outputs) == 1, f"{len(outputs)=}"
|
assert len(outputs) == 1, f"{len(outputs)=}"
|
||||||
output: RequestOutput = outputs[0]
|
output: RequestOutput = outputs[0]
|
||||||
|
|||||||
@ -106,7 +106,7 @@ def run_test(
|
|||||||
tensor_parallel_size=tensor_parallel_size,
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
llm = vllm_model.model
|
llm = vllm_model.llm
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
temperature=0,
|
temperature=0,
|
||||||
|
|||||||
@ -85,7 +85,7 @@ def run_test(
|
|||||||
enforce_eager=enforce_eager,
|
enforce_eager=enforce_eager,
|
||||||
task=task,
|
task=task,
|
||||||
**vllm_runner_kwargs_) as vllm_model:
|
**vllm_runner_kwargs_) as vllm_model:
|
||||||
tokenizer = vllm_model.model.get_tokenizer()
|
tokenizer = vllm_model.llm.get_tokenizer()
|
||||||
|
|
||||||
vllm_kwargs: dict[str, Any] = {}
|
vllm_kwargs: dict[str, Any] = {}
|
||||||
if get_stop_token_ids is not None:
|
if get_stop_token_ids is not None:
|
||||||
|
|||||||
@ -96,7 +96,7 @@ def _run_test(
|
|||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
max_model_len=8192) as vllm_model:
|
max_model_len=8192) as vllm_model:
|
||||||
tokenizer = vllm_model.model.get_tokenizer()
|
tokenizer = vllm_model.llm.get_tokenizer()
|
||||||
texts = [
|
texts = [
|
||||||
# this is necessary because vllm_model.embed will not apply any
|
# this is necessary because vllm_model.embed will not apply any
|
||||||
# templating to the prompt, and therefore lacks an image_pad
|
# templating to the prompt, and therefore lacks an image_pad
|
||||||
|
|||||||
@ -56,7 +56,7 @@ def vllm_reranker(
|
|||||||
mm_processor_kwargs=mm_processor_kwargs,
|
mm_processor_kwargs=mm_processor_kwargs,
|
||||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
outputs = vllm_model.model.score(query, documents)
|
outputs = vllm_model.llm.score(query, documents)
|
||||||
|
|
||||||
return [output.outputs.score for output in outputs]
|
return [output.outputs.score for output in outputs]
|
||||||
|
|
||||||
|
|||||||
@ -45,7 +45,7 @@ EXPECTED_STRS_MAP = {
|
|||||||
reason="fp8 is not supported on this GPU type.")
|
reason="fp8 is not supported on this GPU type.")
|
||||||
@pytest.mark.parametrize("model_name", MODELS)
|
@pytest.mark.parametrize("model_name", MODELS)
|
||||||
def test_models(example_prompts, model_name) -> None:
|
def test_models(example_prompts, model_name) -> None:
|
||||||
model = LLM(
|
llm = LLM(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
max_model_len=MAX_MODEL_LEN,
|
max_model_len=MAX_MODEL_LEN,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
@ -68,9 +68,9 @@ def test_models(example_prompts, model_name) -> None:
|
|||||||
# Note: these need to be run 1 at a time due to numerical precision,
|
# Note: these need to be run 1 at a time due to numerical precision,
|
||||||
# since the expected strs were generated this way.
|
# since the expected strs were generated this way.
|
||||||
for prompt in formatted_prompts:
|
for prompt in formatted_prompts:
|
||||||
outputs = model.generate(prompt, params)
|
outputs = llm.generate(prompt, params)
|
||||||
generations.append(outputs[0].outputs[0].text)
|
generations.append(outputs[0].outputs[0].text)
|
||||||
del model
|
del llm
|
||||||
|
|
||||||
print(model_name, generations)
|
print(model_name, generations)
|
||||||
expected_strs = EXPECTED_STRS_MAP[model_name]
|
expected_strs = EXPECTED_STRS_MAP[model_name]
|
||||||
|
|||||||
@ -46,7 +46,7 @@ EXPECTED_STRS_MAP = {
|
|||||||
reason="modelopt_fp4 is not supported on this GPU type.")
|
reason="modelopt_fp4 is not supported on this GPU type.")
|
||||||
@pytest.mark.parametrize("model_name", MODELS)
|
@pytest.mark.parametrize("model_name", MODELS)
|
||||||
def test_models(example_prompts, model_name) -> None:
|
def test_models(example_prompts, model_name) -> None:
|
||||||
model = LLM(
|
llm = LLM(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
max_model_len=MAX_MODEL_LEN,
|
max_model_len=MAX_MODEL_LEN,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
@ -69,9 +69,9 @@ def test_models(example_prompts, model_name) -> None:
|
|||||||
# Note: these need to be run 1 at a time due to numerical precision,
|
# Note: these need to be run 1 at a time due to numerical precision,
|
||||||
# since the expected strs were generated this way.
|
# since the expected strs were generated this way.
|
||||||
for prompt in formatted_prompts:
|
for prompt in formatted_prompts:
|
||||||
outputs = model.generate(prompt, params)
|
outputs = llm.generate(prompt, params)
|
||||||
generations.append(outputs[0].outputs[0].text)
|
generations.append(outputs[0].outputs[0].text)
|
||||||
del model
|
del llm
|
||||||
|
|
||||||
print(model_name, generations)
|
print(model_name, generations)
|
||||||
expected_strs = EXPECTED_STRS_MAP[model_name]
|
expected_strs = EXPECTED_STRS_MAP[model_name]
|
||||||
|
|||||||
@ -25,25 +25,25 @@ MODEL_LEN_LEN = [
|
|||||||
@pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
|
@pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
|
||||||
def test_disable_sliding_window(model_len_len, ):
|
def test_disable_sliding_window(model_len_len, ):
|
||||||
model, sliding_len, full_len = model_len_len
|
model, sliding_len, full_len = model_len_len
|
||||||
vllm_disabled_model = LLM(model, disable_sliding_window=True)
|
disabled_llm = LLM(model, disable_sliding_window=True)
|
||||||
vllm_disabled_model.generate("Hi my name is")
|
disabled_llm.generate("Hi my name is")
|
||||||
model_config = vllm_disabled_model.llm_engine.model_config
|
model_config = disabled_llm.llm_engine.model_config
|
||||||
assert model_config.max_model_len == sliding_len, (
|
assert model_config.max_model_len == sliding_len, (
|
||||||
"Max len expected to equal sliding_len of %s, but got %s", sliding_len,
|
"Max len expected to equal sliding_len of %s, but got %s", sliding_len,
|
||||||
model_config.max_model_len)
|
model_config.max_model_len)
|
||||||
|
|
||||||
del vllm_disabled_model
|
del disabled_llm
|
||||||
cleanup_dist_env_and_memory()
|
cleanup_dist_env_and_memory()
|
||||||
|
|
||||||
vllm_enabled_model = LLM(model,
|
enabled_llm = LLM(model,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
disable_sliding_window=False,
|
disable_sliding_window=False,
|
||||||
enable_prefix_caching=False)
|
enable_prefix_caching=False)
|
||||||
vllm_enabled_model.generate("Hi my name is")
|
enabled_llm.generate("Hi my name is")
|
||||||
model_config = vllm_enabled_model.llm_engine.model_config
|
model_config = enabled_llm.llm_engine.model_config
|
||||||
assert model_config.max_model_len == full_len, (
|
assert model_config.max_model_len == full_len, (
|
||||||
"Max len expected to equal full_len of %s, but got %s", full_len,
|
"Max len expected to equal full_len of %s, but got %s", full_len,
|
||||||
model_config.max_model_len)
|
model_config.max_model_len)
|
||||||
|
|
||||||
del vllm_enabled_model
|
del enabled_llm
|
||||||
cleanup_dist_env_and_memory()
|
cleanup_dist_env_and_memory()
|
||||||
|
|||||||
@ -93,7 +93,7 @@ def test_mixed_requests(
|
|||||||
# Run all the promopts
|
# Run all the promopts
|
||||||
greedy_params = SamplingParams(temperature=0.0,
|
greedy_params = SamplingParams(temperature=0.0,
|
||||||
max_tokens=max_tokens)
|
max_tokens=max_tokens)
|
||||||
req_outputs = vllm_model.model.generate(example_prompts,
|
req_outputs = vllm_model.llm.generate(example_prompts,
|
||||||
greedy_params)
|
greedy_params)
|
||||||
|
|
||||||
# Verify number of cached tokens
|
# Verify number of cached tokens
|
||||||
@ -161,7 +161,7 @@ def test_fully_cached_prefill_needs_uncached_token(model):
|
|||||||
max_num_batched_tokens=max_num_batched_tokens,
|
max_num_batched_tokens=max_num_batched_tokens,
|
||||||
max_num_seqs=max_num_batched_tokens,
|
max_num_seqs=max_num_batched_tokens,
|
||||||
)
|
)
|
||||||
engine: LLMEngine = runner.model.llm_engine
|
engine: LLMEngine = runner.llm.llm_engine
|
||||||
|
|
||||||
scheduler: Scheduler = SchedulerProxy(engine.scheduler[0]) # type: ignore
|
scheduler: Scheduler = SchedulerProxy(engine.scheduler[0]) # type: ignore
|
||||||
engine.scheduler[0] = scheduler
|
engine.scheduler[0] = scheduler
|
||||||
|
|||||||
@ -39,7 +39,7 @@ def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
|
|||||||
linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
|
linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
|
||||||
GPTQLinearMethod)
|
GPTQLinearMethod)
|
||||||
|
|
||||||
for name, submodule in (vllm_model.model.llm_engine.model_executor.
|
for name, submodule in (vllm_model.llm.llm_engine.model_executor.
|
||||||
driver_worker.model_runner.model.named_modules()):
|
driver_worker.model_runner.model.named_modules()):
|
||||||
if name == "lm_head":
|
if name == "lm_head":
|
||||||
assert isinstance(submodule.quant_method, linear_method_cls)
|
assert isinstance(submodule.quant_method, linear_method_cls)
|
||||||
|
|||||||
@ -107,11 +107,11 @@ def test_quark_fp8_parity(vllm_runner):
|
|||||||
}
|
}
|
||||||
with (vllm_runner(quark_model_id, **llm_kwargs) as
|
with (vllm_runner(quark_model_id, **llm_kwargs) as
|
||||||
quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle):
|
quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle):
|
||||||
quark_model = (quark_handle.model.llm_engine.model_executor.
|
quark_model = (quark_handle.llm.llm_engine.model_executor.
|
||||||
driver_worker.model_runner.model)
|
driver_worker.model_runner.model)
|
||||||
quark_state_dict = quark_model.state_dict()
|
quark_state_dict = quark_model.state_dict()
|
||||||
|
|
||||||
fp8_model = (fp8_handle.model.llm_engine.model_executor.driver_worker.
|
fp8_model = (fp8_handle.llm.llm_engine.model_executor.driver_worker.
|
||||||
model_runner.model)
|
model_runner.model)
|
||||||
fp8_state_dict = fp8_model.state_dict()
|
fp8_state_dict = fp8_model.state_dict()
|
||||||
|
|
||||||
|
|||||||
@ -111,7 +111,7 @@ def test_custom_quant(vllm_runner, model, monkeypatch):
|
|||||||
quantization="custom_quant",
|
quantization="custom_quant",
|
||||||
enforce_eager=True) as llm:
|
enforce_eager=True) as llm:
|
||||||
|
|
||||||
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
|
model = llm.llm.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
|
||||||
layer = model.model.layers[0]
|
layer = model.model.layers[0]
|
||||||
qkv_proj = layer.self_attn.qkv_proj
|
qkv_proj = layer.self_attn.qkv_proj
|
||||||
|
|
||||||
|
|||||||
@ -36,7 +36,7 @@ def test_ignore_eos(
|
|||||||
ignore_eos=True)
|
ignore_eos=True)
|
||||||
|
|
||||||
for prompt in example_prompts:
|
for prompt in example_prompts:
|
||||||
ignore_eos_output = vllm_model.model.generate(
|
ignore_eos_output = vllm_model.llm.generate(
|
||||||
prompt, sampling_params=sampling_params)
|
prompt, sampling_params=sampling_params)
|
||||||
output_length = len(ignore_eos_output[0].outputs[0].token_ids)
|
output_length = len(ignore_eos_output[0].outputs[0].token_ids)
|
||||||
assert output_length == max_tokens
|
assert output_length == max_tokens
|
||||||
|
|||||||
@ -26,7 +26,7 @@ def test_logits_processor_force_generate(
|
|||||||
dtype: str,
|
dtype: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||||
tokenizer = vllm_model.model.get_tokenizer()
|
tokenizer = vllm_model.llm.get_tokenizer()
|
||||||
repeat_times = 2
|
repeat_times = 2
|
||||||
enforced_answers = " vLLM"
|
enforced_answers = " vLLM"
|
||||||
vllm_token_ids = tokenizer.encode(enforced_answers,
|
vllm_token_ids = tokenizer.encode(enforced_answers,
|
||||||
@ -45,13 +45,13 @@ def test_logits_processor_force_generate(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# test logits_processors when prompt_logprobs is not None
|
# test logits_processors when prompt_logprobs is not None
|
||||||
vllm_model.model._add_request(
|
vllm_model.llm._add_request(
|
||||||
example_prompts[0],
|
example_prompts[0],
|
||||||
params=params_with_logprobs,
|
params=params_with_logprobs,
|
||||||
)
|
)
|
||||||
|
|
||||||
# test prompt_logprobs is not None
|
# test prompt_logprobs is not None
|
||||||
vllm_model.model._add_request(
|
vllm_model.llm._add_request(
|
||||||
example_prompts[1],
|
example_prompts[1],
|
||||||
params=SamplingParams(
|
params=SamplingParams(
|
||||||
prompt_logprobs=3,
|
prompt_logprobs=3,
|
||||||
@ -60,11 +60,11 @@ def test_logits_processor_force_generate(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# test grouped requests
|
# test grouped requests
|
||||||
vllm_model.model._add_request(
|
vllm_model.llm._add_request(
|
||||||
example_prompts[2],
|
example_prompts[2],
|
||||||
params=SamplingParams(max_tokens=max_tokens),
|
params=SamplingParams(max_tokens=max_tokens),
|
||||||
)
|
)
|
||||||
|
|
||||||
outputs = vllm_model.model._run_engine(use_tqdm=False)
|
outputs = vllm_model.llm._run_engine(use_tqdm=False)
|
||||||
|
|
||||||
assert outputs[0].outputs[0].text == enforced_answers * repeat_times
|
assert outputs[0].outputs[0].text == enforced_answers * repeat_times
|
||||||
|
|||||||
@ -64,7 +64,7 @@ def test_get_prompt_logprobs(
|
|||||||
prompt_logprobs=num_top_logprobs,
|
prompt_logprobs=num_top_logprobs,
|
||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
detokenize=detokenize)
|
detokenize=detokenize)
|
||||||
vllm_results = vllm_model.model.generate(
|
vllm_results = vllm_model.llm.generate(
|
||||||
example_prompts, sampling_params=vllm_sampling_params)
|
example_prompts, sampling_params=vllm_sampling_params)
|
||||||
|
|
||||||
# Test whether logprobs are included in the results.
|
# Test whether logprobs are included in the results.
|
||||||
@ -174,7 +174,7 @@ def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
|
|||||||
logprobs=None,
|
logprobs=None,
|
||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
detokenize=detokenize)
|
detokenize=detokenize)
|
||||||
results_logprobs_none = vllm_model.model.generate(
|
results_logprobs_none = vllm_model.llm.generate(
|
||||||
example_prompts, sampling_params=sampling_params_logprobs_none)
|
example_prompts, sampling_params=sampling_params_logprobs_none)
|
||||||
|
|
||||||
for i in range(len(results_logprobs_none)):
|
for i in range(len(results_logprobs_none)):
|
||||||
|
|||||||
@ -20,7 +20,7 @@ def v1(run_with_both_engines):
|
|||||||
|
|
||||||
|
|
||||||
def _generate(
|
def _generate(
|
||||||
model: LLM,
|
llm: LLM,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
num_prompt_tokens: int,
|
num_prompt_tokens: int,
|
||||||
temperature: float = 0,
|
temperature: float = 0,
|
||||||
@ -32,7 +32,7 @@ def _generate(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# [([output_token_ids, ], [output_text, ]), ]
|
# [([output_token_ids, ], [output_text, ]), ]
|
||||||
output = model.generate([prompt], sampling_params=sampling_params)
|
output = llm.generate([prompt], sampling_params=sampling_params)
|
||||||
|
|
||||||
output_token_ids = output[0][0][0][num_prompt_tokens:]
|
output_token_ids = output[0][0][0][num_prompt_tokens:]
|
||||||
# [0] first (and only) request output
|
# [0] first (and only) request output
|
||||||
@ -66,10 +66,10 @@ class TestOneTokenBadWord:
|
|||||||
assert self.target_token_id not in output_token_ids
|
assert self.target_token_id not in output_token_ids
|
||||||
|
|
||||||
def _generate(self,
|
def _generate(self,
|
||||||
model: LLM,
|
llm: LLM,
|
||||||
bad_words: Optional[list[str]] = None) -> list[int]:
|
bad_words: Optional[list[str]] = None) -> list[int]:
|
||||||
return _generate(
|
return _generate(
|
||||||
model=model,
|
llm=llm,
|
||||||
prompt=self.PROMPT,
|
prompt=self.PROMPT,
|
||||||
num_prompt_tokens=self.num_prompt_tokens,
|
num_prompt_tokens=self.num_prompt_tokens,
|
||||||
bad_words=bad_words,
|
bad_words=bad_words,
|
||||||
@ -156,10 +156,10 @@ class TestTwoTokenBadWord:
|
|||||||
or (self.neighbour_token_id2 in output_token_ids))
|
or (self.neighbour_token_id2 in output_token_ids))
|
||||||
|
|
||||||
def _generate(self,
|
def _generate(self,
|
||||||
model: LLM,
|
llm: LLM,
|
||||||
bad_words: Optional[list[str]] = None) -> list[int]:
|
bad_words: Optional[list[str]] = None) -> list[int]:
|
||||||
return _generate(
|
return _generate(
|
||||||
model=model,
|
llm=llm,
|
||||||
prompt=self.PROMPT,
|
prompt=self.PROMPT,
|
||||||
num_prompt_tokens=self.num_prompt_tokens,
|
num_prompt_tokens=self.num_prompt_tokens,
|
||||||
bad_words=bad_words,
|
bad_words=bad_words,
|
||||||
|
|||||||
@ -49,7 +49,7 @@ def test_random_sample_with_seed(
|
|||||||
sampling_params_seed_2 = copy.deepcopy(sampling_params)
|
sampling_params_seed_2 = copy.deepcopy(sampling_params)
|
||||||
sampling_params_seed_2.seed = 200
|
sampling_params_seed_2.seed = 200
|
||||||
|
|
||||||
llm = vllm_model.model
|
llm = vllm_model.llm
|
||||||
|
|
||||||
for prompt in example_prompts:
|
for prompt in example_prompts:
|
||||||
for params in (
|
for params in (
|
||||||
|
|||||||
@ -393,7 +393,7 @@ def test_decode_prompt_logprobs_chunked_prefill(
|
|||||||
logprobs=5,
|
logprobs=5,
|
||||||
prompt_logprobs=5,
|
prompt_logprobs=5,
|
||||||
temperature=0.0)
|
temperature=0.0)
|
||||||
vllm_results = vllm_model.model.generate(
|
vllm_results = vllm_model.llm.generate(
|
||||||
example_prompts, sampling_params=vllm_sampling_params)
|
example_prompts, sampling_params=vllm_sampling_params)
|
||||||
|
|
||||||
for idx, result in enumerate(vllm_results):
|
for idx, result in enumerate(vllm_results):
|
||||||
|
|||||||
@ -14,7 +14,7 @@ PROMPT = "Hello my name is Robert and I"
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def model() -> LLM:
|
def llm() -> LLM:
|
||||||
return LLM(MODEL,
|
return LLM(MODEL,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
enable_prefix_caching=True,
|
enable_prefix_caching=True,
|
||||||
@ -24,16 +24,16 @@ def model() -> LLM:
|
|||||||
block_size=16)
|
block_size=16)
|
||||||
|
|
||||||
|
|
||||||
def test_concurrent_partial_prefill(model):
|
def test_concurrent_partial_prefill(llm):
|
||||||
outputs = model.generate([PROMPT] * 3)
|
outputs = llm.generate([PROMPT] * 3)
|
||||||
assert len(outputs) == 3
|
assert len(outputs) == 3
|
||||||
for output in outputs:
|
for output in outputs:
|
||||||
assert len(output.outputs) == 1
|
assert len(output.outputs) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_prefix_cache_stats_is_recorded(model):
|
def test_prefix_cache_stats_is_recorded(llm):
|
||||||
# 17 tokens will make sure first 16 tokens are cached in a block
|
# 17 tokens will make sure first 16 tokens are cached in a block
|
||||||
input_tokens = {"prompt_token_ids": [101] * 17}
|
input_tokens = {"prompt_token_ids": [101] * 17}
|
||||||
_ = model.generate([input_tokens])
|
_ = llm.generate([input_tokens])
|
||||||
outputs = model.generate([input_tokens])
|
outputs = llm.generate([input_tokens])
|
||||||
assert outputs[0].num_cached_tokens == 16
|
assert outputs[0].num_cached_tokens == 16
|
||||||
|
|||||||
@ -112,9 +112,9 @@ def test_compatibility_with_skip_tokenizer_init(
|
|||||||
example_prompts,
|
example_prompts,
|
||||||
structured_outputs=True,
|
structured_outputs=True,
|
||||||
)
|
)
|
||||||
model: LLM = vllm_model_skip_tokenizer_init.model
|
llm: LLM = vllm_model_skip_tokenizer_init.llm
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
_ = model.generate(example_prompts, sampling_params_list)
|
_ = llm.generate(example_prompts, sampling_params_list)
|
||||||
|
|
||||||
|
|
||||||
def test_parallel_sampling(vllm_model, example_prompts) -> None:
|
def test_parallel_sampling(vllm_model, example_prompts) -> None:
|
||||||
@ -125,8 +125,8 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
|
|||||||
example_prompt: test fixture providing prompts for testing.
|
example_prompt: test fixture providing prompts for testing.
|
||||||
"""
|
"""
|
||||||
sampling_params_list, n_list = _get_test_sampling_params(example_prompts)
|
sampling_params_list, n_list = _get_test_sampling_params(example_prompts)
|
||||||
model: LLM = vllm_model.model
|
llm: LLM = vllm_model.llm
|
||||||
outputs = model.generate(example_prompts, sampling_params_list)
|
outputs = llm.generate(example_prompts, sampling_params_list)
|
||||||
|
|
||||||
# Validate each request response
|
# Validate each request response
|
||||||
for out, n in zip(outputs, n_list):
|
for out, n in zip(outputs, n_list):
|
||||||
@ -166,10 +166,10 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
|
|||||||
speculative_config=speculative_config,
|
speculative_config=speculative_config,
|
||||||
disable_log_stats=False,
|
disable_log_stats=False,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
model: LLM = vllm_model.model
|
llm: LLM = vllm_model.llm
|
||||||
sampling_params = SamplingParams(temperature=0.0,
|
sampling_params = SamplingParams(temperature=0.0,
|
||||||
max_tokens=max_tokens)
|
max_tokens=max_tokens)
|
||||||
outputs = model.generate(example_prompts, sampling_params)
|
outputs = llm.generate(example_prompts, sampling_params)
|
||||||
|
|
||||||
n_prompts = len(example_prompts)
|
n_prompts = len(example_prompts)
|
||||||
assert len(outputs) == n_prompts
|
assert len(outputs) == n_prompts
|
||||||
@ -180,7 +180,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
|
|||||||
total_tokens += len(out.outputs[0].token_ids)
|
total_tokens += len(out.outputs[0].token_ids)
|
||||||
assert total_tokens == max_tokens * n_prompts
|
assert total_tokens == max_tokens * n_prompts
|
||||||
|
|
||||||
metrics = model.get_metrics()
|
metrics = llm.get_metrics()
|
||||||
|
|
||||||
def find_metric(name) -> list[Metric]:
|
def find_metric(name) -> list[Metric]:
|
||||||
found = []
|
found = []
|
||||||
|
|||||||
@ -112,7 +112,7 @@ def _run_and_validate(
|
|||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
do_apc: bool,
|
do_apc: bool,
|
||||||
) -> None:
|
) -> None:
|
||||||
vllm_results = vllm_model.model.generate(
|
vllm_results = vllm_model.llm.generate(
|
||||||
test_prompts, sampling_params=vllm_sampling_params)
|
test_prompts, sampling_params=vllm_sampling_params)
|
||||||
|
|
||||||
for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip(
|
for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip(
|
||||||
@ -288,7 +288,7 @@ def test_get_logprobs_and_prompt_logprobs(
|
|||||||
"""
|
"""
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
m.setenv("VLLM_USE_V1", "1")
|
||||||
do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
|
do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
|
||||||
if do_apc and (temperature < 2.0
|
if do_apc and (temperature < 2.0
|
||||||
or batch_logprobs_composition != SAMPLE_PROMPT):
|
or batch_logprobs_composition != SAMPLE_PROMPT):
|
||||||
# Skip some test-cases to save time.
|
# Skip some test-cases to save time.
|
||||||
@ -378,7 +378,7 @@ def test_none_logprobs(vllm_model, example_prompts,
|
|||||||
prompt_logprobs=None,
|
prompt_logprobs=None,
|
||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
)
|
)
|
||||||
results_logprobs_none = vllm_model.model.generate(
|
results_logprobs_none = vllm_model.llm.generate(
|
||||||
example_prompts,
|
example_prompts,
|
||||||
sampling_params=sampling_params_logprobs_none,
|
sampling_params=sampling_params_logprobs_none,
|
||||||
)
|
)
|
||||||
@ -408,7 +408,7 @@ def test_zero_logprobs(vllm_model, example_prompts,
|
|||||||
logprobs=0,
|
logprobs=0,
|
||||||
prompt_logprobs=0,
|
prompt_logprobs=0,
|
||||||
temperature=0.0)
|
temperature=0.0)
|
||||||
results_logprobs_zero = vllm_model.model.generate(
|
results_logprobs_zero = vllm_model.llm.generate(
|
||||||
example_prompts, sampling_params=sampling_params_logprobs_zero)
|
example_prompts, sampling_params=sampling_params_logprobs_zero)
|
||||||
|
|
||||||
for i in range(len(results_logprobs_zero)):
|
for i in range(len(results_logprobs_zero)):
|
||||||
|
|||||||
@ -14,30 +14,30 @@ PROMPT = "Hello my name is Robert and I"
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def model() -> LLM:
|
def llm() -> LLM:
|
||||||
# Disable prefix caching so that we can test prompt logprobs.
|
# Disable prefix caching so that we can test prompt logprobs.
|
||||||
# TODO remove this after https://github.com/vllm-project/vllm/pull/13949
|
# TODO remove this after https://github.com/vllm-project/vllm/pull/13949
|
||||||
# is merged
|
# is merged
|
||||||
return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False)
|
return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False)
|
||||||
|
|
||||||
|
|
||||||
def test_n_gt_1(model):
|
def test_n_gt_1(llm):
|
||||||
"""ParallelSampling is supported."""
|
"""ParallelSampling is supported."""
|
||||||
|
|
||||||
params = SamplingParams(n=3)
|
params = SamplingParams(n=3)
|
||||||
outputs = model.generate(PROMPT, params)
|
outputs = llm.generate(PROMPT, params)
|
||||||
assert len(outputs[0].outputs) == 3
|
assert len(outputs[0].outputs) == 3
|
||||||
|
|
||||||
|
|
||||||
def test_best_of(model):
|
def test_best_of(llm):
|
||||||
"""Raise a ValueError since best_of is deprecated."""
|
"""Raise a ValueError since best_of is deprecated."""
|
||||||
|
|
||||||
params = SamplingParams(n=2, best_of=3)
|
params = SamplingParams(n=2, best_of=3)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
_ = model.generate(PROMPT, params)
|
_ = llm.generate(PROMPT, params)
|
||||||
|
|
||||||
|
|
||||||
def test_penalties(model):
|
def test_penalties(llm):
|
||||||
"""Check that we do not get errors if applied."""
|
"""Check that we do not get errors if applied."""
|
||||||
|
|
||||||
params = SamplingParams(
|
params = SamplingParams(
|
||||||
@ -49,18 +49,18 @@ def test_penalties(model):
|
|||||||
top_p=0.5,
|
top_p=0.5,
|
||||||
top_k=3,
|
top_k=3,
|
||||||
)
|
)
|
||||||
_ = model.generate(PROMPT, params)
|
_ = llm.generate(PROMPT, params)
|
||||||
|
|
||||||
|
|
||||||
def test_stop(model):
|
def test_stop(llm):
|
||||||
"""Check that we respect the stop words."""
|
"""Check that we respect the stop words."""
|
||||||
|
|
||||||
output = model.generate(PROMPT, SamplingParams(temperature=0))
|
output = llm.generate(PROMPT, SamplingParams(temperature=0))
|
||||||
split_text = output[0].outputs[0].text.split()
|
split_text = output[0].outputs[0].text.split()
|
||||||
|
|
||||||
STOP_IDX = 5
|
STOP_IDX = 5
|
||||||
params = SamplingParams(temperature=0, stop=split_text[STOP_IDX])
|
params = SamplingParams(temperature=0, stop=split_text[STOP_IDX])
|
||||||
output = model.generate(PROMPT, params)
|
output = llm.generate(PROMPT, params)
|
||||||
new_split_text = output[0].outputs[0].text.split()
|
new_split_text = output[0].outputs[0].text.split()
|
||||||
|
|
||||||
# Output should not contain the stop word.
|
# Output should not contain the stop word.
|
||||||
@ -69,40 +69,40 @@ def test_stop(model):
|
|||||||
params = SamplingParams(temperature=0,
|
params = SamplingParams(temperature=0,
|
||||||
stop=split_text[STOP_IDX],
|
stop=split_text[STOP_IDX],
|
||||||
include_stop_str_in_output=True)
|
include_stop_str_in_output=True)
|
||||||
output = model.generate(PROMPT, params)
|
output = llm.generate(PROMPT, params)
|
||||||
new_split_text = output[0].outputs[0].text.split()
|
new_split_text = output[0].outputs[0].text.split()
|
||||||
|
|
||||||
# Output should contain the stop word.
|
# Output should contain the stop word.
|
||||||
assert len(new_split_text) == STOP_IDX + 1
|
assert len(new_split_text) == STOP_IDX + 1
|
||||||
|
|
||||||
|
|
||||||
def test_stop_token_ids(model):
|
def test_stop_token_ids(llm):
|
||||||
"""Check that we respect the stop token ids."""
|
"""Check that we respect the stop token ids."""
|
||||||
|
|
||||||
output = model.generate(PROMPT, SamplingParams(temperature=0))
|
output = llm.generate(PROMPT, SamplingParams(temperature=0))
|
||||||
|
|
||||||
stop_token_id_0 = output[0].outputs[0].token_ids[5]
|
stop_token_id_0 = output[0].outputs[0].token_ids[5]
|
||||||
stop_token_id_1 = output[0].outputs[0].token_ids[6]
|
stop_token_id_1 = output[0].outputs[0].token_ids[6]
|
||||||
|
|
||||||
stop_token_ids = [stop_token_id_1, stop_token_id_0]
|
stop_token_ids = [stop_token_id_1, stop_token_id_0]
|
||||||
params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
|
params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
|
||||||
output = model.generate(PROMPT, params)
|
output = llm.generate(PROMPT, params)
|
||||||
assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
|
assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
|
||||||
|
|
||||||
stop_token_ids = [stop_token_id_0, stop_token_id_1]
|
stop_token_ids = [stop_token_id_0, stop_token_id_1]
|
||||||
params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
|
params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
|
||||||
output = model.generate(PROMPT, params)
|
output = llm.generate(PROMPT, params)
|
||||||
assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
|
assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
|
||||||
|
|
||||||
|
|
||||||
def test_detokenize_false(model):
|
def test_detokenize_false(llm):
|
||||||
"""Check that detokenize=False option works."""
|
"""Check that detokenize=False option works."""
|
||||||
|
|
||||||
output = model.generate(PROMPT, SamplingParams(detokenize=False))
|
output = llm.generate(PROMPT, SamplingParams(detokenize=False))
|
||||||
assert len(output[0].outputs[0].token_ids) > 0
|
assert len(output[0].outputs[0].token_ids) > 0
|
||||||
assert len(output[0].outputs[0].text) == 0
|
assert len(output[0].outputs[0].text) == 0
|
||||||
|
|
||||||
output = model.generate(
|
output = llm.generate(
|
||||||
PROMPT, SamplingParams(detokenize=False, logprobs=3,
|
PROMPT, SamplingParams(detokenize=False, logprobs=3,
|
||||||
prompt_logprobs=3))
|
prompt_logprobs=3))
|
||||||
assert len(output[0].outputs[0].token_ids) > 0
|
assert len(output[0].outputs[0].token_ids) > 0
|
||||||
@ -118,28 +118,28 @@ def test_detokenize_false(model):
|
|||||||
assert all(lp.decoded_token is None for lp in logprobs.values())
|
assert all(lp.decoded_token is None for lp in logprobs.values())
|
||||||
|
|
||||||
|
|
||||||
def test_bad_words(model):
|
def test_bad_words(llm):
|
||||||
"""Check that we respect bad words."""
|
"""Check that we respect bad words."""
|
||||||
|
|
||||||
output = model.generate(PROMPT, SamplingParams(temperature=0))
|
output = llm.generate(PROMPT, SamplingParams(temperature=0))
|
||||||
split_text = output[0].outputs[0].text.split()
|
split_text = output[0].outputs[0].text.split()
|
||||||
|
|
||||||
bad_words_1 = " ".join(split_text[:2])
|
bad_words_1 = " ".join(split_text[:2])
|
||||||
params = SamplingParams(temperature=0, bad_words=[bad_words_1])
|
params = SamplingParams(temperature=0, bad_words=[bad_words_1])
|
||||||
output = model.generate(PROMPT, params)
|
output = llm.generate(PROMPT, params)
|
||||||
new_text = output[0].outputs[0].text
|
new_text = output[0].outputs[0].text
|
||||||
assert bad_words_1 not in new_text
|
assert bad_words_1 not in new_text
|
||||||
|
|
||||||
bad_words_2 = new_text.split()[-1]
|
bad_words_2 = new_text.split()[-1]
|
||||||
params = SamplingParams(temperature=0,
|
params = SamplingParams(temperature=0,
|
||||||
bad_words=[bad_words_1, bad_words_2])
|
bad_words=[bad_words_1, bad_words_2])
|
||||||
output = model.generate(PROMPT, params)
|
output = llm.generate(PROMPT, params)
|
||||||
new_text = output[0].outputs[0].text
|
new_text = output[0].outputs[0].text
|
||||||
assert bad_words_1 not in new_text
|
assert bad_words_1 not in new_text
|
||||||
assert bad_words_2 not in new_text
|
assert bad_words_2 not in new_text
|
||||||
|
|
||||||
|
|
||||||
def test_logits_processor(model):
|
def test_logits_processor(llm):
|
||||||
"""Check that we reject logits processor."""
|
"""Check that we reject logits processor."""
|
||||||
|
|
||||||
# This sample logits processor gives infinite score to the i-th token,
|
# This sample logits processor gives infinite score to the i-th token,
|
||||||
@ -150,47 +150,45 @@ def test_logits_processor(model):
|
|||||||
return logits
|
return logits
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
_ = model.generate(PROMPT,
|
_ = llm.generate(PROMPT, SamplingParams(logits_processors=[pick_ith]))
|
||||||
SamplingParams(logits_processors=[pick_ith]))
|
|
||||||
|
|
||||||
|
|
||||||
def test_allowed_token_ids(model):
|
def test_allowed_token_ids(llm):
|
||||||
"""Check that we can use allowed_token_ids."""
|
"""Check that we can use allowed_token_ids."""
|
||||||
|
|
||||||
TOKEN_ID = 10
|
TOKEN_ID = 10
|
||||||
allowed_token_ids = [TOKEN_ID]
|
allowed_token_ids = [TOKEN_ID]
|
||||||
output = model.generate(
|
output = llm.generate(PROMPT,
|
||||||
PROMPT, SamplingParams(allowed_token_ids=allowed_token_ids))
|
SamplingParams(allowed_token_ids=allowed_token_ids))
|
||||||
assert output[0].outputs[0].token_ids[-1] == TOKEN_ID
|
assert output[0].outputs[0].token_ids[-1] == TOKEN_ID
|
||||||
|
|
||||||
# Reject empty allowed_token_ids.
|
# Reject empty allowed_token_ids.
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
_ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[]))
|
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[]))
|
||||||
|
|
||||||
# Reject negative token id.
|
# Reject negative token id.
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
_ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[-1]))
|
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[-1]))
|
||||||
|
|
||||||
# Reject out of vocabulary.
|
# Reject out of vocabulary.
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
_ = model.generate(PROMPT,
|
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
|
||||||
SamplingParams(allowed_token_ids=[10000000]))
|
|
||||||
|
|
||||||
|
|
||||||
def test_priority(model):
|
def test_priority(llm):
|
||||||
"""Check that we reject requests with priority."""
|
"""Check that we reject requests with priority."""
|
||||||
|
|
||||||
# Reject all allowed token ids
|
# Reject all allowed token ids
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
_ = model.generate(PROMPT, priority=[1])
|
_ = llm.generate(PROMPT, priority=[1])
|
||||||
|
|
||||||
|
|
||||||
def test_seed(model):
|
def test_seed(llm):
|
||||||
"""Check that seed impacts randomness."""
|
"""Check that seed impacts randomness."""
|
||||||
|
|
||||||
out_1 = model.generate(PROMPT, SamplingParams(seed=42))
|
out_1 = llm.generate(PROMPT, SamplingParams(seed=42))
|
||||||
out_2 = model.generate(PROMPT, SamplingParams(seed=42))
|
out_2 = llm.generate(PROMPT, SamplingParams(seed=42))
|
||||||
out_3 = model.generate(PROMPT, SamplingParams(seed=43))
|
out_3 = llm.generate(PROMPT, SamplingParams(seed=43))
|
||||||
|
|
||||||
assert out_1[0].outputs[0].text == out_2[0].outputs[0].text
|
assert out_1[0].outputs[0].text == out_2[0].outputs[0].text
|
||||||
assert out_1[0].outputs[0].text != out_3[0].outputs[0].text
|
assert out_1[0].outputs[0].text != out_3[0].outputs[0].text
|
||||||
|
|||||||
@ -106,9 +106,9 @@ def test_v1_llm_by_default(monkeypatch):
|
|||||||
m.delenv("VLLM_USE_V1")
|
m.delenv("VLLM_USE_V1")
|
||||||
|
|
||||||
# Should default to V1 for supported config.
|
# Should default to V1 for supported config.
|
||||||
model = LLM(MODEL, enforce_eager=True, enable_lora=True)
|
llm = LLM(MODEL, enforce_eager=True, enable_lora=True)
|
||||||
print(model.generate("Hello my name is"))
|
print(llm.generate("Hello my name is"))
|
||||||
assert hasattr(model.llm_engine, "engine_core")
|
assert hasattr(llm.llm_engine, "engine_core")
|
||||||
m.delenv("VLLM_USE_V1")
|
m.delenv("VLLM_USE_V1")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user