mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 06:25:01 +08:00
Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
50 lines
1.3 KiB
Python
50 lines
1.3 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
import httpx
|
|
from transformers import AutoTokenizer
|
|
|
|
GEN_ENDPOINT = "http://localhost:8000/inference/v1/generate"
|
|
DUMMY_API_KEY = "empty"
|
|
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
|
|
|
transport = httpx.HTTPTransport()
|
|
headers = {"Authorization": f"Bearer {DUMMY_API_KEY}"}
|
|
client = httpx.Client(
|
|
transport=transport,
|
|
base_url=GEN_ENDPOINT,
|
|
timeout=600,
|
|
headers=headers,
|
|
)
|
|
messages = [
|
|
{"role": "system", "content": "You are a helpful assistant."},
|
|
{"role": "user", "content": "How many countries are in the EU?"},
|
|
]
|
|
|
|
|
|
def main(client):
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
|
token_ids = tokenizer.apply_chat_template(
|
|
messages,
|
|
add_generation_prompt=True,
|
|
enable_thinking=False,
|
|
)
|
|
payload = {
|
|
"model": MODEL_NAME,
|
|
"token_ids": token_ids,
|
|
"sampling_params": {"max_tokens": 24, "temperature": 0.2, "detokenize": False},
|
|
"stream": False,
|
|
}
|
|
resp = client.post(GEN_ENDPOINT, json=payload)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
print(data)
|
|
print("-" * 50)
|
|
print("Token generation results:")
|
|
res = tokenizer.decode(data["choices"][0]["token_ids"])
|
|
print(res)
|
|
print("-" * 50)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main(client)
|