vllm/examples/online_serving/token_generation_client.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import httpx
from transformers import AutoTokenizer

GEN_ENDPOINT = "http://localhost:8000/inference/v1/generate"
DUMMY_API_KEY = "empty"
MODEL_NAME = "Qwen/Qwen3-0.6B"

transport = httpx.HTTPTransport()
headers = {"Authorization": f"Bearer {DUMMY_API_KEY}"}
client = httpx.Client(
    transport=transport,
    base_url=GEN_ENDPOINT,
    timeout=600,
    headers=headers,
)
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "How many countries are in the EU?"},
]


def main(client):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    token_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        enable_thinking=False,
    )
    payload = {
        "model": MODEL_NAME,
        "token_ids": token_ids,
        "sampling_params": {"max_tokens": 24, "temperature": 0.2, "detokenize": False},
        "stream": False,
    }
    resp = client.post(GEN_ENDPOINT, json=payload)
    resp.raise_for_status()
    data = resp.json()
    print(data)
    print("-" * 50)
    print("Token generation results:")
    res = tokenizer.decode(data["choices"][0]["token_ids"])
    print(res)
    print("-" * 50)


if __name__ == "__main__":
    main(client)