diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py index bf8fb7dc521c..efa1aa5b0369 100644 --- a/examples/offline_inference/mistral-small.py +++ b/examples/offline_inference/mistral-small.py @@ -90,8 +90,9 @@ def run_simple_demo(args: argparse.Namespace): }, ] outputs = llm.chat(messages, sampling_params=sampling_params) - + print("-" * 50) print(outputs[0].outputs[0].text) + print("-" * 50) def run_advanced_demo(args: argparse.Namespace): @@ -162,7 +163,9 @@ def run_advanced_demo(args: argparse.Namespace): ] outputs = llm.chat(messages=messages, sampling_params=sampling_params) + print("-" * 50) print(outputs[0].outputs[0].text) + print("-" * 50) def main(): diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py index 4b0d115e6609..de409740292a 100644 --- a/examples/offline_inference/multilora_inference.py +++ b/examples/offline_inference/multilora_inference.py @@ -61,6 +61,7 @@ def process_requests(engine: LLMEngine, """Continuously process a list of prompts and handle the outputs.""" request_id = 0 + print("-" * 50) while test_prompts or engine.has_unfinished_requests(): if test_prompts: prompt, sampling_params, lora_request = test_prompts.pop(0) @@ -75,6 +76,7 @@ def process_requests(engine: LLMEngine, for request_output in request_outputs: if request_output.finished: print(request_output) + print("-" * 50) def initialize_engine() -> LLMEngine: diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py index 517d1bfce95d..5906c7b2c6b3 100644 --- a/examples/offline_inference/neuron.py +++ b/examples/offline_inference/neuron.py @@ -12,27 +12,36 @@ prompts = [ # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -# Create an LLM. -llm = LLM( - model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", - max_num_seqs=8, - # The max_model_len and block_size arguments are required to be same as - # max sequence length when targeting neuron device. - # Currently, this is a known limitation in continuous batching support - # in transformers-neuronx. - # TODO(liangfu): Support paged-attention in transformers-neuronx. - max_model_len=1024, - block_size=1024, - # The device can be automatically detected when AWS Neuron SDK is installed. - # The device argument can be either unspecified for automated detection, - # or explicitly assigned. - device="neuron", - tensor_parallel_size=2) -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +def main(): + # Create an LLM. + llm = LLM( + model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + max_num_seqs=8, + # The max_model_len and block_size arguments are required to be same as + # max sequence length when targeting neuron device. + # Currently, this is a known limitation in continuous batching support + # in transformers-neuronx. + # TODO(liangfu): Support paged-attention in transformers-neuronx. + max_model_len=1024, + block_size=1024, + # ruff: noqa: E501 + # The device can be automatically detected when AWS Neuron SDK is installed. + # The device argument can be either unspecified for automated detection, + # or explicitly assigned. + device="neuron", + tensor_parallel_size=2) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + print("-" * 50) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py index c899a01a0bb9..af21274a3a5b 100644 --- a/examples/offline_inference/neuron_int8_quantization.py +++ b/examples/offline_inference/neuron_int8_quantization.py @@ -22,31 +22,40 @@ prompts = [ # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -# Create an LLM. -llm = LLM( - model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", - max_num_seqs=8, - # The max_model_len and block_size arguments are required to be same as - # max sequence length when targeting neuron device. - # Currently, this is a known limitation in continuous batching support - # in transformers-neuronx. - # TODO(liangfu): Support paged-attention in transformers-neuronx. - max_model_len=2048, - block_size=2048, - # The device can be automatically detected when AWS Neuron SDK is installed. - # The device argument can be either unspecified for automated detection, - # or explicitly assigned. - device="neuron", - quantization="neuron_quant", - override_neuron_config={ - "cast_logits_dtype": "bfloat16", - }, - tensor_parallel_size=2) -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +def main(): + # Create an LLM. + llm = LLM( + model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + max_num_seqs=8, + # The max_model_len and block_size arguments are required to be same as + # max sequence length when targeting neuron device. + # Currently, this is a known limitation in continuous batching support + # in transformers-neuronx. + # TODO(liangfu): Support paged-attention in transformers-neuronx. + max_model_len=2048, + block_size=2048, + # ruff: noqa: E501 + # The device can be automatically detected when AWS Neuron SDK is installed. + # The device argument can be either unspecified for automated detection, + # or explicitly assigned. + device="neuron", + quantization="neuron_quant", + override_neuron_config={ + "cast_logits_dtype": "bfloat16", + }, + tensor_parallel_size=2) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + print("-" * 50) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/prefix_caching.py b/examples/offline_inference/prefix_caching.py index 4c326c417b4d..f0bec387d3a9 100644 --- a/examples/offline_inference/prefix_caching.py +++ b/examples/offline_inference/prefix_caching.py @@ -31,55 +31,62 @@ generating_prompts = [prefix + prompt for prompt in prompts] # Create a sampling params object. sampling_params = SamplingParams(temperature=0.0) -# Create an LLM without prefix caching as a baseline. -regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4) -print("Results without `enable_prefix_caching`") +def main(): + # Create an LLM without prefix caching as a baseline. + regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4) -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = regular_llm.generate(generating_prompts, sampling_params) + print("Results without `enable_prefix_caching`") -regular_generated_texts = [] -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - regular_generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + # ruff: noqa: E501 + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = regular_llm.generate(generating_prompts, sampling_params) -print("-" * 80) + regular_generated_texts = [] + # Print the outputs. + print("-" * 50) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + regular_generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) -# Destroy the LLM object and free up the GPU memory. -del regular_llm -cleanup_dist_env_and_memory() + # Destroy the LLM object and free up the GPU memory. + del regular_llm + cleanup_dist_env_and_memory() -# Create an LLM with prefix caching enabled. -prefix_cached_llm = LLM(model="facebook/opt-125m", - enable_prefix_caching=True, - gpu_memory_utilization=0.4) + # Create an LLM with prefix caching enabled. + prefix_cached_llm = LLM(model="facebook/opt-125m", + enable_prefix_caching=True, + gpu_memory_utilization=0.4) -# Warmup so that the shared prompt's KV cache is computed. -prefix_cached_llm.generate(generating_prompts[0], sampling_params) + # Warmup so that the shared prompt's KV cache is computed. + prefix_cached_llm.generate(generating_prompts[0], sampling_params) -# Generate with prefix caching. -outputs = prefix_cached_llm.generate(generating_prompts, sampling_params) + # Generate with prefix caching. + outputs = prefix_cached_llm.generate(generating_prompts, sampling_params) -print("Results with `enable_prefix_caching`") + print("Results with `enable_prefix_caching`") -cached_generated_texts = [] -# Print the outputs. You should see the same outputs as before. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - cached_generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + cached_generated_texts = [] + # Print the outputs. You should see the same outputs as before. + print("-" * 50) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + cached_generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) -print("-" * 80) + # Compare the results and display the speedup + generated_same = all([ + regular_generated_texts[i] == cached_generated_texts[i] + for i in range(len(prompts)) + ]) + print(f"Generated answers are the same: {generated_same}") -# Compare the results and display the speedup -generated_same = all([ - regular_generated_texts[i] == cached_generated_texts[i] - for i in range(len(prompts)) -]) -print(f"Generated answers are the same: {generated_same}") + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/reproduciblity.py b/examples/offline_inference/reproduciblity.py index d0197bf6d5ba..b2be117d1a0a 100644 --- a/examples/offline_inference/reproduciblity.py +++ b/examples/offline_inference/reproduciblity.py @@ -19,8 +19,6 @@ SEED = 42 # because it is almost impossible to make the scheduling deterministic in the # online serving setting. -llm = LLM(model="facebook/opt-125m", seed=SEED) - prompts = [ "Hello, my name is", "The president of the United States is", @@ -29,8 +27,17 @@ prompts = [ ] sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -outputs = llm.generate(prompts, sampling_params) -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +def main(): + llm = LLM(model="facebook/opt-125m", seed=SEED) + outputs = llm.generate(prompts, sampling_params) + print("-" * 50) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py index b0418c092ca3..e0ed0ac49754 100644 --- a/examples/offline_inference/rlhf.py +++ b/examples/offline_inference/rlhf.py @@ -85,11 +85,13 @@ sampling_params = SamplingParams(temperature=0) outputs = ray.get(llm.generate.remote(prompts, sampling_params)) +print("-" * 50) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, " + print(f"Prompt: {prompt!r}\n" f"Generated text: {generated_text!r}") + print("-" * 50) # set up the communication between the training process # and the inference engine. @@ -120,8 +122,10 @@ assert all(ray.get(llm.collective_rpc.remote("check_weights_changed"))) # use the updated model to generate texts, they will be nonsense # because the weights are all zeros. outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params)) +print("-" * 50) for output in outputs_updated: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, " + print(f"Prompt: {prompt!r}\n" f"Generated text: {generated_text!r}") + print("-" * 50) diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py index b45954b3bd54..6a8e3a5a3e75 100644 --- a/examples/offline_inference/simple_profiling.py +++ b/examples/offline_inference/simple_profiling.py @@ -32,10 +32,12 @@ if __name__ == "__main__": llm.stop_profile() # Print the outputs. + print("-" * 50) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) # Add a buffer to wait for profiler in the background process # (in case MP is on) to finish writing profiling output. diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py index 7a57f29a07fa..c6d9e6b47e21 100644 --- a/examples/offline_inference/torchrun_example.py +++ b/examples/offline_inference/torchrun_example.py @@ -36,11 +36,13 @@ llm = LLM( outputs = llm.generate(prompts, sampling_params) # all ranks will have the same outputs +print("-" * 50) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, " + print(f"Prompt: {prompt!r}\n" f"Generated text: {generated_text!r}") + print("-" * 50) """ Further tips: diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py index 956219d30f38..dea717c36082 100644 --- a/examples/offline_inference/tpu.py +++ b/examples/offline_inference/tpu.py @@ -16,14 +16,22 @@ N = 1 # Currently, top-p sampling is disabled. `top_p` should be 1.0. sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16) -# Set `enforce_eager=True` to avoid ahead-of-time compilation. -# In real workloads, `enforace_eager` should be `False`. -llm = LLM(model="Qwen/Qwen2-1.5B-Instruct", - max_num_batched_tokens=64, - max_num_seqs=4) -outputs = llm.generate(prompts, sampling_params) -for output, answer in zip(outputs, answers): - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - assert generated_text.startswith(answer) + +def main(): + # Set `enforce_eager=True` to avoid ahead-of-time compilation. + # In real workloads, `enforace_eager` should be `False`. + llm = LLM(model="Qwen/Qwen2-1.5B-Instruct", + max_num_batched_tokens=64, + max_num_seqs=4) + outputs = llm.generate(prompts, sampling_params) + print("-" * 50) + for output, answer in zip(outputs, answers): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + assert generated_text.startswith(answer) + print("-" * 50) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 61d53dda1c47..a944260c2692 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1089,14 +1089,18 @@ def main(args): start_time = time.time() outputs = llm.generate(inputs, sampling_params=sampling_params) elapsed_time = time.time() - start_time + print("-" * 50) print("-- generate time = {}".format(elapsed_time)) + print("-" * 50) else: outputs = llm.generate(inputs, sampling_params=sampling_params) + print("-" * 50) for o in outputs: generated_text = o.outputs[0].text print(generated_text) + print("-" * 50) if __name__ == "__main__": diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py index a0b2b44b4e82..8321d3e254a2 100644 --- a/examples/offline_inference/vision_language_embedding.py +++ b/examples/offline_inference/vision_language_embedding.py @@ -143,8 +143,10 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): "multi_modal_data": mm_data, }) + print("-" * 50) for output in outputs: print(output.outputs.embedding) + print("-" * 50) def main(args: Namespace): diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index e03ebe485eaa..39465c9b0ce4 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -644,9 +644,11 @@ def run_generate(model, question: str, image_urls: list[str], }, sampling_params=sampling_params) + print("-" * 50) for o in outputs: generated_text = o.outputs[0].text print(generated_text) + print("-" * 50) def run_chat(model: str, question: str, image_urls: list[str], @@ -687,9 +689,11 @@ def run_chat(model: str, question: str, image_urls: list[str], chat_template=req_data.chat_template, ) + print("-" * 50) for o in outputs: generated_text = o.outputs[0].text print(generated_text) + print("-" * 50) def main(args: Namespace):