From c4e3b12524a8f45f306a7add825651b64b683aab Mon Sep 17 00:00:00 2001 From: Ricardo Decal Date: Thu, 17 Jul 2025 20:09:19 -0700 Subject: [PATCH] [Docs] Add minimal demo of Ray Data API usage (#21080) Signed-off-by: Ricardo Decal --- docs/serving/offline_inference.md | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/docs/serving/offline_inference.md b/docs/serving/offline_inference.md index 4ec879e0bc8a5..ddda47690002a 100644 --- a/docs/serving/offline_inference.md +++ b/docs/serving/offline_inference.md @@ -30,8 +30,31 @@ This API adds several batteries-included capabilities that simplify large-scale, - Automatic sharding, load balancing, and autoscaling distribute work across a Ray cluster with built-in fault tolerance. - Continuous batching keeps vLLM replicas saturated and maximizes GPU utilization. - Transparent support for tensor and pipeline parallelism enables efficient multi-GPU inference. +- Reading and writing to most popular file formats and cloud object storage. +- Scaling up the workload without code changes. -The following example shows how to run batched inference with Ray Data and vLLM: - +??? code + + ```python + import ray # Requires ray>=2.44.1 + from ray.data.llm import vLLMEngineProcessorConfig, build_llm_processor + + config = vLLMEngineProcessorConfig(model_source="unsloth/Llama-3.2-1B-Instruct") + processor = build_llm_processor( + config, + preprocess=lambda row: { + "messages": [ + {"role": "system", "content": "You are a bot that completes unfinished haikus."}, + {"role": "user", "content": row["item"]}, + ], + "sampling_params": {"temperature": 0.3, "max_tokens": 250}, + }, + postprocess=lambda row: {"answer": row["generated_text"]}, + ) + + ds = ray.data.from_items(["An old silent pond..."]) + ds = processor(ds) + ds.write_parquet("local:///tmp/data/") + ``` For more information about the Ray Data LLM API, see the [Ray Data LLM documentation](https://docs.ray.io/en/latest/data/working-with-llms.html).