vllm/examples/offline_inference/basic/cpu_weight_loading.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

"""
Example demonstrating how to load model weights from CPU using pt_load_map_location.

This is useful when:
- You want to explicitly load PyTorch checkpoints from CPU
- You need to manage memory usage during model initialization
- You want to map weights from one device to another

The pt_load_map_location parameter works the same as PyTorch's torch.load(map_location=...)
and defaults to "cpu" for most efficient loading.
"""

from vllm import LLM, SamplingParams

# Sample prompts.
prompts = [
    "The advantages of loading weights from CPU include",
    "When should you use CPU weight loading?",
    "Memory management in machine learning is important because",
]

# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50)


def main():
    # Example 1: Explicitly load weights from CPU (default behavior)
    print("=== Example 1: Loading weights from CPU ===")
    llm = LLM(
        model="facebook/opt-125m",
        pt_load_map_location="cpu"  # Explicitly specify CPU loading
    )

    outputs = llm.generate(prompts[:1], sampling_params)
    for output in outputs:
        print(f"Prompt: {output.prompt}")
        print(f"Output: {output.outputs[0].text}")

    # Example 2: Using device mapping (useful for multi-GPU setups)
    print("\n=== Example 2: Device mapping example ===")
    # Note: This example shows the syntax, but may not be applicable
    # unless you have multiple CUDA devices available
    try:
        llm_mapped = LLM(
            model="facebook/opt-125m",
            pt_load_map_location={"": "cpu"}  # Alternative syntax for CPU
        )

        outputs = llm_mapped.generate(prompts[1:2], sampling_params)
        for output in outputs:
            print(f"Prompt: {output.prompt}")
            print(f"Output: {output.outputs[0].text}")

    except Exception as e:
        print(f"Device mapping example failed (this is normal if no CUDA available): {e}")

    # Example 3: Default behavior (pt_load_map_location="cpu" is the default)
    print("\n=== Example 3: Default behavior (CPU loading) ===")
    llm_default = LLM(model="facebook/opt-125m")  # Uses CPU loading by default

    outputs = llm_default.generate(prompts[2:3], sampling_params)
    for output in outputs:
        print(f"Prompt: {output.prompt}")
        print(f"Output: {output.outputs[0].text}")


if __name__ == "__main__":
    main()