diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md index da87127057dc5..1e468962cc9c5 100644 --- a/docs/source/features/spec_decode.md +++ b/docs/source/features/spec_decode.md @@ -131,7 +131,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) llm = LLM( model="meta-llama/Meta-Llama-3.1-70B-Instruct", tensor_parallel_size=4, - speculative_model="ibm-fms/llama3-70b-accelerator", + speculative_model="ibm-ai-platform/llama3-70b-accelerator", speculative_draft_tensor_parallel_size=1, ) outputs = llm.generate(prompts, sampling_params) @@ -149,11 +149,11 @@ limitation will be fixed in a future release. A variety of speculative models of this type are available on HF hub: -- [llama-13b-accelerator](https://huggingface.co/ibm-fms/llama-13b-accelerator) -- [llama3-8b-accelerator](https://huggingface.co/ibm-fms/llama3-8b-accelerator) -- [codellama-34b-accelerator](https://huggingface.co/ibm-fms/codellama-34b-accelerator) -- [llama2-70b-accelerator](https://huggingface.co/ibm-fms/llama2-70b-accelerator) -- [llama3-70b-accelerator](https://huggingface.co/ibm-fms/llama3-70b-accelerator) +- [llama-13b-accelerator](https://huggingface.co/ibm-ai-platform/llama-13b-accelerator) +- [llama3-8b-accelerator](https://huggingface.co/ibm-ai-platform/llama3-8b-accelerator) +- [codellama-34b-accelerator](https://huggingface.co/ibm-ai-platform/codellama-34b-accelerator) +- [llama2-70b-accelerator](https://huggingface.co/ibm-ai-platform/llama2-70b-accelerator) +- [llama3-70b-accelerator](https://huggingface.co/ibm-ai-platform/llama3-70b-accelerator) - [granite-3b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator) - [granite-8b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator) - [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator) diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py index 10d9de8cb0de1..f227e71ba79be 100644 --- a/examples/offline_inference/mlpspeculator.py +++ b/examples/offline_inference/mlpspeculator.py @@ -51,7 +51,7 @@ if __name__ == "__main__": # Create an LLM with spec decoding llm = LLM( model="meta-llama/Llama-2-13b-chat-hf", - speculative_model="ibm-fms/llama-13b-accelerator", + speculative_model="ibm-ai-platform/llama-13b-accelerator", ) print("With speculation") diff --git a/tests/models/registry.py b/tests/models/registry.py index 8a0ade4fa2074..7b5032f794623 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -278,7 +278,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { "MedusaModel": _HfExamplesInfo("JackFram/llama-68m", speculative_model="abhigoyal/vllm-medusa-llama-68m-random"), # noqa: E501 "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m", - speculative_model="ibm-fms/llama-160m-accelerator"), # noqa: E501 + speculative_model="ibm-ai-platform/llama-160m-accelerator"), # noqa: E501 } _FALLBACK_MODEL = { diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py index a2b84b90222dc..59beca47acd01 100644 --- a/tests/spec_decode/e2e/test_mlp_correctness.py +++ b/tests/spec_decode/e2e/test_mlp_correctness.py @@ -33,7 +33,7 @@ from .conftest import run_equality_correctness_test MAIN_MODEL = "JackFram/llama-160m" # speculative model -SPEC_MODEL = "ibm-fms/llama-160m-accelerator" +SPEC_MODEL = "ibm-ai-platform/llama-160m-accelerator" # max. number of speculative tokens: this corresponds to # n_predict in the config.json of the speculator model. diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index cf4123a2c2b69..2920427f94f7b 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -64,7 +64,7 @@ class MLPSpeculator(nn.Module): https://arxiv.org/pdf/2404.19124 Trained speculators of this type are available on HF hub at: - https://huggingface.co/ibm-fms and https://huggingface.co/ibm-granite + https://huggingface.co/ibm-ai-platform and https://huggingface.co/ibm-granite """ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: