diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index dfed15d4ace97..595115ab6ff77 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -840,3 +840,8 @@ Key capabilities: The following example shows how to deploy a large model like DeepSeek R1 with Ray Serve LLM: . Learn more about Ray Serve LLM with the official [Ray Serve LLM documentation](https://docs.ray.io/en/latest/serve/llm/serving-llms.html). + +curl http://localhost:8002/v1/rerank -H "Content-Type: application/json" -d '{ + "query": "What is the capital of France?", + "documents": ["The capital of France is Paris.", "The capital of Germany is Berlin."] + }' \ No newline at end of file diff --git a/requirements/cuda.txt b/requirements/cuda.txt index fb30e493f80b3..77242381e124e 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -5,10 +5,4 @@ numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Req numba == 0.61.2; python_version > '3.9' # Dependencies for NVIDIA GPUs -ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1. -torch==2.7.1 -torchaudio==2.7.1 -# These must be updated alongside torch -torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version -# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31 -xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7 \ No newline at end of file +torch==2.8.0 \ No newline at end of file diff --git a/tests/models/quantization/test_mxfp4.py b/tests/models/quantization/test_mxfp4.py index 7b8a334bbc369..425e20f846f1d 100644 --- a/tests/models/quantization/test_mxfp4.py +++ b/tests/models/quantization/test_mxfp4.py @@ -39,3 +39,20 @@ def test_models(example_prompts, model_name) -> None: expected_str = EXPECTED_STRS_MAP[model_name][i] assert expected_str == output_str, ( f"Expected: {expected_str!r}\nvLLM: {output_str!r}") + +curl https://localhost:8002/v1/embeddings \ + -H "Content-Type: application/json" \ + -d '{ + "input": "Query: What is the capital of France? \n\nDocuments: \n1. Paris is the capital city of France.\n2. Berlin is the capital of Germany.\n \n Rank the documents from most to least relevant to the query and provide a relevance score", + "model": "$MODEL", + "encoding_format": "float" + }' + + +curl https://localhost:8002/v1/rerank \ + -H "Content-Type: application/json" \ + -d '{ + "input": "Query: What is the capital of France? \n\nDocuments: \n1. Paris is the capital city of France.\n2. Berlin is the capital of Germany.\n \n Rank the documents from most to least relevant to the query and provide a relevance score", + "prompt": "Query: What is the capital of France? \n\nDocuments: \n1. Paris is the capital city of France.\n2. Berlin is the capital of Germany.\n \n Rank the documents from most to least relevant to the query and provide a relevance score" + "model": "BAAI/bge-reranker-v2-m3", + }'