From 32335c8b34b6a6aa4d3075b47e819a9c3ff652c8 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 26 Sep 2025 21:23:52 -0400 Subject: [PATCH] Add option to restrict media domains (#25783) Signed-off-by: Chenheli Hua Signed-off-by: Russell Bryant Co-authored-by: Chenheli Hua Signed-off-by: simon-mo --- docs/features/multimodal_inputs.md | 4 +++ docs/usage/security.md | 6 ++++ .../entrypoints/openai/test_lora_resolvers.py | 1 + tests/entrypoints/openai/test_serving_chat.py | 1 + tests/multimodal/test_utils.py | 33 ++++++++++++++++++- vllm/config/model.py | 3 ++ vllm/config/speculative.py | 2 ++ vllm/engine/arg_utils.py | 5 +++ vllm/entrypoints/chat_utils.py | 6 ++++ vllm/entrypoints/llm.py | 4 +++ vllm/multimodal/utils.py | 16 +++++++++ 11 files changed, 80 insertions(+), 1 deletion(-) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 7fb0337235005..bcc48e7560462 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -6,6 +6,10 @@ This page teaches you how to pass multi-modal inputs to [multi-modal models][sup We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes, and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests. +!!! tip + When serving multi-modal models, consider setting `--allowed-media-domains` to restrict domain that vLLM can access to prevent it from accessing arbitrary endpoints that can potentially be vulnerable to Server-Side Request Forgery (SSRF) attacks. You can provide a list of domains for this arg. For example: `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com` + This restriction is especially important if you run vLLM in a containerized environment where the vLLM pods may have unrestricted access to internal networks. + ## Offline Inference To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]: diff --git a/docs/usage/security.md b/docs/usage/security.md index d54e2bb37ec07..5d85e889c80cc 100644 --- a/docs/usage/security.md +++ b/docs/usage/security.md @@ -60,6 +60,12 @@ Key points from the PyTorch security guide: - Implement proper authentication and authorization for management interfaces - Follow the principle of least privilege for all system components +### 4. **Restrict Domains Access for Media URLs:** + +Restrict domains that vLLM can access for media URLs by setting +`--allowed-media-domains` to prevent Server-Side Request Forgery (SSRF) attacks. +(e.g. `--allowed-media-domains upload.wikimedia.org github.com www.bogotobogo.com`) + ## Security and Firewalls: Protecting Exposed vLLM Systems While vLLM is designed to allow unsafe network services to be isolated to diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index 9d5ee84a19567..0561158dcf65a 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -45,6 +45,7 @@ class MockModelConfig: logits_processor_pattern: Optional[str] = None diff_sampling_param: Optional[dict] = None allowed_local_media_path: str = "" + allowed_media_domains: Optional[list[str]] = None encoder_config = None generation_config: str = "auto" skip_tokenizer_init: bool = False diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index bfed760822cdb..07f39fe2b9bd0 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -240,6 +240,7 @@ class MockModelConfig: logits_processor_pattern = None diff_sampling_param: Optional[dict] = None allowed_local_media_path: str = "" + allowed_media_domains: Optional[list[str]] = None encoder_config = None generation_config: str = "auto" media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index f6a93bae2afce..d1a7882a4c376 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -66,7 +66,12 @@ async def test_fetch_image_http(image_url: str): @pytest.mark.parametrize("suffix", get_supported_suffixes()) async def test_fetch_image_base64(url_images: dict[str, Image.Image], raw_image_url: str, suffix: str): - connector = MediaConnector() + connector = MediaConnector( + # Domain restriction should not apply to data URLs. + allowed_media_domains=[ + "www.bogotobogo.com", + "github.com", + ]) url_image = url_images[raw_image_url] try: @@ -387,3 +392,29 @@ def test_argsort_mm_positions(case): modality_idxs = argsort_mm_positions(mm_positions) assert modality_idxs == expected_modality_idxs + + +@pytest.mark.asyncio +@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) +@pytest.mark.parametrize("num_frames", [-1, 32, 1800]) +async def test_allowed_media_domains(video_url: str, num_frames: int): + connector = MediaConnector( + media_io_kwargs={"video": { + "num_frames": num_frames, + }}, + allowed_media_domains=[ + "www.bogotobogo.com", + "github.com", + ]) + + video_sync, metadata_sync = connector.fetch_video(video_url) + video_async, metadata_async = await connector.fetch_video_async(video_url) + assert np.array_equal(video_sync, video_async) + assert metadata_sync == metadata_async + + disallowed_url = "https://upload.wikimedia.org/wikipedia/commons/4/47/PNG_transparency_demonstration_1.png" + with pytest.raises(ValueError): + _, _ = connector.fetch_video(disallowed_url) + + with pytest.raises(ValueError): + _, _ = await connector.fetch_video_async(disallowed_url) diff --git a/vllm/config/model.py b/vllm/config/model.py index da01d6d4480c5..b2b68abd2c1d3 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -137,6 +137,9 @@ class ModelConfig: """Allowing API requests to read local images or videos from directories specified by the server file system. This is a security risk. Should only be enabled in trusted environments.""" + allowed_media_domains: Optional[list[str]] = None + """If set, only media URLs that belong to this domain can be used for + multi-modal inputs. """ revision: Optional[str] = None """The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 5f462442148f8..46a4e42d395ee 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -281,6 +281,8 @@ class SpeculativeConfig: trust_remote_code, allowed_local_media_path=self.target_model_config. allowed_local_media_path, + allowed_media_domains=self.target_model_config. + allowed_media_domains, dtype=self.target_model_config.dtype, seed=self.target_model_config.seed, revision=self.revision, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7b5ed67d0adbb..63047e41a54b1 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -297,6 +297,8 @@ class EngineArgs: tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode trust_remote_code: bool = ModelConfig.trust_remote_code allowed_local_media_path: str = ModelConfig.allowed_local_media_path + allowed_media_domains: Optional[ + list[str]] = ModelConfig.allowed_media_domains download_dir: Optional[str] = LoadConfig.download_dir safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy load_format: Union[str, LoadFormats] = LoadConfig.load_format @@ -531,6 +533,8 @@ class EngineArgs: **model_kwargs["hf_config_path"]) model_group.add_argument("--allowed-local-media-path", **model_kwargs["allowed_local_media_path"]) + model_group.add_argument("--allowed-media-domains", + **model_kwargs["allowed_media_domains"]) model_group.add_argument("--revision", **model_kwargs["revision"]) model_group.add_argument("--code-revision", **model_kwargs["code_revision"]) @@ -997,6 +1001,7 @@ class EngineArgs: tokenizer_mode=self.tokenizer_mode, trust_remote_code=self.trust_remote_code, allowed_local_media_path=self.allowed_local_media_path, + allowed_media_domains=self.allowed_media_domains, dtype=self.dtype, seed=self.seed, revision=self.revision, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 358453b1060a3..6b0ed23277d36 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -637,6 +637,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): def allowed_local_media_path(self): return self._model_config.allowed_local_media_path + @property + def allowed_media_domains(self): + return self._model_config.allowed_media_domains + @property def mm_registry(self): return MULTIMODAL_REGISTRY @@ -837,6 +841,7 @@ class MultiModalContentParser(BaseMultiModalContentParser): self._connector = MediaConnector( media_io_kwargs=media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, + allowed_media_domains=tracker.allowed_media_domains, ) def parse_image( @@ -921,6 +926,7 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): self._connector = MediaConnector( media_io_kwargs=media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, + allowed_media_domains=tracker.allowed_media_domains, ) def parse_image( diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index dfe535b959179..862f383e4ecb2 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -86,6 +86,8 @@ class LLM: or videos from directories specified by the server file system. This is a security risk. Should only be enabled in trusted environments. + allowed_media_domains: If set, only media URLs that belong to this + domain can be used for multi-modal inputs. tensor_parallel_size: The number of GPUs to use for distributed execution with tensor parallelism. dtype: The data type for the model weights and activations. Currently, @@ -169,6 +171,7 @@ class LLM: skip_tokenizer_init: bool = False, trust_remote_code: bool = False, allowed_local_media_path: str = "", + allowed_media_domains: Optional[list[str]] = None, tensor_parallel_size: int = 1, dtype: ModelDType = "auto", quantization: Optional[QuantizationMethods] = None, @@ -264,6 +267,7 @@ class LLM: skip_tokenizer_init=skip_tokenizer_init, trust_remote_code=trust_remote_code, allowed_local_media_path=allowed_local_media_path, + allowed_media_domains=allowed_media_domains, tensor_parallel_size=tensor_parallel_size, dtype=dtype, quantization=quantization, diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 9b158267040af..1f1eea6bfee75 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -50,6 +50,7 @@ class MediaConnector: connection: HTTPConnection = global_http_connection, *, allowed_local_media_path: str = "", + allowed_media_domains: Optional[list[str]] = None, ) -> None: """ Args: @@ -82,6 +83,9 @@ class MediaConnector: allowed_local_media_path_ = None self.allowed_local_media_path = allowed_local_media_path_ + if allowed_media_domains is None: + allowed_media_domains = [] + self.allowed_media_domains = allowed_media_domains def _load_data_url( self, @@ -115,6 +119,14 @@ class MediaConnector: return media_io.load_file(filepath) + def _assert_url_in_allowed_media_domains(self, url_spec) -> None: + if self.allowed_media_domains and url_spec.hostname not in \ + self.allowed_media_domains: + raise ValueError( + f"The URL must be from one of the allowed domains: " + f"{self.allowed_media_domains}. Input URL domain: " + f"{url_spec.hostname}") + def load_from_url( self, url: str, @@ -125,6 +137,8 @@ class MediaConnector: url_spec = urlparse(url) if url_spec.scheme.startswith("http"): + self._assert_url_in_allowed_media_domains(url_spec) + connection = self.connection data = connection.get_bytes(url, timeout=fetch_timeout) @@ -150,6 +164,8 @@ class MediaConnector: loop = asyncio.get_running_loop() if url_spec.scheme.startswith("http"): + self._assert_url_in_allowed_media_domains(url_spec) + connection = self.connection data = await connection.async_get_bytes(url, timeout=fetch_timeout) future = loop.run_in_executor(global_thread_pool,