diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 345519a07e411..f1575b1b0fc7d 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -3,7 +3,7 @@ import pytest from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig from vllm.sampling_params import SamplingParams -from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData, +from vllm.v1.core.scheduler.output import (CachedRequestData, NewRequestData, SchedulerOutput) from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.worker.gpu_model_runner import GPUModelRunner diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index db80e52bf0738..7c62c9a153fb3 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -16,7 +16,7 @@ from vllm.platforms import current_platform from vllm.utils import cdiv if TYPE_CHECKING: - from vllm.v1.core.scheduler_output import SchedulerOutput + from vllm.v1.core.scheduler.output import SchedulerOutput from vllm.v1.worker.gpu_input_batch import InputBatch from vllm.v1.worker.gpu_model_runner import GPUModelRunner diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 14a7bd3535222..d7a08ad4b058b 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -233,7 +233,7 @@ except ImportError: from flash_attn import flash_attn_varlen_func if TYPE_CHECKING: - from vllm.v1.core.scheduler_output import SchedulerOutput + from vllm.v1.core.scheduler.output import SchedulerOutput from vllm.v1.worker.gpu_input_batch import InputBatch from vllm.v1.worker.gpu_model_runner import GPUModelRunner diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index a7e50f8f40ece..ac5dc4e92627c 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -13,7 +13,7 @@ from vllm.logger import init_logger from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager, compute_encoder_budget) from vllm.v1.core.kv_cache_manager import KVCacheManager -from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData, +from vllm.v1.core.scheduler.output import (CachedRequestData, NewRequestData, SchedulerOutput) from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs) diff --git a/vllm/v1/core/scheduler/__init__.py b/vllm/v1/core/scheduler/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/core/scheduler_output.py b/vllm/v1/core/scheduler/output.py similarity index 100% rename from vllm/v1/core/scheduler_output.py rename to vllm/v1/core/scheduler/output.py diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 5a4e67a2dd78f..662edce410f18 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -22,7 +22,7 @@ from vllm.transformers_utils.config import ( from vllm.utils import (get_exception_traceback, resolve_obj_by_qualname, zmq_socket_ctx) from vllm.v1.core.kv_cache_utils import get_kv_cache_configs -from vllm.v1.core.scheduler import SchedulerOutput +from vllm.v1.core.scheduler.output import SchedulerOutput from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, EngineCoreRequestType, UtilityOutput) from vllm.v1.engine.mm_input_cache import MMInputCacheServer diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index df7ca70924bf5..dfdfe9a76a8bb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -44,7 +44,7 @@ from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin if TYPE_CHECKING: import xgrammar as xgr - from vllm.v1.core.scheduler_output import SchedulerOutput + from vllm.v1.core.scheduler.output import SchedulerOutput else: xgr = LazyLoader("xgr", globals(), "xgrammar") diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 5527a105f8670..e69a8c6453fbc 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -28,7 +28,7 @@ from vllm.v1.worker.worker_base import WorkerBase logger = init_logger(__name__) if TYPE_CHECKING: - from vllm.v1.core.scheduler_output import SchedulerOutput + from vllm.v1.core.scheduler.output import SchedulerOutput class Worker(WorkerBase): diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index effcac7e7bdef..bba016271fa07 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -35,7 +35,7 @@ from vllm.v1.utils import bind_kv_cache from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch if TYPE_CHECKING: - from vllm.v1.core.scheduler import SchedulerOutput + from vllm.v1.core.scheduler.output import SchedulerOutput logger = init_logger(__name__) diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 9f59561192753..df00eedf0d943 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -17,7 +17,7 @@ from vllm.distributed import (ensure_model_parallel_initialized, from vllm.logger import init_logger from vllm.model_executor import set_random_seed from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE -from vllm.v1.core.scheduler import SchedulerOutput +from vllm.v1.core.scheduler.output import SchedulerOutput from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheSpec) from vllm.v1.outputs import ModelRunnerOutput