diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 3f280c837be98..3f1d50d558109 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -435,6 +435,18 @@ steps: - pytest -v -s compile/test_full_graph.py - pytest -v -s compile/test_fusions_e2e.py +- label: Cudagraph test + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental] + source_file_dependencies: + - tests/v1/cudagraph + - vllm/v1/cudagraph_dispatcher.py + - vllm/config/compilation.py + - vllm/compilation + commands: + - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py + - pytest -v -s v1/cudagraph/test_cudagraph_mode.py + - label: Kernels Core Operation Test # 48min timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] diff --git a/tests/utils.py b/tests/utils.py index e52497cf52a1e..fb7614dd7fbce 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1111,6 +1111,11 @@ def spawn_new_process_for_each_test(f: Callable[_P, None]) -> Callable[_P, None] # `cloudpickle` allows pickling complex functions directly input_bytes = cloudpickle.dumps((f, output_filepath)) + repo_root = str(VLLM_PATH.resolve()) + + env = dict(env or os.environ) + env["PYTHONPATH"] = repo_root + os.pathsep + env.get("PYTHONPATH", "") + cmd = [sys.executable, "-m", f"{module_name}"] returned = subprocess.run( diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py index 02fa27e3f05f7..bb953e5c70c8c 100644 --- a/tests/v1/cudagraph/test_cudagraph_dispatch.py +++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py @@ -34,13 +34,16 @@ class SimpleMLP(nn.Module): def _create_vllm_config( - compilation_config: CompilationConfig, max_num_seqs: int = 8 + compilation_config: CompilationConfig, + max_num_seqs: int = 8, + lora_config: bool = False, ) -> MagicMock: mock_config = MagicMock(spec=VllmConfig) mock_config.compilation_config = compilation_config mock_config.scheduler_config = SchedulerConfig(max_num_seqs=max_num_seqs) mock_config.parallel_config = ParallelConfig() - + if not lora_config: + mock_config.lora_config = None # Mimic the behavior of VllmConfig.__post_init__() if compilation_config.mode == CompilationMode.VLLM_COMPILE: compilation_config.set_splitting_ops_for_v1() @@ -50,19 +53,21 @@ def _create_vllm_config( class TestCudagraphDispatcher: @pytest.mark.parametrize( - "case_id,cudagraph_mode_str,compilation_mode", + "cudagraph_mode_str,compilation_mode,lora_config", [ # Test case 0: Full CG for mixed batches, no separate routine - (0, "FULL", CompilationMode.NONE), + ("FULL", CompilationMode.NONE, False), # Test case 1: Full CG for uniform batches, piecewise for mixed - (1, "FULL_AND_PIECEWISE", CompilationMode.NONE), + ("FULL_AND_PIECEWISE", CompilationMode.NONE, False), # Test case 2: Full CG for uniform batches, no CG for mixed - (2, "FULL_DECODE_ONLY", CompilationMode.NONE), + ("FULL_DECODE_ONLY", CompilationMode.NONE, False), # Test case 3: PIECEWISE for all - (3, "PIECEWISE", CompilationMode.VLLM_COMPILE), + ("PIECEWISE", CompilationMode.VLLM_COMPILE, False), + # Test case 4: PIECEWISE for all, specialize LoRA cases + ("PIECEWISE", CompilationMode.VLLM_COMPILE, True), ], ) - def test_dispatcher(self, cudagraph_mode_str, compilation_mode): + def test_dispatcher(self, cudagraph_mode_str, compilation_mode, lora_config): # Setup dispatcher comp_config = CompilationConfig( cudagraph_mode=cudagraph_mode_str, @@ -70,7 +75,17 @@ class TestCudagraphDispatcher: cudagraph_capture_sizes=[1, 8], ) - config = _create_vllm_config(comp_config, max_num_seqs=8) + config = _create_vllm_config( + comp_config, max_num_seqs=8, lora_config=lora_config + ) + if ( + cudagraph_mode_str == "FULL_AND_PIECEWISE" + and compilation_mode == CompilationMode.NONE + ): + with pytest.raises(AssertionError): + dispatcher = CudagraphDispatcher(config) + return + dispatcher = CudagraphDispatcher(config) dispatcher.initialize_cudagraph_keys( cudagraph_mode=comp_config.cudagraph_mode, uniform_decode_query_len=1 @@ -78,17 +93,24 @@ class TestCudagraphDispatcher: # Verify the key is initialized correctly if cudagraph_mode_str in ["FULL_AND_PIECEWISE", "PIECEWISE"]: - assert len(dispatcher.cudagraph_keys[CUDAGraphMode.PIECEWISE]) == 2 + assert len(dispatcher.cudagraph_keys[CUDAGraphMode.PIECEWISE]) == ( + 4 if lora_config else 2 + ) else: assert len(dispatcher.cudagraph_keys[CUDAGraphMode.PIECEWISE]) == 0 if cudagraph_mode_str not in ["NONE", "PIECEWISE"]: - assert len(dispatcher.cudagraph_keys[CUDAGraphMode.FULL]) == 2 + assert len(dispatcher.cudagraph_keys[CUDAGraphMode.FULL]) == ( + 4 if lora_config else 2 + ) else: assert len(dispatcher.cudagraph_keys[CUDAGraphMode.FULL]) == 0 # Test dispatch logic # 1. non-uniform batch, size in cudagraph size list - desc_full_exact = BatchDescriptor(num_tokens=8, uniform_decode=False) + desc_full_exact = BatchDescriptor( + num_tokens=8, + uniform_decode=False, + ) rt_mode, key = dispatcher.dispatch(desc_full_exact) if cudagraph_mode_str == "FULL": assert rt_mode == CUDAGraphMode.FULL @@ -138,7 +160,6 @@ class TestCUDAGraphWrapper: self.persistent_input_buffer = torch.zeros(1, 10, device="cuda") self.input_tensor = torch.randn(1, 10, device="cuda") - @create_new_process_for_each_test("spawn") def test_capture_and_replay(self): wrapper = CUDAGraphWrapper( self.model, self.vllm_config, runtime_mode=CUDAGraphMode.FULL @@ -192,7 +213,6 @@ class TestCUDAGraphWrapper: eager_output = self.model(self.input_tensor) torch.testing.assert_close(eager_output, output2) - @create_new_process_for_each_test("spawn") def test_bypass_on_mode_mismatch(self): wrapper = CUDAGraphWrapper( self.model, self.vllm_config, runtime_mode=CUDAGraphMode.FULL @@ -216,7 +236,6 @@ class TestCUDAGraphWrapper: mock_forward.assert_called_once() assert not wrapper.concrete_cudagraph_entries - @create_new_process_for_each_test("spawn") def test_bypass_on_mode_none(self): wrapper = CUDAGraphWrapper( self.model, self.vllm_config, runtime_mode=CUDAGraphMode.FULL diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py index 818ae1d7ba677..d6bde16eba36b 100644 --- a/tests/v1/cudagraph/test_cudagraph_mode.py +++ b/tests/v1/cudagraph/test_cudagraph_mode.py @@ -109,9 +109,9 @@ combo_cases_2 = [ @pytest.mark.parametrize( "backend_name,cudagraph_mode,compilation_mode,supported", combo_cases_2 ) -def test_cudagraph_compilation_combo(combo_case): - backend_name, cudagraph_mode, compilation_mode, supported = combo_case - +def test_cudagraph_compilation_combo( + backend_name, cudagraph_mode, compilation_mode, supported +): env_vars = backend_configs[backend_name].env_vars with temporary_environ(env_vars), ExitStack() as stack: