diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py index 027f67ad4db69..e07d6c776bc00 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py @@ -255,8 +255,8 @@ def bench_run( torch.cuda.synchronize() # Timing - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies = [] for _ in range(num_iters): diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c99951aa27826..a1af0b8aec3d0 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -185,8 +185,8 @@ def benchmark_config( graph.replay() torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index efa5a7386027e..b8913a217c608 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -105,8 +105,8 @@ def benchmark_permute( graph.replay() torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): @@ -241,8 +241,8 @@ def benchmark_unpermute( graph.replay() torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py index bdc1eb733084e..eba4d510258b6 100644 --- a/benchmarks/kernels/benchmark_per_token_group_quant.py +++ b/benchmarks/kernels/benchmark_per_token_group_quant.py @@ -30,8 +30,8 @@ def _time_cuda( fn() torch.cuda.synchronize() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) + start = torch.Event(enable_timing=True) + end = torch.Event(enable_timing=True) start.record() for _ in range(bench_iters): diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py index a5887aafd30d6..de01ff197eab7 100644 --- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py +++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py @@ -253,8 +253,8 @@ def benchmark( ) torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) # Benchmark latencies: list[float] = [] diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py index 29ce18234dfa0..1d0d6fbb9a470 100644 --- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py @@ -127,8 +127,8 @@ def benchmark_decode( def time_fn(fn, warmup=10, trials=20): torch.cuda.synchronize() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) + start = torch.Event(enable_timing=True) + end = torch.Event(enable_timing=True) times = [] for i in range(warmup): fn() diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py index 2a25d03748112..84bde723abf7f 100644 --- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py @@ -139,8 +139,8 @@ def benchmark_prefill( def time_fn(fn, warmup=10, trials=20): torch.cuda.synchronize() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) + start = torch.Event(enable_timing=True) + end = torch.Event(enable_timing=True) times = [] for i in range(warmup): fn() diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index ab54f81985bc2..b52500c8c5217 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -183,8 +183,8 @@ def benchmark_config( run() torch.cuda.synchronize() - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + start_event = torch.Event(enable_timing=True) + end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py index 9b084f2f660b2..c7662223e1ca5 100644 --- a/tests/kernels/attention/test_merge_attn_states.py +++ b/tests/kernels/attention/test_merge_attn_states.py @@ -150,8 +150,8 @@ def test_merge_attn_states( output_torch = output.clone() output_lse_torch = output_lse.clone() total_time_torch_kernel = 0 - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) + start = torch.Event(enable_timing=True) + end = torch.Event(enable_timing=True) # 0. Run the Torch kernel prefix_lse_torch = prefix_lse.clone() @@ -188,8 +188,8 @@ def test_merge_attn_states( output_lse_ref_triton = output_lse.clone() total_time_triton_kernel = 0 - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) + start = torch.Event(enable_timing=True) + end = torch.Event(enable_timing=True) for _ in range(warmup_times): merge_attn_states_triton( diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py index 646f9d0d75423..0f2ec4a1b41f3 100644 --- a/vllm/v1/kv_offload/worker/cpu_gpu.py +++ b/vllm/v1/kv_offload/worker/cpu_gpu.py @@ -68,9 +68,9 @@ class CpuGpuOffloadingHandler(OffloadingHandler): self.h2d_stream = torch.cuda.Stream() # job_id -> transfer cuda event - self.transfer_events: dict[int, torch.cuda.Event] = {} + self.transfer_events: dict[int, torch.Event] = {} # list of cuda events available for re-use - self.events_pool: list[torch.cuda.Event] = [] + self.events_pool: list[torch.Event] = [] pin_memory = is_pin_memory_available() @@ -153,7 +153,7 @@ class CpuGpuOffloadingHandler(OffloadingHandler): ) src_to_dst_tensor = torch.from_numpy(src_to_dst) - event = self.events_pool.pop() if self.events_pool else torch.cuda.Event() + event = self.events_pool.pop() if self.events_pool else torch.Event() with torch.cuda.stream(stream): for src_tensor, dst_tensor, kv_dim in zip( src_tensors, dst_tensors, self.kv_dim_before_num_blocks diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index 40f011fed1ada..6bfbc32d598fa 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -96,14 +96,14 @@ def _torch_cuda_wrapper(): def __init__(self, *args, **kwargs) -> None: pass - cuda_event = torch.cuda.Event + cuda_event = torch.Event cuda_stream = torch.cuda.Stream try: - torch.cuda.Event = _EventPlaceholder + torch.Event = _EventPlaceholder torch.cuda.Stream = _StreamPlaceholder yield finally: - torch.cuda.Event = cuda_event + torch.Event = cuda_event torch.cuda.Stream = cuda_stream diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 7cf6afa3fc371..023b5edb2c340 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -265,7 +265,7 @@ class InputBatch: # ids from prior step, if required by current sampling params # (e.g. penalties). self.sampled_token_ids_cpu: torch.Tensor | None = None - self.async_copy_ready_event: torch.cuda.Event | None = None + self.async_copy_ready_event: torch.Event | None = None @property def req_ids(self) -> list[str]: @@ -891,7 +891,7 @@ class InputBatch: def set_async_sampled_token_ids( self, sampled_token_ids_cpu: torch.Tensor, - async_copy_ready_event: torch.cuda.Event, + async_copy_ready_event: torch.Event, ) -> None: """ In async scheduling case, store ref to sampled_token_ids_cpu diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 67f575f92cc6b..506118d2d762b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -185,7 +185,7 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput): self._invalid_req_indices = invalid_req_indices # Event on the copy stream so we can synchronize the non-blocking copy. - self.async_copy_ready_event = torch.cuda.Event() + self.async_copy_ready_event = torch.Event() # Keep a reference to the device tensor to avoid it being # deallocated until we finish copying it to the host. @@ -435,10 +435,10 @@ class GPUModelRunner( self.async_output_copy_stream: torch.cuda.Stream | None = None # cuda event to synchronize use of reused CPU tensors between steps # when async scheduling is enabled. - self.prepare_inputs_event: torch.cuda.Event | None = None + self.prepare_inputs_event: torch.Event | None = None if self.use_async_scheduling: self.async_output_copy_stream = torch.cuda.Stream() - self.prepare_inputs_event = torch.cuda.Event() + self.prepare_inputs_event = torch.Event() # self.cudagraph_batch_sizes sorts in ascending order. if ( @@ -549,7 +549,7 @@ class GPUModelRunner( # Cached outputs. self._draft_token_ids: list[list[int]] | torch.Tensor | None = None - self.transfer_event = torch.cuda.Event() + self.transfer_event = torch.Event() self.sampled_token_ids_pinned_cpu = torch.empty( (self.max_num_reqs, 1), dtype=torch.int64, @@ -559,10 +559,10 @@ class GPUModelRunner( # Pre-allocated tensor for copying valid sampled token counts to CPU, # with dedicated stream for overlapping and event for coordination. - self.valid_sampled_token_count_event: torch.cuda.Event | None = None + self.valid_sampled_token_count_event: torch.Event | None = None self.valid_sampled_token_count_copy_stream: torch.cuda.Stream | None = None if self.use_async_scheduling and self.num_spec_tokens: - self.valid_sampled_token_count_event = torch.cuda.Event() + self.valid_sampled_token_count_event = torch.Event() self.valid_sampled_token_count_copy_stream = torch.cuda.Stream() self.valid_sampled_token_count_cpu = torch.empty( self.max_num_reqs, diff --git a/vllm/v1/worker/ubatching.py b/vllm/v1/worker/ubatching.py index 9f16b1e6d03ee..be8326e2fdbc1 100644 --- a/vllm/v1/worker/ubatching.py +++ b/vllm/v1/worker/ubatching.py @@ -27,8 +27,8 @@ class UBatchContext: ready_barrier: threading.Barrier, cpu_wait_event: threading.Event, cpu_signal_event: threading.Event, - gpu_comm_done_event: torch.cuda.Event, - gpu_compute_done_event: torch.cuda.Event, + gpu_comm_done_event: torch.Event, + gpu_compute_done_event: torch.Event, schedule: str = "default", ): self.id = id @@ -207,8 +207,8 @@ def make_ubatch_contexts( Create a context manager for micro-batching synchronization. """ cpu_events = [threading.Event() for _ in range(num_micro_batches)] - gpu_comm_done_events = [torch.cuda.Event() for _ in range(num_micro_batches)] - gpu_compute_done_events = [torch.cuda.Event() for _ in range(num_micro_batches)] + gpu_comm_done_events = [torch.Event() for _ in range(num_micro_batches)] + gpu_compute_done_events = [torch.Event() for _ in range(num_micro_batches)] assert len(forward_contexts) == 2 diff --git a/vllm/v1/worker/xpu_model_runner.py b/vllm/v1/worker/xpu_model_runner.py index 4f82c18da73aa..30563305853a5 100644 --- a/vllm/v1/worker/xpu_model_runner.py +++ b/vllm/v1/worker/xpu_model_runner.py @@ -37,19 +37,12 @@ class XPUModelRunner(GPUModelRunner): @contextmanager def _torch_cuda_wrapper(): - class _EventPlaceholder: - def __init__(self, *args, **kwargs) -> None: - self.record = lambda: None - self.synchronize = lambda: None - try: # replace cuda APIs with xpu APIs, this should work by default - torch.cuda.Event = torch.xpu.Event torch.cuda.Stream = torch.xpu.Stream torch.cuda.default_stream = torch.xpu.current_stream torch.cuda.current_stream = torch.xpu.current_stream torch.cuda.stream = torch.xpu.stream yield finally: - # if anything goes wrong, just patch it with a placeholder - torch.cuda.Event = _EventPlaceholder + pass