diff --git a/vllm/v1/worker/ubatching.py b/vllm/v1/worker/ubatching.py
index 47a681a6757b5..182a20698fd1f 100644
--- a/vllm/v1/worker/ubatching.py
+++ b/vllm/v1/worker/ubatching.py
@@ -80,6 +80,27 @@ class UBatchContext:
     #  before yielding back to ubatch1 but ensure we wont start the dispatch
     #  until ubatch0-dispatch is done avoiding overlapping dispatches that
     #  might share underlying buffers
+    #
+    # NOTE(lucas): I think we need to do:
+    #  ubatch0 
+    #   - work
+    #   - dispatch send
+    #   - yield
+    #  ubatch1
+    #   - work
+    #   - yield
+    #  ubatch0
+    #   - dispatch recv
+    #   - gpu record, event0
+    #   - yield
+    #  ubatch1
+    #   - gpu wait, event0
+    #   - dispatch send
+    #   - yield
+    #  ubatch0
+    #   - work
+    #   .....
+    # To ensure we record the cuda event before waiting
     def gpu_stream_wait(self):
         print("Waiting ubatch %d on %s in stream %s" % (self.id, self.gpu_wait_event, self.stream))
         self.stream.wait_event(self.gpu_wait_event)