diff --git a/tests/v1/worker/test_gpu_block_table.py b/tests/v1/worker/test_gpu_block_table.py
new file mode 100644
index 0000000000000..bb29ea7b1d2ba
--- /dev/null
+++ b/tests/v1/worker/test_gpu_block_table.py
@@ -0,0 +1,52 @@
+import pytest
+import random
+import time
+
+import torch
+
+from vllm.v1.worker.gpu_block_table import BlockTable
+
+MAX_NUM_REQS = 1024
+MAX_MODEL_LEN = 128 * 1024
+BLOCK_SIZE = 16
+MAX_NUM_BLOCKS_PER_REQ = MAX_MODEL_LEN // BLOCK_SIZE
+
+
+def test_block_table(do_wait: bool):
+    random.seed(0)
+    torch.manual_seed(0)
+    torch.cuda.manual_seed_all(0)
+
+    block_table = BlockTable(
+        max_num_reqs=MAX_NUM_REQS,
+        max_model_len=MAX_MODEL_LEN,
+        max_num_blocks_per_req=MAX_NUM_BLOCKS_PER_REQ,
+        pin_memory=True,
+        device=torch.device(0),
+    )
+
+    num_blocks = random.randint(1, MAX_NUM_BLOCKS_PER_REQ - 1)
+    block_ids = torch.randint(0, MAX_NUM_BLOCKS_PER_REQ, (num_blocks,), dtype=torch.int32, device="cpu")
+    block_table.add_row(0, block_ids)
+    num_blocks = random.randint(1, MAX_NUM_BLOCKS_PER_REQ - 100)
+    block_ids = torch.randint(0, MAX_NUM_BLOCKS_PER_REQ, (num_blocks,), dtype=torch.int32, device="cpu")
+    block_table.add_row(1, block_ids)
+    block_table.commit(2)
+
+    torch.cuda.synchronize()
+    if do_wait:
+        time.sleep(1)
+
+    block_ids = torch.randint(0, MAX_NUM_BLOCKS_PER_REQ, (100,), dtype=torch.int32, device="cpu")
+    block_table.append_row(1, num_blocks, block_ids)
+    block_table.move_row(1, 0)
+    block_table.commit(2)
+
+    torch.cuda.synchronize()
+    if do_wait:
+        time.sleep(1)
+
+    torch.testing.assert_close(block_table.block_table[:1].cpu(), block_table.block_table_cpu[:1])
+
+if __name__ == "__main__":
+    test_block_table(do_wait=False)