diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 5dba80fe80806..737bc03a40245 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -236,7 +236,9 @@ class MessageQueue: n_reader, # number of all readers n_local_reader, # number of local readers through shared memory local_reader_ranks: list[int] | None = None, - max_chunk_bytes: int = 1024 * 1024 * 24, # 24MiB + # Default of 24MiB chosen to be large enough to accommodate grammar + # bitmask tensors for large batches (1024 requests). + max_chunk_bytes: int = 1024 * 1024 * 24, max_chunks: int = 10, connect_ip: str | None = None, ): @@ -538,6 +540,10 @@ class MessageQueue: buf[0] = 1 # overflow self.local_socket.send_multipart(all_buffers, copy=False) else: + # Byte 0: 0 + # Bytes 1-2: Count of buffers + # Then each buffer follows, preceded by 4 bytes containing its length: + # [4 byte int L][L bytes of buffer content] ... with self.acquire_write(timeout) as buf: buf[0] = 0 # not overflow offset = 3 diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 619dcd178a13a..035394f045301 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -165,7 +165,9 @@ class SchedulerOutput: # freed from the encoder cache. free_encoder_mm_hashes: list[str] - # ids of structured outputs requests included in the bitmask, in order. + # ids of structured outputs requests included in the bitmask, in the + # same order as the corresponding stacked rows of the bitmask. + # There may be more than one row per request in the case of speculative decoding. structured_output_request_ids: list[str] # the bitmask for the whole batch grammar_bitmask: "npt.NDArray[np.int32] | None"