diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index f10a8fb8e03cf..2d8594cb8aafa 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -298,8 +298,11 @@ class XGrammarLogitsProcessor: # token_bitmask is a CPU tensor for use with accept_token and # fill_next_token_bitmask so we move it to the device of scores device_type = scores.device.type + dtype = scores.dtype if device_type != "cuda": - scores = scores.to("cpu").unsqueeze(0) + # xgrammar on cpu only supports float32 scores + # see: https://github.com/mlc-ai/xgrammar/blob/c1b64920cad24f44f235778c1c00bb52d57da01a/python/xgrammar/kernels/apply_token_bitmask_inplace_cpu.py#L22 + scores = scores.to("cpu").float().unsqueeze(0) # Note: In this method, if the tensors have different dimensions # on CPU device fails, but on GPU it runs without error. Hence the @@ -307,7 +310,7 @@ class XGrammarLogitsProcessor: xgr.apply_token_bitmask_inplace(scores, self.token_bitmask.to(scores.device)) if device_type != "cuda": - scores = scores.to(device_type).squeeze() + scores = scores.to(dtype).to(device_type).squeeze() return scores