mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-02 05:37:54 +08:00
[BUGFIX] Move scores to float32 in case of running xgrammar on cpu (#12152)
Signed-off-by: Michal Adamczyk <madamczyk@habana.ai>
This commit is contained in:
parent
7a8a48d51e
commit
4e94951bb1
@ -298,8 +298,11 @@ class XGrammarLogitsProcessor:
|
|||||||
# token_bitmask is a CPU tensor for use with accept_token and
|
# token_bitmask is a CPU tensor for use with accept_token and
|
||||||
# fill_next_token_bitmask so we move it to the device of scores
|
# fill_next_token_bitmask so we move it to the device of scores
|
||||||
device_type = scores.device.type
|
device_type = scores.device.type
|
||||||
|
dtype = scores.dtype
|
||||||
if device_type != "cuda":
|
if device_type != "cuda":
|
||||||
scores = scores.to("cpu").unsqueeze(0)
|
# xgrammar on cpu only supports float32 scores
|
||||||
|
# see: https://github.com/mlc-ai/xgrammar/blob/c1b64920cad24f44f235778c1c00bb52d57da01a/python/xgrammar/kernels/apply_token_bitmask_inplace_cpu.py#L22
|
||||||
|
scores = scores.to("cpu").float().unsqueeze(0)
|
||||||
|
|
||||||
# Note: In this method, if the tensors have different dimensions
|
# Note: In this method, if the tensors have different dimensions
|
||||||
# on CPU device fails, but on GPU it runs without error. Hence the
|
# on CPU device fails, but on GPU it runs without error. Hence the
|
||||||
@ -307,7 +310,7 @@ class XGrammarLogitsProcessor:
|
|||||||
xgr.apply_token_bitmask_inplace(scores,
|
xgr.apply_token_bitmask_inplace(scores,
|
||||||
self.token_bitmask.to(scores.device))
|
self.token_bitmask.to(scores.device))
|
||||||
if device_type != "cuda":
|
if device_type != "cuda":
|
||||||
scores = scores.to(device_type).squeeze()
|
scores = scores.to(dtype).to(device_type).squeeze()
|
||||||
|
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user