mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-24 21:24:34 +08:00
Fix GLM-4.5V-FP8 numerical issue (#22949)
Signed-off-by: qizixi <qizixi@meta.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
parent
3c8a787247
commit
4efd43e9b4
@ -333,6 +333,80 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# GLM-4.5V
|
||||||
|
def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
|
model_name = "zai-org/GLM-4.5V"
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
max_model_len=4096,
|
||||||
|
max_num_seqs=2,
|
||||||
|
mm_processor_kwargs={
|
||||||
|
"size": {"shortest_edge": 12544, "longest_edge": 47040000},
|
||||||
|
"fps": 1,
|
||||||
|
},
|
||||||
|
limit_mm_per_prompt={modality: 1},
|
||||||
|
enforce_eager=True,
|
||||||
|
tensor_parallel_size=4,
|
||||||
|
)
|
||||||
|
|
||||||
|
if modality == "image":
|
||||||
|
placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
|
||||||
|
elif modality == "video":
|
||||||
|
placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
|
||||||
|
|
||||||
|
prompts = [
|
||||||
|
(
|
||||||
|
"[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
|
||||||
|
f"{placeholder}"
|
||||||
|
f"{question}<|assistant|>assistant\n"
|
||||||
|
)
|
||||||
|
for question in questions
|
||||||
|
]
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# GLM-4.5V-FP8
|
||||||
|
def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
|
model_name = "zai-org/GLM-4.5V-FP8"
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
max_model_len=4096,
|
||||||
|
max_num_seqs=2,
|
||||||
|
mm_processor_kwargs={
|
||||||
|
"size": {"shortest_edge": 12544, "longest_edge": 47040000},
|
||||||
|
"fps": 1,
|
||||||
|
},
|
||||||
|
limit_mm_per_prompt={modality: 1},
|
||||||
|
enforce_eager=True,
|
||||||
|
tensor_parallel_size=4,
|
||||||
|
)
|
||||||
|
|
||||||
|
if modality == "image":
|
||||||
|
placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
|
||||||
|
elif modality == "video":
|
||||||
|
placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
|
||||||
|
|
||||||
|
prompts = [
|
||||||
|
(
|
||||||
|
"[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
|
||||||
|
f"{placeholder}"
|
||||||
|
f"{question}<|assistant|>assistant\n"
|
||||||
|
)
|
||||||
|
for question in questions
|
||||||
|
]
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompts=prompts,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# H2OVL-Mississippi
|
# H2OVL-Mississippi
|
||||||
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
|
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
@ -383,8 +457,8 @@ def run_hyperclovax_seed_vision(
|
|||||||
for question in questions:
|
for question in questions:
|
||||||
if modality == "image":
|
if modality == "image":
|
||||||
"""
|
"""
|
||||||
ocr: List the words in the image in raster order.
|
ocr: List the words in the image in raster order.
|
||||||
Even if the word order feels unnatural for reading,
|
Even if the word order feels unnatural for reading,
|
||||||
the model will handle it as long as it follows raster order.
|
the model will handle it as long as it follows raster order.
|
||||||
e.g. "Naver, CLOVA, bigshane"
|
e.g. "Naver, CLOVA, bigshane"
|
||||||
lens_keywords: List the entity names in the image.
|
lens_keywords: List the entity names in the image.
|
||||||
@ -1448,6 +1522,8 @@ model_example_map = {
|
|||||||
"gemma3n": run_gemma3n,
|
"gemma3n": run_gemma3n,
|
||||||
"glm4v": run_glm4v,
|
"glm4v": run_glm4v,
|
||||||
"glm4_1v": run_glm4_1v,
|
"glm4_1v": run_glm4_1v,
|
||||||
|
"glm4_5v": run_glm4_5v,
|
||||||
|
"glm4_5v_fp8": run_glm4_5v_fp8,
|
||||||
"h2ovl_chat": run_h2ovl,
|
"h2ovl_chat": run_h2ovl,
|
||||||
"hyperclovax_seed_vision": run_hyperclovax_seed_vision,
|
"hyperclovax_seed_vision": run_hyperclovax_seed_vision,
|
||||||
"idefics3": run_idefics3,
|
"idefics3": run_idefics3,
|
||||||
|
|||||||
@ -1064,6 +1064,76 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# GLM-4.5V
|
||||||
|
def load_glm4_5v(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
|
model_name = "zai-org/GLM-4.5V"
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
max_model_len=32768,
|
||||||
|
max_num_seqs=2,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
enforce_eager=True,
|
||||||
|
tensor_parallel_size=4,
|
||||||
|
)
|
||||||
|
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
*placeholders,
|
||||||
|
{"type": "text", "text": question},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
processor = AutoProcessor.from_pretrained(model_name)
|
||||||
|
prompt = processor.apply_chat_template(
|
||||||
|
messages, tokenize=False, add_generation_prompt=True
|
||||||
|
)
|
||||||
|
image_data = [fetch_image(url) for url in image_urls]
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompt=prompt,
|
||||||
|
image_data=image_data,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# GLM-4.5V-FP8
|
||||||
|
def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||||
|
model_name = "zai-org/GLM-4.5V-FP8"
|
||||||
|
|
||||||
|
engine_args = EngineArgs(
|
||||||
|
model=model_name,
|
||||||
|
max_model_len=32768,
|
||||||
|
max_num_seqs=2,
|
||||||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
|
enforce_eager=True,
|
||||||
|
tensor_parallel_size=4,
|
||||||
|
)
|
||||||
|
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
*placeholders,
|
||||||
|
{"type": "text", "text": question},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
processor = AutoProcessor.from_pretrained(model_name)
|
||||||
|
prompt = processor.apply_chat_template(
|
||||||
|
messages, tokenize=False, add_generation_prompt=True
|
||||||
|
)
|
||||||
|
image_data = [fetch_image(url) for url in image_urls]
|
||||||
|
|
||||||
|
return ModelRequestData(
|
||||||
|
engine_args=engine_args,
|
||||||
|
prompt=prompt,
|
||||||
|
image_data=image_data,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
model_example_map = {
|
model_example_map = {
|
||||||
"aria": load_aria,
|
"aria": load_aria,
|
||||||
"aya_vision": load_aya_vision,
|
"aya_vision": load_aya_vision,
|
||||||
@ -1096,6 +1166,8 @@ model_example_map = {
|
|||||||
"step3": load_step3,
|
"step3": load_step3,
|
||||||
"tarsier": load_tarsier,
|
"tarsier": load_tarsier,
|
||||||
"tarsier2": load_tarsier2,
|
"tarsier2": load_tarsier2,
|
||||||
|
"glm4_5v": load_glm4_5v,
|
||||||
|
"glm4_5v_fp8": load_glm4_5v_fp8,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -126,7 +126,7 @@ class Glm4vVideoPixelInputs(TensorSchema):
|
|||||||
- ctpp: Number of channels * temporal_patch_size *
|
- ctpp: Number of channels * temporal_patch_size *
|
||||||
patch_size * patch_size
|
patch_size * patch_size
|
||||||
- f: Number of frames
|
- f: Number of frames
|
||||||
- g: Grid dimensions (3 for grid_t which is usually 1 for processed
|
- g: Grid dimensions (3 for grid_t which is usually 1 for processed
|
||||||
video, grid_h, grid_w)
|
video, grid_h, grid_w)
|
||||||
"""
|
"""
|
||||||
type: Literal["pixel_values_videos"] = "pixel_values_videos"
|
type: Literal["pixel_values_videos"] = "pixel_values_videos"
|
||||||
@ -141,7 +141,7 @@ class Glm4vVideoEmbeddingInputs(TensorSchema):
|
|||||||
- p: Number of video patches across all frames
|
- p: Number of video patches across all frames
|
||||||
- h: Hidden size (must match language model backbone)
|
- h: Hidden size (must match language model backbone)
|
||||||
- f: Number of frames
|
- f: Number of frames
|
||||||
- g: Grid dimensions (3 for grid_t which is usually 1 for processed
|
- g: Grid dimensions (3 for grid_t which is usually 1 for processed
|
||||||
video, grid_h, grid_w)
|
video, grid_h, grid_w)
|
||||||
"""
|
"""
|
||||||
type: Literal["video_embeds"] = "video_embeds"
|
type: Literal["video_embeds"] = "video_embeds"
|
||||||
@ -234,7 +234,8 @@ class Glm4vVisionAttention(nn.Module):
|
|||||||
total_num_kv_heads=num_heads,
|
total_num_kv_heads=num_heads,
|
||||||
bias=False,
|
bias=False,
|
||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
prefix=f"{prefix}.qkv",
|
# Change qkv prefix to align with GLM-4.5V-FP8 quantization config
|
||||||
|
prefix=f"{prefix}.qkv_proj" if quant_config else f"{prefix}.qkv",
|
||||||
)
|
)
|
||||||
self.proj = RowParallelLinear(
|
self.proj = RowParallelLinear(
|
||||||
input_size=projection_size,
|
input_size=projection_size,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user