[Misc] Clean up Kimi-VL (#16833)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-04-18 20:15:09 +08:00 committed by GitHub
parent 87e067de41
commit aadb656562
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 20 additions and 44 deletions

View File

@ -376,9 +376,9 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs(
model="moonshotai/Kimi-VL-A3B-Instruct",
max_model_len=4096,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(

View File

@ -331,11 +331,10 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
max_num_seqs=4,
tensor_parallel_size=1,
limit_mm_per_prompt={"image": len(image_urls)},
trust_remote_code=True,
)
placeholders = [{"type": "image", "image": url} for url in image_urls]

View File

@ -56,7 +56,6 @@ from transformers.activations import GELUActivation
from vllm.config import VllmConfig
from vllm.distributed import (get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size)
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@ -70,22 +69,20 @@ from vllm.model_executor.models.moonvit import MoonVitPretrainedModel
from vllm.model_executor.models.utils import merge_multimodal_embeddings
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
NestedTensors)
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement,
PromptUpdate)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
from .utils import is_pp_missing_parameter, maybe_prefix
logger = init_logger(__name__)
# For dummy input only
@dataclass
@ -143,6 +140,9 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
def get_hf_config(self):
return self.ctx.get_hf_config(KimiVLConfig)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}
def get_num_image_tokens(
self,
*,
@ -180,23 +180,6 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
token_width = (width + pad_width) // (kernel_size[1] * patch_size)
return int(token_height * token_width)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
# None means unlimited
return {"image": None}
def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> Mapping[str, int]:
return {
"image":
self.get_num_image_tokens(
image_width=MaxImageTokenMeta.width,
image_height=MaxImageTokenMeta.height,
),
}
@property
def image_token_id(self) -> int:
return self.get_hf_config().media_placeholder_token_id
@ -204,34 +187,28 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
def __init__(self, info: KimiVLProcessingInfo) -> None:
super().__init__(info)
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
num_images = mm_counts.get("image", 0)
self.image_token_id = self.info.image_token_id
self.image_token = self.info.get_tokenizer().decode(
self.image_token_id)
processor = self.info.get_hf_processor()
image_token = processor.image_token
def get_dummy_processor_inputs(
return image_token * num_images
def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> ProcessorInputs:
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
width = MaxImageTokenMeta.width
height = MaxImageTokenMeta.height
mm_data = {
return {
"image":
self._get_dummy_images(width=width,
height=height,
self._get_dummy_images(width=MaxImageTokenMeta.width,
height=MaxImageTokenMeta.height,
num_images=num_images)
}
return ProcessorInputs(
prompt_text=self.image_token * num_images,
mm_data=mm_data,
)
class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):