mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-08 01:14:25 +08:00
[Misc] Clean up Kimi-VL (#16833)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
87e067de41
commit
aadb656562
@ -376,9 +376,9 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
|
|||||||
|
|
||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(
|
||||||
model="moonshotai/Kimi-VL-A3B-Instruct",
|
model="moonshotai/Kimi-VL-A3B-Instruct",
|
||||||
max_model_len=4096,
|
|
||||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
|
max_model_len=4096,
|
||||||
|
limit_mm_per_prompt={"image": 1},
|
||||||
)
|
)
|
||||||
|
|
||||||
return ModelRequestData(
|
return ModelRequestData(
|
||||||
|
|||||||
@ -331,11 +331,10 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
|||||||
|
|
||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=4,
|
max_num_seqs=4,
|
||||||
tensor_parallel_size=1,
|
|
||||||
limit_mm_per_prompt={"image": len(image_urls)},
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||||||
trust_remote_code=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||||
|
|||||||
@ -56,7 +56,6 @@ from transformers.activations import GELUActivation
|
|||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.distributed import (get_tensor_model_parallel_rank,
|
from vllm.distributed import (get_tensor_model_parallel_rank,
|
||||||
get_tensor_model_parallel_world_size)
|
get_tensor_model_parallel_world_size)
|
||||||
from vllm.logger import init_logger
|
|
||||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||||
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
||||||
@ -70,22 +69,20 @@ from vllm.model_executor.models.moonvit import MoonVitPretrainedModel
|
|||||||
from vllm.model_executor.models.utils import merge_multimodal_embeddings
|
from vllm.model_executor.models.utils import merge_multimodal_embeddings
|
||||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||||
NestedTensors)
|
MultiModalKwargs, NestedTensors)
|
||||||
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||||
MultiModalDataItems)
|
MultiModalDataItems)
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptReplacement,
|
BaseProcessingInfo, PromptReplacement,
|
||||||
PromptUpdate)
|
PromptUpdate)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
|
from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
|
||||||
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
|
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
|
||||||
|
|
||||||
from .utils import is_pp_missing_parameter, maybe_prefix
|
from .utils import is_pp_missing_parameter, maybe_prefix
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
# For dummy input only
|
# For dummy input only
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -143,6 +140,9 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
|
|||||||
def get_hf_config(self):
|
def get_hf_config(self):
|
||||||
return self.ctx.get_hf_config(KimiVLConfig)
|
return self.ctx.get_hf_config(KimiVLConfig)
|
||||||
|
|
||||||
|
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||||
|
return {"image": None}
|
||||||
|
|
||||||
def get_num_image_tokens(
|
def get_num_image_tokens(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
@ -180,23 +180,6 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
|
|||||||
token_width = (width + pad_width) // (kernel_size[1] * patch_size)
|
token_width = (width + pad_width) // (kernel_size[1] * patch_size)
|
||||||
return int(token_height * token_width)
|
return int(token_height * token_width)
|
||||||
|
|
||||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
|
||||||
# None means unlimited
|
|
||||||
return {"image": None}
|
|
||||||
|
|
||||||
def get_mm_max_tokens_per_item(
|
|
||||||
self,
|
|
||||||
seq_len: int,
|
|
||||||
mm_counts: Mapping[str, int],
|
|
||||||
) -> Mapping[str, int]:
|
|
||||||
return {
|
|
||||||
"image":
|
|
||||||
self.get_num_image_tokens(
|
|
||||||
image_width=MaxImageTokenMeta.width,
|
|
||||||
image_height=MaxImageTokenMeta.height,
|
|
||||||
),
|
|
||||||
}
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def image_token_id(self) -> int:
|
def image_token_id(self) -> int:
|
||||||
return self.get_hf_config().media_placeholder_token_id
|
return self.get_hf_config().media_placeholder_token_id
|
||||||
@ -204,34 +187,28 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
|
|||||||
|
|
||||||
class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
|
class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
|
||||||
|
|
||||||
def __init__(self, info: KimiVLProcessingInfo) -> None:
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||||
super().__init__(info)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
self.image_token_id = self.info.image_token_id
|
processor = self.info.get_hf_processor()
|
||||||
self.image_token = self.info.get_tokenizer().decode(
|
image_token = processor.image_token
|
||||||
self.image_token_id)
|
|
||||||
|
|
||||||
def get_dummy_processor_inputs(
|
return image_token * num_images
|
||||||
|
|
||||||
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
) -> ProcessorInputs:
|
) -> MultiModalDataDict:
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
width = MaxImageTokenMeta.width
|
return {
|
||||||
height = MaxImageTokenMeta.height
|
|
||||||
mm_data = {
|
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(width=width,
|
self._get_dummy_images(width=MaxImageTokenMeta.width,
|
||||||
height=height,
|
height=MaxImageTokenMeta.height,
|
||||||
num_images=num_images)
|
num_images=num_images)
|
||||||
}
|
}
|
||||||
|
|
||||||
return ProcessorInputs(
|
|
||||||
prompt_text=self.image_token * num_images,
|
|
||||||
mm_data=mm_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
|
class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user