mirror of
https://git.datalinker.icu/comfyanonymous/ComfyUI
synced 2025-12-09 14:04:26 +08:00
* Add Kandinsky5 model support lite and pro T2V tested to work * Update kandinsky5.py * Fix fp8 * Fix fp8_scaled text encoder * Add transformer_options for attention * Code cleanup, optimizations, use fp32 for all layers originally at fp32 * ImageToVideo -node * Fix I2V, add necessary latent post process nodes * Support text to image model * Support block replace patches (SLG mostly) * Support official LoRAs * Don't scale RoPE for lite model as that just doesn't work... * Update supported_models.py * Rever RoPE scaling to simpler one * Fix typo * Handle latent dim difference for image model in the VAE instead * Add node to use different prompts for clip_l and qwen25_7b * Reduce peak VRAM usage a bit * Further reduce peak VRAM consumption by chunking ffn * Update chunking * Update memory_usage_factor * Code cleanup, don't force the fp32 layers as it has minimal effect * Allow for stronger changes with first frames normalization Default values are too weak for any meaningful changes, these should probably be exposed as advanced node options when that's available. * Add image model's own chat template, remove unused image2video template * Remove hard error in ReplaceVideoLatentFrames -node * Update kandinsky5.py * Update supported_models.py * Fix typos in prompt template They were now fixed in the original repository as well * Update ReplaceVideoLatentFrames Add tooltips Make source optional Better handle negative index * Rename NormalizeVideoLatentFrames -node For bit better clarity what it does * Fix NormalizeVideoLatentStart node out on non-op
69 lines
4.2 KiB
Python
69 lines
4.2 KiB
Python
from comfy import sd1_clip
|
|
from .qwen_image import QwenImageTokenizer, QwenImageTEModel
|
|
from .llama import Qwen25_7BVLI
|
|
|
|
|
|
class Kandinsky5Tokenizer(QwenImageTokenizer):
|
|
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
|
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
|
|
self.llama_template = "<|im_start|>system\nYou are a prompt engineer. Describe the video in detail.\nDescribe how the camera moves or shakes, describe the zoom and view angle, whether it follows the objects.\nDescribe the location of the video, main characters or objects and their action.\nDescribe the dynamism of the video and presented actions.\nName the visual style of the video: whether it is a professional footage, user generated content, some kind of animation, video game or screen content.\nDescribe the visual effects, postprocessing and transitions if they are presented in the video.\nPay attention to the order of key actions shown in the scene.<|im_end|>\n<|im_start|>user\n{}<|im_end|>"
|
|
self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
|
|
|
|
def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
|
|
out = super().tokenize_with_weights(text, return_word_ids, **kwargs)
|
|
out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids, **kwargs)
|
|
|
|
return out
|
|
|
|
|
|
class Kandinsky5TokenizerImage(Kandinsky5Tokenizer):
|
|
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
|
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
|
|
self.llama_template = "<|im_start|>system\nYou are a promt engineer. Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>"
|
|
|
|
|
|
class Qwen25_7BVLIModel(sd1_clip.SDClipModel):
|
|
def __init__(self, device="cpu", layer="hidden", layer_idx=-1, dtype=None, attention_mask=True, model_options={}):
|
|
llama_scaled_fp8 = model_options.get("qwen_scaled_fp8", None)
|
|
if llama_scaled_fp8 is not None:
|
|
model_options = model_options.copy()
|
|
model_options["scaled_fp8"] = llama_scaled_fp8
|
|
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=Qwen25_7BVLI, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
|
|
|
|
|
class Kandinsky5TEModel(QwenImageTEModel):
|
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
|
super(QwenImageTEModel, self).__init__(device=device, dtype=dtype, name="qwen25_7b", clip_model=Qwen25_7BVLIModel, model_options=model_options)
|
|
self.clip_l = sd1_clip.SDClipModel(device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options)
|
|
|
|
def encode_token_weights(self, token_weight_pairs):
|
|
cond, p, extra = super().encode_token_weights(token_weight_pairs, template_end=-1)
|
|
l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs["l"])
|
|
|
|
return cond, l_pooled, extra
|
|
|
|
def set_clip_options(self, options):
|
|
super().set_clip_options(options)
|
|
self.clip_l.set_clip_options(options)
|
|
|
|
def reset_clip_options(self):
|
|
super().reset_clip_options()
|
|
self.clip_l.reset_clip_options()
|
|
|
|
def load_sd(self, sd):
|
|
if "text_model.encoder.layers.1.mlp.fc1.weight" in sd:
|
|
return self.clip_l.load_sd(sd)
|
|
else:
|
|
return super().load_sd(sd)
|
|
|
|
def te(dtype_llama=None, llama_scaled_fp8=None):
|
|
class Kandinsky5TEModel_(Kandinsky5TEModel):
|
|
def __init__(self, device="cpu", dtype=None, model_options={}):
|
|
if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
|
|
model_options = model_options.copy()
|
|
model_options["qwen_scaled_fp8"] = llama_scaled_fp8
|
|
if dtype_llama is not None:
|
|
dtype = dtype_llama
|
|
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
|
return Kandinsky5TEModel_
|