mirror of
https://git.datalinker.icu/kijai/ComfyUI-CogVideoXWrapper.git
synced 2025-12-09 21:04:23 +08:00
code cleanup
This commit is contained in:
parent
dccc8bdcb7
commit
5ba9b1d634
@ -1,3 +1,7 @@
|
|||||||
from .nodes import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
|
from .nodes import NODE_CLASS_MAPPINGS as NODES_CLASS, NODE_DISPLAY_NAME_MAPPINGS as NODES_DISPLAY
|
||||||
|
from .model_loading import NODE_CLASS_MAPPINGS as MODEL_CLASS, NODE_DISPLAY_NAME_MAPPINGS as MODEL_DISPLAY
|
||||||
|
|
||||||
|
NODE_CLASS_MAPPINGS = {**NODES_CLASS, **MODEL_CLASS}
|
||||||
|
NODE_DISPLAY_NAME_MAPPINGS = {**NODES_DISPLAY, **MODEL_DISPLAY}
|
||||||
|
|
||||||
__all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]
|
__all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]
|
||||||
@ -378,12 +378,6 @@ class CogVideoXBlock(nn.Module):
|
|||||||
elif fastercache_counter > fastercache_start_step:
|
elif fastercache_counter > fastercache_start_step:
|
||||||
self.cached_hidden_states[-1].copy_(attn_hidden_states.to(fastercache_device))
|
self.cached_hidden_states[-1].copy_(attn_hidden_states.to(fastercache_device))
|
||||||
self.cached_encoder_hidden_states[-1].copy_(attn_encoder_hidden_states.to(fastercache_device))
|
self.cached_encoder_hidden_states[-1].copy_(attn_encoder_hidden_states.to(fastercache_device))
|
||||||
# attention
|
|
||||||
attn_hidden_states, attn_encoder_hidden_states = self.attn1(
|
|
||||||
hidden_states=norm_hidden_states,
|
|
||||||
encoder_hidden_states=norm_encoder_hidden_states,
|
|
||||||
image_rotary_emb=image_rotary_emb,
|
|
||||||
)
|
|
||||||
|
|
||||||
hidden_states = hidden_states + gate_msa * attn_hidden_states
|
hidden_states = hidden_states + gate_msa * attn_hidden_states
|
||||||
encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder_hidden_states
|
encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder_hidden_states
|
||||||
|
|||||||
567
model_loading.py
Normal file
567
model_loading.py
Normal file
@ -0,0 +1,567 @@
|
|||||||
|
import os
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import json
|
||||||
|
import folder_paths
|
||||||
|
import comfy.model_management as mm
|
||||||
|
|
||||||
|
from diffusers.models import AutoencoderKLCogVideoX
|
||||||
|
from diffusers.schedulers import CogVideoXDDIMScheduler
|
||||||
|
from .custom_cogvideox_transformer_3d import CogVideoXTransformer3DModel
|
||||||
|
from .pipeline_cogvideox import CogVideoXPipeline
|
||||||
|
from contextlib import nullcontext
|
||||||
|
|
||||||
|
from .cogvideox_fun.transformer_3d import CogVideoXTransformer3DModel as CogVideoXTransformer3DModelFun
|
||||||
|
from .cogvideox_fun.fun_pab_transformer_3d import CogVideoXTransformer3DModel as CogVideoXTransformer3DModelFunPAB
|
||||||
|
from .cogvideox_fun.autoencoder_magvit import AutoencoderKLCogVideoX as AutoencoderKLCogVideoXFun
|
||||||
|
|
||||||
|
from .cogvideox_fun.pipeline_cogvideox_inpaint import CogVideoX_Fun_Pipeline_Inpaint
|
||||||
|
from .cogvideox_fun.pipeline_cogvideox_control import CogVideoX_Fun_Pipeline_Control
|
||||||
|
|
||||||
|
from .videosys.cogvideox_transformer_3d import CogVideoXTransformer3DModel as CogVideoXTransformer3DModelPAB
|
||||||
|
|
||||||
|
from .utils import check_diffusers_version, remove_specific_blocks, log
|
||||||
|
from comfy.utils import load_torch_file
|
||||||
|
|
||||||
|
script_directory = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
|
class DownloadAndLoadCogVideoModel:
|
||||||
|
@classmethod
|
||||||
|
def INPUT_TYPES(s):
|
||||||
|
return {
|
||||||
|
"required": {
|
||||||
|
"model": (
|
||||||
|
[
|
||||||
|
"THUDM/CogVideoX-2b",
|
||||||
|
"THUDM/CogVideoX-5b",
|
||||||
|
"THUDM/CogVideoX-5b-I2V",
|
||||||
|
"bertjiazheng/KoolCogVideoX-5b",
|
||||||
|
"kijai/CogVideoX-Fun-2b",
|
||||||
|
"kijai/CogVideoX-Fun-5b",
|
||||||
|
"kijai/CogVideoX-5b-Tora",
|
||||||
|
"alibaba-pai/CogVideoX-Fun-V1.1-2b-InP",
|
||||||
|
"alibaba-pai/CogVideoX-Fun-V1.1-5b-InP",
|
||||||
|
"alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose",
|
||||||
|
"alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose",
|
||||||
|
"feizhengcong/CogvideoX-Interpolation",
|
||||||
|
"NimVideo/cogvideox-2b-img2vid"
|
||||||
|
],
|
||||||
|
),
|
||||||
|
|
||||||
|
},
|
||||||
|
"optional": {
|
||||||
|
"precision": (["fp16", "fp32", "bf16"],
|
||||||
|
{"default": "bf16", "tooltip": "official recommendation is that 2b model should be fp16, 5b model should be bf16"}
|
||||||
|
),
|
||||||
|
"fp8_transformer": (['disabled', 'enabled', 'fastmode'], {"default": 'disabled', "tooltip": "enabled casts the transformer to torch.float8_e4m3fn, fastmode is only for latest nvidia GPUs and requires torch 2.4.0 and cu124 minimum"}),
|
||||||
|
"compile": (["disabled","onediff","torch"], {"tooltip": "compile the model for faster inference, these are advanced options only available on Linux, see readme for more info"}),
|
||||||
|
"enable_sequential_cpu_offload": ("BOOLEAN", {"default": False, "tooltip": "significantly reducing memory usage and slows down the inference"}),
|
||||||
|
"pab_config": ("PAB_CONFIG", {"default": None}),
|
||||||
|
"block_edit": ("TRANSFORMERBLOCKS", {"default": None}),
|
||||||
|
"lora": ("COGLORA", {"default": None}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
RETURN_TYPES = ("COGVIDEOPIPE",)
|
||||||
|
RETURN_NAMES = ("cogvideo_pipe", )
|
||||||
|
FUNCTION = "loadmodel"
|
||||||
|
CATEGORY = "CogVideoWrapper"
|
||||||
|
DESCRIPTION = "Downloads and loads the selected CogVideo model from Huggingface to 'ComfyUI/models/CogVideo'"
|
||||||
|
|
||||||
|
def loadmodel(self, model, precision, fp8_transformer="disabled", compile="disabled", enable_sequential_cpu_offload=False, pab_config=None, block_edit=None, lora=None):
|
||||||
|
|
||||||
|
check_diffusers_version()
|
||||||
|
|
||||||
|
device = mm.get_torch_device()
|
||||||
|
offload_device = mm.unet_offload_device()
|
||||||
|
mm.soft_empty_cache()
|
||||||
|
|
||||||
|
dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
|
||||||
|
download_path = folder_paths.get_folder_paths("CogVideo")[0]
|
||||||
|
|
||||||
|
if "Fun" in model:
|
||||||
|
if not "1.1" in model:
|
||||||
|
repo_id = "kijai/CogVideoX-Fun-pruned"
|
||||||
|
if "2b" in model:
|
||||||
|
base_path = os.path.join(folder_paths.models_dir, "CogVideoX_Fun", "CogVideoX-Fun-2b-InP") # location of the official model
|
||||||
|
if not os.path.exists(base_path):
|
||||||
|
base_path = os.path.join(download_path, "CogVideoX-Fun-2b-InP")
|
||||||
|
elif "5b" in model:
|
||||||
|
base_path = os.path.join(folder_paths.models_dir, "CogVideoX_Fun", "CogVideoX-Fun-5b-InP") # location of the official model
|
||||||
|
if not os.path.exists(base_path):
|
||||||
|
base_path = os.path.join(download_path, "CogVideoX-Fun-5b-InP")
|
||||||
|
elif "1.1" in model:
|
||||||
|
repo_id = model
|
||||||
|
base_path = os.path.join(folder_paths.models_dir, "CogVideoX_Fun", (model.split("/")[-1])) # location of the official model
|
||||||
|
if not os.path.exists(base_path):
|
||||||
|
base_path = os.path.join(download_path, (model.split("/")[-1]))
|
||||||
|
download_path = base_path
|
||||||
|
|
||||||
|
elif "2b" in model:
|
||||||
|
if 'img2vid' in model:
|
||||||
|
base_path = os.path.join(download_path, "cogvideox-2b-img2vid")
|
||||||
|
download_path = base_path
|
||||||
|
repo_id = model
|
||||||
|
else:
|
||||||
|
base_path = os.path.join(download_path, "CogVideo2B")
|
||||||
|
download_path = base_path
|
||||||
|
repo_id = model
|
||||||
|
else:
|
||||||
|
base_path = os.path.join(download_path, (model.split("/")[-1]))
|
||||||
|
download_path = base_path
|
||||||
|
repo_id = model
|
||||||
|
|
||||||
|
|
||||||
|
if "2b" in model:
|
||||||
|
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json')
|
||||||
|
else:
|
||||||
|
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
|
||||||
|
|
||||||
|
if not os.path.exists(base_path) or not os.path.exists(os.path.join(base_path, "transformer")):
|
||||||
|
log.info(f"Downloading model to: {base_path}")
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
|
snapshot_download(
|
||||||
|
repo_id=repo_id,
|
||||||
|
ignore_patterns=["*text_encoder*", "*tokenizer*"],
|
||||||
|
local_dir=download_path,
|
||||||
|
local_dir_use_symlinks=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# transformer
|
||||||
|
if "Fun" in model:
|
||||||
|
if pab_config is not None:
|
||||||
|
transformer = CogVideoXTransformer3DModelFunPAB.from_pretrained(base_path, subfolder="transformer")
|
||||||
|
else:
|
||||||
|
transformer = CogVideoXTransformer3DModelFun.from_pretrained(base_path, subfolder="transformer")
|
||||||
|
else:
|
||||||
|
if pab_config is not None:
|
||||||
|
transformer = CogVideoXTransformer3DModelPAB.from_pretrained(base_path, subfolder="transformer")
|
||||||
|
else:
|
||||||
|
transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder="transformer")
|
||||||
|
|
||||||
|
transformer = transformer.to(dtype).to(offload_device)
|
||||||
|
|
||||||
|
#LoRAs
|
||||||
|
if lora is not None:
|
||||||
|
from .lora_utils import merge_lora, load_lora_into_transformer
|
||||||
|
if "fun" in model.lower():
|
||||||
|
for l in lora:
|
||||||
|
log.info(f"Merging LoRA weights from {l['path']} with strength {l['strength']}")
|
||||||
|
transformer = merge_lora(transformer, l["path"], l["strength"])
|
||||||
|
else:
|
||||||
|
transformer = load_lora_into_transformer(lora, transformer)
|
||||||
|
|
||||||
|
|
||||||
|
if block_edit is not None:
|
||||||
|
transformer = remove_specific_blocks(transformer, block_edit)
|
||||||
|
|
||||||
|
#fp8
|
||||||
|
if fp8_transformer == "enabled" or fp8_transformer == "fastmode":
|
||||||
|
for name, param in transformer.named_parameters():
|
||||||
|
params_to_keep = {"patch_embed", "lora", "pos_embedding"}
|
||||||
|
if not any(keyword in name for keyword in params_to_keep):
|
||||||
|
param.data = param.data.to(torch.float8_e4m3fn)
|
||||||
|
|
||||||
|
if fp8_transformer == "fastmode":
|
||||||
|
from .fp8_optimization import convert_fp8_linear
|
||||||
|
convert_fp8_linear(transformer, dtype)
|
||||||
|
|
||||||
|
with open(scheduler_path) as f:
|
||||||
|
scheduler_config = json.load(f)
|
||||||
|
scheduler = CogVideoXDDIMScheduler.from_config(scheduler_config)
|
||||||
|
|
||||||
|
# VAE
|
||||||
|
if "Fun" in model:
|
||||||
|
vae = AutoencoderKLCogVideoXFun.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device)
|
||||||
|
if "Pose" in model:
|
||||||
|
pipe = CogVideoX_Fun_Pipeline_Control(vae, transformer, scheduler, pab_config=pab_config)
|
||||||
|
else:
|
||||||
|
pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler, pab_config=pab_config)
|
||||||
|
else:
|
||||||
|
vae = AutoencoderKLCogVideoX.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device)
|
||||||
|
pipe = CogVideoXPipeline(vae, transformer, scheduler, pab_config=pab_config)
|
||||||
|
if "cogvideox-2b-img2vid" in model:
|
||||||
|
pipe.input_with_padding = False
|
||||||
|
|
||||||
|
if enable_sequential_cpu_offload:
|
||||||
|
pipe.enable_sequential_cpu_offload()
|
||||||
|
|
||||||
|
# compilation
|
||||||
|
if compile == "torch":
|
||||||
|
torch._dynamo.config.suppress_errors = True
|
||||||
|
pipe.transformer.to(memory_format=torch.channels_last)
|
||||||
|
#pipe.transformer = torch.compile(pipe.transformer, mode="default", fullgraph=False, backend="inductor")
|
||||||
|
for i, block in enumerate(pipe.transformer.transformer_blocks):
|
||||||
|
if "CogVideoXBlock" in str(block):
|
||||||
|
pipe.transformer.transformer_blocks[i] = torch.compile(block, fullgraph=False, dynamic=False, backend="inductor")
|
||||||
|
elif compile == "onediff":
|
||||||
|
from onediffx import compile_pipe
|
||||||
|
os.environ['NEXFORT_FX_FORCE_TRITON_SDPA'] = '1'
|
||||||
|
|
||||||
|
pipe = compile_pipe(
|
||||||
|
pipe,
|
||||||
|
backend="nexfort",
|
||||||
|
options= {"mode": "max-optimize:max-autotune:max-autotune", "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": False, "triton.fuse_attention_allow_fp16_reduction": False}},
|
||||||
|
ignores=["vae"],
|
||||||
|
fuse_qkv_projections=True if pab_config is None else False,
|
||||||
|
)
|
||||||
|
|
||||||
|
pipeline = {
|
||||||
|
"pipe": pipe,
|
||||||
|
"dtype": dtype,
|
||||||
|
"base_path": base_path,
|
||||||
|
"onediff": True if compile == "onediff" else False,
|
||||||
|
"cpu_offloading": enable_sequential_cpu_offload,
|
||||||
|
"scheduler_config": scheduler_config,
|
||||||
|
"model_name": model
|
||||||
|
}
|
||||||
|
|
||||||
|
return (pipeline,)
|
||||||
|
|
||||||
|
class DownloadAndLoadCogVideoGGUFModel:
|
||||||
|
@classmethod
|
||||||
|
def INPUT_TYPES(s):
|
||||||
|
return {
|
||||||
|
"required": {
|
||||||
|
"model": (
|
||||||
|
[
|
||||||
|
"CogVideoX_5b_GGUF_Q4_0.safetensors",
|
||||||
|
"CogVideoX_5b_I2V_GGUF_Q4_0.safetensors",
|
||||||
|
"CogVideoX_5b_fun_GGUF_Q4_0.safetensors",
|
||||||
|
"CogVideoX_5b_fun_1_1_GGUF_Q4_0.safetensors",
|
||||||
|
"CogVideoX_5b_fun_1_1_Pose_GGUF_Q4_0.safetensors",
|
||||||
|
"CogVideoX_5b_Interpolation_GGUF_Q4_0.safetensors",
|
||||||
|
"CogVideoX_5b_Tora_GGUF_Q4_0.safetensors",
|
||||||
|
|
||||||
|
],
|
||||||
|
),
|
||||||
|
"vae_precision": (["fp16", "fp32", "bf16"], {"default": "bf16", "tooltip": "VAE dtype"}),
|
||||||
|
"fp8_fastmode": ("BOOLEAN", {"default": False, "tooltip": "only supported on 4090 and later GPUs, also requires torch 2.4.0 with cu124 minimum"}),
|
||||||
|
"load_device": (["main_device", "offload_device"], {"default": "main_device"}),
|
||||||
|
"enable_sequential_cpu_offload": ("BOOLEAN", {"default": False, "tooltip": "significantly reducing memory usage and slows down the inference"}),
|
||||||
|
},
|
||||||
|
"optional": {
|
||||||
|
"pab_config": ("PAB_CONFIG", {"default": None}),
|
||||||
|
"block_edit": ("TRANSFORMERBLOCKS", {"default": None}),
|
||||||
|
"compile": (["disabled","torch"], {"tooltip": "compile the model for faster inference, these are advanced options only available on Linux, see readme for more info"}),
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
RETURN_TYPES = ("COGVIDEOPIPE",)
|
||||||
|
RETURN_NAMES = ("cogvideo_pipe", )
|
||||||
|
FUNCTION = "loadmodel"
|
||||||
|
CATEGORY = "CogVideoWrapper"
|
||||||
|
|
||||||
|
def loadmodel(self, model, vae_precision, fp8_fastmode, load_device, enable_sequential_cpu_offload, pab_config=None, block_edit=None, compile="disabled"):
|
||||||
|
|
||||||
|
check_diffusers_version()
|
||||||
|
|
||||||
|
device = mm.get_torch_device()
|
||||||
|
offload_device = mm.unet_offload_device()
|
||||||
|
mm.soft_empty_cache()
|
||||||
|
|
||||||
|
vae_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[vae_precision]
|
||||||
|
download_path = os.path.join(folder_paths.models_dir, 'CogVideo', 'GGUF')
|
||||||
|
gguf_path = os.path.join(folder_paths.models_dir, 'diffusion_models', model) # check MinusZone's model path first
|
||||||
|
if not os.path.exists(gguf_path):
|
||||||
|
gguf_path = os.path.join(download_path, model)
|
||||||
|
if not os.path.exists(gguf_path):
|
||||||
|
if "I2V" in model or "1_1" in model or "Interpolation" in model or "Tora" in model:
|
||||||
|
repo_id = "Kijai/CogVideoX_GGUF"
|
||||||
|
else:
|
||||||
|
repo_id = "MinusZoneAI/ComfyUI-CogVideoX-MZ"
|
||||||
|
log.info(f"Downloading model to: {gguf_path}")
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
|
snapshot_download(
|
||||||
|
repo_id=repo_id,
|
||||||
|
allow_patterns=[f"*{model}*"],
|
||||||
|
local_dir=download_path,
|
||||||
|
local_dir_use_symlinks=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
if "5b" in model:
|
||||||
|
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
|
||||||
|
transformer_path = os.path.join(script_directory, 'configs', 'transformer_config_5b.json')
|
||||||
|
elif "2b" in model:
|
||||||
|
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json')
|
||||||
|
transformer_path = os.path.join(script_directory, 'configs', 'transformer_config_2b.json')
|
||||||
|
|
||||||
|
with open(transformer_path) as f:
|
||||||
|
transformer_config = json.load(f)
|
||||||
|
|
||||||
|
sd = load_torch_file(gguf_path)
|
||||||
|
|
||||||
|
from .nodes import mz_gguf_loader
|
||||||
|
import importlib
|
||||||
|
importlib.reload(mz_gguf_loader)
|
||||||
|
|
||||||
|
with mz_gguf_loader.quantize_lazy_load():
|
||||||
|
if "fun" in model:
|
||||||
|
if "Pose" in model:
|
||||||
|
transformer_config["in_channels"] = 32
|
||||||
|
else:
|
||||||
|
transformer_config["in_channels"] = 33
|
||||||
|
if pab_config is not None:
|
||||||
|
transformer = CogVideoXTransformer3DModelFunPAB.from_config(transformer_config)
|
||||||
|
else:
|
||||||
|
transformer = CogVideoXTransformer3DModelFun.from_config(transformer_config)
|
||||||
|
elif "I2V" in model or "Interpolation" in model:
|
||||||
|
transformer_config["in_channels"] = 32
|
||||||
|
if pab_config is not None:
|
||||||
|
transformer = CogVideoXTransformer3DModelPAB.from_config(transformer_config)
|
||||||
|
else:
|
||||||
|
transformer = CogVideoXTransformer3DModel.from_config(transformer_config)
|
||||||
|
else:
|
||||||
|
transformer_config["in_channels"] = 16
|
||||||
|
if pab_config is not None:
|
||||||
|
transformer = CogVideoXTransformer3DModelPAB.from_config(transformer_config)
|
||||||
|
else:
|
||||||
|
transformer = CogVideoXTransformer3DModel.from_config(transformer_config)
|
||||||
|
|
||||||
|
if "2b" in model:
|
||||||
|
for name, param in transformer.named_parameters():
|
||||||
|
if name != "pos_embedding":
|
||||||
|
param.data = param.data.to(torch.float8_e4m3fn)
|
||||||
|
else:
|
||||||
|
param.data = param.data.to(torch.float16)
|
||||||
|
else:
|
||||||
|
transformer.to(torch.float8_e4m3fn)
|
||||||
|
|
||||||
|
if block_edit is not None:
|
||||||
|
transformer = remove_specific_blocks(transformer, block_edit)
|
||||||
|
|
||||||
|
transformer = mz_gguf_loader.quantize_load_state_dict(transformer, sd, device="cpu")
|
||||||
|
if load_device == "offload_device":
|
||||||
|
transformer.to(offload_device)
|
||||||
|
else:
|
||||||
|
transformer.to(device)
|
||||||
|
|
||||||
|
|
||||||
|
if fp8_fastmode:
|
||||||
|
from .fp8_optimization import convert_fp8_linear
|
||||||
|
convert_fp8_linear(transformer, vae_dtype)
|
||||||
|
|
||||||
|
if compile == "torch":
|
||||||
|
# compilation
|
||||||
|
for i, block in enumerate(transformer.transformer_blocks):
|
||||||
|
transformer.transformer_blocks[i] = torch.compile(block, fullgraph=False, dynamic=False, backend="inductor")
|
||||||
|
with open(scheduler_path) as f:
|
||||||
|
scheduler_config = json.load(f)
|
||||||
|
|
||||||
|
scheduler = CogVideoXDDIMScheduler.from_config(scheduler_config, subfolder="scheduler")
|
||||||
|
|
||||||
|
# VAE
|
||||||
|
vae_dl_path = os.path.join(folder_paths.models_dir, 'CogVideo', 'VAE')
|
||||||
|
vae_path = os.path.join(vae_dl_path, "cogvideox_vae.safetensors")
|
||||||
|
if not os.path.exists(vae_path):
|
||||||
|
log.info(f"Downloading VAE model to: {vae_path}")
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
|
snapshot_download(
|
||||||
|
repo_id="Kijai/CogVideoX-Fun-pruned",
|
||||||
|
allow_patterns=["*cogvideox_vae.safetensors*"],
|
||||||
|
local_dir=vae_dl_path,
|
||||||
|
local_dir_use_symlinks=False,
|
||||||
|
)
|
||||||
|
with open(os.path.join(script_directory, 'configs', 'vae_config.json')) as f:
|
||||||
|
vae_config = json.load(f)
|
||||||
|
|
||||||
|
vae_sd = load_torch_file(vae_path)
|
||||||
|
if "fun" in model:
|
||||||
|
vae = AutoencoderKLCogVideoXFun.from_config(vae_config).to(vae_dtype).to(offload_device)
|
||||||
|
vae.load_state_dict(vae_sd)
|
||||||
|
if "Pose" in model:
|
||||||
|
pipe = CogVideoX_Fun_Pipeline_Control(vae, transformer, scheduler, pab_config=pab_config)
|
||||||
|
else:
|
||||||
|
pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler, pab_config=pab_config)
|
||||||
|
else:
|
||||||
|
vae = AutoencoderKLCogVideoX.from_config(vae_config).to(vae_dtype).to(offload_device)
|
||||||
|
vae.load_state_dict(vae_sd)
|
||||||
|
pipe = CogVideoXPipeline(vae, transformer, scheduler, pab_config=pab_config)
|
||||||
|
|
||||||
|
if enable_sequential_cpu_offload:
|
||||||
|
pipe.enable_sequential_cpu_offload()
|
||||||
|
|
||||||
|
pipeline = {
|
||||||
|
"pipe": pipe,
|
||||||
|
"dtype": vae_dtype,
|
||||||
|
"base_path": model,
|
||||||
|
"onediff": False,
|
||||||
|
"cpu_offloading": enable_sequential_cpu_offload,
|
||||||
|
"scheduler_config": scheduler_config,
|
||||||
|
"model_name": model
|
||||||
|
}
|
||||||
|
|
||||||
|
return (pipeline,)
|
||||||
|
|
||||||
|
class DownloadAndLoadToraModel:
|
||||||
|
@classmethod
|
||||||
|
def INPUT_TYPES(s):
|
||||||
|
return {
|
||||||
|
"required": {
|
||||||
|
"model": (
|
||||||
|
[
|
||||||
|
"kijai/CogVideoX-5b-Tora",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
RETURN_TYPES = ("TORAMODEL",)
|
||||||
|
RETURN_NAMES = ("tora_model", )
|
||||||
|
FUNCTION = "loadmodel"
|
||||||
|
CATEGORY = "CogVideoWrapper"
|
||||||
|
DESCRIPTION = "Downloads and loads the the Tora model from Huggingface to 'ComfyUI/models/CogVideo/CogVideoX-5b-Tora'"
|
||||||
|
|
||||||
|
def loadmodel(self, model):
|
||||||
|
|
||||||
|
check_diffusers_version()
|
||||||
|
|
||||||
|
device = mm.get_torch_device()
|
||||||
|
offload_device = mm.unet_offload_device()
|
||||||
|
mm.soft_empty_cache()
|
||||||
|
|
||||||
|
download_path = folder_paths.get_folder_paths("CogVideo")[0]
|
||||||
|
|
||||||
|
from .tora.traj_module import MGF
|
||||||
|
|
||||||
|
try:
|
||||||
|
from accelerate import init_empty_weights
|
||||||
|
from accelerate.utils import set_module_tensor_to_device
|
||||||
|
is_accelerate_available = True
|
||||||
|
except:
|
||||||
|
is_accelerate_available = False
|
||||||
|
pass
|
||||||
|
|
||||||
|
download_path = os.path.join(folder_paths.models_dir, 'CogVideo', "CogVideoX-5b-Tora")
|
||||||
|
fuser_path = os.path.join(download_path, "fuser", "fuser.safetensors")
|
||||||
|
if not os.path.exists(fuser_path):
|
||||||
|
log.info(f"Downloading Fuser model to: {fuser_path}")
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
|
snapshot_download(
|
||||||
|
repo_id=model,
|
||||||
|
allow_patterns=["*fuser.safetensors*"],
|
||||||
|
local_dir=download_path,
|
||||||
|
local_dir_use_symlinks=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
hidden_size = 3072
|
||||||
|
num_layers = 42
|
||||||
|
|
||||||
|
with (init_empty_weights() if is_accelerate_available else nullcontext()):
|
||||||
|
fuser_list = nn.ModuleList([MGF(128, hidden_size) for _ in range(num_layers)])
|
||||||
|
|
||||||
|
fuser_sd = load_torch_file(fuser_path)
|
||||||
|
if is_accelerate_available:
|
||||||
|
for key in fuser_sd:
|
||||||
|
set_module_tensor_to_device(fuser_list, key, dtype=torch.float16, device=device, value=fuser_sd[key])
|
||||||
|
else:
|
||||||
|
fuser_list.load_state_dict(fuser_sd)
|
||||||
|
for module in fuser_list:
|
||||||
|
for param in module.parameters():
|
||||||
|
param.data = param.data.to(torch.bfloat16).to(device)
|
||||||
|
del fuser_sd
|
||||||
|
|
||||||
|
traj_extractor_path = os.path.join(download_path, "traj_extractor", "traj_extractor.safetensors")
|
||||||
|
if not os.path.exists(traj_extractor_path):
|
||||||
|
log.info(f"Downloading trajectory extractor model to: {traj_extractor_path}")
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
|
snapshot_download(
|
||||||
|
repo_id="kijai/CogVideoX-5b-Tora",
|
||||||
|
allow_patterns=["*traj_extractor.safetensors*"],
|
||||||
|
local_dir=download_path,
|
||||||
|
local_dir_use_symlinks=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
from .tora.traj_module import TrajExtractor
|
||||||
|
with (init_empty_weights() if is_accelerate_available else nullcontext()):
|
||||||
|
traj_extractor = TrajExtractor(
|
||||||
|
vae_downsize=(4, 8, 8),
|
||||||
|
patch_size=2,
|
||||||
|
nums_rb=2,
|
||||||
|
cin=16,
|
||||||
|
channels=[128] * 42,
|
||||||
|
sk=True,
|
||||||
|
use_conv=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
traj_sd = load_torch_file(traj_extractor_path)
|
||||||
|
if is_accelerate_available:
|
||||||
|
for key in traj_sd:
|
||||||
|
set_module_tensor_to_device(traj_extractor, key, dtype=torch.float32, device=device, value=traj_sd[key])
|
||||||
|
else:
|
||||||
|
traj_extractor.load_state_dict(traj_sd)
|
||||||
|
traj_extractor.to(torch.float32).to(device)
|
||||||
|
|
||||||
|
toramodel = {
|
||||||
|
"fuser_list": fuser_list,
|
||||||
|
"traj_extractor": traj_extractor,
|
||||||
|
}
|
||||||
|
|
||||||
|
return (toramodel,)
|
||||||
|
|
||||||
|
class DownloadAndLoadCogVideoControlNet:
|
||||||
|
@classmethod
|
||||||
|
def INPUT_TYPES(s):
|
||||||
|
return {
|
||||||
|
"required": {
|
||||||
|
"model": (
|
||||||
|
[
|
||||||
|
"TheDenk/cogvideox-2b-controlnet-hed-v1",
|
||||||
|
"TheDenk/cogvideox-2b-controlnet-canny-v1",
|
||||||
|
"TheDenk/cogvideox-5b-controlnet-hed-v1",
|
||||||
|
"TheDenk/cogvideox-5b-controlnet-canny-v1"
|
||||||
|
],
|
||||||
|
),
|
||||||
|
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
RETURN_TYPES = ("COGVIDECONTROLNETMODEL",)
|
||||||
|
RETURN_NAMES = ("cogvideo_controlnet", )
|
||||||
|
FUNCTION = "loadmodel"
|
||||||
|
CATEGORY = "CogVideoWrapper"
|
||||||
|
|
||||||
|
def loadmodel(self, model):
|
||||||
|
from .cogvideo_controlnet import CogVideoXControlnet
|
||||||
|
|
||||||
|
device = mm.get_torch_device()
|
||||||
|
offload_device = mm.unet_offload_device()
|
||||||
|
mm.soft_empty_cache()
|
||||||
|
|
||||||
|
|
||||||
|
download_path = os.path.join(folder_paths.models_dir, 'CogVideo', 'ControlNet')
|
||||||
|
base_path = os.path.join(download_path, (model.split("/")[-1]))
|
||||||
|
|
||||||
|
if not os.path.exists(base_path):
|
||||||
|
log.info(f"Downloading model to: {base_path}")
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
|
snapshot_download(
|
||||||
|
repo_id=model,
|
||||||
|
ignore_patterns=["*text_encoder*", "*tokenizer*"],
|
||||||
|
local_dir=base_path,
|
||||||
|
local_dir_use_symlinks=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
controlnet = CogVideoXControlnet.from_pretrained(base_path)
|
||||||
|
|
||||||
|
return (controlnet,)
|
||||||
|
|
||||||
|
NODE_CLASS_MAPPINGS = {
|
||||||
|
"DownloadAndLoadCogVideoModel": DownloadAndLoadCogVideoModel,
|
||||||
|
"DownloadAndLoadCogVideoGGUFModel": DownloadAndLoadCogVideoGGUFModel,
|
||||||
|
"DownloadAndLoadCogVideoControlNet": DownloadAndLoadCogVideoControlNet,
|
||||||
|
"DownloadAndLoadToraModel": DownloadAndLoadToraModel,
|
||||||
|
}
|
||||||
|
NODE_DISPLAY_NAME_MAPPINGS = {
|
||||||
|
"DownloadAndLoadCogVideoModel": "(Down)load CogVideo Model",
|
||||||
|
"DownloadAndLoadCogVideoGGUFModel": "(Down)load CogVideo GGUF Model",
|
||||||
|
"DownloadAndLoadCogVideoControlNet": "(Down)load CogVideo ControlNet",
|
||||||
|
"DownloadAndLoadToraModel": "(Down)load Tora Model",
|
||||||
|
}
|
||||||
646
nodes.py
646
nodes.py
@ -1,20 +1,9 @@
|
|||||||
import os
|
import os
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
|
||||||
import folder_paths
|
import folder_paths
|
||||||
import comfy.model_management as mm
|
import comfy.model_management as mm
|
||||||
from comfy.utils import ProgressBar, load_torch_file
|
|
||||||
from einops import rearrange
|
from einops import rearrange
|
||||||
import importlib.metadata
|
from contextlib import nullcontext
|
||||||
|
|
||||||
def check_diffusers_version():
|
|
||||||
try:
|
|
||||||
version = importlib.metadata.version('diffusers')
|
|
||||||
required_version = '0.30.3'
|
|
||||||
if version < required_version:
|
|
||||||
raise AssertionError(f"diffusers version {version} is installed, but version {required_version} or higher is required.")
|
|
||||||
except importlib.metadata.PackageNotFoundError:
|
|
||||||
raise AssertionError("diffusers is not installed.")
|
|
||||||
|
|
||||||
from diffusers.schedulers import (
|
from diffusers.schedulers import (
|
||||||
CogVideoXDDIMScheduler,
|
CogVideoXDDIMScheduler,
|
||||||
@ -47,26 +36,13 @@ scheduler_mapping = {
|
|||||||
}
|
}
|
||||||
available_schedulers = list(scheduler_mapping.keys())
|
available_schedulers = list(scheduler_mapping.keys())
|
||||||
|
|
||||||
|
|
||||||
from diffusers.models import AutoencoderKLCogVideoX
|
|
||||||
from .custom_cogvideox_transformer_3d import CogVideoXTransformer3DModel
|
|
||||||
from .pipeline_cogvideox import CogVideoXPipeline
|
|
||||||
from contextlib import nullcontext
|
|
||||||
|
|
||||||
from .cogvideox_fun.transformer_3d import CogVideoXTransformer3DModel as CogVideoXTransformer3DModelFun
|
|
||||||
from .cogvideox_fun.fun_pab_transformer_3d import CogVideoXTransformer3DModel as CogVideoXTransformer3DModelFunPAB
|
|
||||||
from .cogvideox_fun.autoencoder_magvit import AutoencoderKLCogVideoX as AutoencoderKLCogVideoXFun
|
|
||||||
from .cogvideox_fun.utils import get_image_to_video_latent, get_video_to_video_latent, ASPECT_RATIO_512, get_closest_ratio, to_pil
|
from .cogvideox_fun.utils import get_image_to_video_latent, get_video_to_video_latent, ASPECT_RATIO_512, get_closest_ratio, to_pil
|
||||||
from .cogvideox_fun.pipeline_cogvideox_inpaint import CogVideoX_Fun_Pipeline_Inpaint
|
|
||||||
from .cogvideox_fun.pipeline_cogvideox_control import CogVideoX_Fun_Pipeline_Control
|
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import logging
|
from .utils import log, check_diffusers_version
|
||||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
script_directory = os.path.dirname(os.path.abspath(__file__))
|
script_directory = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
@ -75,72 +51,8 @@ if not "CogVideo" in folder_paths.folder_names_and_paths:
|
|||||||
if not "cogvideox_loras" in folder_paths.folder_names_and_paths:
|
if not "cogvideox_loras" in folder_paths.folder_names_and_paths:
|
||||||
folder_paths.add_model_folder_path("cogvideox_loras", os.path.join(folder_paths.models_dir, "CogVideo", "loras"))
|
folder_paths.add_model_folder_path("cogvideox_loras", os.path.join(folder_paths.models_dir, "CogVideo", "loras"))
|
||||||
|
|
||||||
class PABConfig:
|
#PAB
|
||||||
def __init__(
|
from .videosys.pab import CogVideoXPABConfig
|
||||||
self,
|
|
||||||
steps: int,
|
|
||||||
cross_broadcast: bool = False,
|
|
||||||
cross_threshold: list = None,
|
|
||||||
cross_range: int = None,
|
|
||||||
spatial_broadcast: bool = False,
|
|
||||||
spatial_threshold: list = None,
|
|
||||||
spatial_range: int = None,
|
|
||||||
temporal_broadcast: bool = False,
|
|
||||||
temporal_threshold: list = None,
|
|
||||||
temporal_range: int = None,
|
|
||||||
mlp_broadcast: bool = False,
|
|
||||||
mlp_spatial_broadcast_config: dict = None,
|
|
||||||
mlp_temporal_broadcast_config: dict = None,
|
|
||||||
):
|
|
||||||
self.steps = steps
|
|
||||||
|
|
||||||
self.cross_broadcast = cross_broadcast
|
|
||||||
self.cross_threshold = cross_threshold
|
|
||||||
self.cross_range = cross_range
|
|
||||||
|
|
||||||
self.spatial_broadcast = spatial_broadcast
|
|
||||||
self.spatial_threshold = spatial_threshold
|
|
||||||
self.spatial_range = spatial_range
|
|
||||||
|
|
||||||
self.temporal_broadcast = temporal_broadcast
|
|
||||||
self.temporal_threshold = temporal_threshold
|
|
||||||
self.temporal_range = temporal_range
|
|
||||||
|
|
||||||
self.mlp_broadcast = mlp_broadcast
|
|
||||||
self.mlp_spatial_broadcast_config = mlp_spatial_broadcast_config
|
|
||||||
self.mlp_temporal_broadcast_config = mlp_temporal_broadcast_config
|
|
||||||
self.mlp_temporal_outputs = {}
|
|
||||||
self.mlp_spatial_outputs = {}
|
|
||||||
|
|
||||||
class CogVideoXPABConfig(PABConfig):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
steps: int = 50,
|
|
||||||
spatial_broadcast: bool = True,
|
|
||||||
spatial_threshold: list = [100, 850],
|
|
||||||
spatial_range: int = 2,
|
|
||||||
temporal_broadcast: bool = False,
|
|
||||||
temporal_threshold: list = [100, 850],
|
|
||||||
temporal_range: int = 4,
|
|
||||||
cross_broadcast: bool = False,
|
|
||||||
cross_threshold: list = [100, 850],
|
|
||||||
cross_range: int = 6,
|
|
||||||
):
|
|
||||||
super().__init__(
|
|
||||||
steps=steps,
|
|
||||||
spatial_broadcast=spatial_broadcast,
|
|
||||||
spatial_threshold=spatial_threshold,
|
|
||||||
spatial_range=spatial_range,
|
|
||||||
temporal_broadcast=temporal_broadcast,
|
|
||||||
temporal_threshold=temporal_threshold,
|
|
||||||
temporal_range=temporal_range,
|
|
||||||
cross_broadcast=cross_broadcast,
|
|
||||||
cross_threshold=cross_threshold,
|
|
||||||
cross_range=cross_range
|
|
||||||
|
|
||||||
)
|
|
||||||
|
|
||||||
from .videosys.cogvideox_transformer_3d import CogVideoXTransformer3DModel as CogVideoXTransformer3DModelPAB
|
|
||||||
|
|
||||||
class CogVideoPABConfig:
|
class CogVideoPABConfig:
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -189,13 +101,7 @@ class CogVideoPABConfig:
|
|||||||
|
|
||||||
return (pab_config, )
|
return (pab_config, )
|
||||||
|
|
||||||
def remove_specific_blocks(model, block_indices_to_remove):
|
|
||||||
import torch.nn as nn
|
|
||||||
transformer_blocks = model.transformer_blocks
|
|
||||||
new_blocks = [block for i, block in enumerate(transformer_blocks) if i not in block_indices_to_remove]
|
|
||||||
model.transformer_blocks = nn.ModuleList(new_blocks)
|
|
||||||
|
|
||||||
return model
|
|
||||||
|
|
||||||
class CogVideoTransformerEdit:
|
class CogVideoTransformerEdit:
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -250,534 +156,6 @@ class CogVideoLoraSelect:
|
|||||||
print(cog_loras_list)
|
print(cog_loras_list)
|
||||||
return (cog_loras_list,)
|
return (cog_loras_list,)
|
||||||
|
|
||||||
class DownloadAndLoadCogVideoModel:
|
|
||||||
@classmethod
|
|
||||||
def INPUT_TYPES(s):
|
|
||||||
return {
|
|
||||||
"required": {
|
|
||||||
"model": (
|
|
||||||
[
|
|
||||||
"THUDM/CogVideoX-2b",
|
|
||||||
"THUDM/CogVideoX-5b",
|
|
||||||
"THUDM/CogVideoX-5b-I2V",
|
|
||||||
"bertjiazheng/KoolCogVideoX-5b",
|
|
||||||
"kijai/CogVideoX-Fun-2b",
|
|
||||||
"kijai/CogVideoX-Fun-5b",
|
|
||||||
"kijai/CogVideoX-5b-Tora",
|
|
||||||
"alibaba-pai/CogVideoX-Fun-V1.1-2b-InP",
|
|
||||||
"alibaba-pai/CogVideoX-Fun-V1.1-5b-InP",
|
|
||||||
"alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose",
|
|
||||||
"alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose",
|
|
||||||
"feizhengcong/CogvideoX-Interpolation",
|
|
||||||
"NimVideo/cogvideox-2b-img2vid"
|
|
||||||
],
|
|
||||||
),
|
|
||||||
|
|
||||||
},
|
|
||||||
"optional": {
|
|
||||||
"precision": (["fp16", "fp32", "bf16"],
|
|
||||||
{"default": "bf16", "tooltip": "official recommendation is that 2b model should be fp16, 5b model should be bf16"}
|
|
||||||
),
|
|
||||||
"fp8_transformer": (['disabled', 'enabled', 'fastmode'], {"default": 'disabled', "tooltip": "enabled casts the transformer to torch.float8_e4m3fn, fastmode is only for latest nvidia GPUs and requires torch 2.4.0 and cu124 minimum"}),
|
|
||||||
"compile": (["disabled","onediff","torch"], {"tooltip": "compile the model for faster inference, these are advanced options only available on Linux, see readme for more info"}),
|
|
||||||
"enable_sequential_cpu_offload": ("BOOLEAN", {"default": False, "tooltip": "significantly reducing memory usage and slows down the inference"}),
|
|
||||||
"pab_config": ("PAB_CONFIG", {"default": None}),
|
|
||||||
"block_edit": ("TRANSFORMERBLOCKS", {"default": None}),
|
|
||||||
"lora": ("COGLORA", {"default": None}),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
RETURN_TYPES = ("COGVIDEOPIPE",)
|
|
||||||
RETURN_NAMES = ("cogvideo_pipe", )
|
|
||||||
FUNCTION = "loadmodel"
|
|
||||||
CATEGORY = "CogVideoWrapper"
|
|
||||||
DESCRIPTION = "Downloads and loads the selected CogVideo model from Huggingface to 'ComfyUI/models/CogVideo'"
|
|
||||||
|
|
||||||
def loadmodel(self, model, precision, fp8_transformer="disabled", compile="disabled", enable_sequential_cpu_offload=False, pab_config=None, block_edit=None, lora=None):
|
|
||||||
|
|
||||||
check_diffusers_version()
|
|
||||||
|
|
||||||
device = mm.get_torch_device()
|
|
||||||
offload_device = mm.unet_offload_device()
|
|
||||||
mm.soft_empty_cache()
|
|
||||||
|
|
||||||
dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
|
|
||||||
download_path = folder_paths.get_folder_paths("CogVideo")[0]
|
|
||||||
|
|
||||||
if "Fun" in model:
|
|
||||||
if not "1.1" in model:
|
|
||||||
repo_id = "kijai/CogVideoX-Fun-pruned"
|
|
||||||
if "2b" in model:
|
|
||||||
base_path = os.path.join(folder_paths.models_dir, "CogVideoX_Fun", "CogVideoX-Fun-2b-InP") # location of the official model
|
|
||||||
if not os.path.exists(base_path):
|
|
||||||
base_path = os.path.join(download_path, "CogVideoX-Fun-2b-InP")
|
|
||||||
elif "5b" in model:
|
|
||||||
base_path = os.path.join(folder_paths.models_dir, "CogVideoX_Fun", "CogVideoX-Fun-5b-InP") # location of the official model
|
|
||||||
if not os.path.exists(base_path):
|
|
||||||
base_path = os.path.join(download_path, "CogVideoX-Fun-5b-InP")
|
|
||||||
elif "1.1" in model:
|
|
||||||
repo_id = model
|
|
||||||
base_path = os.path.join(folder_paths.models_dir, "CogVideoX_Fun", (model.split("/")[-1])) # location of the official model
|
|
||||||
if not os.path.exists(base_path):
|
|
||||||
base_path = os.path.join(download_path, (model.split("/")[-1]))
|
|
||||||
download_path = base_path
|
|
||||||
|
|
||||||
elif "2b" in model:
|
|
||||||
if 'img2vid' in model:
|
|
||||||
base_path = os.path.join(download_path, "cogvideox-2b-img2vid")
|
|
||||||
download_path = base_path
|
|
||||||
repo_id = model
|
|
||||||
else:
|
|
||||||
base_path = os.path.join(download_path, "CogVideo2B")
|
|
||||||
download_path = base_path
|
|
||||||
repo_id = model
|
|
||||||
else:
|
|
||||||
base_path = os.path.join(download_path, (model.split("/")[-1]))
|
|
||||||
download_path = base_path
|
|
||||||
repo_id = model
|
|
||||||
|
|
||||||
|
|
||||||
if "2b" in model:
|
|
||||||
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json')
|
|
||||||
else:
|
|
||||||
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
|
|
||||||
|
|
||||||
if not os.path.exists(base_path) or not os.path.exists(os.path.join(base_path, "transformer")):
|
|
||||||
log.info(f"Downloading model to: {base_path}")
|
|
||||||
from huggingface_hub import snapshot_download
|
|
||||||
|
|
||||||
snapshot_download(
|
|
||||||
repo_id=repo_id,
|
|
||||||
ignore_patterns=["*text_encoder*", "*tokenizer*"],
|
|
||||||
local_dir=download_path,
|
|
||||||
local_dir_use_symlinks=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
# transformer
|
|
||||||
if "Fun" in model:
|
|
||||||
if pab_config is not None:
|
|
||||||
transformer = CogVideoXTransformer3DModelFunPAB.from_pretrained(base_path, subfolder="transformer")
|
|
||||||
else:
|
|
||||||
transformer = CogVideoXTransformer3DModelFun.from_pretrained(base_path, subfolder="transformer")
|
|
||||||
else:
|
|
||||||
if pab_config is not None:
|
|
||||||
transformer = CogVideoXTransformer3DModelPAB.from_pretrained(base_path, subfolder="transformer")
|
|
||||||
else:
|
|
||||||
transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder="transformer")
|
|
||||||
|
|
||||||
transformer = transformer.to(dtype).to(offload_device)
|
|
||||||
|
|
||||||
#LoRAs
|
|
||||||
if lora is not None:
|
|
||||||
from .lora_utils import merge_lora, load_lora_into_transformer
|
|
||||||
if "fun" in model.lower():
|
|
||||||
for l in lora:
|
|
||||||
logging.info(f"Merging LoRA weights from {l['path']} with strength {l['strength']}")
|
|
||||||
transformer = merge_lora(transformer, l["path"], l["strength"])
|
|
||||||
else:
|
|
||||||
transformer = load_lora_into_transformer(lora, transformer)
|
|
||||||
|
|
||||||
|
|
||||||
if block_edit is not None:
|
|
||||||
transformer = remove_specific_blocks(transformer, block_edit)
|
|
||||||
|
|
||||||
#fp8
|
|
||||||
if fp8_transformer == "enabled" or fp8_transformer == "fastmode":
|
|
||||||
for name, param in transformer.named_parameters():
|
|
||||||
params_to_keep = {"patch_embed", "lora", "pos_embedding"}
|
|
||||||
if not any(keyword in name for keyword in params_to_keep):
|
|
||||||
param.data = param.data.to(torch.float8_e4m3fn)
|
|
||||||
|
|
||||||
if fp8_transformer == "fastmode":
|
|
||||||
from .fp8_optimization import convert_fp8_linear
|
|
||||||
convert_fp8_linear(transformer, dtype)
|
|
||||||
|
|
||||||
with open(scheduler_path) as f:
|
|
||||||
scheduler_config = json.load(f)
|
|
||||||
scheduler = CogVideoXDDIMScheduler.from_config(scheduler_config)
|
|
||||||
|
|
||||||
# VAE
|
|
||||||
if "Fun" in model:
|
|
||||||
vae = AutoencoderKLCogVideoXFun.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device)
|
|
||||||
if "Pose" in model:
|
|
||||||
pipe = CogVideoX_Fun_Pipeline_Control(vae, transformer, scheduler, pab_config=pab_config)
|
|
||||||
else:
|
|
||||||
pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler, pab_config=pab_config)
|
|
||||||
else:
|
|
||||||
vae = AutoencoderKLCogVideoX.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device)
|
|
||||||
pipe = CogVideoXPipeline(vae, transformer, scheduler, pab_config=pab_config)
|
|
||||||
if "cogvideox-2b-img2vid" in model:
|
|
||||||
pipe.input_with_padding = False
|
|
||||||
|
|
||||||
if enable_sequential_cpu_offload:
|
|
||||||
pipe.enable_sequential_cpu_offload()
|
|
||||||
|
|
||||||
# compilation
|
|
||||||
if compile == "torch":
|
|
||||||
torch._dynamo.config.suppress_errors = True
|
|
||||||
pipe.transformer.to(memory_format=torch.channels_last)
|
|
||||||
#pipe.transformer = torch.compile(pipe.transformer, mode="default", fullgraph=False, backend="inductor")
|
|
||||||
for i, block in enumerate(pipe.transformer.transformer_blocks):
|
|
||||||
if "CogVideoXBlock" in str(block):
|
|
||||||
pipe.transformer.transformer_blocks[i] = torch.compile(block, fullgraph=False, dynamic=False, backend="inductor")
|
|
||||||
elif compile == "onediff":
|
|
||||||
from onediffx import compile_pipe
|
|
||||||
os.environ['NEXFORT_FX_FORCE_TRITON_SDPA'] = '1'
|
|
||||||
|
|
||||||
pipe = compile_pipe(
|
|
||||||
pipe,
|
|
||||||
backend="nexfort",
|
|
||||||
options= {"mode": "max-optimize:max-autotune:max-autotune", "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": False, "triton.fuse_attention_allow_fp16_reduction": False}},
|
|
||||||
ignores=["vae"],
|
|
||||||
fuse_qkv_projections=True if pab_config is None else False,
|
|
||||||
)
|
|
||||||
|
|
||||||
pipeline = {
|
|
||||||
"pipe": pipe,
|
|
||||||
"dtype": dtype,
|
|
||||||
"base_path": base_path,
|
|
||||||
"onediff": True if compile == "onediff" else False,
|
|
||||||
"cpu_offloading": enable_sequential_cpu_offload,
|
|
||||||
"scheduler_config": scheduler_config,
|
|
||||||
"model_name": model
|
|
||||||
}
|
|
||||||
|
|
||||||
return (pipeline,)
|
|
||||||
|
|
||||||
class DownloadAndLoadCogVideoGGUFModel:
|
|
||||||
@classmethod
|
|
||||||
def INPUT_TYPES(s):
|
|
||||||
return {
|
|
||||||
"required": {
|
|
||||||
"model": (
|
|
||||||
[
|
|
||||||
"CogVideoX_5b_GGUF_Q4_0.safetensors",
|
|
||||||
"CogVideoX_5b_I2V_GGUF_Q4_0.safetensors",
|
|
||||||
"CogVideoX_5b_fun_GGUF_Q4_0.safetensors",
|
|
||||||
"CogVideoX_5b_fun_1_1_GGUF_Q4_0.safetensors",
|
|
||||||
"CogVideoX_5b_fun_1_1_Pose_GGUF_Q4_0.safetensors",
|
|
||||||
"CogVideoX_5b_Interpolation_GGUF_Q4_0.safetensors",
|
|
||||||
"CogVideoX_5b_Tora_GGUF_Q4_0.safetensors",
|
|
||||||
|
|
||||||
],
|
|
||||||
),
|
|
||||||
"vae_precision": (["fp16", "fp32", "bf16"], {"default": "bf16", "tooltip": "VAE dtype"}),
|
|
||||||
"fp8_fastmode": ("BOOLEAN", {"default": False, "tooltip": "only supported on 4090 and later GPUs, also requires torch 2.4.0 with cu124 minimum"}),
|
|
||||||
"load_device": (["main_device", "offload_device"], {"default": "main_device"}),
|
|
||||||
"enable_sequential_cpu_offload": ("BOOLEAN", {"default": False, "tooltip": "significantly reducing memory usage and slows down the inference"}),
|
|
||||||
},
|
|
||||||
"optional": {
|
|
||||||
"pab_config": ("PAB_CONFIG", {"default": None}),
|
|
||||||
"block_edit": ("TRANSFORMERBLOCKS", {"default": None}),
|
|
||||||
"compile": (["disabled","torch"], {"tooltip": "compile the model for faster inference, these are advanced options only available on Linux, see readme for more info"}),
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
RETURN_TYPES = ("COGVIDEOPIPE",)
|
|
||||||
RETURN_NAMES = ("cogvideo_pipe", )
|
|
||||||
FUNCTION = "loadmodel"
|
|
||||||
CATEGORY = "CogVideoWrapper"
|
|
||||||
|
|
||||||
def loadmodel(self, model, vae_precision, fp8_fastmode, load_device, enable_sequential_cpu_offload, pab_config=None, block_edit=None, compile="disabled"):
|
|
||||||
|
|
||||||
check_diffusers_version()
|
|
||||||
|
|
||||||
device = mm.get_torch_device()
|
|
||||||
offload_device = mm.unet_offload_device()
|
|
||||||
mm.soft_empty_cache()
|
|
||||||
|
|
||||||
vae_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[vae_precision]
|
|
||||||
download_path = os.path.join(folder_paths.models_dir, 'CogVideo', 'GGUF')
|
|
||||||
gguf_path = os.path.join(folder_paths.models_dir, 'diffusion_models', model) # check MinusZone's model path first
|
|
||||||
if not os.path.exists(gguf_path):
|
|
||||||
gguf_path = os.path.join(download_path, model)
|
|
||||||
if not os.path.exists(gguf_path):
|
|
||||||
if "I2V" in model or "1_1" in model or "Interpolation" in model or "Tora" in model:
|
|
||||||
repo_id = "Kijai/CogVideoX_GGUF"
|
|
||||||
else:
|
|
||||||
repo_id = "MinusZoneAI/ComfyUI-CogVideoX-MZ"
|
|
||||||
log.info(f"Downloading model to: {gguf_path}")
|
|
||||||
from huggingface_hub import snapshot_download
|
|
||||||
|
|
||||||
snapshot_download(
|
|
||||||
repo_id=repo_id,
|
|
||||||
allow_patterns=[f"*{model}*"],
|
|
||||||
local_dir=download_path,
|
|
||||||
local_dir_use_symlinks=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
if "5b" in model:
|
|
||||||
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
|
|
||||||
transformer_path = os.path.join(script_directory, 'configs', 'transformer_config_5b.json')
|
|
||||||
elif "2b" in model:
|
|
||||||
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json')
|
|
||||||
transformer_path = os.path.join(script_directory, 'configs', 'transformer_config_2b.json')
|
|
||||||
|
|
||||||
with open(transformer_path) as f:
|
|
||||||
transformer_config = json.load(f)
|
|
||||||
|
|
||||||
sd = load_torch_file(gguf_path)
|
|
||||||
|
|
||||||
from . import mz_gguf_loader
|
|
||||||
import importlib
|
|
||||||
importlib.reload(mz_gguf_loader)
|
|
||||||
|
|
||||||
with mz_gguf_loader.quantize_lazy_load():
|
|
||||||
if "fun" in model:
|
|
||||||
if "Pose" in model:
|
|
||||||
transformer_config["in_channels"] = 32
|
|
||||||
else:
|
|
||||||
transformer_config["in_channels"] = 33
|
|
||||||
if pab_config is not None:
|
|
||||||
transformer = CogVideoXTransformer3DModelFunPAB.from_config(transformer_config)
|
|
||||||
else:
|
|
||||||
transformer = CogVideoXTransformer3DModelFun.from_config(transformer_config)
|
|
||||||
elif "I2V" in model or "Interpolation" in model:
|
|
||||||
transformer_config["in_channels"] = 32
|
|
||||||
if pab_config is not None:
|
|
||||||
transformer = CogVideoXTransformer3DModelPAB.from_config(transformer_config)
|
|
||||||
else:
|
|
||||||
transformer = CogVideoXTransformer3DModel.from_config(transformer_config)
|
|
||||||
else:
|
|
||||||
transformer_config["in_channels"] = 16
|
|
||||||
if pab_config is not None:
|
|
||||||
transformer = CogVideoXTransformer3DModelPAB.from_config(transformer_config)
|
|
||||||
else:
|
|
||||||
transformer = CogVideoXTransformer3DModel.from_config(transformer_config)
|
|
||||||
|
|
||||||
if "2b" in model:
|
|
||||||
for name, param in transformer.named_parameters():
|
|
||||||
if name != "pos_embedding":
|
|
||||||
param.data = param.data.to(torch.float8_e4m3fn)
|
|
||||||
else:
|
|
||||||
param.data = param.data.to(torch.float16)
|
|
||||||
else:
|
|
||||||
transformer.to(torch.float8_e4m3fn)
|
|
||||||
|
|
||||||
if block_edit is not None:
|
|
||||||
transformer = remove_specific_blocks(transformer, block_edit)
|
|
||||||
|
|
||||||
transformer = mz_gguf_loader.quantize_load_state_dict(transformer, sd, device="cpu")
|
|
||||||
if load_device == "offload_device":
|
|
||||||
transformer.to(offload_device)
|
|
||||||
else:
|
|
||||||
transformer.to(device)
|
|
||||||
|
|
||||||
|
|
||||||
if fp8_fastmode:
|
|
||||||
from .fp8_optimization import convert_fp8_linear
|
|
||||||
convert_fp8_linear(transformer, vae_dtype)
|
|
||||||
|
|
||||||
if compile == "torch":
|
|
||||||
# compilation
|
|
||||||
for i, block in enumerate(transformer.transformer_blocks):
|
|
||||||
transformer.transformer_blocks[i] = torch.compile(block, fullgraph=False, dynamic=False, backend="inductor")
|
|
||||||
with open(scheduler_path) as f:
|
|
||||||
scheduler_config = json.load(f)
|
|
||||||
|
|
||||||
scheduler = CogVideoXDDIMScheduler.from_config(scheduler_config, subfolder="scheduler")
|
|
||||||
|
|
||||||
# VAE
|
|
||||||
vae_dl_path = os.path.join(folder_paths.models_dir, 'CogVideo', 'VAE')
|
|
||||||
vae_path = os.path.join(vae_dl_path, "cogvideox_vae.safetensors")
|
|
||||||
if not os.path.exists(vae_path):
|
|
||||||
log.info(f"Downloading VAE model to: {vae_path}")
|
|
||||||
from huggingface_hub import snapshot_download
|
|
||||||
|
|
||||||
snapshot_download(
|
|
||||||
repo_id="Kijai/CogVideoX-Fun-pruned",
|
|
||||||
allow_patterns=["*cogvideox_vae.safetensors*"],
|
|
||||||
local_dir=vae_dl_path,
|
|
||||||
local_dir_use_symlinks=False,
|
|
||||||
)
|
|
||||||
with open(os.path.join(script_directory, 'configs', 'vae_config.json')) as f:
|
|
||||||
vae_config = json.load(f)
|
|
||||||
|
|
||||||
vae_sd = load_torch_file(vae_path)
|
|
||||||
if "fun" in model:
|
|
||||||
vae = AutoencoderKLCogVideoXFun.from_config(vae_config).to(vae_dtype).to(offload_device)
|
|
||||||
vae.load_state_dict(vae_sd)
|
|
||||||
if "Pose" in model:
|
|
||||||
pipe = CogVideoX_Fun_Pipeline_Control(vae, transformer, scheduler, pab_config=pab_config)
|
|
||||||
else:
|
|
||||||
pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler, pab_config=pab_config)
|
|
||||||
else:
|
|
||||||
vae = AutoencoderKLCogVideoX.from_config(vae_config).to(vae_dtype).to(offload_device)
|
|
||||||
vae.load_state_dict(vae_sd)
|
|
||||||
pipe = CogVideoXPipeline(vae, transformer, scheduler, pab_config=pab_config)
|
|
||||||
|
|
||||||
if enable_sequential_cpu_offload:
|
|
||||||
pipe.enable_sequential_cpu_offload()
|
|
||||||
|
|
||||||
pipeline = {
|
|
||||||
"pipe": pipe,
|
|
||||||
"dtype": vae_dtype,
|
|
||||||
"base_path": model,
|
|
||||||
"onediff": False,
|
|
||||||
"cpu_offloading": enable_sequential_cpu_offload,
|
|
||||||
"scheduler_config": scheduler_config,
|
|
||||||
"model_name": model
|
|
||||||
}
|
|
||||||
|
|
||||||
return (pipeline,)
|
|
||||||
|
|
||||||
class DownloadAndLoadToraModel:
|
|
||||||
@classmethod
|
|
||||||
def INPUT_TYPES(s):
|
|
||||||
return {
|
|
||||||
"required": {
|
|
||||||
"model": (
|
|
||||||
[
|
|
||||||
"kijai/CogVideoX-5b-Tora",
|
|
||||||
],
|
|
||||||
),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
RETURN_TYPES = ("TORAMODEL",)
|
|
||||||
RETURN_NAMES = ("tora_model", )
|
|
||||||
FUNCTION = "loadmodel"
|
|
||||||
CATEGORY = "CogVideoWrapper"
|
|
||||||
DESCRIPTION = "Downloads and loads the the Tora model from Huggingface to 'ComfyUI/models/CogVideo/CogVideoX-5b-Tora'"
|
|
||||||
|
|
||||||
def loadmodel(self, model):
|
|
||||||
|
|
||||||
check_diffusers_version()
|
|
||||||
|
|
||||||
device = mm.get_torch_device()
|
|
||||||
offload_device = mm.unet_offload_device()
|
|
||||||
mm.soft_empty_cache()
|
|
||||||
|
|
||||||
download_path = folder_paths.get_folder_paths("CogVideo")[0]
|
|
||||||
|
|
||||||
from .tora.traj_module import MGF
|
|
||||||
|
|
||||||
try:
|
|
||||||
from accelerate import init_empty_weights
|
|
||||||
from accelerate.utils import set_module_tensor_to_device
|
|
||||||
is_accelerate_available = True
|
|
||||||
except:
|
|
||||||
is_accelerate_available = False
|
|
||||||
pass
|
|
||||||
|
|
||||||
download_path = os.path.join(folder_paths.models_dir, 'CogVideo', "CogVideoX-5b-Tora")
|
|
||||||
fuser_path = os.path.join(download_path, "fuser", "fuser.safetensors")
|
|
||||||
if not os.path.exists(fuser_path):
|
|
||||||
log.info(f"Downloading Fuser model to: {fuser_path}")
|
|
||||||
from huggingface_hub import snapshot_download
|
|
||||||
|
|
||||||
snapshot_download(
|
|
||||||
repo_id=model,
|
|
||||||
allow_patterns=["*fuser.safetensors*"],
|
|
||||||
local_dir=download_path,
|
|
||||||
local_dir_use_symlinks=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
hidden_size = 3072
|
|
||||||
num_layers = 42
|
|
||||||
|
|
||||||
with (init_empty_weights() if is_accelerate_available else nullcontext()):
|
|
||||||
fuser_list = nn.ModuleList([MGF(128, hidden_size) for _ in range(num_layers)])
|
|
||||||
|
|
||||||
fuser_sd = load_torch_file(fuser_path)
|
|
||||||
if is_accelerate_available:
|
|
||||||
for key in fuser_sd:
|
|
||||||
set_module_tensor_to_device(fuser_list, key, dtype=torch.float16, device=device, value=fuser_sd[key])
|
|
||||||
else:
|
|
||||||
fuser_list.load_state_dict(fuser_sd)
|
|
||||||
for module in fuser_list:
|
|
||||||
for param in module.parameters():
|
|
||||||
param.data = param.data.to(torch.bfloat16).to(device)
|
|
||||||
del fuser_sd
|
|
||||||
|
|
||||||
traj_extractor_path = os.path.join(download_path, "traj_extractor", "traj_extractor.safetensors")
|
|
||||||
if not os.path.exists(traj_extractor_path):
|
|
||||||
log.info(f"Downloading trajectory extractor model to: {traj_extractor_path}")
|
|
||||||
from huggingface_hub import snapshot_download
|
|
||||||
|
|
||||||
snapshot_download(
|
|
||||||
repo_id="kijai/CogVideoX-5b-Tora",
|
|
||||||
allow_patterns=["*traj_extractor.safetensors*"],
|
|
||||||
local_dir=download_path,
|
|
||||||
local_dir_use_symlinks=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
from .tora.traj_module import TrajExtractor
|
|
||||||
with (init_empty_weights() if is_accelerate_available else nullcontext()):
|
|
||||||
traj_extractor = TrajExtractor(
|
|
||||||
vae_downsize=(4, 8, 8),
|
|
||||||
patch_size=2,
|
|
||||||
nums_rb=2,
|
|
||||||
cin=16,
|
|
||||||
channels=[128] * 42,
|
|
||||||
sk=True,
|
|
||||||
use_conv=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
traj_sd = load_torch_file(traj_extractor_path)
|
|
||||||
if is_accelerate_available:
|
|
||||||
for key in traj_sd:
|
|
||||||
set_module_tensor_to_device(traj_extractor, key, dtype=torch.float32, device=device, value=traj_sd[key])
|
|
||||||
else:
|
|
||||||
traj_extractor.load_state_dict(traj_sd)
|
|
||||||
traj_extractor.to(torch.float32).to(device)
|
|
||||||
|
|
||||||
toramodel = {
|
|
||||||
"fuser_list": fuser_list,
|
|
||||||
"traj_extractor": traj_extractor,
|
|
||||||
}
|
|
||||||
|
|
||||||
return (toramodel,)
|
|
||||||
|
|
||||||
class DownloadAndLoadCogVideoControlNet:
|
|
||||||
@classmethod
|
|
||||||
def INPUT_TYPES(s):
|
|
||||||
return {
|
|
||||||
"required": {
|
|
||||||
"model": (
|
|
||||||
[
|
|
||||||
"TheDenk/cogvideox-2b-controlnet-hed-v1",
|
|
||||||
"TheDenk/cogvideox-2b-controlnet-canny-v1",
|
|
||||||
"TheDenk/cogvideox-5b-controlnet-hed-v1",
|
|
||||||
"TheDenk/cogvideox-5b-controlnet-canny-v1"
|
|
||||||
],
|
|
||||||
),
|
|
||||||
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
RETURN_TYPES = ("COGVIDECONTROLNETMODEL",)
|
|
||||||
RETURN_NAMES = ("cogvideo_controlnet", )
|
|
||||||
FUNCTION = "loadmodel"
|
|
||||||
CATEGORY = "CogVideoWrapper"
|
|
||||||
|
|
||||||
def loadmodel(self, model):
|
|
||||||
from .cogvideo_controlnet import CogVideoXControlnet
|
|
||||||
|
|
||||||
device = mm.get_torch_device()
|
|
||||||
offload_device = mm.unet_offload_device()
|
|
||||||
mm.soft_empty_cache()
|
|
||||||
|
|
||||||
|
|
||||||
download_path = os.path.join(folder_paths.models_dir, 'CogVideo', 'ControlNet')
|
|
||||||
base_path = os.path.join(download_path, (model.split("/")[-1]))
|
|
||||||
|
|
||||||
if not os.path.exists(base_path):
|
|
||||||
log.info(f"Downloading model to: {base_path}")
|
|
||||||
from huggingface_hub import snapshot_download
|
|
||||||
|
|
||||||
snapshot_download(
|
|
||||||
repo_id=model,
|
|
||||||
ignore_patterns=["*text_encoder*", "*tokenizer*"],
|
|
||||||
local_dir=base_path,
|
|
||||||
local_dir_use_symlinks=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
controlnet = CogVideoXControlnet.from_pretrained(base_path)
|
|
||||||
|
|
||||||
return (controlnet,)
|
|
||||||
|
|
||||||
class CogVideoEncodePrompt:
|
class CogVideoEncodePrompt:
|
||||||
@classmethod
|
@classmethod
|
||||||
def INPUT_TYPES(s):
|
def INPUT_TYPES(s):
|
||||||
@ -1179,7 +557,7 @@ class ToraEncodeTrajectory:
|
|||||||
|
|
||||||
video_flow_features = video_flow_features * strength
|
video_flow_features = video_flow_features * strength
|
||||||
|
|
||||||
logging.info(f"video_flow shape: {video_flow.shape}")
|
log.info(f"video_flow shape: {video_flow.shape}")
|
||||||
|
|
||||||
tora = {
|
tora = {
|
||||||
"video_flow_features" : video_flow_features,
|
"video_flow_features" : video_flow_features,
|
||||||
@ -1241,7 +619,7 @@ class ToraEncodeOpticalFlow:
|
|||||||
|
|
||||||
video_flow_features = video_flow_features * strength
|
video_flow_features = video_flow_features * strength
|
||||||
|
|
||||||
logging.info(f"video_flow shape: {video_flow.shape}")
|
log.info(f"video_flow shape: {video_flow.shape}")
|
||||||
|
|
||||||
tora = {
|
tora = {
|
||||||
"video_flow_features" : video_flow_features,
|
"video_flow_features" : video_flow_features,
|
||||||
@ -1529,7 +907,7 @@ class CogVideoXFunSampler:
|
|||||||
|
|
||||||
# Load Sampler
|
# Load Sampler
|
||||||
if context_options is not None and context_options["context_schedule"] == "temporal_tiling":
|
if context_options is not None and context_options["context_schedule"] == "temporal_tiling":
|
||||||
logging.info("Temporal tiling enabled, changing scheduler to CogVideoXDDIM")
|
log.info("Temporal tiling enabled, changing scheduler to CogVideoXDDIM")
|
||||||
scheduler="CogVideoXDDIM"
|
scheduler="CogVideoXDDIM"
|
||||||
scheduler_config = pipeline["scheduler_config"]
|
scheduler_config = pipeline["scheduler_config"]
|
||||||
if scheduler in scheduler_mapping:
|
if scheduler in scheduler_mapping:
|
||||||
@ -1824,7 +1202,7 @@ class CogVideoXFunControlSampler:
|
|||||||
# Load Sampler
|
# Load Sampler
|
||||||
scheduler_config = pipeline["scheduler_config"]
|
scheduler_config = pipeline["scheduler_config"]
|
||||||
if context_options is not None and context_options["context_schedule"] == "temporal_tiling":
|
if context_options is not None and context_options["context_schedule"] == "temporal_tiling":
|
||||||
logging.info("Temporal tiling enabled, changing scheduler to CogVideoXDDIM")
|
log.info("Temporal tiling enabled, changing scheduler to CogVideoXDDIM")
|
||||||
scheduler="CogVideoXDDIM"
|
scheduler="CogVideoXDDIM"
|
||||||
if scheduler in scheduler_mapping:
|
if scheduler in scheduler_mapping:
|
||||||
noise_scheduler = scheduler_mapping[scheduler].from_config(scheduler_config)
|
noise_scheduler = scheduler_mapping[scheduler].from_config(scheduler_config)
|
||||||
@ -1870,7 +1248,6 @@ class CogVideoXFunControlSampler:
|
|||||||
return (pipeline, {"samples": latents})
|
return (pipeline, {"samples": latents})
|
||||||
|
|
||||||
NODE_CLASS_MAPPINGS = {
|
NODE_CLASS_MAPPINGS = {
|
||||||
"DownloadAndLoadCogVideoModel": DownloadAndLoadCogVideoModel,
|
|
||||||
"CogVideoSampler": CogVideoSampler,
|
"CogVideoSampler": CogVideoSampler,
|
||||||
"CogVideoDecode": CogVideoDecode,
|
"CogVideoDecode": CogVideoDecode,
|
||||||
"CogVideoTextEncode": CogVideoTextEncode,
|
"CogVideoTextEncode": CogVideoTextEncode,
|
||||||
@ -1881,21 +1258,17 @@ NODE_CLASS_MAPPINGS = {
|
|||||||
"CogVideoXFunVid2VidSampler": CogVideoXFunVid2VidSampler,
|
"CogVideoXFunVid2VidSampler": CogVideoXFunVid2VidSampler,
|
||||||
"CogVideoXFunControlSampler": CogVideoXFunControlSampler,
|
"CogVideoXFunControlSampler": CogVideoXFunControlSampler,
|
||||||
"CogVideoTextEncodeCombine": CogVideoTextEncodeCombine,
|
"CogVideoTextEncodeCombine": CogVideoTextEncodeCombine,
|
||||||
"DownloadAndLoadCogVideoGGUFModel": DownloadAndLoadCogVideoGGUFModel,
|
|
||||||
"CogVideoPABConfig": CogVideoPABConfig,
|
"CogVideoPABConfig": CogVideoPABConfig,
|
||||||
"CogVideoTransformerEdit": CogVideoTransformerEdit,
|
"CogVideoTransformerEdit": CogVideoTransformerEdit,
|
||||||
"CogVideoControlImageEncode": CogVideoControlImageEncode,
|
"CogVideoControlImageEncode": CogVideoControlImageEncode,
|
||||||
"CogVideoLoraSelect": CogVideoLoraSelect,
|
"CogVideoLoraSelect": CogVideoLoraSelect,
|
||||||
"CogVideoContextOptions": CogVideoContextOptions,
|
"CogVideoContextOptions": CogVideoContextOptions,
|
||||||
"CogVideoControlNet": CogVideoControlNet,
|
"CogVideoControlNet": CogVideoControlNet,
|
||||||
"DownloadAndLoadCogVideoControlNet": DownloadAndLoadCogVideoControlNet,
|
|
||||||
"ToraEncodeTrajectory": ToraEncodeTrajectory,
|
"ToraEncodeTrajectory": ToraEncodeTrajectory,
|
||||||
"ToraEncodeOpticalFlow": ToraEncodeOpticalFlow,
|
"ToraEncodeOpticalFlow": ToraEncodeOpticalFlow,
|
||||||
"DownloadAndLoadToraModel": DownloadAndLoadToraModel,
|
|
||||||
"CogVideoXFasterCache": CogVideoXFasterCache
|
"CogVideoXFasterCache": CogVideoXFasterCache
|
||||||
}
|
}
|
||||||
NODE_DISPLAY_NAME_MAPPINGS = {
|
NODE_DISPLAY_NAME_MAPPINGS = {
|
||||||
"DownloadAndLoadCogVideoModel": "(Down)load CogVideo Model",
|
|
||||||
"CogVideoSampler": "CogVideo Sampler",
|
"CogVideoSampler": "CogVideo Sampler",
|
||||||
"CogVideoDecode": "CogVideo Decode",
|
"CogVideoDecode": "CogVideo Decode",
|
||||||
"CogVideoTextEncode": "CogVideo TextEncode",
|
"CogVideoTextEncode": "CogVideo TextEncode",
|
||||||
@ -1906,15 +1279,12 @@ NODE_DISPLAY_NAME_MAPPINGS = {
|
|||||||
"CogVideoXFunVid2VidSampler": "CogVideoXFun Vid2Vid Sampler",
|
"CogVideoXFunVid2VidSampler": "CogVideoXFun Vid2Vid Sampler",
|
||||||
"CogVideoXFunControlSampler": "CogVideoXFun Control Sampler",
|
"CogVideoXFunControlSampler": "CogVideoXFun Control Sampler",
|
||||||
"CogVideoTextEncodeCombine": "CogVideo TextEncode Combine",
|
"CogVideoTextEncodeCombine": "CogVideo TextEncode Combine",
|
||||||
"DownloadAndLoadCogVideoGGUFModel": "(Down)load CogVideo GGUF Model",
|
|
||||||
"CogVideoPABConfig": "CogVideo PABConfig",
|
"CogVideoPABConfig": "CogVideo PABConfig",
|
||||||
"CogVideoTransformerEdit": "CogVideo TransformerEdit",
|
"CogVideoTransformerEdit": "CogVideo TransformerEdit",
|
||||||
"CogVideoControlImageEncode": "CogVideo Control ImageEncode",
|
"CogVideoControlImageEncode": "CogVideo Control ImageEncode",
|
||||||
"CogVideoLoraSelect": "CogVideo LoraSelect",
|
"CogVideoLoraSelect": "CogVideo LoraSelect",
|
||||||
"CogVideoContextOptions": "CogVideo Context Options",
|
"CogVideoContextOptions": "CogVideo Context Options",
|
||||||
"DownloadAndLoadCogVideoControlNet": "(Down)load CogVideo ControlNet",
|
|
||||||
"ToraEncodeTrajectory": "Tora Encode Trajectory",
|
"ToraEncodeTrajectory": "Tora Encode Trajectory",
|
||||||
"ToraEncodeOpticalFlow": "Tora Encode OpticalFlow",
|
"ToraEncodeOpticalFlow": "Tora Encode OpticalFlow",
|
||||||
"DownloadAndLoadToraModel": "(Down)load Tora Model",
|
|
||||||
"CogVideoXFasterCache": "CogVideoX FasterCache"
|
"CogVideoXFasterCache": "CogVideoX FasterCache"
|
||||||
}
|
}
|
||||||
@ -21,7 +21,7 @@ New features:
|
|||||||
- Initial context windowing with FreeNoise noise shuffling mainly for vid2vid and pose2vid pipelines for longer generations, haven't figured it out for img2vid yet
|
- Initial context windowing with FreeNoise noise shuffling mainly for vid2vid and pose2vid pipelines for longer generations, haven't figured it out for img2vid yet
|
||||||
- GGUF models and tiled encoding for I2V and pose pipelines (thanks to MinusZoneAI)
|
- GGUF models and tiled encoding for I2V and pose pipelines (thanks to MinusZoneAI)
|
||||||
- [sageattention](https://github.com/thu-ml/SageAttention) support (Linux only) for a speed boost, I experienced ~20-30% increase with it, stacks with fp8 fast mode, doesn't need compiling
|
- [sageattention](https://github.com/thu-ml/SageAttention) support (Linux only) for a speed boost, I experienced ~20-30% increase with it, stacks with fp8 fast mode, doesn't need compiling
|
||||||
- Support CogVideoX-Fun 1.1 and it's pose models with additional control strenght and application step settings, this model's input does NOT have to be just dwpose skeletons, just about anything can work
|
- Support CogVideoX-Fun 1.1 and it's pose models with additional control strength and application step settings, this model's input does NOT have to be just dwpose skeletons, just about anything can work
|
||||||
- Support LoRAs
|
- Support LoRAs
|
||||||
|
|
||||||
https://github.com/user-attachments/assets/ddeb8f38-a647-42b3-a4b1-c6936f961deb
|
https://github.com/user-attachments/assets/ddeb8f38-a647-42b3-a4b1-c6936f961deb
|
||||||
|
|||||||
22
utils.py
Normal file
22
utils.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
import importlib.metadata
|
||||||
|
|
||||||
|
import logging
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def check_diffusers_version():
|
||||||
|
try:
|
||||||
|
version = importlib.metadata.version('diffusers')
|
||||||
|
required_version = '0.30.3'
|
||||||
|
if version < required_version:
|
||||||
|
raise AssertionError(f"diffusers version {version} is installed, but version {required_version} or higher is required.")
|
||||||
|
except importlib.metadata.PackageNotFoundError:
|
||||||
|
raise AssertionError("diffusers is not installed.")
|
||||||
|
|
||||||
|
def remove_specific_blocks(model, block_indices_to_remove):
|
||||||
|
import torch.nn as nn
|
||||||
|
transformer_blocks = model.transformer_blocks
|
||||||
|
new_blocks = [block for i, block in enumerate(transformer_blocks) if i not in block_indices_to_remove]
|
||||||
|
model.transformer_blocks = nn.ModuleList(new_blocks)
|
||||||
|
|
||||||
|
return model
|
||||||
@ -67,16 +67,6 @@ class CogVideoXAttnProcessor2_0:
|
|||||||
key = attn.to_k(hidden_states)
|
key = attn.to_k(hidden_states)
|
||||||
value = attn.to_v(hidden_states)
|
value = attn.to_v(hidden_states)
|
||||||
|
|
||||||
# if attn.parallel_manager.sp_size > 1:
|
|
||||||
# assert (
|
|
||||||
# attn.heads % attn.parallel_manager.sp_size == 0
|
|
||||||
# ), f"Number of heads {attn.heads} must be divisible by sequence parallel size {attn.parallel_manager.sp_size}"
|
|
||||||
# attn_heads = attn.heads // attn.parallel_manager.sp_size
|
|
||||||
# query, key, value = map(
|
|
||||||
# lambda x: all_to_all_comm(x, attn.parallel_manager.sp_group, scatter_dim=2, gather_dim=1),
|
|
||||||
# [query, key, value],
|
|
||||||
# )
|
|
||||||
|
|
||||||
attn_heads = attn.heads
|
attn_heads = attn.heads
|
||||||
|
|
||||||
inner_dim = key.shape[-1]
|
inner_dim = key.shape[-1]
|
||||||
@ -111,9 +101,6 @@ class CogVideoXAttnProcessor2_0:
|
|||||||
|
|
||||||
hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn_heads * head_dim)
|
hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn_heads * head_dim)
|
||||||
|
|
||||||
#if attn.parallel_manager.sp_size > 1:
|
|
||||||
# hidden_states = all_to_all_comm(hidden_states, attn.parallel_manager.sp_group, scatter_dim=1, gather_dim=2)
|
|
||||||
|
|
||||||
# linear proj
|
# linear proj
|
||||||
hidden_states = attn.to_out[0](hidden_states)
|
hidden_states = attn.to_out[0](hidden_states)
|
||||||
# dropout
|
# dropout
|
||||||
|
|||||||
64
videosys/pab.py
Normal file
64
videosys/pab.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
class PABConfig:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
steps: int,
|
||||||
|
cross_broadcast: bool = False,
|
||||||
|
cross_threshold: list = None,
|
||||||
|
cross_range: int = None,
|
||||||
|
spatial_broadcast: bool = False,
|
||||||
|
spatial_threshold: list = None,
|
||||||
|
spatial_range: int = None,
|
||||||
|
temporal_broadcast: bool = False,
|
||||||
|
temporal_threshold: list = None,
|
||||||
|
temporal_range: int = None,
|
||||||
|
mlp_broadcast: bool = False,
|
||||||
|
mlp_spatial_broadcast_config: dict = None,
|
||||||
|
mlp_temporal_broadcast_config: dict = None,
|
||||||
|
):
|
||||||
|
self.steps = steps
|
||||||
|
|
||||||
|
self.cross_broadcast = cross_broadcast
|
||||||
|
self.cross_threshold = cross_threshold
|
||||||
|
self.cross_range = cross_range
|
||||||
|
|
||||||
|
self.spatial_broadcast = spatial_broadcast
|
||||||
|
self.spatial_threshold = spatial_threshold
|
||||||
|
self.spatial_range = spatial_range
|
||||||
|
|
||||||
|
self.temporal_broadcast = temporal_broadcast
|
||||||
|
self.temporal_threshold = temporal_threshold
|
||||||
|
self.temporal_range = temporal_range
|
||||||
|
|
||||||
|
self.mlp_broadcast = mlp_broadcast
|
||||||
|
self.mlp_spatial_broadcast_config = mlp_spatial_broadcast_config
|
||||||
|
self.mlp_temporal_broadcast_config = mlp_temporal_broadcast_config
|
||||||
|
self.mlp_temporal_outputs = {}
|
||||||
|
self.mlp_spatial_outputs = {}
|
||||||
|
|
||||||
|
class CogVideoXPABConfig(PABConfig):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
steps: int = 50,
|
||||||
|
spatial_broadcast: bool = True,
|
||||||
|
spatial_threshold: list = [100, 850],
|
||||||
|
spatial_range: int = 2,
|
||||||
|
temporal_broadcast: bool = False,
|
||||||
|
temporal_threshold: list = [100, 850],
|
||||||
|
temporal_range: int = 4,
|
||||||
|
cross_broadcast: bool = False,
|
||||||
|
cross_threshold: list = [100, 850],
|
||||||
|
cross_range: int = 6,
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
steps=steps,
|
||||||
|
spatial_broadcast=spatial_broadcast,
|
||||||
|
spatial_threshold=spatial_threshold,
|
||||||
|
spatial_range=spatial_range,
|
||||||
|
temporal_broadcast=temporal_broadcast,
|
||||||
|
temporal_threshold=temporal_threshold,
|
||||||
|
temporal_range=temporal_range,
|
||||||
|
cross_broadcast=cross_broadcast,
|
||||||
|
cross_threshold=cross_threshold,
|
||||||
|
cross_range=cross_range
|
||||||
|
|
||||||
|
)
|
||||||
Loading…
x
Reference in New Issue
Block a user