From 5ba9b1d6343fa5d6ea8aeac550761db949c3a8e2 Mon Sep 17 00:00:00 2001 From: kijai <40791699+kijai@users.noreply.github.com> Date: Wed, 30 Oct 2024 21:30:03 +0200 Subject: [PATCH] code cleanup --- __init__.py | 6 +- cogvideox_fun/transformer_3d.py | 6 - model_loading.py | 567 +++++++++++++++++++++++ nodes.py | 648 +-------------------------- readme.md | 2 +- utils.py | 22 + videosys/cogvideox_transformer_3d.py | 13 - videosys/pab.py | 64 +++ 8 files changed, 668 insertions(+), 660 deletions(-) create mode 100644 model_loading.py create mode 100644 utils.py create mode 100644 videosys/pab.py diff --git a/__init__.py b/__init__.py index 2e96bd6..a608714 100644 --- a/__init__.py +++ b/__init__.py @@ -1,3 +1,7 @@ -from .nodes import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS +from .nodes import NODE_CLASS_MAPPINGS as NODES_CLASS, NODE_DISPLAY_NAME_MAPPINGS as NODES_DISPLAY +from .model_loading import NODE_CLASS_MAPPINGS as MODEL_CLASS, NODE_DISPLAY_NAME_MAPPINGS as MODEL_DISPLAY + +NODE_CLASS_MAPPINGS = {**NODES_CLASS, **MODEL_CLASS} +NODE_DISPLAY_NAME_MAPPINGS = {**NODES_DISPLAY, **MODEL_DISPLAY} __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"] \ No newline at end of file diff --git a/cogvideox_fun/transformer_3d.py b/cogvideox_fun/transformer_3d.py index 2b57923..83614e2 100644 --- a/cogvideox_fun/transformer_3d.py +++ b/cogvideox_fun/transformer_3d.py @@ -378,12 +378,6 @@ class CogVideoXBlock(nn.Module): elif fastercache_counter > fastercache_start_step: self.cached_hidden_states[-1].copy_(attn_hidden_states.to(fastercache_device)) self.cached_encoder_hidden_states[-1].copy_(attn_encoder_hidden_states.to(fastercache_device)) - # attention - attn_hidden_states, attn_encoder_hidden_states = self.attn1( - hidden_states=norm_hidden_states, - encoder_hidden_states=norm_encoder_hidden_states, - image_rotary_emb=image_rotary_emb, - ) hidden_states = hidden_states + gate_msa * attn_hidden_states encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder_hidden_states diff --git a/model_loading.py b/model_loading.py new file mode 100644 index 0000000..c0cea3f --- /dev/null +++ b/model_loading.py @@ -0,0 +1,567 @@ +import os +import torch +import torch.nn as nn +import json +import folder_paths +import comfy.model_management as mm + +from diffusers.models import AutoencoderKLCogVideoX +from diffusers.schedulers import CogVideoXDDIMScheduler +from .custom_cogvideox_transformer_3d import CogVideoXTransformer3DModel +from .pipeline_cogvideox import CogVideoXPipeline +from contextlib import nullcontext + +from .cogvideox_fun.transformer_3d import CogVideoXTransformer3DModel as CogVideoXTransformer3DModelFun +from .cogvideox_fun.fun_pab_transformer_3d import CogVideoXTransformer3DModel as CogVideoXTransformer3DModelFunPAB +from .cogvideox_fun.autoencoder_magvit import AutoencoderKLCogVideoX as AutoencoderKLCogVideoXFun + +from .cogvideox_fun.pipeline_cogvideox_inpaint import CogVideoX_Fun_Pipeline_Inpaint +from .cogvideox_fun.pipeline_cogvideox_control import CogVideoX_Fun_Pipeline_Control + +from .videosys.cogvideox_transformer_3d import CogVideoXTransformer3DModel as CogVideoXTransformer3DModelPAB + +from .utils import check_diffusers_version, remove_specific_blocks, log +from comfy.utils import load_torch_file + +script_directory = os.path.dirname(os.path.abspath(__file__)) + +class DownloadAndLoadCogVideoModel: + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "model": ( + [ + "THUDM/CogVideoX-2b", + "THUDM/CogVideoX-5b", + "THUDM/CogVideoX-5b-I2V", + "bertjiazheng/KoolCogVideoX-5b", + "kijai/CogVideoX-Fun-2b", + "kijai/CogVideoX-Fun-5b", + "kijai/CogVideoX-5b-Tora", + "alibaba-pai/CogVideoX-Fun-V1.1-2b-InP", + "alibaba-pai/CogVideoX-Fun-V1.1-5b-InP", + "alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose", + "alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose", + "feizhengcong/CogvideoX-Interpolation", + "NimVideo/cogvideox-2b-img2vid" + ], + ), + + }, + "optional": { + "precision": (["fp16", "fp32", "bf16"], + {"default": "bf16", "tooltip": "official recommendation is that 2b model should be fp16, 5b model should be bf16"} + ), + "fp8_transformer": (['disabled', 'enabled', 'fastmode'], {"default": 'disabled', "tooltip": "enabled casts the transformer to torch.float8_e4m3fn, fastmode is only for latest nvidia GPUs and requires torch 2.4.0 and cu124 minimum"}), + "compile": (["disabled","onediff","torch"], {"tooltip": "compile the model for faster inference, these are advanced options only available on Linux, see readme for more info"}), + "enable_sequential_cpu_offload": ("BOOLEAN", {"default": False, "tooltip": "significantly reducing memory usage and slows down the inference"}), + "pab_config": ("PAB_CONFIG", {"default": None}), + "block_edit": ("TRANSFORMERBLOCKS", {"default": None}), + "lora": ("COGLORA", {"default": None}), + } + } + + RETURN_TYPES = ("COGVIDEOPIPE",) + RETURN_NAMES = ("cogvideo_pipe", ) + FUNCTION = "loadmodel" + CATEGORY = "CogVideoWrapper" + DESCRIPTION = "Downloads and loads the selected CogVideo model from Huggingface to 'ComfyUI/models/CogVideo'" + + def loadmodel(self, model, precision, fp8_transformer="disabled", compile="disabled", enable_sequential_cpu_offload=False, pab_config=None, block_edit=None, lora=None): + + check_diffusers_version() + + device = mm.get_torch_device() + offload_device = mm.unet_offload_device() + mm.soft_empty_cache() + + dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision] + download_path = folder_paths.get_folder_paths("CogVideo")[0] + + if "Fun" in model: + if not "1.1" in model: + repo_id = "kijai/CogVideoX-Fun-pruned" + if "2b" in model: + base_path = os.path.join(folder_paths.models_dir, "CogVideoX_Fun", "CogVideoX-Fun-2b-InP") # location of the official model + if not os.path.exists(base_path): + base_path = os.path.join(download_path, "CogVideoX-Fun-2b-InP") + elif "5b" in model: + base_path = os.path.join(folder_paths.models_dir, "CogVideoX_Fun", "CogVideoX-Fun-5b-InP") # location of the official model + if not os.path.exists(base_path): + base_path = os.path.join(download_path, "CogVideoX-Fun-5b-InP") + elif "1.1" in model: + repo_id = model + base_path = os.path.join(folder_paths.models_dir, "CogVideoX_Fun", (model.split("/")[-1])) # location of the official model + if not os.path.exists(base_path): + base_path = os.path.join(download_path, (model.split("/")[-1])) + download_path = base_path + + elif "2b" in model: + if 'img2vid' in model: + base_path = os.path.join(download_path, "cogvideox-2b-img2vid") + download_path = base_path + repo_id = model + else: + base_path = os.path.join(download_path, "CogVideo2B") + download_path = base_path + repo_id = model + else: + base_path = os.path.join(download_path, (model.split("/")[-1])) + download_path = base_path + repo_id = model + + + if "2b" in model: + scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json') + else: + scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json') + + if not os.path.exists(base_path) or not os.path.exists(os.path.join(base_path, "transformer")): + log.info(f"Downloading model to: {base_path}") + from huggingface_hub import snapshot_download + + snapshot_download( + repo_id=repo_id, + ignore_patterns=["*text_encoder*", "*tokenizer*"], + local_dir=download_path, + local_dir_use_symlinks=False, + ) + + # transformer + if "Fun" in model: + if pab_config is not None: + transformer = CogVideoXTransformer3DModelFunPAB.from_pretrained(base_path, subfolder="transformer") + else: + transformer = CogVideoXTransformer3DModelFun.from_pretrained(base_path, subfolder="transformer") + else: + if pab_config is not None: + transformer = CogVideoXTransformer3DModelPAB.from_pretrained(base_path, subfolder="transformer") + else: + transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder="transformer") + + transformer = transformer.to(dtype).to(offload_device) + + #LoRAs + if lora is not None: + from .lora_utils import merge_lora, load_lora_into_transformer + if "fun" in model.lower(): + for l in lora: + log.info(f"Merging LoRA weights from {l['path']} with strength {l['strength']}") + transformer = merge_lora(transformer, l["path"], l["strength"]) + else: + transformer = load_lora_into_transformer(lora, transformer) + + + if block_edit is not None: + transformer = remove_specific_blocks(transformer, block_edit) + + #fp8 + if fp8_transformer == "enabled" or fp8_transformer == "fastmode": + for name, param in transformer.named_parameters(): + params_to_keep = {"patch_embed", "lora", "pos_embedding"} + if not any(keyword in name for keyword in params_to_keep): + param.data = param.data.to(torch.float8_e4m3fn) + + if fp8_transformer == "fastmode": + from .fp8_optimization import convert_fp8_linear + convert_fp8_linear(transformer, dtype) + + with open(scheduler_path) as f: + scheduler_config = json.load(f) + scheduler = CogVideoXDDIMScheduler.from_config(scheduler_config) + + # VAE + if "Fun" in model: + vae = AutoencoderKLCogVideoXFun.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device) + if "Pose" in model: + pipe = CogVideoX_Fun_Pipeline_Control(vae, transformer, scheduler, pab_config=pab_config) + else: + pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler, pab_config=pab_config) + else: + vae = AutoencoderKLCogVideoX.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device) + pipe = CogVideoXPipeline(vae, transformer, scheduler, pab_config=pab_config) + if "cogvideox-2b-img2vid" in model: + pipe.input_with_padding = False + + if enable_sequential_cpu_offload: + pipe.enable_sequential_cpu_offload() + + # compilation + if compile == "torch": + torch._dynamo.config.suppress_errors = True + pipe.transformer.to(memory_format=torch.channels_last) + #pipe.transformer = torch.compile(pipe.transformer, mode="default", fullgraph=False, backend="inductor") + for i, block in enumerate(pipe.transformer.transformer_blocks): + if "CogVideoXBlock" in str(block): + pipe.transformer.transformer_blocks[i] = torch.compile(block, fullgraph=False, dynamic=False, backend="inductor") + elif compile == "onediff": + from onediffx import compile_pipe + os.environ['NEXFORT_FX_FORCE_TRITON_SDPA'] = '1' + + pipe = compile_pipe( + pipe, + backend="nexfort", + options= {"mode": "max-optimize:max-autotune:max-autotune", "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": False, "triton.fuse_attention_allow_fp16_reduction": False}}, + ignores=["vae"], + fuse_qkv_projections=True if pab_config is None else False, + ) + + pipeline = { + "pipe": pipe, + "dtype": dtype, + "base_path": base_path, + "onediff": True if compile == "onediff" else False, + "cpu_offloading": enable_sequential_cpu_offload, + "scheduler_config": scheduler_config, + "model_name": model + } + + return (pipeline,) + +class DownloadAndLoadCogVideoGGUFModel: + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "model": ( + [ + "CogVideoX_5b_GGUF_Q4_0.safetensors", + "CogVideoX_5b_I2V_GGUF_Q4_0.safetensors", + "CogVideoX_5b_fun_GGUF_Q4_0.safetensors", + "CogVideoX_5b_fun_1_1_GGUF_Q4_0.safetensors", + "CogVideoX_5b_fun_1_1_Pose_GGUF_Q4_0.safetensors", + "CogVideoX_5b_Interpolation_GGUF_Q4_0.safetensors", + "CogVideoX_5b_Tora_GGUF_Q4_0.safetensors", + + ], + ), + "vae_precision": (["fp16", "fp32", "bf16"], {"default": "bf16", "tooltip": "VAE dtype"}), + "fp8_fastmode": ("BOOLEAN", {"default": False, "tooltip": "only supported on 4090 and later GPUs, also requires torch 2.4.0 with cu124 minimum"}), + "load_device": (["main_device", "offload_device"], {"default": "main_device"}), + "enable_sequential_cpu_offload": ("BOOLEAN", {"default": False, "tooltip": "significantly reducing memory usage and slows down the inference"}), + }, + "optional": { + "pab_config": ("PAB_CONFIG", {"default": None}), + "block_edit": ("TRANSFORMERBLOCKS", {"default": None}), + "compile": (["disabled","torch"], {"tooltip": "compile the model for faster inference, these are advanced options only available on Linux, see readme for more info"}), + + } + } + + RETURN_TYPES = ("COGVIDEOPIPE",) + RETURN_NAMES = ("cogvideo_pipe", ) + FUNCTION = "loadmodel" + CATEGORY = "CogVideoWrapper" + + def loadmodel(self, model, vae_precision, fp8_fastmode, load_device, enable_sequential_cpu_offload, pab_config=None, block_edit=None, compile="disabled"): + + check_diffusers_version() + + device = mm.get_torch_device() + offload_device = mm.unet_offload_device() + mm.soft_empty_cache() + + vae_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[vae_precision] + download_path = os.path.join(folder_paths.models_dir, 'CogVideo', 'GGUF') + gguf_path = os.path.join(folder_paths.models_dir, 'diffusion_models', model) # check MinusZone's model path first + if not os.path.exists(gguf_path): + gguf_path = os.path.join(download_path, model) + if not os.path.exists(gguf_path): + if "I2V" in model or "1_1" in model or "Interpolation" in model or "Tora" in model: + repo_id = "Kijai/CogVideoX_GGUF" + else: + repo_id = "MinusZoneAI/ComfyUI-CogVideoX-MZ" + log.info(f"Downloading model to: {gguf_path}") + from huggingface_hub import snapshot_download + + snapshot_download( + repo_id=repo_id, + allow_patterns=[f"*{model}*"], + local_dir=download_path, + local_dir_use_symlinks=False, + ) + + if "5b" in model: + scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json') + transformer_path = os.path.join(script_directory, 'configs', 'transformer_config_5b.json') + elif "2b" in model: + scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json') + transformer_path = os.path.join(script_directory, 'configs', 'transformer_config_2b.json') + + with open(transformer_path) as f: + transformer_config = json.load(f) + + sd = load_torch_file(gguf_path) + + from .nodes import mz_gguf_loader + import importlib + importlib.reload(mz_gguf_loader) + + with mz_gguf_loader.quantize_lazy_load(): + if "fun" in model: + if "Pose" in model: + transformer_config["in_channels"] = 32 + else: + transformer_config["in_channels"] = 33 + if pab_config is not None: + transformer = CogVideoXTransformer3DModelFunPAB.from_config(transformer_config) + else: + transformer = CogVideoXTransformer3DModelFun.from_config(transformer_config) + elif "I2V" in model or "Interpolation" in model: + transformer_config["in_channels"] = 32 + if pab_config is not None: + transformer = CogVideoXTransformer3DModelPAB.from_config(transformer_config) + else: + transformer = CogVideoXTransformer3DModel.from_config(transformer_config) + else: + transformer_config["in_channels"] = 16 + if pab_config is not None: + transformer = CogVideoXTransformer3DModelPAB.from_config(transformer_config) + else: + transformer = CogVideoXTransformer3DModel.from_config(transformer_config) + + if "2b" in model: + for name, param in transformer.named_parameters(): + if name != "pos_embedding": + param.data = param.data.to(torch.float8_e4m3fn) + else: + param.data = param.data.to(torch.float16) + else: + transformer.to(torch.float8_e4m3fn) + + if block_edit is not None: + transformer = remove_specific_blocks(transformer, block_edit) + + transformer = mz_gguf_loader.quantize_load_state_dict(transformer, sd, device="cpu") + if load_device == "offload_device": + transformer.to(offload_device) + else: + transformer.to(device) + + + if fp8_fastmode: + from .fp8_optimization import convert_fp8_linear + convert_fp8_linear(transformer, vae_dtype) + + if compile == "torch": + # compilation + for i, block in enumerate(transformer.transformer_blocks): + transformer.transformer_blocks[i] = torch.compile(block, fullgraph=False, dynamic=False, backend="inductor") + with open(scheduler_path) as f: + scheduler_config = json.load(f) + + scheduler = CogVideoXDDIMScheduler.from_config(scheduler_config, subfolder="scheduler") + + # VAE + vae_dl_path = os.path.join(folder_paths.models_dir, 'CogVideo', 'VAE') + vae_path = os.path.join(vae_dl_path, "cogvideox_vae.safetensors") + if not os.path.exists(vae_path): + log.info(f"Downloading VAE model to: {vae_path}") + from huggingface_hub import snapshot_download + + snapshot_download( + repo_id="Kijai/CogVideoX-Fun-pruned", + allow_patterns=["*cogvideox_vae.safetensors*"], + local_dir=vae_dl_path, + local_dir_use_symlinks=False, + ) + with open(os.path.join(script_directory, 'configs', 'vae_config.json')) as f: + vae_config = json.load(f) + + vae_sd = load_torch_file(vae_path) + if "fun" in model: + vae = AutoencoderKLCogVideoXFun.from_config(vae_config).to(vae_dtype).to(offload_device) + vae.load_state_dict(vae_sd) + if "Pose" in model: + pipe = CogVideoX_Fun_Pipeline_Control(vae, transformer, scheduler, pab_config=pab_config) + else: + pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler, pab_config=pab_config) + else: + vae = AutoencoderKLCogVideoX.from_config(vae_config).to(vae_dtype).to(offload_device) + vae.load_state_dict(vae_sd) + pipe = CogVideoXPipeline(vae, transformer, scheduler, pab_config=pab_config) + + if enable_sequential_cpu_offload: + pipe.enable_sequential_cpu_offload() + + pipeline = { + "pipe": pipe, + "dtype": vae_dtype, + "base_path": model, + "onediff": False, + "cpu_offloading": enable_sequential_cpu_offload, + "scheduler_config": scheduler_config, + "model_name": model + } + + return (pipeline,) + +class DownloadAndLoadToraModel: + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "model": ( + [ + "kijai/CogVideoX-5b-Tora", + ], + ), + }, + } + + RETURN_TYPES = ("TORAMODEL",) + RETURN_NAMES = ("tora_model", ) + FUNCTION = "loadmodel" + CATEGORY = "CogVideoWrapper" + DESCRIPTION = "Downloads and loads the the Tora model from Huggingface to 'ComfyUI/models/CogVideo/CogVideoX-5b-Tora'" + + def loadmodel(self, model): + + check_diffusers_version() + + device = mm.get_torch_device() + offload_device = mm.unet_offload_device() + mm.soft_empty_cache() + + download_path = folder_paths.get_folder_paths("CogVideo")[0] + + from .tora.traj_module import MGF + + try: + from accelerate import init_empty_weights + from accelerate.utils import set_module_tensor_to_device + is_accelerate_available = True + except: + is_accelerate_available = False + pass + + download_path = os.path.join(folder_paths.models_dir, 'CogVideo', "CogVideoX-5b-Tora") + fuser_path = os.path.join(download_path, "fuser", "fuser.safetensors") + if not os.path.exists(fuser_path): + log.info(f"Downloading Fuser model to: {fuser_path}") + from huggingface_hub import snapshot_download + + snapshot_download( + repo_id=model, + allow_patterns=["*fuser.safetensors*"], + local_dir=download_path, + local_dir_use_symlinks=False, + ) + + hidden_size = 3072 + num_layers = 42 + + with (init_empty_weights() if is_accelerate_available else nullcontext()): + fuser_list = nn.ModuleList([MGF(128, hidden_size) for _ in range(num_layers)]) + + fuser_sd = load_torch_file(fuser_path) + if is_accelerate_available: + for key in fuser_sd: + set_module_tensor_to_device(fuser_list, key, dtype=torch.float16, device=device, value=fuser_sd[key]) + else: + fuser_list.load_state_dict(fuser_sd) + for module in fuser_list: + for param in module.parameters(): + param.data = param.data.to(torch.bfloat16).to(device) + del fuser_sd + + traj_extractor_path = os.path.join(download_path, "traj_extractor", "traj_extractor.safetensors") + if not os.path.exists(traj_extractor_path): + log.info(f"Downloading trajectory extractor model to: {traj_extractor_path}") + from huggingface_hub import snapshot_download + + snapshot_download( + repo_id="kijai/CogVideoX-5b-Tora", + allow_patterns=["*traj_extractor.safetensors*"], + local_dir=download_path, + local_dir_use_symlinks=False, + ) + + from .tora.traj_module import TrajExtractor + with (init_empty_weights() if is_accelerate_available else nullcontext()): + traj_extractor = TrajExtractor( + vae_downsize=(4, 8, 8), + patch_size=2, + nums_rb=2, + cin=16, + channels=[128] * 42, + sk=True, + use_conv=False, + ) + + traj_sd = load_torch_file(traj_extractor_path) + if is_accelerate_available: + for key in traj_sd: + set_module_tensor_to_device(traj_extractor, key, dtype=torch.float32, device=device, value=traj_sd[key]) + else: + traj_extractor.load_state_dict(traj_sd) + traj_extractor.to(torch.float32).to(device) + + toramodel = { + "fuser_list": fuser_list, + "traj_extractor": traj_extractor, + } + + return (toramodel,) + +class DownloadAndLoadCogVideoControlNet: + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "model": ( + [ + "TheDenk/cogvideox-2b-controlnet-hed-v1", + "TheDenk/cogvideox-2b-controlnet-canny-v1", + "TheDenk/cogvideox-5b-controlnet-hed-v1", + "TheDenk/cogvideox-5b-controlnet-canny-v1" + ], + ), + + }, + } + + RETURN_TYPES = ("COGVIDECONTROLNETMODEL",) + RETURN_NAMES = ("cogvideo_controlnet", ) + FUNCTION = "loadmodel" + CATEGORY = "CogVideoWrapper" + + def loadmodel(self, model): + from .cogvideo_controlnet import CogVideoXControlnet + + device = mm.get_torch_device() + offload_device = mm.unet_offload_device() + mm.soft_empty_cache() + + + download_path = os.path.join(folder_paths.models_dir, 'CogVideo', 'ControlNet') + base_path = os.path.join(download_path, (model.split("/")[-1])) + + if not os.path.exists(base_path): + log.info(f"Downloading model to: {base_path}") + from huggingface_hub import snapshot_download + + snapshot_download( + repo_id=model, + ignore_patterns=["*text_encoder*", "*tokenizer*"], + local_dir=base_path, + local_dir_use_symlinks=False, + ) + + controlnet = CogVideoXControlnet.from_pretrained(base_path) + + return (controlnet,) + +NODE_CLASS_MAPPINGS = { + "DownloadAndLoadCogVideoModel": DownloadAndLoadCogVideoModel, + "DownloadAndLoadCogVideoGGUFModel": DownloadAndLoadCogVideoGGUFModel, + "DownloadAndLoadCogVideoControlNet": DownloadAndLoadCogVideoControlNet, + "DownloadAndLoadToraModel": DownloadAndLoadToraModel, +} +NODE_DISPLAY_NAME_MAPPINGS = { + "DownloadAndLoadCogVideoModel": "(Down)load CogVideo Model", + "DownloadAndLoadCogVideoGGUFModel": "(Down)load CogVideo GGUF Model", + "DownloadAndLoadCogVideoControlNet": "(Down)load CogVideo ControlNet", + "DownloadAndLoadToraModel": "(Down)load Tora Model", + } \ No newline at end of file diff --git a/nodes.py b/nodes.py index 9f84350..3e8507f 100644 --- a/nodes.py +++ b/nodes.py @@ -1,20 +1,9 @@ import os import torch -import torch.nn as nn import folder_paths import comfy.model_management as mm -from comfy.utils import ProgressBar, load_torch_file from einops import rearrange -import importlib.metadata - -def check_diffusers_version(): - try: - version = importlib.metadata.version('diffusers') - required_version = '0.30.3' - if version < required_version: - raise AssertionError(f"diffusers version {version} is installed, but version {required_version} or higher is required.") - except importlib.metadata.PackageNotFoundError: - raise AssertionError("diffusers is not installed.") +from contextlib import nullcontext from diffusers.schedulers import ( CogVideoXDDIMScheduler, @@ -47,26 +36,13 @@ scheduler_mapping = { } available_schedulers = list(scheduler_mapping.keys()) - -from diffusers.models import AutoencoderKLCogVideoX -from .custom_cogvideox_transformer_3d import CogVideoXTransformer3DModel -from .pipeline_cogvideox import CogVideoXPipeline -from contextlib import nullcontext - -from .cogvideox_fun.transformer_3d import CogVideoXTransformer3DModel as CogVideoXTransformer3DModelFun -from .cogvideox_fun.fun_pab_transformer_3d import CogVideoXTransformer3DModel as CogVideoXTransformer3DModelFunPAB -from .cogvideox_fun.autoencoder_magvit import AutoencoderKLCogVideoX as AutoencoderKLCogVideoXFun from .cogvideox_fun.utils import get_image_to_video_latent, get_video_to_video_latent, ASPECT_RATIO_512, get_closest_ratio, to_pil -from .cogvideox_fun.pipeline_cogvideox_inpaint import CogVideoX_Fun_Pipeline_Inpaint -from .cogvideox_fun.pipeline_cogvideox_control import CogVideoX_Fun_Pipeline_Control from PIL import Image import numpy as np import json -import logging -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -log = logging.getLogger(__name__) +from .utils import log, check_diffusers_version script_directory = os.path.dirname(os.path.abspath(__file__)) @@ -75,72 +51,8 @@ if not "CogVideo" in folder_paths.folder_names_and_paths: if not "cogvideox_loras" in folder_paths.folder_names_and_paths: folder_paths.add_model_folder_path("cogvideox_loras", os.path.join(folder_paths.models_dir, "CogVideo", "loras")) -class PABConfig: - def __init__( - self, - steps: int, - cross_broadcast: bool = False, - cross_threshold: list = None, - cross_range: int = None, - spatial_broadcast: bool = False, - spatial_threshold: list = None, - spatial_range: int = None, - temporal_broadcast: bool = False, - temporal_threshold: list = None, - temporal_range: int = None, - mlp_broadcast: bool = False, - mlp_spatial_broadcast_config: dict = None, - mlp_temporal_broadcast_config: dict = None, - ): - self.steps = steps - - self.cross_broadcast = cross_broadcast - self.cross_threshold = cross_threshold - self.cross_range = cross_range - - self.spatial_broadcast = spatial_broadcast - self.spatial_threshold = spatial_threshold - self.spatial_range = spatial_range - - self.temporal_broadcast = temporal_broadcast - self.temporal_threshold = temporal_threshold - self.temporal_range = temporal_range - - self.mlp_broadcast = mlp_broadcast - self.mlp_spatial_broadcast_config = mlp_spatial_broadcast_config - self.mlp_temporal_broadcast_config = mlp_temporal_broadcast_config - self.mlp_temporal_outputs = {} - self.mlp_spatial_outputs = {} - -class CogVideoXPABConfig(PABConfig): - def __init__( - self, - steps: int = 50, - spatial_broadcast: bool = True, - spatial_threshold: list = [100, 850], - spatial_range: int = 2, - temporal_broadcast: bool = False, - temporal_threshold: list = [100, 850], - temporal_range: int = 4, - cross_broadcast: bool = False, - cross_threshold: list = [100, 850], - cross_range: int = 6, - ): - super().__init__( - steps=steps, - spatial_broadcast=spatial_broadcast, - spatial_threshold=spatial_threshold, - spatial_range=spatial_range, - temporal_broadcast=temporal_broadcast, - temporal_threshold=temporal_threshold, - temporal_range=temporal_range, - cross_broadcast=cross_broadcast, - cross_threshold=cross_threshold, - cross_range=cross_range - - ) - -from .videosys.cogvideox_transformer_3d import CogVideoXTransformer3DModel as CogVideoXTransformer3DModelPAB +#PAB +from .videosys.pab import CogVideoXPABConfig class CogVideoPABConfig: @classmethod @@ -189,13 +101,7 @@ class CogVideoPABConfig: return (pab_config, ) -def remove_specific_blocks(model, block_indices_to_remove): - import torch.nn as nn - transformer_blocks = model.transformer_blocks - new_blocks = [block for i, block in enumerate(transformer_blocks) if i not in block_indices_to_remove] - model.transformer_blocks = nn.ModuleList(new_blocks) - - return model + class CogVideoTransformerEdit: @classmethod @@ -250,534 +156,6 @@ class CogVideoLoraSelect: print(cog_loras_list) return (cog_loras_list,) -class DownloadAndLoadCogVideoModel: - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "model": ( - [ - "THUDM/CogVideoX-2b", - "THUDM/CogVideoX-5b", - "THUDM/CogVideoX-5b-I2V", - "bertjiazheng/KoolCogVideoX-5b", - "kijai/CogVideoX-Fun-2b", - "kijai/CogVideoX-Fun-5b", - "kijai/CogVideoX-5b-Tora", - "alibaba-pai/CogVideoX-Fun-V1.1-2b-InP", - "alibaba-pai/CogVideoX-Fun-V1.1-5b-InP", - "alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose", - "alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose", - "feizhengcong/CogvideoX-Interpolation", - "NimVideo/cogvideox-2b-img2vid" - ], - ), - - }, - "optional": { - "precision": (["fp16", "fp32", "bf16"], - {"default": "bf16", "tooltip": "official recommendation is that 2b model should be fp16, 5b model should be bf16"} - ), - "fp8_transformer": (['disabled', 'enabled', 'fastmode'], {"default": 'disabled', "tooltip": "enabled casts the transformer to torch.float8_e4m3fn, fastmode is only for latest nvidia GPUs and requires torch 2.4.0 and cu124 minimum"}), - "compile": (["disabled","onediff","torch"], {"tooltip": "compile the model for faster inference, these are advanced options only available on Linux, see readme for more info"}), - "enable_sequential_cpu_offload": ("BOOLEAN", {"default": False, "tooltip": "significantly reducing memory usage and slows down the inference"}), - "pab_config": ("PAB_CONFIG", {"default": None}), - "block_edit": ("TRANSFORMERBLOCKS", {"default": None}), - "lora": ("COGLORA", {"default": None}), - } - } - - RETURN_TYPES = ("COGVIDEOPIPE",) - RETURN_NAMES = ("cogvideo_pipe", ) - FUNCTION = "loadmodel" - CATEGORY = "CogVideoWrapper" - DESCRIPTION = "Downloads and loads the selected CogVideo model from Huggingface to 'ComfyUI/models/CogVideo'" - - def loadmodel(self, model, precision, fp8_transformer="disabled", compile="disabled", enable_sequential_cpu_offload=False, pab_config=None, block_edit=None, lora=None): - - check_diffusers_version() - - device = mm.get_torch_device() - offload_device = mm.unet_offload_device() - mm.soft_empty_cache() - - dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision] - download_path = folder_paths.get_folder_paths("CogVideo")[0] - - if "Fun" in model: - if not "1.1" in model: - repo_id = "kijai/CogVideoX-Fun-pruned" - if "2b" in model: - base_path = os.path.join(folder_paths.models_dir, "CogVideoX_Fun", "CogVideoX-Fun-2b-InP") # location of the official model - if not os.path.exists(base_path): - base_path = os.path.join(download_path, "CogVideoX-Fun-2b-InP") - elif "5b" in model: - base_path = os.path.join(folder_paths.models_dir, "CogVideoX_Fun", "CogVideoX-Fun-5b-InP") # location of the official model - if not os.path.exists(base_path): - base_path = os.path.join(download_path, "CogVideoX-Fun-5b-InP") - elif "1.1" in model: - repo_id = model - base_path = os.path.join(folder_paths.models_dir, "CogVideoX_Fun", (model.split("/")[-1])) # location of the official model - if not os.path.exists(base_path): - base_path = os.path.join(download_path, (model.split("/")[-1])) - download_path = base_path - - elif "2b" in model: - if 'img2vid' in model: - base_path = os.path.join(download_path, "cogvideox-2b-img2vid") - download_path = base_path - repo_id = model - else: - base_path = os.path.join(download_path, "CogVideo2B") - download_path = base_path - repo_id = model - else: - base_path = os.path.join(download_path, (model.split("/")[-1])) - download_path = base_path - repo_id = model - - - if "2b" in model: - scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json') - else: - scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json') - - if not os.path.exists(base_path) or not os.path.exists(os.path.join(base_path, "transformer")): - log.info(f"Downloading model to: {base_path}") - from huggingface_hub import snapshot_download - - snapshot_download( - repo_id=repo_id, - ignore_patterns=["*text_encoder*", "*tokenizer*"], - local_dir=download_path, - local_dir_use_symlinks=False, - ) - - # transformer - if "Fun" in model: - if pab_config is not None: - transformer = CogVideoXTransformer3DModelFunPAB.from_pretrained(base_path, subfolder="transformer") - else: - transformer = CogVideoXTransformer3DModelFun.from_pretrained(base_path, subfolder="transformer") - else: - if pab_config is not None: - transformer = CogVideoXTransformer3DModelPAB.from_pretrained(base_path, subfolder="transformer") - else: - transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder="transformer") - - transformer = transformer.to(dtype).to(offload_device) - - #LoRAs - if lora is not None: - from .lora_utils import merge_lora, load_lora_into_transformer - if "fun" in model.lower(): - for l in lora: - logging.info(f"Merging LoRA weights from {l['path']} with strength {l['strength']}") - transformer = merge_lora(transformer, l["path"], l["strength"]) - else: - transformer = load_lora_into_transformer(lora, transformer) - - - if block_edit is not None: - transformer = remove_specific_blocks(transformer, block_edit) - - #fp8 - if fp8_transformer == "enabled" or fp8_transformer == "fastmode": - for name, param in transformer.named_parameters(): - params_to_keep = {"patch_embed", "lora", "pos_embedding"} - if not any(keyword in name for keyword in params_to_keep): - param.data = param.data.to(torch.float8_e4m3fn) - - if fp8_transformer == "fastmode": - from .fp8_optimization import convert_fp8_linear - convert_fp8_linear(transformer, dtype) - - with open(scheduler_path) as f: - scheduler_config = json.load(f) - scheduler = CogVideoXDDIMScheduler.from_config(scheduler_config) - - # VAE - if "Fun" in model: - vae = AutoencoderKLCogVideoXFun.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device) - if "Pose" in model: - pipe = CogVideoX_Fun_Pipeline_Control(vae, transformer, scheduler, pab_config=pab_config) - else: - pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler, pab_config=pab_config) - else: - vae = AutoencoderKLCogVideoX.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device) - pipe = CogVideoXPipeline(vae, transformer, scheduler, pab_config=pab_config) - if "cogvideox-2b-img2vid" in model: - pipe.input_with_padding = False - - if enable_sequential_cpu_offload: - pipe.enable_sequential_cpu_offload() - - # compilation - if compile == "torch": - torch._dynamo.config.suppress_errors = True - pipe.transformer.to(memory_format=torch.channels_last) - #pipe.transformer = torch.compile(pipe.transformer, mode="default", fullgraph=False, backend="inductor") - for i, block in enumerate(pipe.transformer.transformer_blocks): - if "CogVideoXBlock" in str(block): - pipe.transformer.transformer_blocks[i] = torch.compile(block, fullgraph=False, dynamic=False, backend="inductor") - elif compile == "onediff": - from onediffx import compile_pipe - os.environ['NEXFORT_FX_FORCE_TRITON_SDPA'] = '1' - - pipe = compile_pipe( - pipe, - backend="nexfort", - options= {"mode": "max-optimize:max-autotune:max-autotune", "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": False, "triton.fuse_attention_allow_fp16_reduction": False}}, - ignores=["vae"], - fuse_qkv_projections=True if pab_config is None else False, - ) - - pipeline = { - "pipe": pipe, - "dtype": dtype, - "base_path": base_path, - "onediff": True if compile == "onediff" else False, - "cpu_offloading": enable_sequential_cpu_offload, - "scheduler_config": scheduler_config, - "model_name": model - } - - return (pipeline,) - -class DownloadAndLoadCogVideoGGUFModel: - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "model": ( - [ - "CogVideoX_5b_GGUF_Q4_0.safetensors", - "CogVideoX_5b_I2V_GGUF_Q4_0.safetensors", - "CogVideoX_5b_fun_GGUF_Q4_0.safetensors", - "CogVideoX_5b_fun_1_1_GGUF_Q4_0.safetensors", - "CogVideoX_5b_fun_1_1_Pose_GGUF_Q4_0.safetensors", - "CogVideoX_5b_Interpolation_GGUF_Q4_0.safetensors", - "CogVideoX_5b_Tora_GGUF_Q4_0.safetensors", - - ], - ), - "vae_precision": (["fp16", "fp32", "bf16"], {"default": "bf16", "tooltip": "VAE dtype"}), - "fp8_fastmode": ("BOOLEAN", {"default": False, "tooltip": "only supported on 4090 and later GPUs, also requires torch 2.4.0 with cu124 minimum"}), - "load_device": (["main_device", "offload_device"], {"default": "main_device"}), - "enable_sequential_cpu_offload": ("BOOLEAN", {"default": False, "tooltip": "significantly reducing memory usage and slows down the inference"}), - }, - "optional": { - "pab_config": ("PAB_CONFIG", {"default": None}), - "block_edit": ("TRANSFORMERBLOCKS", {"default": None}), - "compile": (["disabled","torch"], {"tooltip": "compile the model for faster inference, these are advanced options only available on Linux, see readme for more info"}), - - } - } - - RETURN_TYPES = ("COGVIDEOPIPE",) - RETURN_NAMES = ("cogvideo_pipe", ) - FUNCTION = "loadmodel" - CATEGORY = "CogVideoWrapper" - - def loadmodel(self, model, vae_precision, fp8_fastmode, load_device, enable_sequential_cpu_offload, pab_config=None, block_edit=None, compile="disabled"): - - check_diffusers_version() - - device = mm.get_torch_device() - offload_device = mm.unet_offload_device() - mm.soft_empty_cache() - - vae_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[vae_precision] - download_path = os.path.join(folder_paths.models_dir, 'CogVideo', 'GGUF') - gguf_path = os.path.join(folder_paths.models_dir, 'diffusion_models', model) # check MinusZone's model path first - if not os.path.exists(gguf_path): - gguf_path = os.path.join(download_path, model) - if not os.path.exists(gguf_path): - if "I2V" in model or "1_1" in model or "Interpolation" in model or "Tora" in model: - repo_id = "Kijai/CogVideoX_GGUF" - else: - repo_id = "MinusZoneAI/ComfyUI-CogVideoX-MZ" - log.info(f"Downloading model to: {gguf_path}") - from huggingface_hub import snapshot_download - - snapshot_download( - repo_id=repo_id, - allow_patterns=[f"*{model}*"], - local_dir=download_path, - local_dir_use_symlinks=False, - ) - - if "5b" in model: - scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json') - transformer_path = os.path.join(script_directory, 'configs', 'transformer_config_5b.json') - elif "2b" in model: - scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json') - transformer_path = os.path.join(script_directory, 'configs', 'transformer_config_2b.json') - - with open(transformer_path) as f: - transformer_config = json.load(f) - - sd = load_torch_file(gguf_path) - - from . import mz_gguf_loader - import importlib - importlib.reload(mz_gguf_loader) - - with mz_gguf_loader.quantize_lazy_load(): - if "fun" in model: - if "Pose" in model: - transformer_config["in_channels"] = 32 - else: - transformer_config["in_channels"] = 33 - if pab_config is not None: - transformer = CogVideoXTransformer3DModelFunPAB.from_config(transformer_config) - else: - transformer = CogVideoXTransformer3DModelFun.from_config(transformer_config) - elif "I2V" in model or "Interpolation" in model: - transformer_config["in_channels"] = 32 - if pab_config is not None: - transformer = CogVideoXTransformer3DModelPAB.from_config(transformer_config) - else: - transformer = CogVideoXTransformer3DModel.from_config(transformer_config) - else: - transformer_config["in_channels"] = 16 - if pab_config is not None: - transformer = CogVideoXTransformer3DModelPAB.from_config(transformer_config) - else: - transformer = CogVideoXTransformer3DModel.from_config(transformer_config) - - if "2b" in model: - for name, param in transformer.named_parameters(): - if name != "pos_embedding": - param.data = param.data.to(torch.float8_e4m3fn) - else: - param.data = param.data.to(torch.float16) - else: - transformer.to(torch.float8_e4m3fn) - - if block_edit is not None: - transformer = remove_specific_blocks(transformer, block_edit) - - transformer = mz_gguf_loader.quantize_load_state_dict(transformer, sd, device="cpu") - if load_device == "offload_device": - transformer.to(offload_device) - else: - transformer.to(device) - - - if fp8_fastmode: - from .fp8_optimization import convert_fp8_linear - convert_fp8_linear(transformer, vae_dtype) - - if compile == "torch": - # compilation - for i, block in enumerate(transformer.transformer_blocks): - transformer.transformer_blocks[i] = torch.compile(block, fullgraph=False, dynamic=False, backend="inductor") - with open(scheduler_path) as f: - scheduler_config = json.load(f) - - scheduler = CogVideoXDDIMScheduler.from_config(scheduler_config, subfolder="scheduler") - - # VAE - vae_dl_path = os.path.join(folder_paths.models_dir, 'CogVideo', 'VAE') - vae_path = os.path.join(vae_dl_path, "cogvideox_vae.safetensors") - if not os.path.exists(vae_path): - log.info(f"Downloading VAE model to: {vae_path}") - from huggingface_hub import snapshot_download - - snapshot_download( - repo_id="Kijai/CogVideoX-Fun-pruned", - allow_patterns=["*cogvideox_vae.safetensors*"], - local_dir=vae_dl_path, - local_dir_use_symlinks=False, - ) - with open(os.path.join(script_directory, 'configs', 'vae_config.json')) as f: - vae_config = json.load(f) - - vae_sd = load_torch_file(vae_path) - if "fun" in model: - vae = AutoencoderKLCogVideoXFun.from_config(vae_config).to(vae_dtype).to(offload_device) - vae.load_state_dict(vae_sd) - if "Pose" in model: - pipe = CogVideoX_Fun_Pipeline_Control(vae, transformer, scheduler, pab_config=pab_config) - else: - pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler, pab_config=pab_config) - else: - vae = AutoencoderKLCogVideoX.from_config(vae_config).to(vae_dtype).to(offload_device) - vae.load_state_dict(vae_sd) - pipe = CogVideoXPipeline(vae, transformer, scheduler, pab_config=pab_config) - - if enable_sequential_cpu_offload: - pipe.enable_sequential_cpu_offload() - - pipeline = { - "pipe": pipe, - "dtype": vae_dtype, - "base_path": model, - "onediff": False, - "cpu_offloading": enable_sequential_cpu_offload, - "scheduler_config": scheduler_config, - "model_name": model - } - - return (pipeline,) - -class DownloadAndLoadToraModel: - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "model": ( - [ - "kijai/CogVideoX-5b-Tora", - ], - ), - }, - } - - RETURN_TYPES = ("TORAMODEL",) - RETURN_NAMES = ("tora_model", ) - FUNCTION = "loadmodel" - CATEGORY = "CogVideoWrapper" - DESCRIPTION = "Downloads and loads the the Tora model from Huggingface to 'ComfyUI/models/CogVideo/CogVideoX-5b-Tora'" - - def loadmodel(self, model): - - check_diffusers_version() - - device = mm.get_torch_device() - offload_device = mm.unet_offload_device() - mm.soft_empty_cache() - - download_path = folder_paths.get_folder_paths("CogVideo")[0] - - from .tora.traj_module import MGF - - try: - from accelerate import init_empty_weights - from accelerate.utils import set_module_tensor_to_device - is_accelerate_available = True - except: - is_accelerate_available = False - pass - - download_path = os.path.join(folder_paths.models_dir, 'CogVideo', "CogVideoX-5b-Tora") - fuser_path = os.path.join(download_path, "fuser", "fuser.safetensors") - if not os.path.exists(fuser_path): - log.info(f"Downloading Fuser model to: {fuser_path}") - from huggingface_hub import snapshot_download - - snapshot_download( - repo_id=model, - allow_patterns=["*fuser.safetensors*"], - local_dir=download_path, - local_dir_use_symlinks=False, - ) - - hidden_size = 3072 - num_layers = 42 - - with (init_empty_weights() if is_accelerate_available else nullcontext()): - fuser_list = nn.ModuleList([MGF(128, hidden_size) for _ in range(num_layers)]) - - fuser_sd = load_torch_file(fuser_path) - if is_accelerate_available: - for key in fuser_sd: - set_module_tensor_to_device(fuser_list, key, dtype=torch.float16, device=device, value=fuser_sd[key]) - else: - fuser_list.load_state_dict(fuser_sd) - for module in fuser_list: - for param in module.parameters(): - param.data = param.data.to(torch.bfloat16).to(device) - del fuser_sd - - traj_extractor_path = os.path.join(download_path, "traj_extractor", "traj_extractor.safetensors") - if not os.path.exists(traj_extractor_path): - log.info(f"Downloading trajectory extractor model to: {traj_extractor_path}") - from huggingface_hub import snapshot_download - - snapshot_download( - repo_id="kijai/CogVideoX-5b-Tora", - allow_patterns=["*traj_extractor.safetensors*"], - local_dir=download_path, - local_dir_use_symlinks=False, - ) - - from .tora.traj_module import TrajExtractor - with (init_empty_weights() if is_accelerate_available else nullcontext()): - traj_extractor = TrajExtractor( - vae_downsize=(4, 8, 8), - patch_size=2, - nums_rb=2, - cin=16, - channels=[128] * 42, - sk=True, - use_conv=False, - ) - - traj_sd = load_torch_file(traj_extractor_path) - if is_accelerate_available: - for key in traj_sd: - set_module_tensor_to_device(traj_extractor, key, dtype=torch.float32, device=device, value=traj_sd[key]) - else: - traj_extractor.load_state_dict(traj_sd) - traj_extractor.to(torch.float32).to(device) - - toramodel = { - "fuser_list": fuser_list, - "traj_extractor": traj_extractor, - } - - return (toramodel,) - -class DownloadAndLoadCogVideoControlNet: - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "model": ( - [ - "TheDenk/cogvideox-2b-controlnet-hed-v1", - "TheDenk/cogvideox-2b-controlnet-canny-v1", - "TheDenk/cogvideox-5b-controlnet-hed-v1", - "TheDenk/cogvideox-5b-controlnet-canny-v1" - ], - ), - - }, - } - - RETURN_TYPES = ("COGVIDECONTROLNETMODEL",) - RETURN_NAMES = ("cogvideo_controlnet", ) - FUNCTION = "loadmodel" - CATEGORY = "CogVideoWrapper" - - def loadmodel(self, model): - from .cogvideo_controlnet import CogVideoXControlnet - - device = mm.get_torch_device() - offload_device = mm.unet_offload_device() - mm.soft_empty_cache() - - - download_path = os.path.join(folder_paths.models_dir, 'CogVideo', 'ControlNet') - base_path = os.path.join(download_path, (model.split("/")[-1])) - - if not os.path.exists(base_path): - log.info(f"Downloading model to: {base_path}") - from huggingface_hub import snapshot_download - - snapshot_download( - repo_id=model, - ignore_patterns=["*text_encoder*", "*tokenizer*"], - local_dir=base_path, - local_dir_use_symlinks=False, - ) - - controlnet = CogVideoXControlnet.from_pretrained(base_path) - - return (controlnet,) - class CogVideoEncodePrompt: @classmethod def INPUT_TYPES(s): @@ -1179,7 +557,7 @@ class ToraEncodeTrajectory: video_flow_features = video_flow_features * strength - logging.info(f"video_flow shape: {video_flow.shape}") + log.info(f"video_flow shape: {video_flow.shape}") tora = { "video_flow_features" : video_flow_features, @@ -1241,7 +619,7 @@ class ToraEncodeOpticalFlow: video_flow_features = video_flow_features * strength - logging.info(f"video_flow shape: {video_flow.shape}") + log.info(f"video_flow shape: {video_flow.shape}") tora = { "video_flow_features" : video_flow_features, @@ -1529,7 +907,7 @@ class CogVideoXFunSampler: # Load Sampler if context_options is not None and context_options["context_schedule"] == "temporal_tiling": - logging.info("Temporal tiling enabled, changing scheduler to CogVideoXDDIM") + log.info("Temporal tiling enabled, changing scheduler to CogVideoXDDIM") scheduler="CogVideoXDDIM" scheduler_config = pipeline["scheduler_config"] if scheduler in scheduler_mapping: @@ -1824,7 +1202,7 @@ class CogVideoXFunControlSampler: # Load Sampler scheduler_config = pipeline["scheduler_config"] if context_options is not None and context_options["context_schedule"] == "temporal_tiling": - logging.info("Temporal tiling enabled, changing scheduler to CogVideoXDDIM") + log.info("Temporal tiling enabled, changing scheduler to CogVideoXDDIM") scheduler="CogVideoXDDIM" if scheduler in scheduler_mapping: noise_scheduler = scheduler_mapping[scheduler].from_config(scheduler_config) @@ -1870,7 +1248,6 @@ class CogVideoXFunControlSampler: return (pipeline, {"samples": latents}) NODE_CLASS_MAPPINGS = { - "DownloadAndLoadCogVideoModel": DownloadAndLoadCogVideoModel, "CogVideoSampler": CogVideoSampler, "CogVideoDecode": CogVideoDecode, "CogVideoTextEncode": CogVideoTextEncode, @@ -1881,21 +1258,17 @@ NODE_CLASS_MAPPINGS = { "CogVideoXFunVid2VidSampler": CogVideoXFunVid2VidSampler, "CogVideoXFunControlSampler": CogVideoXFunControlSampler, "CogVideoTextEncodeCombine": CogVideoTextEncodeCombine, - "DownloadAndLoadCogVideoGGUFModel": DownloadAndLoadCogVideoGGUFModel, "CogVideoPABConfig": CogVideoPABConfig, "CogVideoTransformerEdit": CogVideoTransformerEdit, "CogVideoControlImageEncode": CogVideoControlImageEncode, "CogVideoLoraSelect": CogVideoLoraSelect, "CogVideoContextOptions": CogVideoContextOptions, "CogVideoControlNet": CogVideoControlNet, - "DownloadAndLoadCogVideoControlNet": DownloadAndLoadCogVideoControlNet, "ToraEncodeTrajectory": ToraEncodeTrajectory, "ToraEncodeOpticalFlow": ToraEncodeOpticalFlow, - "DownloadAndLoadToraModel": DownloadAndLoadToraModel, "CogVideoXFasterCache": CogVideoXFasterCache } NODE_DISPLAY_NAME_MAPPINGS = { - "DownloadAndLoadCogVideoModel": "(Down)load CogVideo Model", "CogVideoSampler": "CogVideo Sampler", "CogVideoDecode": "CogVideo Decode", "CogVideoTextEncode": "CogVideo TextEncode", @@ -1906,15 +1279,12 @@ NODE_DISPLAY_NAME_MAPPINGS = { "CogVideoXFunVid2VidSampler": "CogVideoXFun Vid2Vid Sampler", "CogVideoXFunControlSampler": "CogVideoXFun Control Sampler", "CogVideoTextEncodeCombine": "CogVideo TextEncode Combine", - "DownloadAndLoadCogVideoGGUFModel": "(Down)load CogVideo GGUF Model", "CogVideoPABConfig": "CogVideo PABConfig", "CogVideoTransformerEdit": "CogVideo TransformerEdit", "CogVideoControlImageEncode": "CogVideo Control ImageEncode", "CogVideoLoraSelect": "CogVideo LoraSelect", "CogVideoContextOptions": "CogVideo Context Options", - "DownloadAndLoadCogVideoControlNet": "(Down)load CogVideo ControlNet", "ToraEncodeTrajectory": "Tora Encode Trajectory", "ToraEncodeOpticalFlow": "Tora Encode OpticalFlow", - "DownloadAndLoadToraModel": "(Down)load Tora Model", "CogVideoXFasterCache": "CogVideoX FasterCache" } \ No newline at end of file diff --git a/readme.md b/readme.md index 6f6a01b..018e16c 100644 --- a/readme.md +++ b/readme.md @@ -21,7 +21,7 @@ New features: - Initial context windowing with FreeNoise noise shuffling mainly for vid2vid and pose2vid pipelines for longer generations, haven't figured it out for img2vid yet - GGUF models and tiled encoding for I2V and pose pipelines (thanks to MinusZoneAI) - [sageattention](https://github.com/thu-ml/SageAttention) support (Linux only) for a speed boost, I experienced ~20-30% increase with it, stacks with fp8 fast mode, doesn't need compiling -- Support CogVideoX-Fun 1.1 and it's pose models with additional control strenght and application step settings, this model's input does NOT have to be just dwpose skeletons, just about anything can work +- Support CogVideoX-Fun 1.1 and it's pose models with additional control strength and application step settings, this model's input does NOT have to be just dwpose skeletons, just about anything can work - Support LoRAs https://github.com/user-attachments/assets/ddeb8f38-a647-42b3-a4b1-c6936f961deb diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..b00dd1d --- /dev/null +++ b/utils.py @@ -0,0 +1,22 @@ +import importlib.metadata + +import logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +log = logging.getLogger(__name__) + +def check_diffusers_version(): + try: + version = importlib.metadata.version('diffusers') + required_version = '0.30.3' + if version < required_version: + raise AssertionError(f"diffusers version {version} is installed, but version {required_version} or higher is required.") + except importlib.metadata.PackageNotFoundError: + raise AssertionError("diffusers is not installed.") + +def remove_specific_blocks(model, block_indices_to_remove): + import torch.nn as nn + transformer_blocks = model.transformer_blocks + new_blocks = [block for i, block in enumerate(transformer_blocks) if i not in block_indices_to_remove] + model.transformer_blocks = nn.ModuleList(new_blocks) + + return model \ No newline at end of file diff --git a/videosys/cogvideox_transformer_3d.py b/videosys/cogvideox_transformer_3d.py index b0e1aa5..26550a2 100644 --- a/videosys/cogvideox_transformer_3d.py +++ b/videosys/cogvideox_transformer_3d.py @@ -66,16 +66,6 @@ class CogVideoXAttnProcessor2_0: query = attn.to_q(hidden_states) key = attn.to_k(hidden_states) value = attn.to_v(hidden_states) - - # if attn.parallel_manager.sp_size > 1: - # assert ( - # attn.heads % attn.parallel_manager.sp_size == 0 - # ), f"Number of heads {attn.heads} must be divisible by sequence parallel size {attn.parallel_manager.sp_size}" - # attn_heads = attn.heads // attn.parallel_manager.sp_size - # query, key, value = map( - # lambda x: all_to_all_comm(x, attn.parallel_manager.sp_group, scatter_dim=2, gather_dim=1), - # [query, key, value], - # ) attn_heads = attn.heads @@ -111,9 +101,6 @@ class CogVideoXAttnProcessor2_0: hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn_heads * head_dim) - #if attn.parallel_manager.sp_size > 1: - # hidden_states = all_to_all_comm(hidden_states, attn.parallel_manager.sp_group, scatter_dim=1, gather_dim=2) - # linear proj hidden_states = attn.to_out[0](hidden_states) # dropout diff --git a/videosys/pab.py b/videosys/pab.py new file mode 100644 index 0000000..007e1b3 --- /dev/null +++ b/videosys/pab.py @@ -0,0 +1,64 @@ +class PABConfig: + def __init__( + self, + steps: int, + cross_broadcast: bool = False, + cross_threshold: list = None, + cross_range: int = None, + spatial_broadcast: bool = False, + spatial_threshold: list = None, + spatial_range: int = None, + temporal_broadcast: bool = False, + temporal_threshold: list = None, + temporal_range: int = None, + mlp_broadcast: bool = False, + mlp_spatial_broadcast_config: dict = None, + mlp_temporal_broadcast_config: dict = None, + ): + self.steps = steps + + self.cross_broadcast = cross_broadcast + self.cross_threshold = cross_threshold + self.cross_range = cross_range + + self.spatial_broadcast = spatial_broadcast + self.spatial_threshold = spatial_threshold + self.spatial_range = spatial_range + + self.temporal_broadcast = temporal_broadcast + self.temporal_threshold = temporal_threshold + self.temporal_range = temporal_range + + self.mlp_broadcast = mlp_broadcast + self.mlp_spatial_broadcast_config = mlp_spatial_broadcast_config + self.mlp_temporal_broadcast_config = mlp_temporal_broadcast_config + self.mlp_temporal_outputs = {} + self.mlp_spatial_outputs = {} + +class CogVideoXPABConfig(PABConfig): + def __init__( + self, + steps: int = 50, + spatial_broadcast: bool = True, + spatial_threshold: list = [100, 850], + spatial_range: int = 2, + temporal_broadcast: bool = False, + temporal_threshold: list = [100, 850], + temporal_range: int = 4, + cross_broadcast: bool = False, + cross_threshold: list = [100, 850], + cross_range: int = 6, + ): + super().__init__( + steps=steps, + spatial_broadcast=spatial_broadcast, + spatial_threshold=spatial_threshold, + spatial_range=spatial_range, + temporal_broadcast=temporal_broadcast, + temporal_threshold=temporal_threshold, + temporal_range=temporal_range, + cross_broadcast=cross_broadcast, + cross_threshold=cross_threshold, + cross_range=cross_range + + ) \ No newline at end of file