mirror of
https://git.datalinker.icu/kijai/ComfyUI-CogVideoXWrapper.git
synced 2026-05-29 02:07:08 +08:00
Merge remote-tracking branch 'kijai/main'
This commit is contained in:
commit
645365c03d
18
configs/scheduler_config_2b.json
Normal file
18
configs/scheduler_config_2b.json
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
{
|
||||||
|
"_class_name": "CogVideoXDDIMScheduler",
|
||||||
|
"_diffusers_version": "0.30.0.dev0",
|
||||||
|
"beta_end": 0.012,
|
||||||
|
"beta_schedule": "scaled_linear",
|
||||||
|
"beta_start": 0.00085,
|
||||||
|
"clip_sample": false,
|
||||||
|
"clip_sample_range": 1.0,
|
||||||
|
"num_train_timesteps": 1000,
|
||||||
|
"prediction_type": "v_prediction",
|
||||||
|
"rescale_betas_zero_snr": true,
|
||||||
|
"sample_max_value": 1.0,
|
||||||
|
"set_alpha_to_one": true,
|
||||||
|
"snr_shift_scale": 3.0,
|
||||||
|
"steps_offset": 0,
|
||||||
|
"timestep_spacing": "trailing",
|
||||||
|
"trained_betas": null
|
||||||
|
}
|
||||||
18
configs/scheduler_config_5b.json
Normal file
18
configs/scheduler_config_5b.json
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
{
|
||||||
|
"_class_name": "CogVideoXDDIMScheduler",
|
||||||
|
"_diffusers_version": "0.31.0.dev0",
|
||||||
|
"beta_end": 0.012,
|
||||||
|
"beta_schedule": "scaled_linear",
|
||||||
|
"beta_start": 0.00085,
|
||||||
|
"clip_sample": false,
|
||||||
|
"clip_sample_range": 1.0,
|
||||||
|
"num_train_timesteps": 1000,
|
||||||
|
"prediction_type": "v_prediction",
|
||||||
|
"rescale_betas_zero_snr": true,
|
||||||
|
"sample_max_value": 1.0,
|
||||||
|
"set_alpha_to_one": true,
|
||||||
|
"snr_shift_scale": 1.0,
|
||||||
|
"steps_offset": 0,
|
||||||
|
"timestep_spacing": "trailing",
|
||||||
|
"trained_betas": null
|
||||||
|
}
|
||||||
28
configs/transformer_config_2b.json
Normal file
28
configs/transformer_config_2b.json
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
{
|
||||||
|
"_class_name": "CogVideoXTransformer3DModel",
|
||||||
|
"_diffusers_version": "0.30.0.dev0",
|
||||||
|
"activation_fn": "gelu-approximate",
|
||||||
|
"attention_bias": true,
|
||||||
|
"attention_head_dim": 64,
|
||||||
|
"dropout": 0.0,
|
||||||
|
"flip_sin_to_cos": true,
|
||||||
|
"freq_shift": 0,
|
||||||
|
"in_channels": 16,
|
||||||
|
"max_text_seq_length": 226,
|
||||||
|
"norm_elementwise_affine": true,
|
||||||
|
"norm_eps": 1e-05,
|
||||||
|
"num_attention_heads": 30,
|
||||||
|
"num_layers": 30,
|
||||||
|
"out_channels": 16,
|
||||||
|
"patch_size": 2,
|
||||||
|
"sample_frames": 49,
|
||||||
|
"sample_height": 60,
|
||||||
|
"sample_width": 90,
|
||||||
|
"spatial_interpolation_scale": 1.875,
|
||||||
|
"temporal_compression_ratio": 4,
|
||||||
|
"temporal_interpolation_scale": 1.0,
|
||||||
|
"text_embed_dim": 4096,
|
||||||
|
"time_embed_dim": 512,
|
||||||
|
"timestep_activation_fn": "silu",
|
||||||
|
"use_rotary_positional_embeddings": false
|
||||||
|
}
|
||||||
26
configs/transformer_config_5b.json
Normal file
26
configs/transformer_config_5b.json
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"activation_fn": "gelu-approximate",
|
||||||
|
"attention_bias": true,
|
||||||
|
"attention_head_dim": 64,
|
||||||
|
"dropout": 0.0,
|
||||||
|
"flip_sin_to_cos": true,
|
||||||
|
"freq_shift": 0,
|
||||||
|
"in_channels": 16,
|
||||||
|
"max_text_seq_length": 226,
|
||||||
|
"norm_elementwise_affine": true,
|
||||||
|
"norm_eps": 1e-05,
|
||||||
|
"num_attention_heads": 48,
|
||||||
|
"num_layers": 42,
|
||||||
|
"out_channels": 16,
|
||||||
|
"patch_size": 2,
|
||||||
|
"sample_frames": 49,
|
||||||
|
"sample_height": 60,
|
||||||
|
"sample_width": 90,
|
||||||
|
"spatial_interpolation_scale": 1.875,
|
||||||
|
"temporal_compression_ratio": 4,
|
||||||
|
"temporal_interpolation_scale": 1.0,
|
||||||
|
"text_embed_dim": 4096,
|
||||||
|
"time_embed_dim": 512,
|
||||||
|
"timestep_activation_fn": "silu",
|
||||||
|
"use_rotary_positional_embeddings": true
|
||||||
|
}
|
||||||
29
configs/transformer_config_I2V_5b.json
Normal file
29
configs/transformer_config_I2V_5b.json
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
{
|
||||||
|
"_class_name": "CogVideoXTransformer3DModel",
|
||||||
|
"_diffusers_version": "0.31.0.dev0",
|
||||||
|
"activation_fn": "gelu-approximate",
|
||||||
|
"attention_bias": true,
|
||||||
|
"attention_head_dim": 64,
|
||||||
|
"dropout": 0.0,
|
||||||
|
"flip_sin_to_cos": true,
|
||||||
|
"freq_shift": 0,
|
||||||
|
"in_channels": 32,
|
||||||
|
"max_text_seq_length": 226,
|
||||||
|
"norm_elementwise_affine": true,
|
||||||
|
"norm_eps": 1e-05,
|
||||||
|
"num_attention_heads": 48,
|
||||||
|
"num_layers": 42,
|
||||||
|
"out_channels": 16,
|
||||||
|
"patch_size": 2,
|
||||||
|
"sample_frames": 49,
|
||||||
|
"sample_height": 60,
|
||||||
|
"sample_width": 90,
|
||||||
|
"spatial_interpolation_scale": 1.875,
|
||||||
|
"temporal_compression_ratio": 4,
|
||||||
|
"temporal_interpolation_scale": 1.0,
|
||||||
|
"text_embed_dim": 4096,
|
||||||
|
"time_embed_dim": 512,
|
||||||
|
"timestep_activation_fn": "silu",
|
||||||
|
"use_learned_positional_embeddings": true,
|
||||||
|
"use_rotary_positional_embeddings": true
|
||||||
|
}
|
||||||
39
configs/vae_config.json
Normal file
39
configs/vae_config.json
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
{
|
||||||
|
"_class_name": "AutoencoderKLCogVideoX",
|
||||||
|
"_diffusers_version": "0.31.0.dev0",
|
||||||
|
"act_fn": "silu",
|
||||||
|
"block_out_channels": [
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
256,
|
||||||
|
512
|
||||||
|
],
|
||||||
|
"down_block_types": [
|
||||||
|
"CogVideoXDownBlock3D",
|
||||||
|
"CogVideoXDownBlock3D",
|
||||||
|
"CogVideoXDownBlock3D",
|
||||||
|
"CogVideoXDownBlock3D"
|
||||||
|
],
|
||||||
|
"force_upcast": true,
|
||||||
|
"in_channels": 3,
|
||||||
|
"latent_channels": 16,
|
||||||
|
"latents_mean": null,
|
||||||
|
"latents_std": null,
|
||||||
|
"layers_per_block": 3,
|
||||||
|
"norm_eps": 1e-06,
|
||||||
|
"norm_num_groups": 32,
|
||||||
|
"out_channels": 3,
|
||||||
|
"sample_height": 480,
|
||||||
|
"sample_width": 720,
|
||||||
|
"scaling_factor": 0.7,
|
||||||
|
"shift_factor": null,
|
||||||
|
"temporal_compression_ratio": 4,
|
||||||
|
"up_block_types": [
|
||||||
|
"CogVideoXUpBlock3D",
|
||||||
|
"CogVideoXUpBlock3D",
|
||||||
|
"CogVideoXUpBlock3D",
|
||||||
|
"CogVideoXUpBlock3D"
|
||||||
|
],
|
||||||
|
"use_post_quant_conv": false,
|
||||||
|
"use_quant_conv": false
|
||||||
|
}
|
||||||
188
mz_gguf_loader.py
Normal file
188
mz_gguf_loader.py
Normal file
@ -0,0 +1,188 @@
|
|||||||
|
# https://github.com/MinusZoneAI/ComfyUI-CogVideoX-MZ/blob/9616415220fd09388622f40f6609e4ed81f048a5/mz_gguf_loader.py
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import gc
|
||||||
|
|
||||||
|
|
||||||
|
class quantize_lazy_load():
|
||||||
|
def __init__(self):
|
||||||
|
self.device = None
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
self.device = torch.device("meta")
|
||||||
|
self.device.__enter__()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_value, traceback):
|
||||||
|
self.device.__exit__(exc_type, exc_value, traceback)
|
||||||
|
|
||||||
|
|
||||||
|
def quantize_load_state_dict(model, state_dict, device="cpu"):
|
||||||
|
Q4_0_qkey = []
|
||||||
|
for key in state_dict.keys():
|
||||||
|
if key.endswith(".Q4_0_qweight"):
|
||||||
|
Q4_0_qkey.append(key.replace(".Q4_0_qweight", ""))
|
||||||
|
|
||||||
|
for name, module in model.named_modules():
|
||||||
|
if name in Q4_0_qkey:
|
||||||
|
q_linear = WQLinear_GGUF.from_linear(
|
||||||
|
linear=module,
|
||||||
|
device=device,
|
||||||
|
qtype="Q4_0",
|
||||||
|
)
|
||||||
|
set_op_by_name(model, name, q_linear)
|
||||||
|
|
||||||
|
model.to_empty(device=device)
|
||||||
|
model.load_state_dict(state_dict, strict=False)
|
||||||
|
model.to(device)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def set_op_by_name(layer, name, new_module):
|
||||||
|
levels = name.split(".")
|
||||||
|
if len(levels) > 1:
|
||||||
|
mod_ = layer
|
||||||
|
for l_idx in range(len(levels) - 1):
|
||||||
|
if levels[l_idx].isdigit():
|
||||||
|
mod_ = mod_[int(levels[l_idx])]
|
||||||
|
else:
|
||||||
|
mod_ = getattr(mod_, levels[l_idx])
|
||||||
|
setattr(mod_, levels[-1], new_module)
|
||||||
|
else:
|
||||||
|
setattr(layer, name, new_module)
|
||||||
|
|
||||||
|
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
|
class WQLinear_GGUF(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self, in_features, out_features, bias, dev, qtype="Q4_0"
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.in_features = in_features
|
||||||
|
self.out_features = out_features
|
||||||
|
self.qtype = qtype
|
||||||
|
|
||||||
|
qweight_shape = quant_shape_to_byte_shape(
|
||||||
|
(out_features, in_features), qtype
|
||||||
|
)
|
||||||
|
self.register_buffer(
|
||||||
|
f"{qtype}_qweight",
|
||||||
|
torch.zeros(
|
||||||
|
qweight_shape,
|
||||||
|
dtype=torch.uint8,
|
||||||
|
device=dev,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
if bias:
|
||||||
|
self.register_buffer(
|
||||||
|
"bias",
|
||||||
|
torch.zeros(
|
||||||
|
(out_features),
|
||||||
|
dtype=torch.float16,
|
||||||
|
device=dev,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.bias = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_linear(
|
||||||
|
cls, linear,
|
||||||
|
device="cpu",
|
||||||
|
qtype="Q4_0",
|
||||||
|
):
|
||||||
|
q_linear = cls(
|
||||||
|
linear.in_features,
|
||||||
|
linear.out_features,
|
||||||
|
linear.bias is not None,
|
||||||
|
device,
|
||||||
|
qtype=qtype,
|
||||||
|
)
|
||||||
|
return q_linear
|
||||||
|
|
||||||
|
def extra_repr(self) -> str:
|
||||||
|
return (
|
||||||
|
"in_features={}, out_features={}, bias={}, w_bit={}, group_size={}".format(
|
||||||
|
self.in_features,
|
||||||
|
self.out_features,
|
||||||
|
self.bias is not None,
|
||||||
|
self.w_bit,
|
||||||
|
self.group_size,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def forward(self, x):
|
||||||
|
# x = torch.matmul(x, dequantize_blocks_Q4_0(self.qweight))
|
||||||
|
if self.qtype == "Q4_0":
|
||||||
|
x = F.linear(x, dequantize_blocks_Q4_0(
|
||||||
|
self.Q4_0_qweight, x.dtype), self.bias.to(x.dtype) if self.bias is not None else None)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown qtype: {self.qtype}")
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def split_block_dims(blocks, *args):
|
||||||
|
n_max = blocks.shape[1]
|
||||||
|
dims = list(args) + [n_max - sum(args)]
|
||||||
|
return torch.split(blocks, dims, dim=1)
|
||||||
|
|
||||||
|
|
||||||
|
def quant_shape_to_byte_shape(shape, qtype) -> tuple[int, ...]:
|
||||||
|
# shape = shape[::-1]
|
||||||
|
block_size, type_size = GGML_QUANT_SIZES[qtype]
|
||||||
|
if shape[-1] % block_size != 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"Quantized tensor row size ({shape[-1]}) is not a multiple of Q4_0 block size ({block_size})")
|
||||||
|
return (*shape[:-1], shape[-1] // block_size * type_size)
|
||||||
|
|
||||||
|
|
||||||
|
def quant_shape_from_byte_shape(shape, qtype) -> tuple[int, ...]:
|
||||||
|
# shape = shape[::-1]
|
||||||
|
block_size, type_size = GGML_QUANT_SIZES[qtype]
|
||||||
|
if shape[-1] % type_size != 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of Q4_0 type size ({type_size})")
|
||||||
|
return (*shape[:-1], shape[-1] // type_size * block_size)
|
||||||
|
|
||||||
|
|
||||||
|
GGML_QUANT_SIZES = {
|
||||||
|
"Q4_0": (32, 2 + 16),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def dequantize_blocks_Q4_0(data, dtype=torch.float16):
|
||||||
|
block_size, type_size = GGML_QUANT_SIZES["Q4_0"]
|
||||||
|
|
||||||
|
data = data.to(torch.uint8)
|
||||||
|
shape = data.shape
|
||||||
|
|
||||||
|
rows = data.reshape(
|
||||||
|
(-1, data.shape[-1])
|
||||||
|
).view(torch.uint8)
|
||||||
|
|
||||||
|
n_blocks = rows.numel() // type_size
|
||||||
|
blocks = data.reshape((n_blocks, type_size))
|
||||||
|
|
||||||
|
n_blocks = blocks.shape[0]
|
||||||
|
|
||||||
|
d, qs = split_block_dims(blocks, 2)
|
||||||
|
d = d.view(torch.float16)
|
||||||
|
|
||||||
|
qs = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor(
|
||||||
|
[0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1))
|
||||||
|
qs = (qs & 0x0F).reshape((n_blocks, -1)).to(torch.int8) - 8
|
||||||
|
|
||||||
|
out = (d * qs)
|
||||||
|
|
||||||
|
out = out.reshape(quant_shape_from_byte_shape(
|
||||||
|
shape,
|
||||||
|
qtype="Q4_0",
|
||||||
|
)).to(dtype)
|
||||||
|
return out
|
||||||
|
|
||||||
183
nodes.py
183
nodes.py
@ -15,11 +15,13 @@ from .cogvideox_fun.utils import get_image_to_video_latent, get_video_to_video_l
|
|||||||
from .cogvideox_fun.pipeline_cogvideox_inpaint import CogVideoX_Fun_Pipeline_Inpaint
|
from .cogvideox_fun.pipeline_cogvideox_inpaint import CogVideoX_Fun_Pipeline_Inpaint
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import json
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
script_directory = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
class DownloadAndLoadCogVideoModel:
|
class DownloadAndLoadCogVideoModel:
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -65,19 +67,23 @@ class DownloadAndLoadCogVideoModel:
|
|||||||
download_path = os.path.join(folder_paths.models_dir, "CogVideo")
|
download_path = os.path.join(folder_paths.models_dir, "CogVideo")
|
||||||
if "2b" in model:
|
if "2b" in model:
|
||||||
base_path = os.path.join(folder_paths.models_dir, "CogVideoX_Fun", "CogVideoX-Fun-2b-InP") # location of the official model
|
base_path = os.path.join(folder_paths.models_dir, "CogVideoX_Fun", "CogVideoX-Fun-2b-InP") # location of the official model
|
||||||
|
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json')
|
||||||
if not os.path.exists(base_path):
|
if not os.path.exists(base_path):
|
||||||
base_path = os.path.join(download_path, "CogVideoX-Fun-2b-InP")
|
base_path = os.path.join(download_path, "CogVideoX-Fun-2b-InP")
|
||||||
elif "5b" in model:
|
elif "5b" in model:
|
||||||
base_path = os.path.join(folder_paths.models_dir, "CogVideoX_Fun", "CogVideoX-Fun-5b-InP") # location of the official model
|
base_path = os.path.join(folder_paths.models_dir, "CogVideoX_Fun", "CogVideoX-Fun-5b-InP") # location of the official model
|
||||||
|
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
|
||||||
if not os.path.exists(base_path):
|
if not os.path.exists(base_path):
|
||||||
base_path = os.path.join(download_path, "CogVideoX-Fun-5b-InP")
|
base_path = os.path.join(download_path, "CogVideoX-Fun-5b-InP")
|
||||||
|
|
||||||
elif "2b" in model:
|
elif "2b" in model:
|
||||||
base_path = os.path.join(folder_paths.models_dir, "CogVideo", "CogVideo2B")
|
base_path = os.path.join(folder_paths.models_dir, "CogVideo", "CogVideo2B")
|
||||||
|
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json')
|
||||||
download_path = base_path
|
download_path = base_path
|
||||||
repo_id = model
|
repo_id = model
|
||||||
elif "5b" in model:
|
elif "5b" in model:
|
||||||
base_path = os.path.join(folder_paths.models_dir, "CogVideo", (model.split("/")[-1]))
|
base_path = os.path.join(folder_paths.models_dir, "CogVideo", (model.split("/")[-1]))
|
||||||
|
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
|
||||||
download_path = base_path
|
download_path = base_path
|
||||||
repo_id = model
|
repo_id = model
|
||||||
|
|
||||||
@ -91,7 +97,8 @@ class DownloadAndLoadCogVideoModel:
|
|||||||
local_dir=download_path,
|
local_dir=download_path,
|
||||||
local_dir_use_symlinks=False,
|
local_dir_use_symlinks=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# transformer
|
||||||
if "Fun" in model:
|
if "Fun" in model:
|
||||||
transformer = CogVideoXTransformer3DModelFun.from_pretrained(base_path, subfolder="transformer")
|
transformer = CogVideoXTransformer3DModelFun.from_pretrained(base_path, subfolder="transformer")
|
||||||
else:
|
else:
|
||||||
@ -114,9 +121,12 @@ class DownloadAndLoadCogVideoModel:
|
|||||||
if fp8_transformer == "fastmode":
|
if fp8_transformer == "fastmode":
|
||||||
from .fp8_optimization import convert_fp8_linear
|
from .fp8_optimization import convert_fp8_linear
|
||||||
convert_fp8_linear(transformer, dtype)
|
convert_fp8_linear(transformer, dtype)
|
||||||
|
|
||||||
scheduler = CogVideoXDDIMScheduler.from_pretrained(base_path, subfolder="scheduler")
|
|
||||||
|
|
||||||
|
with open(scheduler_path) as f:
|
||||||
|
scheduler_config = json.load(f)
|
||||||
|
scheduler = CogVideoXDDIMScheduler.from_config(scheduler_config)
|
||||||
|
|
||||||
|
# VAE
|
||||||
if "Fun" in model:
|
if "Fun" in model:
|
||||||
vae = AutoencoderKLCogVideoXFun.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device)
|
vae = AutoencoderKLCogVideoXFun.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device)
|
||||||
pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler)
|
pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler)
|
||||||
@ -127,6 +137,7 @@ class DownloadAndLoadCogVideoModel:
|
|||||||
if enable_sequential_cpu_offload:
|
if enable_sequential_cpu_offload:
|
||||||
pipe.enable_sequential_cpu_offload()
|
pipe.enable_sequential_cpu_offload()
|
||||||
|
|
||||||
|
# compilation
|
||||||
if compile == "torch":
|
if compile == "torch":
|
||||||
torch._dynamo.config.suppress_errors = True
|
torch._dynamo.config.suppress_errors = True
|
||||||
pipe.transformer.to(memory_format=torch.channels_last)
|
pipe.transformer.to(memory_format=torch.channels_last)
|
||||||
@ -148,7 +159,149 @@ class DownloadAndLoadCogVideoModel:
|
|||||||
"dtype": dtype,
|
"dtype": dtype,
|
||||||
"base_path": base_path,
|
"base_path": base_path,
|
||||||
"onediff": True if compile == "onediff" else False,
|
"onediff": True if compile == "onediff" else False,
|
||||||
"cpu_offloading": enable_sequential_cpu_offload
|
"cpu_offloading": enable_sequential_cpu_offload,
|
||||||
|
"scheduler_config": scheduler_config
|
||||||
|
}
|
||||||
|
|
||||||
|
return (pipeline,)
|
||||||
|
|
||||||
|
class DownloadAndLoadCogVideoGGUFModel:
|
||||||
|
@classmethod
|
||||||
|
def INPUT_TYPES(s):
|
||||||
|
return {
|
||||||
|
"required": {
|
||||||
|
"model": (
|
||||||
|
[
|
||||||
|
"CogVideoX_5b_GGUF_Q4_0.safetensors",
|
||||||
|
"CogVideoX_5b_fun_GGUF_Q4_0.safetensors",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
"vae_precision": (["fp16", "fp32", "bf16"], {"default": "bf16", "tooltip": "VAE dtype"}),
|
||||||
|
"fp8_fastmode": ("BOOLEAN", {"default": False, "tooltip": "only supported on 4090 and later GPUs"}),
|
||||||
|
"compile": (["disabled","onediff","torch"], {"tooltip": "UNTESTED WITH GGUF"}),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
RETURN_TYPES = ("COGVIDEOPIPE",)
|
||||||
|
RETURN_NAMES = ("cogvideo_pipe", )
|
||||||
|
FUNCTION = "loadmodel"
|
||||||
|
CATEGORY = "CogVideoWrapper"
|
||||||
|
|
||||||
|
def loadmodel(self, model, vae_precision, compile, fp8_fastmode):
|
||||||
|
device = mm.get_torch_device()
|
||||||
|
offload_device = mm.unet_offload_device()
|
||||||
|
mm.soft_empty_cache()
|
||||||
|
|
||||||
|
vae_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[vae_precision]
|
||||||
|
download_path = os.path.join(folder_paths.models_dir, 'CogVideo', 'GGUF')
|
||||||
|
gguf_path = os.path.join(folder_paths.models_dir, 'diffusion_models', model) # check MinusZone's model path first
|
||||||
|
if not os.path.exists(gguf_path):
|
||||||
|
gguf_path = os.path.join(download_path, model)
|
||||||
|
if not os.path.exists(gguf_path):
|
||||||
|
log.info(f"Downloading model to: {gguf_path}")
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
|
snapshot_download(
|
||||||
|
repo_id="MinusZoneAI/ComfyUI-CogVideoX-MZ",
|
||||||
|
allow_patterns=[f"*{model}*"],
|
||||||
|
local_dir=download_path,
|
||||||
|
local_dir_use_symlinks=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
with open(os.path.join(script_directory, 'configs', 'transformer_config_5b.json')) as f:
|
||||||
|
transformer_config = json.load(f)
|
||||||
|
sd = load_torch_file(gguf_path)
|
||||||
|
|
||||||
|
from . import mz_gguf_loader
|
||||||
|
import importlib
|
||||||
|
importlib.reload(mz_gguf_loader)
|
||||||
|
|
||||||
|
with mz_gguf_loader.quantize_lazy_load():
|
||||||
|
if "fun" in model:
|
||||||
|
transformer_config["in_channels"] = 33
|
||||||
|
transformer = CogVideoXTransformer3DModelFun.from_config(transformer_config)
|
||||||
|
else:
|
||||||
|
transformer_config["in_channels"] = 16
|
||||||
|
transformer = CogVideoXTransformer3DModel.from_config(transformer_config)
|
||||||
|
|
||||||
|
transformer.to(torch.float8_e4m3fn)
|
||||||
|
transformer = mz_gguf_loader.quantize_load_state_dict(transformer, sd, device="cpu")
|
||||||
|
transformer.to(device)
|
||||||
|
|
||||||
|
# transformer
|
||||||
|
# if fp8_transformer == "fastmode":
|
||||||
|
# if "2b" in model:
|
||||||
|
# for name, param in transformer.named_parameters():
|
||||||
|
# if name != "pos_embedding":
|
||||||
|
# param.data = param.data.to(torch.float8_e4m3fn)
|
||||||
|
# elif "I2V" in model:
|
||||||
|
# for name, param in transformer.named_parameters():
|
||||||
|
# if "patch_embed" not in name:
|
||||||
|
# param.data = param.data.to(torch.float8_e4m3fn)
|
||||||
|
# else:
|
||||||
|
# transformer.to(torch.float8_e4m3fn)
|
||||||
|
|
||||||
|
if fp8_fastmode:
|
||||||
|
from .fp8_optimization import convert_fp8_linear
|
||||||
|
convert_fp8_linear(transformer, vae_dtype)
|
||||||
|
|
||||||
|
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
|
||||||
|
with open(scheduler_path) as f:
|
||||||
|
scheduler_config = json.load(f)
|
||||||
|
|
||||||
|
scheduler = CogVideoXDDIMScheduler.from_config(scheduler_config, subfolder="scheduler")
|
||||||
|
|
||||||
|
# VAE
|
||||||
|
vae_dl_path = os.path.join(folder_paths.models_dir, 'CogVideo', 'VAE')
|
||||||
|
vae_path = os.path.join(vae_dl_path, "cogvideox_vae.safetensors")
|
||||||
|
if not os.path.exists(vae_path):
|
||||||
|
log.info(f"Downloading VAE model to: {vae_path}")
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
|
snapshot_download(
|
||||||
|
repo_id="Kijai/CogVideoX-Fun-pruned",
|
||||||
|
allow_patterns=["*cogvideox_vae.safetensors*"],
|
||||||
|
local_dir=vae_dl_path,
|
||||||
|
local_dir_use_symlinks=False,
|
||||||
|
)
|
||||||
|
with open(os.path.join(script_directory, 'configs', 'vae_config.json')) as f:
|
||||||
|
vae_config = json.load(f)
|
||||||
|
|
||||||
|
vae_sd = load_torch_file(vae_path)
|
||||||
|
if "fun" in model:
|
||||||
|
vae = AutoencoderKLCogVideoXFun.from_config(vae_config).to(vae_dtype).to(offload_device)
|
||||||
|
vae.load_state_dict(vae_sd)
|
||||||
|
pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler)
|
||||||
|
else:
|
||||||
|
vae = AutoencoderKLCogVideoX.from_config(vae_config).to(vae_dtype).to(offload_device)
|
||||||
|
vae.load_state_dict(vae_sd)
|
||||||
|
pipe = CogVideoXPipeline(vae, transformer, scheduler)
|
||||||
|
|
||||||
|
# compilation
|
||||||
|
if compile == "torch":
|
||||||
|
torch._dynamo.config.suppress_errors = True
|
||||||
|
pipe.transformer.to(memory_format=torch.channels_last)
|
||||||
|
pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True)
|
||||||
|
elif compile == "onediff":
|
||||||
|
from onediffx import compile_pipe
|
||||||
|
os.environ['NEXFORT_FX_FORCE_TRITON_SDPA'] = '1'
|
||||||
|
|
||||||
|
pipe = compile_pipe(
|
||||||
|
pipe,
|
||||||
|
backend="nexfort",
|
||||||
|
options= {"mode": "max-optimize:max-autotune:max-autotune", "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": False, "triton.fuse_attention_allow_fp16_reduction": False}},
|
||||||
|
ignores=["vae"],
|
||||||
|
fuse_qkv_projections=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
pipeline = {
|
||||||
|
"pipe": pipe,
|
||||||
|
"dtype": vae_dtype,
|
||||||
|
"base_path": "Fun" if "fun" in model else "sad",
|
||||||
|
"onediff": True if compile == "onediff" else False,
|
||||||
|
"cpu_offloading": False,
|
||||||
|
"scheduler_config": scheduler_config
|
||||||
}
|
}
|
||||||
|
|
||||||
return (pipeline,)
|
return (pipeline,)
|
||||||
@ -523,6 +676,7 @@ class CogVideoDecode:
|
|||||||
latents = 1 / vae.config.scaling_factor * latents
|
latents = 1 / vae.config.scaling_factor * latents
|
||||||
|
|
||||||
frames = vae.decode(latents).sample
|
frames = vae.decode(latents).sample
|
||||||
|
vae.disable_tiling()
|
||||||
if not pipeline["cpu_offloading"]:
|
if not pipeline["cpu_offloading"]:
|
||||||
vae.to(offload_device)
|
vae.to(offload_device)
|
||||||
mm.soft_empty_cache()
|
mm.soft_empty_cache()
|
||||||
@ -597,20 +751,21 @@ class CogVideoXFunSampler:
|
|||||||
original_height = opt_empty_latent["samples"][0].shape[-2] * 8
|
original_height = opt_empty_latent["samples"][0].shape[-2] * 8
|
||||||
|
|
||||||
# Load Sampler
|
# Load Sampler
|
||||||
|
scheduler_config = pipeline["scheduler_config"]
|
||||||
if scheduler == "DPM++":
|
if scheduler == "DPM++":
|
||||||
noise_scheduler = DPMSolverMultistepScheduler.from_pretrained(base_path, subfolder= 'scheduler')
|
noise_scheduler = DPMSolverMultistepScheduler.from_config(scheduler_config)
|
||||||
elif scheduler == "Euler":
|
elif scheduler == "Euler":
|
||||||
noise_scheduler = EulerDiscreteScheduler.from_pretrained(base_path, subfolder= 'scheduler')
|
noise_scheduler = EulerDiscreteScheduler.from_config(scheduler_config)
|
||||||
elif scheduler == "Euler A":
|
elif scheduler == "Euler A":
|
||||||
noise_scheduler = EulerAncestralDiscreteScheduler.from_pretrained(base_path, subfolder= 'scheduler')
|
noise_scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler_config)
|
||||||
elif scheduler == "PNDM":
|
elif scheduler == "PNDM":
|
||||||
noise_scheduler = PNDMScheduler.from_pretrained(base_path, subfolder= 'scheduler')
|
noise_scheduler = PNDMScheduler.from_config(scheduler_config)
|
||||||
elif scheduler == "DDIM":
|
elif scheduler == "DDIM":
|
||||||
noise_scheduler = DDIMScheduler.from_pretrained(base_path, subfolder= 'scheduler')
|
noise_scheduler = DDIMScheduler.from_config(scheduler_config)
|
||||||
elif scheduler == "CogVideoXDDIM":
|
elif scheduler == "CogVideoXDDIM":
|
||||||
noise_scheduler = CogVideoXDDIMScheduler.from_pretrained(base_path, subfolder= 'scheduler')
|
noise_scheduler = CogVideoXDDIMScheduler.from_config(scheduler_config)
|
||||||
elif scheduler == "CogVideoXDPMScheduler":
|
elif scheduler == "CogVideoXDPMScheduler":
|
||||||
noise_scheduler = CogVideoXDPMScheduler.from_pretrained(base_path, subfolder= 'scheduler')
|
noise_scheduler = CogVideoXDPMScheduler.from_config(scheduler_config)
|
||||||
pipe.scheduler = noise_scheduler
|
pipe.scheduler = noise_scheduler
|
||||||
|
|
||||||
#if not pipeline["cpu_offloading"]:
|
#if not pipeline["cpu_offloading"]:
|
||||||
@ -755,7 +910,8 @@ NODE_CLASS_MAPPINGS = {
|
|||||||
"CogVideoImageEncode": CogVideoImageEncode,
|
"CogVideoImageEncode": CogVideoImageEncode,
|
||||||
"CogVideoXFunSampler": CogVideoXFunSampler,
|
"CogVideoXFunSampler": CogVideoXFunSampler,
|
||||||
"CogVideoXFunVid2VidSampler": CogVideoXFunVid2VidSampler,
|
"CogVideoXFunVid2VidSampler": CogVideoXFunVid2VidSampler,
|
||||||
"CogVideoTextEncodeCombine": CogVideoTextEncodeCombine
|
"CogVideoTextEncodeCombine": CogVideoTextEncodeCombine,
|
||||||
|
"DownloadAndLoadCogVideoGGUFModel": DownloadAndLoadCogVideoGGUFModel
|
||||||
}
|
}
|
||||||
NODE_DISPLAY_NAME_MAPPINGS = {
|
NODE_DISPLAY_NAME_MAPPINGS = {
|
||||||
"DownloadAndLoadCogVideoModel": "(Down)load CogVideo Model",
|
"DownloadAndLoadCogVideoModel": "(Down)load CogVideo Model",
|
||||||
@ -766,5 +922,6 @@ NODE_DISPLAY_NAME_MAPPINGS = {
|
|||||||
"CogVideoImageEncode": "CogVideo ImageEncode",
|
"CogVideoImageEncode": "CogVideo ImageEncode",
|
||||||
"CogVideoXFunSampler": "CogVideoXFun Sampler",
|
"CogVideoXFunSampler": "CogVideoXFun Sampler",
|
||||||
"CogVideoXFunVid2VidSampler": "CogVideoXFun Vid2Vid Sampler",
|
"CogVideoXFunVid2VidSampler": "CogVideoXFun Vid2Vid Sampler",
|
||||||
"CogVideoTextEncodeCombine": "CogVideo TextEncode Combine"
|
"CogVideoTextEncodeCombine": "CogVideo TextEncode Combine",
|
||||||
|
"DownloadAndLoadCogVideoGGUFModel": "(Down)load CogVideo GGUF Model"
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user