Add GGUF loader

This commit is contained in:
kijai 2024-09-20 17:02:31 +03:00
parent 0f7f457b82
commit 1e26971695
8 changed files with 515 additions and 13 deletions

View File

@ -0,0 +1,18 @@
{
"_class_name": "CogVideoXDDIMScheduler",
"_diffusers_version": "0.30.0.dev0",
"beta_end": 0.012,
"beta_schedule": "scaled_linear",
"beta_start": 0.00085,
"clip_sample": false,
"clip_sample_range": 1.0,
"num_train_timesteps": 1000,
"prediction_type": "v_prediction",
"rescale_betas_zero_snr": true,
"sample_max_value": 1.0,
"set_alpha_to_one": true,
"snr_shift_scale": 3.0,
"steps_offset": 0,
"timestep_spacing": "trailing",
"trained_betas": null
}

View File

@ -0,0 +1,18 @@
{
"_class_name": "CogVideoXDDIMScheduler",
"_diffusers_version": "0.31.0.dev0",
"beta_end": 0.012,
"beta_schedule": "scaled_linear",
"beta_start": 0.00085,
"clip_sample": false,
"clip_sample_range": 1.0,
"num_train_timesteps": 1000,
"prediction_type": "v_prediction",
"rescale_betas_zero_snr": true,
"sample_max_value": 1.0,
"set_alpha_to_one": true,
"snr_shift_scale": 1.0,
"steps_offset": 0,
"timestep_spacing": "trailing",
"trained_betas": null
}

View File

@ -0,0 +1,28 @@
{
"_class_name": "CogVideoXTransformer3DModel",
"_diffusers_version": "0.30.0.dev0",
"activation_fn": "gelu-approximate",
"attention_bias": true,
"attention_head_dim": 64,
"dropout": 0.0,
"flip_sin_to_cos": true,
"freq_shift": 0,
"in_channels": 16,
"max_text_seq_length": 226,
"norm_elementwise_affine": true,
"norm_eps": 1e-05,
"num_attention_heads": 30,
"num_layers": 30,
"out_channels": 16,
"patch_size": 2,
"sample_frames": 49,
"sample_height": 60,
"sample_width": 90,
"spatial_interpolation_scale": 1.875,
"temporal_compression_ratio": 4,
"temporal_interpolation_scale": 1.0,
"text_embed_dim": 4096,
"time_embed_dim": 512,
"timestep_activation_fn": "silu",
"use_rotary_positional_embeddings": false
}

View File

@ -0,0 +1,26 @@
{
"activation_fn": "gelu-approximate",
"attention_bias": true,
"attention_head_dim": 64,
"dropout": 0.0,
"flip_sin_to_cos": true,
"freq_shift": 0,
"in_channels": 16,
"max_text_seq_length": 226,
"norm_elementwise_affine": true,
"norm_eps": 1e-05,
"num_attention_heads": 48,
"num_layers": 42,
"out_channels": 16,
"patch_size": 2,
"sample_frames": 49,
"sample_height": 60,
"sample_width": 90,
"spatial_interpolation_scale": 1.875,
"temporal_compression_ratio": 4,
"temporal_interpolation_scale": 1.0,
"text_embed_dim": 4096,
"time_embed_dim": 512,
"timestep_activation_fn": "silu",
"use_rotary_positional_embeddings": true
}

View File

@ -0,0 +1,29 @@
{
"_class_name": "CogVideoXTransformer3DModel",
"_diffusers_version": "0.31.0.dev0",
"activation_fn": "gelu-approximate",
"attention_bias": true,
"attention_head_dim": 64,
"dropout": 0.0,
"flip_sin_to_cos": true,
"freq_shift": 0,
"in_channels": 32,
"max_text_seq_length": 226,
"norm_elementwise_affine": true,
"norm_eps": 1e-05,
"num_attention_heads": 48,
"num_layers": 42,
"out_channels": 16,
"patch_size": 2,
"sample_frames": 49,
"sample_height": 60,
"sample_width": 90,
"spatial_interpolation_scale": 1.875,
"temporal_compression_ratio": 4,
"temporal_interpolation_scale": 1.0,
"text_embed_dim": 4096,
"time_embed_dim": 512,
"timestep_activation_fn": "silu",
"use_learned_positional_embeddings": true,
"use_rotary_positional_embeddings": true
}

39
configs/vae_config.json Normal file
View File

@ -0,0 +1,39 @@
{
"_class_name": "AutoencoderKLCogVideoX",
"_diffusers_version": "0.31.0.dev0",
"act_fn": "silu",
"block_out_channels": [
128,
256,
256,
512
],
"down_block_types": [
"CogVideoXDownBlock3D",
"CogVideoXDownBlock3D",
"CogVideoXDownBlock3D",
"CogVideoXDownBlock3D"
],
"force_upcast": true,
"in_channels": 3,
"latent_channels": 16,
"latents_mean": null,
"latents_std": null,
"layers_per_block": 3,
"norm_eps": 1e-06,
"norm_num_groups": 32,
"out_channels": 3,
"sample_height": 480,
"sample_width": 720,
"scaling_factor": 0.7,
"shift_factor": null,
"temporal_compression_ratio": 4,
"up_block_types": [
"CogVideoXUpBlock3D",
"CogVideoXUpBlock3D",
"CogVideoXUpBlock3D",
"CogVideoXUpBlock3D"
],
"use_post_quant_conv": false,
"use_quant_conv": false
}

188
mz_gguf_loader.py Normal file
View File

@ -0,0 +1,188 @@
# https://github.com/MinusZoneAI/ComfyUI-CogVideoX-MZ/blob/9616415220fd09388622f40f6609e4ed81f048a5/mz_gguf_loader.py
import torch
import torch.nn as nn
import gc
class quantize_lazy_load():
def __init__(self):
self.device = None
def __enter__(self):
self.device = torch.device("meta")
self.device.__enter__()
return self
def __exit__(self, exc_type, exc_value, traceback):
self.device.__exit__(exc_type, exc_value, traceback)
def quantize_load_state_dict(model, state_dict, device="cpu"):
Q4_0_qkey = []
for key in state_dict.keys():
if key.endswith(".Q4_0_qweight"):
Q4_0_qkey.append(key.replace(".Q4_0_qweight", ""))
for name, module in model.named_modules():
if name in Q4_0_qkey:
q_linear = WQLinear_GGUF.from_linear(
linear=module,
device=device,
qtype="Q4_0",
)
set_op_by_name(model, name, q_linear)
model.to_empty(device=device)
model.load_state_dict(state_dict, strict=False)
model.to(device)
return model
def set_op_by_name(layer, name, new_module):
levels = name.split(".")
if len(levels) > 1:
mod_ = layer
for l_idx in range(len(levels) - 1):
if levels[l_idx].isdigit():
mod_ = mod_[int(levels[l_idx])]
else:
mod_ = getattr(mod_, levels[l_idx])
setattr(mod_, levels[-1], new_module)
else:
setattr(layer, name, new_module)
import torch.nn.functional as F
class WQLinear_GGUF(nn.Module):
def __init__(
self, in_features, out_features, bias, dev, qtype="Q4_0"
):
super().__init__()
self.in_features = in_features
self.out_features = out_features
self.qtype = qtype
qweight_shape = quant_shape_to_byte_shape(
(out_features, in_features), qtype
)
self.register_buffer(
f"{qtype}_qweight",
torch.zeros(
qweight_shape,
dtype=torch.uint8,
device=dev,
),
)
if bias:
self.register_buffer(
"bias",
torch.zeros(
(out_features),
dtype=torch.float16,
device=dev,
),
)
else:
self.bias = None
@classmethod
def from_linear(
cls, linear,
device="cpu",
qtype="Q4_0",
):
q_linear = cls(
linear.in_features,
linear.out_features,
linear.bias is not None,
device,
qtype=qtype,
)
return q_linear
def extra_repr(self) -> str:
return (
"in_features={}, out_features={}, bias={}, w_bit={}, group_size={}".format(
self.in_features,
self.out_features,
self.bias is not None,
self.w_bit,
self.group_size,
)
)
@torch.no_grad()
def forward(self, x):
# x = torch.matmul(x, dequantize_blocks_Q4_0(self.qweight))
if self.qtype == "Q4_0":
x = F.linear(x, dequantize_blocks_Q4_0(
self.Q4_0_qweight, x.dtype), self.bias.to(x.dtype) if self.bias is not None else None)
else:
raise ValueError(f"Unknown qtype: {self.qtype}")
return x
def split_block_dims(blocks, *args):
n_max = blocks.shape[1]
dims = list(args) + [n_max - sum(args)]
return torch.split(blocks, dims, dim=1)
def quant_shape_to_byte_shape(shape, qtype) -> tuple[int, ...]:
# shape = shape[::-1]
block_size, type_size = GGML_QUANT_SIZES[qtype]
if shape[-1] % block_size != 0:
raise ValueError(
f"Quantized tensor row size ({shape[-1]}) is not a multiple of Q4_0 block size ({block_size})")
return (*shape[:-1], shape[-1] // block_size * type_size)
def quant_shape_from_byte_shape(shape, qtype) -> tuple[int, ...]:
# shape = shape[::-1]
block_size, type_size = GGML_QUANT_SIZES[qtype]
if shape[-1] % type_size != 0:
raise ValueError(
f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of Q4_0 type size ({type_size})")
return (*shape[:-1], shape[-1] // type_size * block_size)
GGML_QUANT_SIZES = {
"Q4_0": (32, 2 + 16),
}
def dequantize_blocks_Q4_0(data, dtype=torch.float16):
block_size, type_size = GGML_QUANT_SIZES["Q4_0"]
data = data.to(torch.uint8)
shape = data.shape
rows = data.reshape(
(-1, data.shape[-1])
).view(torch.uint8)
n_blocks = rows.numel() // type_size
blocks = data.reshape((n_blocks, type_size))
n_blocks = blocks.shape[0]
d, qs = split_block_dims(blocks, 2)
d = d.view(torch.float16)
qs = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor(
[0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1))
qs = (qs & 0x0F).reshape((n_blocks, -1)).to(torch.int8) - 8
out = (d * qs)
out = out.reshape(quant_shape_from_byte_shape(
shape,
qtype="Q4_0",
)).to(dtype)
return out

182
nodes.py
View File

@ -15,11 +15,13 @@ from .cogvideox_fun.utils import get_image_to_video_latent, get_video_to_video_l
from .cogvideox_fun.pipeline_cogvideox_inpaint import CogVideoX_Fun_Pipeline_Inpaint
from PIL import Image
import numpy as np
import json
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
log = logging.getLogger(__name__)
script_directory = os.path.dirname(os.path.abspath(__file__))
class DownloadAndLoadCogVideoModel:
@classmethod
@ -65,19 +67,23 @@ class DownloadAndLoadCogVideoModel:
download_path = os.path.join(folder_paths.models_dir, "CogVideo")
if "2b" in model:
base_path = os.path.join(folder_paths.models_dir, "CogVideoX_Fun", "CogVideoX-Fun-2b-InP") # location of the official model
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json')
if not os.path.exists(base_path):
base_path = os.path.join(download_path, "CogVideoX-Fun-2b-InP")
elif "5b" in model:
base_path = os.path.join(folder_paths.models_dir, "CogVideoX_Fun", "CogVideoX-Fun-5b-InP") # location of the official model
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
if not os.path.exists(base_path):
base_path = os.path.join(download_path, "CogVideoX-Fun-5b-InP")
elif "2b" in model:
base_path = os.path.join(folder_paths.models_dir, "CogVideo", "CogVideo2B")
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json')
download_path = base_path
repo_id = model
elif "5b" in model:
base_path = os.path.join(folder_paths.models_dir, "CogVideo", (model.split("/")[-1]))
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
download_path = base_path
repo_id = model
@ -91,7 +97,8 @@ class DownloadAndLoadCogVideoModel:
local_dir=download_path,
local_dir_use_symlinks=False,
)
# transformer
if "Fun" in model:
transformer = CogVideoXTransformer3DModelFun.from_pretrained(base_path, subfolder="transformer")
else:
@ -114,9 +121,12 @@ class DownloadAndLoadCogVideoModel:
if fp8_transformer == "fastmode":
from .fp8_optimization import convert_fp8_linear
convert_fp8_linear(transformer, dtype)
scheduler = CogVideoXDDIMScheduler.from_pretrained(base_path, subfolder="scheduler")
with open(scheduler_path) as f:
scheduler_config = json.load(f)
scheduler = CogVideoXDDIMScheduler.from_config(scheduler_config)
# VAE
if "Fun" in model:
vae = AutoencoderKLCogVideoXFun.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device)
pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler)
@ -127,6 +137,7 @@ class DownloadAndLoadCogVideoModel:
if enable_sequential_cpu_offload:
pipe.enable_sequential_cpu_offload()
# compilation
if compile == "torch":
torch._dynamo.config.suppress_errors = True
pipe.transformer.to(memory_format=torch.channels_last)
@ -148,7 +159,149 @@ class DownloadAndLoadCogVideoModel:
"dtype": dtype,
"base_path": base_path,
"onediff": True if compile == "onediff" else False,
"cpu_offloading": enable_sequential_cpu_offload
"cpu_offloading": enable_sequential_cpu_offload,
"scheduler_config": scheduler_config
}
return (pipeline,)
class DownloadAndLoadCogVideoGGUFModel:
@classmethod
def INPUT_TYPES(s):
return {
"required": {
"model": (
[
"CogVideoX_5b_GGUF_Q4_0.safetensors",
"CogVideoX_5b_fun_GGUF_Q4_0.safetensors",
],
),
"vae_precision": (["fp16", "fp32", "bf16"], {"default": "bf16", "tooltip": "VAE dtype"}),
"fp8_fastmode": ("BOOLEAN", {"default": False, "tooltip": "only supported on 4090 and later GPUs"}),
"compile": (["disabled","onediff","torch"], {"tooltip": "UNTESTED WITH GGUF"}),
},
}
RETURN_TYPES = ("COGVIDEOPIPE",)
RETURN_NAMES = ("cogvideo_pipe", )
FUNCTION = "loadmodel"
CATEGORY = "CogVideoWrapper"
def loadmodel(self, model, vae_precision, compile, fp8_fastmode):
device = mm.get_torch_device()
offload_device = mm.unet_offload_device()
mm.soft_empty_cache()
vae_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[vae_precision]
download_path = os.path.join(folder_paths.models_dir, 'CogVideo', 'GGUF')
gguf_path = os.path.join(folder_paths.models_dir, 'diffusion_models', model) # check MinusZone's model path first
if not os.path.exists(gguf_path):
gguf_path = os.path.join(download_path, model)
if not os.path.exists(gguf_path):
log.info(f"Downloading model to: {gguf_path}")
from huggingface_hub import snapshot_download
snapshot_download(
repo_id="MinusZoneAI/ComfyUI-CogVideoX-MZ",
allow_patterns=[f"*{model}*"],
local_dir=download_path,
local_dir_use_symlinks=False,
)
with open(os.path.join(script_directory, 'configs', 'transformer_config_5b.json')) as f:
transformer_config = json.load(f)
sd = load_torch_file(gguf_path)
from . import mz_gguf_loader
import importlib
importlib.reload(mz_gguf_loader)
with mz_gguf_loader.quantize_lazy_load():
if "fun" in model:
transformer_config["in_channels"] = 33
transformer = CogVideoXTransformer3DModelFun.from_config(transformer_config)
else:
transformer_config["in_channels"] = 16
transformer = CogVideoXTransformer3DModel.from_config(transformer_config)
transformer.to(torch.float8_e4m3fn)
transformer = mz_gguf_loader.quantize_load_state_dict(transformer, sd, device="cpu")
transformer.to(device)
# transformer
# if fp8_transformer == "fastmode":
# if "2b" in model:
# for name, param in transformer.named_parameters():
# if name != "pos_embedding":
# param.data = param.data.to(torch.float8_e4m3fn)
# elif "I2V" in model:
# for name, param in transformer.named_parameters():
# if "patch_embed" not in name:
# param.data = param.data.to(torch.float8_e4m3fn)
# else:
# transformer.to(torch.float8_e4m3fn)
if fp8_fastmode:
from .fp8_optimization import convert_fp8_linear
convert_fp8_linear(transformer, vae_dtype)
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
with open(scheduler_path) as f:
scheduler_config = json.load(f)
scheduler = CogVideoXDDIMScheduler.from_config(scheduler_config, subfolder="scheduler")
# VAE
vae_dl_path = os.path.join(folder_paths.models_dir, 'CogVideo', 'VAE')
vae_path = os.path.join(vae_dl_path, "cogvideox_vae.safetensors")
if not os.path.exists(vae_path):
log.info(f"Downloading VAE model to: {vae_path}")
from huggingface_hub import snapshot_download
snapshot_download(
repo_id="Kijai/CogVideoX-Fun-pruned",
allow_patterns=["*cogvideox_vae.safetensors*"],
local_dir=vae_dl_path,
local_dir_use_symlinks=False,
)
with open(os.path.join(script_directory, 'configs', 'vae_config.json')) as f:
vae_config = json.load(f)
vae_sd = load_torch_file(vae_path)
if "fun" in model:
vae = AutoencoderKLCogVideoXFun.from_config(vae_config).to(vae_dtype).to(offload_device)
vae.load_state_dict(vae_sd)
pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler)
else:
vae = AutoencoderKLCogVideoX.from_config(vae_config).to(vae_dtype).to(offload_device)
vae.load_state_dict(vae_sd)
pipe = CogVideoXPipeline(vae, transformer, scheduler)
# compilation
if compile == "torch":
torch._dynamo.config.suppress_errors = True
pipe.transformer.to(memory_format=torch.channels_last)
pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True)
elif compile == "onediff":
from onediffx import compile_pipe
os.environ['NEXFORT_FX_FORCE_TRITON_SDPA'] = '1'
pipe = compile_pipe(
pipe,
backend="nexfort",
options= {"mode": "max-optimize:max-autotune:max-autotune", "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": False, "triton.fuse_attention_allow_fp16_reduction": False}},
ignores=["vae"],
fuse_qkv_projections=True,
)
pipeline = {
"pipe": pipe,
"dtype": vae_dtype,
"base_path": "Fun" if "fun" in model else "sad",
"onediff": True if compile == "onediff" else False,
"cpu_offloading": False,
"scheduler_config": scheduler_config
}
return (pipeline,)
@ -613,20 +766,21 @@ class CogVideoXFunSampler:
print(f"Closest size: {width}x{height}")
# Load Sampler
scheduler_config = pipeline["scheduler_config"]
if scheduler == "DPM++":
noise_scheduler = DPMSolverMultistepScheduler.from_pretrained(base_path, subfolder= 'scheduler')
noise_scheduler = DPMSolverMultistepScheduler.from_config(scheduler_config)
elif scheduler == "Euler":
noise_scheduler = EulerDiscreteScheduler.from_pretrained(base_path, subfolder= 'scheduler')
noise_scheduler = EulerDiscreteScheduler.from_config(scheduler_config)
elif scheduler == "Euler A":
noise_scheduler = EulerAncestralDiscreteScheduler.from_pretrained(base_path, subfolder= 'scheduler')
noise_scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler_config)
elif scheduler == "PNDM":
noise_scheduler = PNDMScheduler.from_pretrained(base_path, subfolder= 'scheduler')
noise_scheduler = PNDMScheduler.from_config(scheduler_config)
elif scheduler == "DDIM":
noise_scheduler = DDIMScheduler.from_pretrained(base_path, subfolder= 'scheduler')
noise_scheduler = DDIMScheduler.from_config(scheduler_config)
elif scheduler == "CogVideoXDDIM":
noise_scheduler = CogVideoXDDIMScheduler.from_pretrained(base_path, subfolder= 'scheduler')
noise_scheduler = CogVideoXDDIMScheduler.from_config(scheduler_config)
elif scheduler == "CogVideoXDPMScheduler":
noise_scheduler = CogVideoXDPMScheduler.from_pretrained(base_path, subfolder= 'scheduler')
noise_scheduler = CogVideoXDPMScheduler.from_config(scheduler_config)
pipe.scheduler = noise_scheduler
#if not pipeline["cpu_offloading"]:
@ -784,7 +938,8 @@ NODE_CLASS_MAPPINGS = {
"CogVideoImageEncode": CogVideoImageEncode,
"CogVideoXFunSampler": CogVideoXFunSampler,
"CogVideoXFunVid2VidSampler": CogVideoXFunVid2VidSampler,
"CogVideoTextEncodeCombine": CogVideoTextEncodeCombine
"CogVideoTextEncodeCombine": CogVideoTextEncodeCombine,
"DownloadAndLoadCogVideoGGUFModel": DownloadAndLoadCogVideoGGUFModel
}
NODE_DISPLAY_NAME_MAPPINGS = {
"DownloadAndLoadCogVideoModel": "(Down)load CogVideo Model",
@ -795,5 +950,6 @@ NODE_DISPLAY_NAME_MAPPINGS = {
"CogVideoImageEncode": "CogVideo ImageEncode",
"CogVideoXFunSampler": "CogVideoXFun Sampler",
"CogVideoXFunVid2VidSampler": "CogVideoXFun Vid2Vid Sampler",
"CogVideoTextEncodeCombine": "CogVideo TextEncode Combine"
"CogVideoTextEncodeCombine": "CogVideo TextEncode Combine",
"DownloadAndLoadCogVideoGGUFModel": "(Down)load CogVideo GGUF Model"
}