Allow sequential_cpu_offload for GGUF too

This commit is contained in:
kijai 2024-09-22 17:03:28 +03:00
parent 2a71aba1aa
commit d3d7f043cd
2 changed files with 43 additions and 26 deletions

View File

@ -1,6 +1,4 @@
{ {
"_class_name": "CogVideoXTransformer3DModel",
"_diffusers_version": "0.30.0.dev0",
"activation_fn": "gelu-approximate", "activation_fn": "gelu-approximate",
"attention_bias": true, "attention_bias": true,
"attention_head_dim": 64, "attention_head_dim": 64,

View File

@ -175,11 +175,13 @@ class DownloadAndLoadCogVideoGGUFModel:
"CogVideoX_5b_GGUF_Q4_0.safetensors", "CogVideoX_5b_GGUF_Q4_0.safetensors",
"CogVideoX_5b_I2V_GGUF_Q4_0.safetensors", "CogVideoX_5b_I2V_GGUF_Q4_0.safetensors",
"CogVideoX_5b_fun_GGUF_Q4_0.safetensors", "CogVideoX_5b_fun_GGUF_Q4_0.safetensors",
#"CogVideoX_2b_fun_GGUF_Q4_0.safetensors"
], ],
), ),
"vae_precision": (["fp16", "fp32", "bf16"], {"default": "bf16", "tooltip": "VAE dtype"}), "vae_precision": (["fp16", "fp32", "bf16"], {"default": "bf16", "tooltip": "VAE dtype"}),
"fp8_fastmode": ("BOOLEAN", {"default": False, "tooltip": "only supported on 4090 and later GPUs"}), "fp8_fastmode": ("BOOLEAN", {"default": False, "tooltip": "only supported on 4090 and later GPUs"}),
"load_device": (["main_device", "offload_device"], {"default": "main_device"}), "load_device": (["main_device", "offload_device"], {"default": "main_device"}),
"enable_sequential_cpu_offload": ("BOOLEAN", {"default": False, "tooltip": "significantly reducing memory usage and slows down the inference"}),
}, },
} }
@ -188,7 +190,7 @@ class DownloadAndLoadCogVideoGGUFModel:
FUNCTION = "loadmodel" FUNCTION = "loadmodel"
CATEGORY = "CogVideoWrapper" CATEGORY = "CogVideoWrapper"
def loadmodel(self, model, vae_precision, fp8_fastmode, load_device): def loadmodel(self, model, vae_precision, fp8_fastmode, load_device, enable_sequential_cpu_offload):
device = mm.get_torch_device() device = mm.get_torch_device()
offload_device = mm.unet_offload_device() offload_device = mm.unet_offload_device()
mm.soft_empty_cache() mm.soft_empty_cache()
@ -213,17 +215,24 @@ class DownloadAndLoadCogVideoGGUFModel:
local_dir_use_symlinks=False, local_dir_use_symlinks=False,
) )
if "5b" in model:
with open(os.path.join(script_directory, 'configs', 'transformer_config_5b.json')) as f: scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
transformer_config = json.load(f) transformer_path = os.path.join(script_directory, 'configs', 'transformer_config_5b.json')
elif "2b" in model:
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json')
transformer_path = os.path.join(script_directory, 'configs', 'transformer_config_2b.json')
with open(transformer_path) as f:
transformer_config = json.load(f)
sd = load_torch_file(gguf_path) sd = load_torch_file(gguf_path)
# for key, value in sd.items(): #for key, value in sd.items():
# print(key, value.shape, value.dtype) # print(key, value.shape, value.dtype)
from . import mz_gguf_loader from . import mz_gguf_loader
import importlib import importlib
importlib.reload(mz_gguf_loader) importlib.reload(mz_gguf_loader)
with mz_gguf_loader.quantize_lazy_load(): with mz_gguf_loader.quantize_lazy_load():
if "fun" in model: if "fun" in model:
transformer_config["in_channels"] = 33 transformer_config["in_channels"] = 33
@ -235,7 +244,14 @@ class DownloadAndLoadCogVideoGGUFModel:
transformer_config["in_channels"] = 16 transformer_config["in_channels"] = 16
transformer = CogVideoXTransformer3DModel.from_config(transformer_config) transformer = CogVideoXTransformer3DModel.from_config(transformer_config)
transformer.to(torch.float8_e4m3fn) if "2b" in model:
for name, param in transformer.named_parameters():
if name != "pos_embedding":
param.data = param.data.to(torch.float8_e4m3fn)
else:
param.data = param.data.to(torch.float16)
else:
transformer.to(torch.float8_e4m3fn)
transformer = mz_gguf_loader.quantize_load_state_dict(transformer, sd, device="cpu") transformer = mz_gguf_loader.quantize_load_state_dict(transformer, sd, device="cpu")
if load_device == "offload_device": if load_device == "offload_device":
transformer.to(offload_device) transformer.to(offload_device)
@ -246,7 +262,7 @@ class DownloadAndLoadCogVideoGGUFModel:
from .fp8_optimization import convert_fp8_linear from .fp8_optimization import convert_fp8_linear
convert_fp8_linear(transformer, vae_dtype) convert_fp8_linear(transformer, vae_dtype)
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
with open(scheduler_path) as f: with open(scheduler_path) as f:
scheduler_config = json.load(f) scheduler_config = json.load(f)
@ -279,28 +295,31 @@ class DownloadAndLoadCogVideoGGUFModel:
pipe = CogVideoXPipeline(vae, transformer, scheduler) pipe = CogVideoXPipeline(vae, transformer, scheduler)
# compilation # compilation
if compile == "torch": # if compile == "torch":
torch._dynamo.config.suppress_errors = True # torch._dynamo.config.suppress_errors = True
pipe.transformer.to(memory_format=torch.channels_last) # pipe.transformer.to(memory_format=torch.channels_last)
pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True) # pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True)
elif compile == "onediff": # elif compile == "onediff":
from onediffx import compile_pipe # from onediffx import compile_pipe
os.environ['NEXFORT_FX_FORCE_TRITON_SDPA'] = '1' # os.environ['NEXFORT_FX_FORCE_TRITON_SDPA'] = '1'
pipe = compile_pipe( # pipe = compile_pipe(
pipe, # pipe,
backend="nexfort", # backend="nexfort",
options= {"mode": "max-optimize:max-autotune:max-autotune", "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": False, "triton.fuse_attention_allow_fp16_reduction": False}}, # options= {"mode": "max-optimize:max-autotune:max-autotune", "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": False, "triton.fuse_attention_allow_fp16_reduction": False}},
ignores=["vae"], # ignores=["vae"],
fuse_qkv_projections=True, # fuse_qkv_projections=True,
) # )
if enable_sequential_cpu_offload:
pipe.enable_sequential_cpu_offload()
pipeline = { pipeline = {
"pipe": pipe, "pipe": pipe,
"dtype": vae_dtype, "dtype": vae_dtype,
"base_path": "Fun" if "fun" in model else "sad", "base_path": "Fun" if "fun" in model else "sad",
"onediff": True if compile == "onediff" else False, "onediff": True if compile == "onediff" else False,
"cpu_offloading": False, "cpu_offloading": enable_sequential_cpu_offload,
"scheduler_config": scheduler_config "scheduler_config": scheduler_config
} }