diff --git a/model_loading.py b/model_loading.py index db9d814..1402b59 100644 --- a/model_loading.py +++ b/model_loading.py @@ -108,6 +108,7 @@ class DownloadAndLoadCogVideoModel: "alibaba-pai/CogVideoX-Fun-V1.1-5b-InP", "alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose", "alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose", + "alibaba-pai/CogVideoX-Fun-V1.1-5b-Control", "feizhengcong/CogvideoX-Interpolation", "NimVideo/cogvideox-2b-img2vid" ], @@ -233,7 +234,7 @@ class DownloadAndLoadCogVideoModel: transformer, scheduler, dtype=dtype, - is_fun_inpaint=True if "fun" in model.lower() and "pose" not in model.lower() else False + is_fun_inpaint="fun" in model.lower() and not ("pose" in model.lower() or "control" in model.lower()) ) if "cogvideox-2b-img2vid" in model: pipe.input_with_padding = False @@ -255,7 +256,6 @@ class DownloadAndLoadCogVideoModel: adapter_weight = l['strength'] pipe.load_lora_weights(l['path'], weight_name=l['path'].split("/")[-1], lora_rank=lora_rank, adapter_name=adapter_name) - #transformer = load_lora_into_transformer(lora, transformer) adapter_list.append(adapter_name) adapter_weights.append(adapter_weight) for l in lora: @@ -549,7 +549,12 @@ class DownloadAndLoadCogVideoGGUFModel: vae = AutoencoderKLCogVideoX.from_config(vae_config).to(vae_dtype).to(offload_device) vae.load_state_dict(vae_sd) del vae_sd - pipe = CogVideoXPipeline(transformer, scheduler, dtype=vae_dtype) + pipe = CogVideoXPipeline( + transformer, + scheduler, + dtype=vae_dtype, + is_fun_inpaint="fun" in model.lower() and not ("pose" in model.lower() or "control" in model.lower()) + ) if enable_sequential_cpu_offload: pipe.enable_sequential_cpu_offload() @@ -675,7 +680,6 @@ class CogVideoXModelLoader: set_module_tensor_to_device(transformer, name, device=transformer_load_device, dtype=base_dtype, value=sd[name]) del sd - #scheduler with open(scheduler_config_path) as f: scheduler_config = json.load(f) @@ -692,14 +696,12 @@ class CogVideoXModelLoader: module.fuse_projections(fuse=True) transformer.attention_mode = attention_mode - if "fun" in model_type: - if not "pose" in model_type: - raise NotImplementedError("Fun models besides pose are not supported with this loader yet") - pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler) - else: - pipe = CogVideoXPipeline(transformer, scheduler, dtype=base_dtype) - else: - pipe = CogVideoXPipeline(transformer, scheduler, dtype=base_dtype) + pipe = CogVideoXPipeline( + transformer, + scheduler, + dtype=base_dtype, + is_fun_inpaint="fun" in model.lower() and not ("pose" in model.lower() or "control" in model.lower()) + ) if enable_sequential_cpu_offload: pipe.enable_sequential_cpu_offload() @@ -796,11 +798,6 @@ class CogVideoXModelLoader: manual_offloading = False # to disable manual .to(device) calls log.info(f"Quantized transformer blocks to {quantization}") - - # if load_device == "offload_device": - # pipe.transformer.to(offload_device) - # else: - # pipe.transformer.to(device) pipeline = { "pipe": pipe, @@ -812,7 +809,6 @@ class CogVideoXModelLoader: "model_name": model, "manual_offloading": manual_offloading, } - return (pipeline,) #region VAE diff --git a/nodes.py b/nodes.py index 6af6b6c..3764c9d 100644 --- a/nodes.py +++ b/nodes.py @@ -343,13 +343,10 @@ class CogVideoImageEncodeFunInP: bs = 1 new_mask_pixel_values = [] - print("input_image shape: ",input_image.shape) for i in range(0, input_image.shape[0], bs): mask_pixel_values_bs = input_image[i : i + bs] mask_pixel_values_bs = vae.encode(mask_pixel_values_bs)[0] - print("mask_pixel_values_bs: ",mask_pixel_values_bs.parameters.shape) mask_pixel_values_bs = mask_pixel_values_bs.mode() - print("mask_pixel_values_bs: ",mask_pixel_values_bs.shape, mask_pixel_values_bs.min(), mask_pixel_values_bs.max()) new_mask_pixel_values.append(mask_pixel_values_bs) masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0) masked_image_latents = masked_image_latents.permute(0, 2, 1, 3, 4) # B, T, C, H, W @@ -601,8 +598,7 @@ class CogVideoSampler: model_name = model.get("model_name", "") supports_image_conds = True if "I2V" in model_name or "interpolation" in model_name.lower() or "fun" in model_name.lower() else False - - if "fun" in model_name.lower() and "pose" not in model_name.lower() and image_cond_latents is not None: + if "fun" in model_name.lower() and not ("pose" in model_name.lower() or "control" in model_name.lower()) and image_cond_latents is not None: assert image_cond_latents["mask"] is not None, "For fun inpaint models use CogVideoImageEncodeFunInP" fun_mask = image_cond_latents["mask"] else: @@ -855,8 +851,8 @@ class CogVideoXFunResizeToClosestBucket: from .cogvideox_fun.utils import ASPECT_RATIO_512, get_closest_ratio B, H, W, C = images.shape - # Count most suitable height and width - aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()} + # Find most suitable height and width + aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()} closest_size, closest_ratio = get_closest_ratio(H, W, ratios=aspect_ratio_sample_size) height, width = [int(x / 16) * 16 for x in closest_size] diff --git a/readme.md b/readme.md index 97c72c1..cfe0578 100644 --- a/readme.md +++ b/readme.md @@ -1,5 +1,7 @@ # WORK IN PROGRESS +Spreadsheet (WIP) of supported models and their supported features: https://docs.google.com/spreadsheets/d/16eA6mSL8XkTcu9fSWkPSHfRIqyAKJbR1O99xnuGdCKY/edit?usp=sharing + ## BREAKING Update8 This is big one, and unfortunately to do the necessary cleanup and refactoring this will break every old workflow as they are.