Add CogVideoX-Fun-V1.1-5b-Control

https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-5b-Control
2025-12-08 20:34:23 +08:00 · 2024-11-20 01:23:54 +02:00 · 2024-11-20 01:23:54 +02:00 · ecd067260c
commit ecd067260c
parent c9efefe736
3 changed files with 19 additions and 25 deletions
--- a/model_loading.py
+++ b/model_loading.py
@ -108,6 +108,7 @@ class DownloadAndLoadCogVideoModel:
                        "alibaba-pai/CogVideoX-Fun-V1.1-5b-InP",
                        "alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose",
                        "alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose",
+                        "alibaba-pai/CogVideoX-Fun-V1.1-5b-Control",
                        "feizhengcong/CogvideoX-Interpolation",
                        "NimVideo/cogvideox-2b-img2vid"
                    ],
@ -233,7 +234,7 @@ class DownloadAndLoadCogVideoModel:
            transformer, 
            scheduler, 
            dtype=dtype, 
-            is_fun_inpaint=True if "fun" in model.lower() and "pose" not in model.lower() else False
+            is_fun_inpaint="fun" in model.lower() and not ("pose" in model.lower() or "control" in model.lower())
            )
        if "cogvideox-2b-img2vid" in model:
            pipe.input_with_padding = False
@ -255,7 +256,6 @@ class DownloadAndLoadCogVideoModel:
                    adapter_weight = l['strength']
                    pipe.load_lora_weights(l['path'], weight_name=l['path'].split("/")[-1], lora_rank=lora_rank, adapter_name=adapter_name)
                    
-                    #transformer = load_lora_into_transformer(lora, transformer)
                    adapter_list.append(adapter_name)
                    adapter_weights.append(adapter_weight)
                for l in lora:
@ -549,7 +549,12 @@ class DownloadAndLoadCogVideoGGUFModel:
        vae = AutoencoderKLCogVideoX.from_config(vae_config).to(vae_dtype).to(offload_device)
        vae.load_state_dict(vae_sd)
        del vae_sd
-        pipe = CogVideoXPipeline(transformer, scheduler, dtype=vae_dtype)
+        pipe = CogVideoXPipeline(
+            transformer, 
+            scheduler, 
+            dtype=vae_dtype,
+            is_fun_inpaint="fun" in model.lower() and not ("pose" in model.lower() or "control" in model.lower())
+            )

        if enable_sequential_cpu_offload:
            pipe.enable_sequential_cpu_offload()
@ -675,7 +680,6 @@ class CogVideoXModelLoader:
            set_module_tensor_to_device(transformer, name, device=transformer_load_device, dtype=base_dtype, value=sd[name])
        del sd

-
        #scheduler
        with open(scheduler_config_path) as f:
            scheduler_config = json.load(f)
@ -692,14 +696,12 @@ class CogVideoXModelLoader:
                    module.fuse_projections(fuse=True)
        transformer.attention_mode = attention_mode

-        if "fun" in model_type:
-            if not "pose" in model_type:
-                raise NotImplementedError("Fun models besides pose are not supported with this loader yet")
-                pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler)
-            else:
-                pipe = CogVideoXPipeline(transformer, scheduler, dtype=base_dtype)
-        else:
-            pipe = CogVideoXPipeline(transformer, scheduler, dtype=base_dtype)
+        pipe = CogVideoXPipeline(
+            transformer, 
+            scheduler, 
+            dtype=base_dtype, 
+            is_fun_inpaint="fun" in model.lower() and not ("pose" in model.lower() or "control" in model.lower())
+            )

        if enable_sequential_cpu_offload:
            pipe.enable_sequential_cpu_offload()
@ -796,11 +798,6 @@ class CogVideoXModelLoader:
            
            manual_offloading = False # to disable manual .to(device) calls
            log.info(f"Quantized transformer blocks to {quantization}")
-        
-        # if load_device == "offload_device":
-        #     pipe.transformer.to(offload_device)
-        # else:
-        #     pipe.transformer.to(device)

        pipeline = {
            "pipe": pipe,
@ -812,7 +809,6 @@ class CogVideoXModelLoader:
            "model_name": model,
            "manual_offloading": manual_offloading,
        }
-
        return (pipeline,)
    
 #region VAE
--- a/nodes.py
+++ b/nodes.py
@ -343,13 +343,10 @@ class CogVideoImageEncodeFunInP:
        
        bs = 1
        new_mask_pixel_values = []
-        print("input_image shape: ",input_image.shape)
        for i in range(0, input_image.shape[0], bs):
            mask_pixel_values_bs = input_image[i : i + bs]
            mask_pixel_values_bs = vae.encode(mask_pixel_values_bs)[0]
-            print("mask_pixel_values_bs: ",mask_pixel_values_bs.parameters.shape)
            mask_pixel_values_bs = mask_pixel_values_bs.mode()
-            print("mask_pixel_values_bs: ",mask_pixel_values_bs.shape, mask_pixel_values_bs.min(), mask_pixel_values_bs.max())
            new_mask_pixel_values.append(mask_pixel_values_bs)
        masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0)
        masked_image_latents = masked_image_latents.permute(0, 2, 1, 3, 4)  # B, T, C, H, W
@ -601,8 +598,7 @@ class CogVideoSampler:

        model_name = model.get("model_name", "")
        supports_image_conds = True if "I2V" in model_name or "interpolation" in model_name.lower() or "fun" in model_name.lower() else False
-
-        if "fun" in model_name.lower() and "pose" not in model_name.lower() and image_cond_latents is not None:
+        if "fun" in model_name.lower() and not ("pose" in model_name.lower() or "control" in model_name.lower()) and image_cond_latents is not None:
            assert image_cond_latents["mask"] is not None, "For fun inpaint models use CogVideoImageEncodeFunInP"
            fun_mask = image_cond_latents["mask"]
        else:
@ -855,8 +851,8 @@ class CogVideoXFunResizeToClosestBucket:
        from .cogvideox_fun.utils import ASPECT_RATIO_512, get_closest_ratio

        B, H, W, C = images.shape
-        # Count most suitable height and width
-        aspect_ratio_sample_size    = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
+        # Find most suitable height and width
+        aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}

        closest_size, closest_ratio = get_closest_ratio(H, W, ratios=aspect_ratio_sample_size)
        height, width = [int(x / 16) * 16 for x in closest_size]
--- a/readme.md
+++ b/readme.md
@ -1,5 +1,7 @@
 # WORK IN PROGRESS

+Spreadsheet (WIP) of supported models and their supported features: https://docs.google.com/spreadsheets/d/16eA6mSL8XkTcu9fSWkPSHfRIqyAKJbR1O99xnuGdCKY/edit?usp=sharing
+
 ## BREAKING Update8

 This is big one, and unfortunately to do the necessary cleanup and refactoring this will break every old workflow as they are.