Merge remote-tracking branch 'kijai/main'

2026-07-12 03:07:00 +08:00 · 2024-09-21 12:04:20 -04:00 · 2024-09-21 12:04:20 -04:00 · 69dbbd3487
commit 69dbbd3487
parent b5d11895b6 ffece2db59
4 changed files with 575 additions and 79 deletions
--- a/cogvideox_fun/pipeline_cogvideox_inpaint.py
+++ b/cogvideox_fun/pipeline_cogvideox_inpaint.py
@ -209,7 +209,7 @@ class CogVideoX_Fun_Pipeline_Inpaint(DiffusionPipeline):
    """

    _optional_components = []
-    model_cpu_offload_seq = "text_encoder->vae->transformer->vae"
+    model_cpu_offload_seq = ">vae->transformer->vae"

    _callback_tensor_inputs = [
        "latents",
@ -631,7 +631,7 @@ class CogVideoX_Fun_Pipeline_Inpaint(DiffusionPipeline):

        device = self._execution_device

-        self.vae.to(device)
+        #self.vae.to(device)

        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
--- a/cogvideox_fun/utils.py
+++ b/cogvideox_fun/utils.py
@ -1,20 +1,10 @@
 import os
 import gc
-import imageio
 import numpy as np
 import torch
-import torchvision
-import cv2
-from einops import rearrange
 from PIL import Image

 # Copyright (c) OpenMMLab. All rights reserved.
-import os
-import cv2
-import numpy as np
-import torch
-from PIL import Image
-

 def tensor2pil(image):
    return Image.fromarray(np.clip(255. * image.cpu().numpy(), 0, 255).astype(np.uint8))
@ -73,60 +63,6 @@ def get_width_and_height_from_image_and_base_resolution(image, base_resolution):
    height_slider = round(original_height * ratio)
    return height_slider, width_slider

-def color_transfer(sc, dc):
-    """
-    Transfer color distribution from of sc, referred to dc.
-
-    Args:
-        sc (numpy.ndarray): input image to be transfered.
-        dc (numpy.ndarray): reference image
-
-    Returns:
-        numpy.ndarray: Transferred color distribution on the sc.
-    """
-
-    def get_mean_and_std(img):
-        x_mean, x_std = cv2.meanStdDev(img)
-        x_mean = np.hstack(np.around(x_mean, 2))
-        x_std = np.hstack(np.around(x_std, 2))
-        return x_mean, x_std
-
-    sc = cv2.cvtColor(sc, cv2.COLOR_RGB2LAB)
-    s_mean, s_std = get_mean_and_std(sc)
-    dc = cv2.cvtColor(dc, cv2.COLOR_RGB2LAB)
-    t_mean, t_std = get_mean_and_std(dc)
-    img_n = ((sc - s_mean) * (t_std / s_std)) + t_mean
-    np.putmask(img_n, img_n > 255, 255)
-    np.putmask(img_n, img_n < 0, 0)
-    dst = cv2.cvtColor(cv2.convertScaleAbs(img_n), cv2.COLOR_LAB2RGB)
-    return dst
-
-def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=12, imageio_backend=True, color_transfer_post_process=False):
-    videos = rearrange(videos, "b c t h w -> t b c h w")
-    outputs = []
-    for x in videos:
-        x = torchvision.utils.make_grid(x, nrow=n_rows)
-        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
-        if rescale:
-            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
-        x = (x * 255).numpy().astype(np.uint8)
-        outputs.append(Image.fromarray(x))
-
-    if color_transfer_post_process:
-        for i in range(1, len(outputs)):
-            outputs[i] = Image.fromarray(color_transfer(np.uint8(outputs[i]), np.uint8(outputs[0])))
-
-    os.makedirs(os.path.dirname(path), exist_ok=True)
-    if imageio_backend:
-        if path.endswith("mp4"):
-            imageio.mimsave(path, outputs, fps=fps)
-        else:
-            imageio.mimsave(path, outputs, duration=(1000 * 1/fps))
-    else:
-        if path.endswith("mp4"):
-            path = path.replace('.mp4', '.gif')
-        outputs[0].save(path, format='GIF', append_images=outputs, save_all=True, duration=100, loop=0)
-
 def get_image_to_video_latent(validation_image_start, validation_image_end, video_length, sample_size):
    if validation_image_start is not None and validation_image_end is not None:
        if type(validation_image_start) is str and os.path.isfile(validation_image_start):
@ -224,18 +160,7 @@ def get_image_to_video_latent(validation_image_start, validation_image_end, vide
    return  input_video, input_video_mask, clip_image

 def get_video_to_video_latent(input_video_path, video_length, sample_size):
-    if type(input_video_path) is str:
-        cap = cv2.VideoCapture(input_video_path)
-        input_video = []
-        while True:
-            ret, frame = cap.read()
-            if not ret:
-                break
-            frame = cv2.resize(frame, (sample_size[1], sample_size[0]))
-            input_video.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-        cap.release()
-    else:
-        input_video = input_video_path
+    input_video = input_video_path

    input_video = torch.from_numpy(np.array(input_video))[:video_length]
    input_video = input_video.permute([3, 0, 1, 2]).unsqueeze(0) / 255
--- a/examples/cogvidex_fun_5b_GGUF_10GB_VRAM_example_01.json
+++ b/examples/cogvidex_fun_5b_GGUF_10GB_VRAM_example_01.json
@ -0,0 +1,570 @@
+{
+  "last_node_id": 48,
+  "last_link_id": 101,
+  "nodes": [
+    {
+      "id": 20,
+      "type": "CLIPLoader",
+      "pos": {
+        "0": -26,
+        "1": 400
+      },
+      "size": {
+        "0": 451.30548095703125,
+        "1": 82
+      },
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "CLIP",
+          "type": "CLIP",
+          "links": [
+            54,
+            56
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CLIPLoader"
+      },
+      "widgets_values": [
+        "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
+        "sd3"
+      ]
+    },
+    {
+      "id": 31,
+      "type": "CogVideoTextEncode",
+      "pos": {
+        "0": 497,
+        "1": 520
+      },
+      "size": {
+        "0": 463.01251220703125,
+        "1": 124
+      },
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 56
+        }
+      ],
+      "outputs": [
+        {
+          "name": "conditioning",
+          "type": "CONDITIONING",
+          "links": [
+            86
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoTextEncode"
+      },
+      "widgets_values": [
+        "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
+        1,
+        true
+      ]
+    },
+    {
+      "id": 44,
+      "type": "VHS_VideoCombine",
+      "pos": {
+        "0": 1842,
+        "1": 345
+      },
+      "size": [
+        855.81494140625,
+        927.6441243489584
+      ],
+      "flags": {},
+      "order": 8,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 97
+        },
+        {
+          "name": "audio",
+          "type": "AUDIO",
+          "link": null
+        },
+        {
+          "name": "meta_batch",
+          "type": "VHS_BatchManager",
+          "link": null
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": null
+        }
+      ],
+      "outputs": [
+        {
+          "name": "Filenames",
+          "type": "VHS_FILENAMES",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VHS_VideoCombine"
+      },
+      "widgets_values": {
+        "frame_rate": 16,
+        "loop_count": 0,
+        "filename_prefix": "CogVideoX_Fun",
+        "format": "video/h264-mp4",
+        "pix_fmt": "yuv420p",
+        "crf": 19,
+        "save_metadata": true,
+        "pingpong": false,
+        "save_output": false,
+        "videopreview": {
+          "hidden": false,
+          "paused": false,
+          "params": {
+            "filename": "CogVideoX_Fun_00012.mp4",
+            "subfolder": "",
+            "type": "temp",
+            "format": "video/h264-mp4",
+            "frame_rate": 16
+          },
+          "muted": false
+        }
+      }
+    },
+    {
+      "id": 11,
+      "type": "CogVideoDecode",
+      "pos": {
+        "0": 1448,
+        "1": 345
+      },
+      "size": {
+        "0": 300.396484375,
+        "1": 198
+      },
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 89
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 88
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            97
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoDecode"
+      },
+      "widgets_values": [
+        true,
+        240,
+        360,
+        0.2,
+        0.2,
+        true
+      ]
+    },
+    {
+      "id": 36,
+      "type": "LoadImage",
+      "pos": {
+        "0": 364,
+        "1": 715
+      },
+      "size": {
+        "0": 391.3421325683594,
+        "1": 456.8497009277344
+      },
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            71
+          ],
+          "slot_index": 0,
+          "shape": 3
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadImage"
+      },
+      "widgets_values": [
+        "sd3stag.png",
+        "image"
+      ]
+    },
+    {
+      "id": 37,
+      "type": "ImageResizeKJ",
+      "pos": {
+        "0": 824,
+        "1": 715
+      },
+      "size": {
+        "0": 315,
+        "1": 266
+      },
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "link": 71
+        },
+        {
+          "name": "get_image_size",
+          "type": "IMAGE",
+          "link": null
+        },
+        {
+          "name": "width_input",
+          "type": "INT",
+          "link": null,
+          "widget": {
+            "name": "width_input"
+          }
+        },
+        {
+          "name": "height_input",
+          "type": "INT",
+          "link": null,
+          "widget": {
+            "name": "height_input"
+          }
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            87
+          ],
+          "slot_index": 0,
+          "shape": 3
+        },
+        {
+          "name": "width",
+          "type": "INT",
+          "links": null,
+          "shape": 3
+        },
+        {
+          "name": "height",
+          "type": "INT",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "ImageResizeKJ"
+      },
+      "widgets_values": [
+        720,
+        480,
+        "lanczos",
+        true,
+        16,
+        0,
+        0,
+        "disabled"
+      ]
+    },
+    {
+      "id": 30,
+      "type": "CogVideoTextEncode",
+      "pos": {
+        "0": 493,
+        "1": 303
+      },
+      "size": {
+        "0": 471.90142822265625,
+        "1": 168.08047485351562
+      },
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 54
+        }
+      ],
+      "outputs": [
+        {
+          "name": "conditioning",
+          "type": "CONDITIONING",
+          "links": [
+            85
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoTextEncode"
+      },
+      "widgets_values": [
+        "majestic stag grazing in a forest and basking in the setting sun",
+        1,
+        true
+      ]
+    },
+    {
+      "id": 48,
+      "type": "DownloadAndLoadCogVideoGGUFModel",
+      "pos": {
+        "0": 584,
+        "1": 103
+      },
+      "size": {
+        "0": 378,
+        "1": 130
+      },
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "cogvideo_pipe",
+          "type": "COGVIDEOPIPE",
+          "links": [
+            101
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "DownloadAndLoadCogVideoGGUFModel"
+      },
+      "widgets_values": [
+        "CogVideoX_5b_fun_GGUF_Q4_0.safetensors",
+        "bf16",
+        false,
+        "offload_device"
+      ]
+    },
+    {
+      "id": 41,
+      "type": "CogVideoXFunSampler",
+      "pos": {
+        "0": 1058,
+        "1": 345
+      },
+      "size": {
+        "0": 315,
+        "1": 302
+      },
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 101
+        },
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "link": 85
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "link": 86
+        },
+        {
+          "name": "start_img",
+          "type": "IMAGE",
+          "link": 87
+        },
+        {
+          "name": "end_img",
+          "type": "IMAGE",
+          "link": null
+        },
+        {
+          "name": "opt_empty_latent",
+          "type": "LATENT",
+          "link": null
+        }
+      ],
+      "outputs": [
+        {
+          "name": "cogvideo_pipe",
+          "type": "COGVIDEOPIPE",
+          "links": [
+            89
+          ],
+          "slot_index": 0,
+          "shape": 3
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "links": [
+            88
+          ],
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoXFunSampler"
+      },
+      "widgets_values": [
+        49,
+        512,
+        44,
+        "fixed",
+        30,
+        6,
+        "CogVideoXDPMScheduler"
+      ]
+    }
+  ],
+  "links": [
+    [
+      54,
+      20,
+      0,
+      30,
+      0,
+      "CLIP"
+    ],
+    [
+      56,
+      20,
+      0,
+      31,
+      0,
+      "CLIP"
+    ],
+    [
+      71,
+      36,
+      0,
+      37,
+      0,
+      "IMAGE"
+    ],
+    [
+      85,
+      30,
+      0,
+      41,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      86,
+      31,
+      0,
+      41,
+      2,
+      "CONDITIONING"
+    ],
+    [
+      87,
+      37,
+      0,
+      41,
+      3,
+      "IMAGE"
+    ],
+    [
+      88,
+      41,
+      1,
+      11,
+      1,
+      "LATENT"
+    ],
+    [
+      89,
+      41,
+      0,
+      11,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      97,
+      11,
+      0,
+      44,
+      0,
+      "IMAGE"
+    ],
+    [
+      101,
+      48,
+      0,
+      41,
+      0,
+      "COGVIDEOPIPE"
+    ]
+  ],
+  "groups": [],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.7627768444385654,
+      "offset": [
+        62.58315607223924,
+        102.05205752424705
+      ]
+    }
+  },
+  "version": 0.4
+}
--- a/nodes.py
+++ b/nodes.py
@ -727,7 +727,8 @@ class CogVideoXFunSampler:
        base_path = pipeline["base_path"]
        assert "Fun" in base_path, "'Unfun' models not supported in 'CogVideoXFunSampler', use the 'CogVideoSampler'"

-        pipe.enable_model_cpu_offload(device=device)
+        if not pipeline["cpu_offloading"]:
+            pipe.enable_model_cpu_offload(device=device)

        mm.soft_empty_cache()