Merge remote-tracking branch 'kijai/main'

2026-07-14 15:37:03 +08:00 · 2024-09-21 12:04:20 -04:00 · 2024-09-21 12:04:20 -04:00 · 69dbbd3487
commit 69dbbd3487
parent b5d11895b6 ffece2db59
4 changed files with 575 additions and 79 deletions
--- a/cogvideox_fun/pipeline_cogvideox_inpaint.py
+++ b/cogvideox_fun/pipeline_cogvideox_inpaint.py
@ -209,7 +209,7 @@ class CogVideoX_Fun_Pipeline_Inpaint(DiffusionPipeline):
    """
    _optional_components = []
-    model_cpu_offload_seq = "text_encoder->vae->transformer->vae"
+    model_cpu_offload_seq = ">vae->transformer->vae"
    _callback_tensor_inputs = [
        "latents",
@ -631,7 +631,7 @@ class CogVideoX_Fun_Pipeline_Inpaint(DiffusionPipeline):
        device = self._execution_device
-        self.vae.to(device)
+        #self.vae.to(device)
        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
--- a/cogvideox_fun/utils.py
+++ b/cogvideox_fun/utils.py
@ -1,20 +1,10 @@
 import os
 import gc
 import imageio
 import numpy as np
 import torch
 import torchvision
 import cv2
 from einops import rearrange
 from PIL import Image
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
 import cv2
 import numpy as np
 import torch
 from PIL import Image
 def tensor2pil(image):
    return Image.fromarray(np.clip(255. * image.cpu().numpy(), 0, 255).astype(np.uint8))
@ -73,60 +63,6 @@ def get_width_and_height_from_image_and_base_resolution(image, base_resolution):
    height_slider = round(original_height * ratio)
    return height_slider, width_slider
 def color_transfer(sc, dc):
    """
    Transfer color distribution from of sc, referred to dc.
    Args:
        sc (numpy.ndarray): input image to be transfered.
        dc (numpy.ndarray): reference image
    Returns:
        numpy.ndarray: Transferred color distribution on the sc.
    """
    def get_mean_and_std(img):
        x_mean, x_std = cv2.meanStdDev(img)
        x_mean = np.hstack(np.around(x_mean, 2))
        x_std = np.hstack(np.around(x_std, 2))
        return x_mean, x_std
    sc = cv2.cvtColor(sc, cv2.COLOR_RGB2LAB)
    s_mean, s_std = get_mean_and_std(sc)
    dc = cv2.cvtColor(dc, cv2.COLOR_RGB2LAB)
    t_mean, t_std = get_mean_and_std(dc)
    img_n = ((sc - s_mean) * (t_std / s_std)) + t_mean
    np.putmask(img_n, img_n > 255, 255)
    np.putmask(img_n, img_n < 0, 0)
    dst = cv2.cvtColor(cv2.convertScaleAbs(img_n), cv2.COLOR_LAB2RGB)
    return dst
 def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=12, imageio_backend=True, color_transfer_post_process=False):
    videos = rearrange(videos, "b c t h w -> t b c h w")
    outputs = []
    for x in videos:
        x = torchvision.utils.make_grid(x, nrow=n_rows)
        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
        if rescale:
            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
        x = (x * 255).numpy().astype(np.uint8)
        outputs.append(Image.fromarray(x))
    if color_transfer_post_process:
        for i in range(1, len(outputs)):
            outputs[i] = Image.fromarray(color_transfer(np.uint8(outputs[i]), np.uint8(outputs[0])))
    os.makedirs(os.path.dirname(path), exist_ok=True)
    if imageio_backend:
        if path.endswith("mp4"):
            imageio.mimsave(path, outputs, fps=fps)
        else:
            imageio.mimsave(path, outputs, duration=(1000 * 1/fps))
    else:
        if path.endswith("mp4"):
            path = path.replace('.mp4', '.gif')
        outputs[0].save(path, format='GIF', append_images=outputs, save_all=True, duration=100, loop=0)
 def get_image_to_video_latent(validation_image_start, validation_image_end, video_length, sample_size):
    if validation_image_start is not None and validation_image_end is not None:
        if type(validation_image_start) is str and os.path.isfile(validation_image_start):
@ -224,18 +160,7 @@ def get_image_to_video_latent(validation_image_start, validation_image_end, vide
    return  input_video, input_video_mask, clip_image
 def get_video_to_video_latent(input_video_path, video_length, sample_size):
-    if type(input_video_path) is str:
+    input_video = input_video_path
        cap = cv2.VideoCapture(input_video_path)
        input_video = []
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.resize(frame, (sample_size[1], sample_size[0]))
            input_video.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        cap.release()
    else:
        input_video = input_video_path
    input_video = torch.from_numpy(np.array(input_video))[:video_length]
    input_video = input_video.permute([3, 0, 1, 2]).unsqueeze(0) / 255
--- a/examples/cogvidex_fun_5b_GGUF_10GB_VRAM_example_01.json
+++ b/examples/cogvidex_fun_5b_GGUF_10GB_VRAM_example_01.json
@ -0,0 +1,570 @@
 {
  "last_node_id": 48,
  "last_link_id": 101,
  "nodes": [
    {
      "id": 20,
      "type": "CLIPLoader",
      "pos": {
        "0": -26,
        "1": 400
      },
      "size": {
        "0": 451.30548095703125,
        "1": 82
      },
      "flags": {},
      "order": 0,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "CLIP",
          "type": "CLIP",
          "links": [
            54,
            56
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CLIPLoader"
      },
      "widgets_values": [
        "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
        "sd3"
      ]
    },
    {
      "id": 31,
      "type": "CogVideoTextEncode",
      "pos": {
        "0": 497,
        "1": 520
      },
      "size": {
        "0": 463.01251220703125,
        "1": 124
      },
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 56
        }
      ],
      "outputs": [
        {
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
            86
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoTextEncode"
      },
      "widgets_values": [
        "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
        1,
        true
      ]
    },
    {
      "id": 44,
      "type": "VHS_VideoCombine",
      "pos": {
        "0": 1842,
        "1": 345
      },
      "size": [
        855.81494140625,
        927.6441243489584
      ],
      "flags": {},
      "order": 8,
      "mode": 0,
      "inputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "link": 97
        },
        {
          "name": "audio",
          "type": "AUDIO",
          "link": null
        },
        {
          "name": "meta_batch",
          "type": "VHS_BatchManager",
          "link": null
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "Filenames",
          "type": "VHS_FILENAMES",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "VHS_VideoCombine"
      },
      "widgets_values": {
        "frame_rate": 16,
        "loop_count": 0,
        "filename_prefix": "CogVideoX_Fun",
        "format": "video/h264-mp4",
        "pix_fmt": "yuv420p",
        "crf": 19,
        "save_metadata": true,
        "pingpong": false,
        "save_output": false,
        "videopreview": {
          "hidden": false,
          "paused": false,
          "params": {
            "filename": "CogVideoX_Fun_00012.mp4",
            "subfolder": "",
            "type": "temp",
            "format": "video/h264-mp4",
            "frame_rate": 16
          },
          "muted": false
        }
      }
    },
    {
      "id": 11,
      "type": "CogVideoDecode",
      "pos": {
        "0": 1448,
        "1": 345
      },
      "size": {
        "0": 300.396484375,
        "1": 198
      },
      "flags": {},
      "order": 7,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 89
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": 88
        }
      ],
      "outputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "links": [
            97
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoDecode"
      },
      "widgets_values": [
        true,
        240,
        360,
        0.2,
        0.2,
        true
      ]
    },
    {
      "id": 36,
      "type": "LoadImage",
      "pos": {
        "0": 364,
        "1": 715
      },
      "size": {
        "0": 391.3421325683594,
        "1": 456.8497009277344
      },
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            71
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "MASK",
          "type": "MASK",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "LoadImage"
      },
      "widgets_values": [
        "sd3stag.png",
        "image"
      ]
    },
    {
      "id": 37,
      "type": "ImageResizeKJ",
      "pos": {
        "0": 824,
        "1": 715
      },
      "size": {
        "0": 315,
        "1": 266
      },
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "image",
          "type": "IMAGE",
          "link": 71
        },
        {
          "name": "get_image_size",
          "type": "IMAGE",
          "link": null
        },
        {
          "name": "width_input",
          "type": "INT",
          "link": null,
          "widget": {
            "name": "width_input"
          }
        },
        {
          "name": "height_input",
          "type": "INT",
          "link": null,
          "widget": {
            "name": "height_input"
          }
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            87
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "width",
          "type": "INT",
          "links": null,
          "shape": 3
        },
        {
          "name": "height",
          "type": "INT",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "ImageResizeKJ"
      },
      "widgets_values": [
        720,
        480,
        "lanczos",
        true,
        16,
        0,
        0,
        "disabled"
      ]
    },
    {
      "id": 30,
      "type": "CogVideoTextEncode",
      "pos": {
        "0": 493,
        "1": 303
      },
      "size": {
        "0": 471.90142822265625,
        "1": 168.08047485351562
      },
      "flags": {},
      "order": 3,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 54
        }
      ],
      "outputs": [
        {
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
            85
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoTextEncode"
      },
      "widgets_values": [
        "majestic stag grazing in a forest and basking in the setting sun",
        1,
        true
      ]
    },
    {
      "id": 48,
      "type": "DownloadAndLoadCogVideoGGUFModel",
      "pos": {
        "0": 584,
        "1": 103
      },
      "size": {
        "0": 378,
        "1": 130
      },
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
            101
          ],
          "shape": 3,
          "slot_index": 0
        }
      ],
      "properties": {
        "Node name for S&R": "DownloadAndLoadCogVideoGGUFModel"
      },
      "widgets_values": [
        "CogVideoX_5b_fun_GGUF_Q4_0.safetensors",
        "bf16",
        false,
        "offload_device"
      ]
    },
    {
      "id": 41,
      "type": "CogVideoXFunSampler",
      "pos": {
        "0": 1058,
        "1": 345
      },
      "size": {
        "0": 315,
        "1": 302
      },
      "flags": {},
      "order": 6,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 101
        },
        {
          "name": "positive",
          "type": "CONDITIONING",
          "link": 85
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
          "link": 86
        },
        {
          "name": "start_img",
          "type": "IMAGE",
          "link": 87
        },
        {
          "name": "end_img",
          "type": "IMAGE",
          "link": null
        },
        {
          "name": "opt_empty_latent",
          "type": "LATENT",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
            89
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "samples",
          "type": "LATENT",
          "links": [
            88
          ],
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoXFunSampler"
      },
      "widgets_values": [
        49,
        512,
        44,
        "fixed",
        30,
        6,
        "CogVideoXDPMScheduler"
      ]
    }
  ],
  "links": [
    [
      54,
      20,
      0,
      30,
      0,
      "CLIP"
    ],
    [
      56,
      20,
      0,
      31,
      0,
      "CLIP"
    ],
    [
      71,
      36,
      0,
      37,
      0,
      "IMAGE"
    ],
    [
      85,
      30,
      0,
      41,
      1,
      "CONDITIONING"
    ],
    [
      86,
      31,
      0,
      41,
      2,
      "CONDITIONING"
    ],
    [
      87,
      37,
      0,
      41,
      3,
      "IMAGE"
    ],
    [
      88,
      41,
      1,
      11,
      1,
      "LATENT"
    ],
    [
      89,
      41,
      0,
      11,
      0,
      "COGVIDEOPIPE"
    ],
    [
      97,
      11,
      0,
      44,
      0,
      "IMAGE"
    ],
    [
      101,
      48,
      0,
      41,
      0,
      "COGVIDEOPIPE"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
      "scale": 0.7627768444385654,
      "offset": [
        62.58315607223924,
        102.05205752424705
      ]
    }
  },
  "version": 0.4
 }
--- a/nodes.py
+++ b/nodes.py
@ -727,7 +727,8 @@ class CogVideoXFunSampler:
        base_path = pipeline["base_path"]
        assert "Fun" in base_path, "'Unfun' models not supported in 'CogVideoXFunSampler', use the 'CogVideoSampler'"
-        pipe.enable_model_cpu_offload(device=device)
+        if not pipeline["cpu_offloading"]:
            pipe.enable_model_cpu_offload(device=device)
        mm.soft_empty_cache()