From 73fa4be48fb495a1b8360c26eec628a9e53d60db Mon Sep 17 00:00:00 2001 From: kijai <40791699+kijai@users.noreply.github.com> Date: Sat, 21 Sep 2024 16:37:54 +0300 Subject: [PATCH 1/4] Cleanup unused dependencies --- cogvideox_fun/utils.py | 77 +----------------------------------------- 1 file changed, 1 insertion(+), 76 deletions(-) diff --git a/cogvideox_fun/utils.py b/cogvideox_fun/utils.py index e9c5cc7..0b71f55 100644 --- a/cogvideox_fun/utils.py +++ b/cogvideox_fun/utils.py @@ -1,20 +1,10 @@ import os import gc -import imageio import numpy as np import torch -import torchvision -import cv2 -from einops import rearrange from PIL import Image # Copyright (c) OpenMMLab. All rights reserved. -import os -import cv2 -import numpy as np -import torch -from PIL import Image - def tensor2pil(image): return Image.fromarray(np.clip(255. * image.cpu().numpy(), 0, 255).astype(np.uint8)) @@ -73,60 +63,6 @@ def get_width_and_height_from_image_and_base_resolution(image, base_resolution): height_slider = round(original_height * ratio) return height_slider, width_slider -def color_transfer(sc, dc): - """ - Transfer color distribution from of sc, referred to dc. - - Args: - sc (numpy.ndarray): input image to be transfered. - dc (numpy.ndarray): reference image - - Returns: - numpy.ndarray: Transferred color distribution on the sc. - """ - - def get_mean_and_std(img): - x_mean, x_std = cv2.meanStdDev(img) - x_mean = np.hstack(np.around(x_mean, 2)) - x_std = np.hstack(np.around(x_std, 2)) - return x_mean, x_std - - sc = cv2.cvtColor(sc, cv2.COLOR_RGB2LAB) - s_mean, s_std = get_mean_and_std(sc) - dc = cv2.cvtColor(dc, cv2.COLOR_RGB2LAB) - t_mean, t_std = get_mean_and_std(dc) - img_n = ((sc - s_mean) * (t_std / s_std)) + t_mean - np.putmask(img_n, img_n > 255, 255) - np.putmask(img_n, img_n < 0, 0) - dst = cv2.cvtColor(cv2.convertScaleAbs(img_n), cv2.COLOR_LAB2RGB) - return dst - -def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=12, imageio_backend=True, color_transfer_post_process=False): - videos = rearrange(videos, "b c t h w -> t b c h w") - outputs = [] - for x in videos: - x = torchvision.utils.make_grid(x, nrow=n_rows) - x = x.transpose(0, 1).transpose(1, 2).squeeze(-1) - if rescale: - x = (x + 1.0) / 2.0 # -1,1 -> 0,1 - x = (x * 255).numpy().astype(np.uint8) - outputs.append(Image.fromarray(x)) - - if color_transfer_post_process: - for i in range(1, len(outputs)): - outputs[i] = Image.fromarray(color_transfer(np.uint8(outputs[i]), np.uint8(outputs[0]))) - - os.makedirs(os.path.dirname(path), exist_ok=True) - if imageio_backend: - if path.endswith("mp4"): - imageio.mimsave(path, outputs, fps=fps) - else: - imageio.mimsave(path, outputs, duration=(1000 * 1/fps)) - else: - if path.endswith("mp4"): - path = path.replace('.mp4', '.gif') - outputs[0].save(path, format='GIF', append_images=outputs, save_all=True, duration=100, loop=0) - def get_image_to_video_latent(validation_image_start, validation_image_end, video_length, sample_size): if validation_image_start is not None and validation_image_end is not None: if type(validation_image_start) is str and os.path.isfile(validation_image_start): @@ -224,18 +160,7 @@ def get_image_to_video_latent(validation_image_start, validation_image_end, vide return input_video, input_video_mask, clip_image def get_video_to_video_latent(input_video_path, video_length, sample_size): - if type(input_video_path) is str: - cap = cv2.VideoCapture(input_video_path) - input_video = [] - while True: - ret, frame = cap.read() - if not ret: - break - frame = cv2.resize(frame, (sample_size[1], sample_size[0])) - input_video.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) - cap.release() - else: - input_video = input_video_path + input_video = input_video_path input_video = torch.from_numpy(np.array(input_video))[:video_length] input_video = input_video.permute([3, 0, 1, 2]).unsqueeze(0) / 255 From 006507a9fe56154438c60b45241cb404e66b281d Mon Sep 17 00:00:00 2001 From: kijai <40791699+kijai@users.noreply.github.com> Date: Sat, 21 Sep 2024 16:46:59 +0300 Subject: [PATCH 2/4] Don't overwrite sequential cpu offloading in the fun sampler --- nodes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nodes.py b/nodes.py index fc853c2..4577310 100644 --- a/nodes.py +++ b/nodes.py @@ -737,7 +737,8 @@ class CogVideoXFunSampler: base_path = pipeline["base_path"] assert "Fun" in base_path, "'Unfun' models not supported in 'CogVideoXFunSampler', use the 'CogVideoSampler'" - pipe.enable_model_cpu_offload(device=device) + if not pipeline["cpu_offloading"]: + pipe.enable_model_cpu_offload(device=device) mm.soft_empty_cache() From bc31982707f40180a1e3423c2b8b4e5fb10d0e80 Mon Sep 17 00:00:00 2001 From: kijai <40791699+kijai@users.noreply.github.com> Date: Sat, 21 Sep 2024 18:43:26 +0300 Subject: [PATCH 3/4] fix for sequential_cpu_offload in "fun" pipeline --- cogvideox_fun/pipeline_cogvideox_inpaint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cogvideox_fun/pipeline_cogvideox_inpaint.py b/cogvideox_fun/pipeline_cogvideox_inpaint.py index 466da13..f372342 100644 --- a/cogvideox_fun/pipeline_cogvideox_inpaint.py +++ b/cogvideox_fun/pipeline_cogvideox_inpaint.py @@ -209,7 +209,7 @@ class CogVideoX_Fun_Pipeline_Inpaint(DiffusionPipeline): """ _optional_components = [] - model_cpu_offload_seq = "text_encoder->vae->transformer->vae" + model_cpu_offload_seq = ">vae->transformer->vae" _callback_tensor_inputs = [ "latents", @@ -631,7 +631,7 @@ class CogVideoX_Fun_Pipeline_Inpaint(DiffusionPipeline): device = self._execution_device - self.vae.to(device) + #self.vae.to(device) # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` From ffece2db599caa8d90ab13860461dee091f722d0 Mon Sep 17 00:00:00 2001 From: kijai <40791699+kijai@users.noreply.github.com> Date: Sat, 21 Sep 2024 18:58:26 +0300 Subject: [PATCH 4/4] Create cogvidex_fun_5b_GGUF_10GB_VRAM_example_01.json --- ...idex_fun_5b_GGUF_10GB_VRAM_example_01.json | 570 ++++++++++++++++++ 1 file changed, 570 insertions(+) create mode 100644 examples/cogvidex_fun_5b_GGUF_10GB_VRAM_example_01.json diff --git a/examples/cogvidex_fun_5b_GGUF_10GB_VRAM_example_01.json b/examples/cogvidex_fun_5b_GGUF_10GB_VRAM_example_01.json new file mode 100644 index 0000000..1dc562c --- /dev/null +++ b/examples/cogvidex_fun_5b_GGUF_10GB_VRAM_example_01.json @@ -0,0 +1,570 @@ +{ + "last_node_id": 48, + "last_link_id": 101, + "nodes": [ + { + "id": 20, + "type": "CLIPLoader", + "pos": { + "0": -26, + "1": 400 + }, + "size": { + "0": 451.30548095703125, + "1": 82 + }, + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "CLIP", + "type": "CLIP", + "links": [ + 54, + 56 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CLIPLoader" + }, + "widgets_values": [ + "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors", + "sd3" + ] + }, + { + "id": 31, + "type": "CogVideoTextEncode", + "pos": { + "0": 497, + "1": 520 + }, + "size": { + "0": 463.01251220703125, + "1": 124 + }, + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 56 + } + ], + "outputs": [ + { + "name": "conditioning", + "type": "CONDITIONING", + "links": [ + 86 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoTextEncode" + }, + "widgets_values": [ + "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ", + 1, + true + ] + }, + { + "id": 44, + "type": "VHS_VideoCombine", + "pos": { + "0": 1842, + "1": 345 + }, + "size": [ + 855.81494140625, + 927.6441243489584 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 97 + }, + { + "name": "audio", + "type": "AUDIO", + "link": null + }, + { + "name": "meta_batch", + "type": "VHS_BatchManager", + "link": null + }, + { + "name": "vae", + "type": "VAE", + "link": null + } + ], + "outputs": [ + { + "name": "Filenames", + "type": "VHS_FILENAMES", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "VHS_VideoCombine" + }, + "widgets_values": { + "frame_rate": 16, + "loop_count": 0, + "filename_prefix": "CogVideoX_Fun", + "format": "video/h264-mp4", + "pix_fmt": "yuv420p", + "crf": 19, + "save_metadata": true, + "pingpong": false, + "save_output": false, + "videopreview": { + "hidden": false, + "paused": false, + "params": { + "filename": "CogVideoX_Fun_00012.mp4", + "subfolder": "", + "type": "temp", + "format": "video/h264-mp4", + "frame_rate": 16 + }, + "muted": false + } + } + }, + { + "id": 11, + "type": "CogVideoDecode", + "pos": { + "0": 1448, + "1": 345 + }, + "size": { + "0": 300.396484375, + "1": 198 + }, + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 89 + }, + { + "name": "samples", + "type": "LATENT", + "link": 88 + } + ], + "outputs": [ + { + "name": "images", + "type": "IMAGE", + "links": [ + 97 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoDecode" + }, + "widgets_values": [ + true, + 240, + 360, + 0.2, + 0.2, + true + ] + }, + { + "id": 36, + "type": "LoadImage", + "pos": { + "0": 364, + "1": 715 + }, + "size": { + "0": 391.3421325683594, + "1": 456.8497009277344 + }, + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 71 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "MASK", + "type": "MASK", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "LoadImage" + }, + "widgets_values": [ + "sd3stag.png", + "image" + ] + }, + { + "id": 37, + "type": "ImageResizeKJ", + "pos": { + "0": 824, + "1": 715 + }, + "size": { + "0": 315, + "1": 266 + }, + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "image", + "type": "IMAGE", + "link": 71 + }, + { + "name": "get_image_size", + "type": "IMAGE", + "link": null + }, + { + "name": "width_input", + "type": "INT", + "link": null, + "widget": { + "name": "width_input" + } + }, + { + "name": "height_input", + "type": "INT", + "link": null, + "widget": { + "name": "height_input" + } + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 87 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "width", + "type": "INT", + "links": null, + "shape": 3 + }, + { + "name": "height", + "type": "INT", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "ImageResizeKJ" + }, + "widgets_values": [ + 720, + 480, + "lanczos", + true, + 16, + 0, + 0, + "disabled" + ] + }, + { + "id": 30, + "type": "CogVideoTextEncode", + "pos": { + "0": 493, + "1": 303 + }, + "size": { + "0": 471.90142822265625, + "1": 168.08047485351562 + }, + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 54 + } + ], + "outputs": [ + { + "name": "conditioning", + "type": "CONDITIONING", + "links": [ + 85 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoTextEncode" + }, + "widgets_values": [ + "majestic stag grazing in a forest and basking in the setting sun", + 1, + true + ] + }, + { + "id": 48, + "type": "DownloadAndLoadCogVideoGGUFModel", + "pos": { + "0": 584, + "1": 103 + }, + "size": { + "0": 378, + "1": 130 + }, + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", + "links": [ + 101 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DownloadAndLoadCogVideoGGUFModel" + }, + "widgets_values": [ + "CogVideoX_5b_fun_GGUF_Q4_0.safetensors", + "bf16", + false, + "offload_device" + ] + }, + { + "id": 41, + "type": "CogVideoXFunSampler", + "pos": { + "0": 1058, + "1": 345 + }, + "size": { + "0": 315, + "1": 302 + }, + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "COGVIDEOPIPE", + "link": 101 + }, + { + "name": "positive", + "type": "CONDITIONING", + "link": 85 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 86 + }, + { + "name": "start_img", + "type": "IMAGE", + "link": 87 + }, + { + "name": "end_img", + "type": "IMAGE", + "link": null + }, + { + "name": "opt_empty_latent", + "type": "LATENT", + "link": null + } + ], + "outputs": [ + { + "name": "cogvideo_pipe", + "type": "COGVIDEOPIPE", + "links": [ + 89 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "samples", + "type": "LATENT", + "links": [ + 88 + ], + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CogVideoXFunSampler" + }, + "widgets_values": [ + 49, + 512, + 44, + "fixed", + 30, + 6, + "CogVideoXDPMScheduler" + ] + } + ], + "links": [ + [ + 54, + 20, + 0, + 30, + 0, + "CLIP" + ], + [ + 56, + 20, + 0, + 31, + 0, + "CLIP" + ], + [ + 71, + 36, + 0, + 37, + 0, + "IMAGE" + ], + [ + 85, + 30, + 0, + 41, + 1, + "CONDITIONING" + ], + [ + 86, + 31, + 0, + 41, + 2, + "CONDITIONING" + ], + [ + 87, + 37, + 0, + 41, + 3, + "IMAGE" + ], + [ + 88, + 41, + 1, + 11, + 1, + "LATENT" + ], + [ + 89, + 41, + 0, + 11, + 0, + "COGVIDEOPIPE" + ], + [ + 97, + 11, + 0, + 44, + 0, + "IMAGE" + ], + [ + 101, + 48, + 0, + 41, + 0, + "COGVIDEOPIPE" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.7627768444385654, + "offset": [ + 62.58315607223924, + 102.05205752424705 + ] + } + }, + "version": 0.4 +} \ No newline at end of file