Merge remote-tracking branch 'kijai/main'

This commit is contained in:
Phr00t 2024-09-21 12:04:20 -04:00
commit 69dbbd3487
4 changed files with 575 additions and 79 deletions

View File

@ -209,7 +209,7 @@ class CogVideoX_Fun_Pipeline_Inpaint(DiffusionPipeline):
"""
_optional_components = []
model_cpu_offload_seq = "text_encoder->vae->transformer->vae"
model_cpu_offload_seq = ">vae->transformer->vae"
_callback_tensor_inputs = [
"latents",
@ -631,7 +631,7 @@ class CogVideoX_Fun_Pipeline_Inpaint(DiffusionPipeline):
device = self._execution_device
self.vae.to(device)
#self.vae.to(device)
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`

View File

@ -1,20 +1,10 @@
import os
import gc
import imageio
import numpy as np
import torch
import torchvision
import cv2
from einops import rearrange
from PIL import Image
# Copyright (c) OpenMMLab. All rights reserved.
import os
import cv2
import numpy as np
import torch
from PIL import Image
def tensor2pil(image):
return Image.fromarray(np.clip(255. * image.cpu().numpy(), 0, 255).astype(np.uint8))
@ -73,60 +63,6 @@ def get_width_and_height_from_image_and_base_resolution(image, base_resolution):
height_slider = round(original_height * ratio)
return height_slider, width_slider
def color_transfer(sc, dc):
"""
Transfer color distribution from of sc, referred to dc.
Args:
sc (numpy.ndarray): input image to be transfered.
dc (numpy.ndarray): reference image
Returns:
numpy.ndarray: Transferred color distribution on the sc.
"""
def get_mean_and_std(img):
x_mean, x_std = cv2.meanStdDev(img)
x_mean = np.hstack(np.around(x_mean, 2))
x_std = np.hstack(np.around(x_std, 2))
return x_mean, x_std
sc = cv2.cvtColor(sc, cv2.COLOR_RGB2LAB)
s_mean, s_std = get_mean_and_std(sc)
dc = cv2.cvtColor(dc, cv2.COLOR_RGB2LAB)
t_mean, t_std = get_mean_and_std(dc)
img_n = ((sc - s_mean) * (t_std / s_std)) + t_mean
np.putmask(img_n, img_n > 255, 255)
np.putmask(img_n, img_n < 0, 0)
dst = cv2.cvtColor(cv2.convertScaleAbs(img_n), cv2.COLOR_LAB2RGB)
return dst
def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=12, imageio_backend=True, color_transfer_post_process=False):
videos = rearrange(videos, "b c t h w -> t b c h w")
outputs = []
for x in videos:
x = torchvision.utils.make_grid(x, nrow=n_rows)
x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
if rescale:
x = (x + 1.0) / 2.0 # -1,1 -> 0,1
x = (x * 255).numpy().astype(np.uint8)
outputs.append(Image.fromarray(x))
if color_transfer_post_process:
for i in range(1, len(outputs)):
outputs[i] = Image.fromarray(color_transfer(np.uint8(outputs[i]), np.uint8(outputs[0])))
os.makedirs(os.path.dirname(path), exist_ok=True)
if imageio_backend:
if path.endswith("mp4"):
imageio.mimsave(path, outputs, fps=fps)
else:
imageio.mimsave(path, outputs, duration=(1000 * 1/fps))
else:
if path.endswith("mp4"):
path = path.replace('.mp4', '.gif')
outputs[0].save(path, format='GIF', append_images=outputs, save_all=True, duration=100, loop=0)
def get_image_to_video_latent(validation_image_start, validation_image_end, video_length, sample_size):
if validation_image_start is not None and validation_image_end is not None:
if type(validation_image_start) is str and os.path.isfile(validation_image_start):
@ -224,18 +160,7 @@ def get_image_to_video_latent(validation_image_start, validation_image_end, vide
return input_video, input_video_mask, clip_image
def get_video_to_video_latent(input_video_path, video_length, sample_size):
if type(input_video_path) is str:
cap = cv2.VideoCapture(input_video_path)
input_video = []
while True:
ret, frame = cap.read()
if not ret:
break
frame = cv2.resize(frame, (sample_size[1], sample_size[0]))
input_video.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
cap.release()
else:
input_video = input_video_path
input_video = input_video_path
input_video = torch.from_numpy(np.array(input_video))[:video_length]
input_video = input_video.permute([3, 0, 1, 2]).unsqueeze(0) / 255

View File

@ -0,0 +1,570 @@
{
"last_node_id": 48,
"last_link_id": 101,
"nodes": [
{
"id": 20,
"type": "CLIPLoader",
"pos": {
"0": -26,
"1": 400
},
"size": {
"0": 451.30548095703125,
"1": 82
},
"flags": {},
"order": 0,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "CLIP",
"type": "CLIP",
"links": [
54,
56
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CLIPLoader"
},
"widgets_values": [
"t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
"sd3"
]
},
{
"id": 31,
"type": "CogVideoTextEncode",
"pos": {
"0": 497,
"1": 520
},
"size": {
"0": 463.01251220703125,
"1": 124
},
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 56
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
86
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
"The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
1,
true
]
},
{
"id": 44,
"type": "VHS_VideoCombine",
"pos": {
"0": 1842,
"1": 345
},
"size": [
855.81494140625,
927.6441243489584
],
"flags": {},
"order": 8,
"mode": 0,
"inputs": [
{
"name": "images",
"type": "IMAGE",
"link": 97
},
{
"name": "audio",
"type": "AUDIO",
"link": null
},
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
},
{
"name": "vae",
"type": "VAE",
"link": null
}
],
"outputs": [
{
"name": "Filenames",
"type": "VHS_FILENAMES",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_VideoCombine"
},
"widgets_values": {
"frame_rate": 16,
"loop_count": 0,
"filename_prefix": "CogVideoX_Fun",
"format": "video/h264-mp4",
"pix_fmt": "yuv420p",
"crf": 19,
"save_metadata": true,
"pingpong": false,
"save_output": false,
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"filename": "CogVideoX_Fun_00012.mp4",
"subfolder": "",
"type": "temp",
"format": "video/h264-mp4",
"frame_rate": 16
},
"muted": false
}
}
},
{
"id": 11,
"type": "CogVideoDecode",
"pos": {
"0": 1448,
"1": 345
},
"size": {
"0": 300.396484375,
"1": 198
},
"flags": {},
"order": 7,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 89
},
{
"name": "samples",
"type": "LATENT",
"link": 88
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
97
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
},
"widgets_values": [
true,
240,
360,
0.2,
0.2,
true
]
},
{
"id": 36,
"type": "LoadImage",
"pos": {
"0": 364,
"1": 715
},
"size": {
"0": 391.3421325683594,
"1": 456.8497009277344
},
"flags": {},
"order": 1,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
71
],
"slot_index": 0,
"shape": 3
},
{
"name": "MASK",
"type": "MASK",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "LoadImage"
},
"widgets_values": [
"sd3stag.png",
"image"
]
},
{
"id": 37,
"type": "ImageResizeKJ",
"pos": {
"0": 824,
"1": 715
},
"size": {
"0": 315,
"1": 266
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 71
},
{
"name": "get_image_size",
"type": "IMAGE",
"link": null
},
{
"name": "width_input",
"type": "INT",
"link": null,
"widget": {
"name": "width_input"
}
},
{
"name": "height_input",
"type": "INT",
"link": null,
"widget": {
"name": "height_input"
}
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
87
],
"slot_index": 0,
"shape": 3
},
{
"name": "width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "height",
"type": "INT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "ImageResizeKJ"
},
"widgets_values": [
720,
480,
"lanczos",
true,
16,
0,
0,
"disabled"
]
},
{
"id": 30,
"type": "CogVideoTextEncode",
"pos": {
"0": 493,
"1": 303
},
"size": {
"0": 471.90142822265625,
"1": 168.08047485351562
},
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 54
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
85
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
"majestic stag grazing in a forest and basking in the setting sun",
1,
true
]
},
{
"id": 48,
"type": "DownloadAndLoadCogVideoGGUFModel",
"pos": {
"0": 584,
"1": 103
},
"size": {
"0": 378,
"1": 130
},
"flags": {},
"order": 2,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
101
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadCogVideoGGUFModel"
},
"widgets_values": [
"CogVideoX_5b_fun_GGUF_Q4_0.safetensors",
"bf16",
false,
"offload_device"
]
},
{
"id": 41,
"type": "CogVideoXFunSampler",
"pos": {
"0": 1058,
"1": 345
},
"size": {
"0": 315,
"1": 302
},
"flags": {},
"order": 6,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 101
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 85
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 86
},
{
"name": "start_img",
"type": "IMAGE",
"link": 87
},
{
"name": "end_img",
"type": "IMAGE",
"link": null
},
{
"name": "opt_empty_latent",
"type": "LATENT",
"link": null
}
],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
89
],
"slot_index": 0,
"shape": 3
},
{
"name": "samples",
"type": "LATENT",
"links": [
88
],
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoXFunSampler"
},
"widgets_values": [
49,
512,
44,
"fixed",
30,
6,
"CogVideoXDPMScheduler"
]
}
],
"links": [
[
54,
20,
0,
30,
0,
"CLIP"
],
[
56,
20,
0,
31,
0,
"CLIP"
],
[
71,
36,
0,
37,
0,
"IMAGE"
],
[
85,
30,
0,
41,
1,
"CONDITIONING"
],
[
86,
31,
0,
41,
2,
"CONDITIONING"
],
[
87,
37,
0,
41,
3,
"IMAGE"
],
[
88,
41,
1,
11,
1,
"LATENT"
],
[
89,
41,
0,
11,
0,
"COGVIDEOPIPE"
],
[
97,
11,
0,
44,
0,
"IMAGE"
],
[
101,
48,
0,
41,
0,
"COGVIDEOPIPE"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 0.7627768444385654,
"offset": [
62.58315607223924,
102.05205752424705
]
}
},
"version": 0.4
}

View File

@ -727,7 +727,8 @@ class CogVideoXFunSampler:
base_path = pipeline["base_path"]
assert "Fun" in base_path, "'Unfun' models not supported in 'CogVideoXFunSampler', use the 'CogVideoSampler'"
pipe.enable_model_cpu_offload(device=device)
if not pipeline["cpu_offloading"]:
pipe.enable_model_cpu_offload(device=device)
mm.soft_empty_cache()