update examples, expose scheduler, force T5 offload

This commit is contained in:
kijai 2024-08-07 01:10:19 +03:00
parent 8a0af3b663
commit 97e89d596e
4 changed files with 436 additions and 410 deletions

View File

@ -1,46 +1,7 @@
{
"last_node_id": 59,
"last_link_id": 137,
"last_node_id": 64,
"last_link_id": 167,
"nodes": [
{
"id": 31,
"type": "CogVideoTextEncode",
"pos": [
503,
521
],
"size": {
"0": 463.01251220703125,
"1": 98.10446166992188
},
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 56
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
80
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
""
]
},
{
"id": 1,
"type": "DownloadAndLoadCogVideoModel",
@ -60,8 +21,8 @@
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
78,
83
83,
159
],
"shape": 3,
"slot_index": 0
@ -108,47 +69,6 @@
"sd3"
]
},
{
"id": 11,
"type": "CogVideoDecode",
"pos": [
1199,
661
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 9,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 81
},
{
"name": "samples",
"type": "LATENT",
"link": 82
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
118
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
},
{
"id": 56,
"type": "SimpleMath+",
@ -233,7 +153,7 @@
"name": "samples",
"type": "LATENT",
"links": [
122
162
],
"shape": 3,
"slot_index": 0
@ -301,84 +221,6 @@
"Node name for S&R": "GetImageSizeAndCount"
}
},
{
"id": 41,
"type": "ImageResizeKJ",
"pos": [
315,
-19
],
"size": {
"0": 315,
"1": 242
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 128
},
{
"name": "get_image_size",
"type": "IMAGE",
"link": null
},
{
"name": "width_input",
"type": "INT",
"link": null,
"widget": {
"name": "width_input"
}
},
{
"name": "height_input",
"type": "INT",
"link": null,
"widget": {
"name": "height_input"
}
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
126
],
"shape": 3,
"slot_index": 0
},
{
"name": "width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "height",
"type": "INT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "ImageResizeKJ"
},
"widgets_values": [
720,
480,
"lanczos",
false,
2,
0,
0
]
},
{
"id": 59,
"type": "GetImageRangeFromBatch",
@ -448,10 +290,10 @@
1451,
368
],
"size": [
315,
102
],
"size": {
"0": 315,
"1": 102
},
"flags": {
"collapsed": true
},
@ -552,12 +394,12 @@
"id": 47,
"type": "VHS_VideoCombine",
"pos": [
1789,
1790,
-104
],
"size": [
1113.3311767578125,
712.4437255859375
1110,
711.3333333333333
],
"flags": {},
"order": 15,
@ -610,7 +452,7 @@
"hidden": false,
"paused": false,
"params": {
"filename": "AnimateDiff_00011.mp4",
"filename": "AnimateDiff_00008.mp4",
"subfolder": "",
"type": "temp",
"format": "video/nvenc_h264-mp4",
@ -619,6 +461,190 @@
}
}
},
{
"id": 57,
"type": "GetImageSizeAndCount",
"pos": [
674,
2
],
"size": {
"0": 210,
"1": 86
},
"flags": {},
"order": 6,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 126,
"slot_index": 0
}
],
"outputs": [
{
"name": "image",
"type": "IMAGE",
"links": [
129,
136
],
"shape": 3,
"slot_index": 0
},
{
"name": "720 width",
"type": "INT",
"links": [
165
],
"shape": 3,
"slot_index": 1
},
{
"name": "480 height",
"type": "INT",
"links": [
164
],
"shape": 3,
"slot_index": 2
},
{
"name": "16 count",
"type": "INT",
"links": [
163
],
"shape": 3,
"slot_index": 3
}
],
"properties": {
"Node name for S&R": "GetImageSizeAndCount"
}
},
{
"id": 41,
"type": "ImageResizeKJ",
"pos": [
315,
-19
],
"size": {
"0": 315,
"1": 242
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 128
},
{
"name": "get_image_size",
"type": "IMAGE",
"link": null
},
{
"name": "width_input",
"type": "INT",
"link": null,
"widget": {
"name": "width_input"
}
},
{
"name": "height_input",
"type": "INT",
"link": null,
"widget": {
"name": "height_input"
}
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
126
],
"shape": 3,
"slot_index": 0
},
{
"name": "width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "height",
"type": "INT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "ImageResizeKJ"
},
"widgets_values": [
720,
480,
"lanczos",
false,
2,
0,
0
]
},
{
"id": 11,
"type": "CogVideoDecode",
"pos": [
1201,
684
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 9,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 166
},
{
"name": "samples",
"type": "LATENT",
"link": 167
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
118
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
},
{
"id": 30,
"type": "CogVideoTextEncode",
@ -645,7 +671,7 @@
"name": "conditioning",
"type": "CONDITIONING",
"links": [
79
160
],
"shape": 3,
"slot_index": 0
@ -655,84 +681,46 @@
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
"video of dinosaur turning it's head in a cinematic and dramatic scene from a movie"
"cinematic video of a red panda turning it's head"
]
},
{
"id": 36,
"type": "CogVideoSampler",
"id": 31,
"type": "CogVideoTextEncode",
"pos": [
1093,
292
],
"size": [
315,
310
503,
521
],
"size": {
"0": 463.01251220703125,
"1": 98.10446166992188
},
"flags": {},
"order": 8,
"order": 4,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 78
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 79
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 80
},
{
"name": "samples",
"type": "LATENT",
"link": 122
},
{
"name": "num_frames",
"type": "INT",
"link": 137,
"widget": {
"name": "num_frames"
}
"name": "clip",
"type": "CLIP",
"link": 56
}
],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"name": "conditioning",
"type": "CONDITIONING",
"links": [
81
161
],
"shape": 3
},
{
"name": "samples",
"type": "LATENT",
"links": [
82
],
"shape": 3
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoSampler"
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
480,
720,
16,
8,
25,
8,
1119546789766856,
"fixed",
0.8
"bad quality video, blurry, messy"
]
},
{
@ -819,63 +807,98 @@
}
},
{
"id": 57,
"type": "GetImageSizeAndCount",
"id": 64,
"type": "CogVideoSampler",
"pos": [
674,
2
1090,
290
],
"size": {
"0": 210,
"1": 86
"0": 315,
"1": 342
},
"flags": {},
"order": 6,
"order": 8,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 126,
"slot_index": 0
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 159
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 160
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 161
},
{
"name": "samples",
"type": "LATENT",
"link": 162
},
{
"name": "num_frames",
"type": "INT",
"link": 163,
"widget": {
"name": "num_frames"
}
},
{
"name": "height",
"type": "INT",
"link": 164,
"widget": {
"name": "height"
}
},
{
"name": "width",
"type": "INT",
"link": 165,
"widget": {
"name": "width"
}
}
],
"outputs": [
{
"name": "image",
"type": "IMAGE",
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
129,
136
166
],
"shape": 3,
"slot_index": 0
},
{
"name": "720 width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "480 height",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "16 count",
"type": "INT",
"name": "samples",
"type": "LATENT",
"links": [
137
167
],
"shape": 3,
"slot_index": 3
"shape": 3
}
],
"properties": {
"Node name for S&R": "GetImageSizeAndCount"
}
"Node name for S&R": "CogVideoSampler"
},
"widgets_values": [
480,
720,
48,
8,
35,
9,
6,
"fixed",
"DPM",
0.7000000000000001
]
}
],
"links": [
@ -895,46 +918,6 @@
0,
"CLIP"
],
[
78,
1,
0,
36,
0,
"COGVIDEOPIPE"
],
[
79,
30,
0,
36,
1,
"CONDITIONING"
],
[
80,
31,
0,
36,
2,
"CONDITIONING"
],
[
81,
36,
0,
11,
0,
"COGVIDEOPIPE"
],
[
82,
36,
1,
11,
1,
"LATENT"
],
[
83,
1,
@ -975,14 +958,6 @@
0,
"INT,FLOAT"
],
[
122,
37,
0,
36,
3,
"LATENT"
],
[
126,
41,
@ -1048,22 +1023,86 @@
"IMAGE"
],
[
137,
159,
1,
0,
64,
0,
"COGVIDEOPIPE"
],
[
160,
30,
0,
64,
1,
"CONDITIONING"
],
[
161,
31,
0,
64,
2,
"CONDITIONING"
],
[
162,
37,
0,
64,
3,
"LATENT"
],
[
163,
57,
3,
36,
64,
4,
"INT"
],
[
164,
57,
2,
64,
5,
"INT"
],
[
165,
57,
1,
64,
6,
"INT"
],
[
166,
64,
0,
11,
0,
"COGVIDEOPIPE"
],
[
167,
64,
1,
11,
1,
"LATENT"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 0.7513148009015777,
"scale": 0.6830134553650705,
"offset": [
45.633655208726886,
389.8041242612087
56.628416841109384,
394.7727729054069
]
}
},

View File

@ -11,7 +11,7 @@
],
"size": {
"0": 315,
"1": 266
"1": 334
},
"flags": {},
"order": 4,
@ -32,6 +32,11 @@
"name": "negative",
"type": "CONDITIONING",
"link": 57
},
{
"name": "samples",
"type": "LATENT",
"link": null
}
],
"outputs": [
@ -63,50 +68,11 @@
25,
6,
806286757407561,
"fixed"
"fixed",
"DDIM",
1
]
},
{
"id": 11,
"type": "CogVideoDecode",
"pos": [
1142,
658
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 37
},
{
"name": "samples",
"type": "LATENT",
"link": 38
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
51
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
},
{
"id": 28,
"type": "VHS_VideoCombine",
@ -169,7 +135,7 @@
"hidden": false,
"paused": false,
"params": {
"filename": "AnimateDiff_00001.mp4",
"filename": "CogVideoX_00001.mp4",
"subfolder": "",
"type": "temp",
"format": "video/h264-mp4",
@ -185,10 +151,10 @@
500,
308
],
"size": [
474.84501511852204,
164.74235966960538
],
"size": {
"0": 474.8450012207031,
"1": 164.7423553466797
},
"flags": {},
"order": 2,
"mode": 0,
@ -258,10 +224,10 @@
503,
521
],
"size": [
463.01251866466464,
98.10446321574796
],
"size": {
"0": 463.01251220703125,
"1": 98.10446166992188
},
"flags": {},
"order": 3,
"mode": 0,
@ -321,6 +287,47 @@
"widgets_values": [
"fp16"
]
},
{
"id": 11,
"type": "CogVideoDecode",
"pos": [
1138,
725
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 37
},
{
"name": "samples",
"type": "LATENT",
"link": 38
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
51
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
}
],
"links": [
@ -393,10 +400,10 @@
"config": {},
"extra": {
"ds": {
"scale": 0.6830134553650706,
"scale": 0.9090909090909092,
"offset": [
359.4381777891929,
334.95283678425216
12.99028921497383,
38.21608107136124
]
}
},

View File

@ -2,7 +2,7 @@ import os
import torch
import folder_paths
import comfy.model_management as mm
from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
from .pipeline_cogvideox import CogVideoXPipeline
import logging
@ -54,11 +54,11 @@ class DownloadAndLoadCogVideoModel:
)
pipe = CogVideoXPipeline.from_pretrained(base_path, torch_dtype=dtype).to(offload_device)
pipeline = {
"pipe": pipe,
"dtype": dtype
"dtype": dtype,
"base_path": base_path
}
return (pipeline,)
@ -115,11 +115,15 @@ class CogVideoTextEncode:
CATEGORY = "CogVideoWrapper"
def process(self, clip, prompt):
load_device = mm.text_encoder_device()
offload_device = mm.text_encoder_offload_device()
clip.tokenizer.t5xxl.pad_to_max_length = True
clip.tokenizer.t5xxl.max_length = 226
clip.cond_stage_model.to(load_device)
tokens = clip.tokenize(prompt, return_word_ids=True)
embeds = clip.encode_from_tokens(tokens, return_pooled=False, return_dict=False)
clip.cond_stage_model.to(offload_device)
return (embeds, )
@ -194,6 +198,7 @@ class CogVideoSampler:
"steps": ("INT", {"default": 25, "min": 1}),
"cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
"scheduler": (["DDIM", "DPM"],),
},
"optional": {
"samples": ("LATENT", ),
@ -206,16 +211,22 @@ class CogVideoSampler:
FUNCTION = "process"
CATEGORY = "CogVideoWrapper"
def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, samples=None, denoise_strength=1.0):
def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, samples=None, denoise_strength=1.0):
mm.soft_empty_cache()
device = mm.get_torch_device()
offload_device = mm.unet_offload_device()
pipe = pipeline["pipe"]
dtype = pipeline["dtype"]
base_path = pipeline["base_path"]
pipe.transformer.to(device)
generator = torch.Generator(device=device).manual_seed(seed)
if scheduler == "DDIM":
pipe.scheduler = CogVideoXDDIMScheduler.from_pretrained(base_path, subfolder="scheduler")
elif scheduler == "DPM":
pipe.scheduler = CogVideoXDPMScheduler.from_pretrained(base_path, subfolder="scheduler")
latents = pipeline["pipe"](
num_inference_steps=steps,
height = height,
@ -227,7 +238,6 @@ class CogVideoSampler:
denoise_strength=denoise_strength,
prompt_embeds=positive.to(dtype).to(device),
negative_prompt_embeds=negative.to(dtype).to(device),
#negative_prompt_embeds=torch.zeros_like(embeds),
generator=generator,
output_type="latents",
device=device
@ -264,11 +274,10 @@ class CogVideoDecode:
if "num_frames" in pipeline:
num_frames = pipeline["num_frames"]
fps = pipeline["fps"]
else:
num_frames = latents.shape[2]
fps = 8
num_seconds = num_frames // fps
latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
latents = 1 / vae.config.scaling_factor * latents
@ -278,17 +287,14 @@ class CogVideoDecode:
# Whether or not to clear fake context parallel cache
fake_cp = i + 1 < num_seconds
start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3)
current_frames = vae.decode(latents[:, :, start_frame:end_frame], fake_cp=fake_cp).sample
frames.append(current_frames)
mm.soft_empty_cache()
vae.to(offload_device)
frames = torch.cat(frames, dim=2)
print(frames.min(), frames.max())
video = pipeline["pipe"].video_processor.postprocess_video(video=frames, output_type="pt")
print(video.shape)
video = video[0].permute(0, 2, 3, 1).cpu().float()
print(video.min(), video.max())
return (video,)

View File

@ -222,22 +222,6 @@ class CogVideoXPipeline(DiffusionPipeline):
latents = latents * self.scheduler.init_noise_sigma
return latents, timesteps
def decode_latents(self, latents: torch.Tensor, num_seconds: int):
latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
latents = 1 / self.vae.config.scaling_factor * latents
frames = []
for i in range(num_seconds):
# Whether or not to clear fake context parallel cache
fake_cp = i + 1 < num_seconds
start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3)
current_frames = self.vae.decode(latents[:, :, start_frame:end_frame], fake_cp=fake_cp).sample
frames.append(current_frames)
frames = torch.cat(frames, dim=2)
return frames
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
def prepare_extra_step_kwargs(self, generator, eta):
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@ -534,17 +518,7 @@ class CogVideoXPipeline(DiffusionPipeline):
progress_bar.update()
comfy_pbar.update(1)
if not output_type == "latents":
video = self.decode_latents(latents, num_frames // fps)
video = self.video_processor.postprocess_video(video=video, output_type=output_type)
else:
video = latents
print(video.shape)
# Offload all models
self.maybe_free_model_hooks()
if not return_dict:
return (video,)
return latents
#return CogVideoXPipelineOutput(frames=video)