update examples, expose scheduler, force T5 offload

This commit is contained in:
kijai 2024-08-07 01:10:19 +03:00
parent 8a0af3b663
commit 97e89d596e
4 changed files with 436 additions and 410 deletions

View File

@ -1,46 +1,7 @@
{ {
"last_node_id": 59, "last_node_id": 64,
"last_link_id": 137, "last_link_id": 167,
"nodes": [ "nodes": [
{
"id": 31,
"type": "CogVideoTextEncode",
"pos": [
503,
521
],
"size": {
"0": 463.01251220703125,
"1": 98.10446166992188
},
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 56
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
80
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
""
]
},
{ {
"id": 1, "id": 1,
"type": "DownloadAndLoadCogVideoModel", "type": "DownloadAndLoadCogVideoModel",
@ -60,8 +21,8 @@
"name": "cogvideo_pipe", "name": "cogvideo_pipe",
"type": "COGVIDEOPIPE", "type": "COGVIDEOPIPE",
"links": [ "links": [
78, 83,
83 159
], ],
"shape": 3, "shape": 3,
"slot_index": 0 "slot_index": 0
@ -108,47 +69,6 @@
"sd3" "sd3"
] ]
}, },
{
"id": 11,
"type": "CogVideoDecode",
"pos": [
1199,
661
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 9,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 81
},
{
"name": "samples",
"type": "LATENT",
"link": 82
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
118
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
},
{ {
"id": 56, "id": 56,
"type": "SimpleMath+", "type": "SimpleMath+",
@ -233,7 +153,7 @@
"name": "samples", "name": "samples",
"type": "LATENT", "type": "LATENT",
"links": [ "links": [
122 162
], ],
"shape": 3, "shape": 3,
"slot_index": 0 "slot_index": 0
@ -301,84 +221,6 @@
"Node name for S&R": "GetImageSizeAndCount" "Node name for S&R": "GetImageSizeAndCount"
} }
}, },
{
"id": 41,
"type": "ImageResizeKJ",
"pos": [
315,
-19
],
"size": {
"0": 315,
"1": 242
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 128
},
{
"name": "get_image_size",
"type": "IMAGE",
"link": null
},
{
"name": "width_input",
"type": "INT",
"link": null,
"widget": {
"name": "width_input"
}
},
{
"name": "height_input",
"type": "INT",
"link": null,
"widget": {
"name": "height_input"
}
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
126
],
"shape": 3,
"slot_index": 0
},
{
"name": "width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "height",
"type": "INT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "ImageResizeKJ"
},
"widgets_values": [
720,
480,
"lanczos",
false,
2,
0,
0
]
},
{ {
"id": 59, "id": 59,
"type": "GetImageRangeFromBatch", "type": "GetImageRangeFromBatch",
@ -448,10 +290,10 @@
1451, 1451,
368 368
], ],
"size": [ "size": {
315, "0": 315,
102 "1": 102
], },
"flags": { "flags": {
"collapsed": true "collapsed": true
}, },
@ -552,12 +394,12 @@
"id": 47, "id": 47,
"type": "VHS_VideoCombine", "type": "VHS_VideoCombine",
"pos": [ "pos": [
1789, 1790,
-104 -104
], ],
"size": [ "size": [
1113.3311767578125, 1110,
712.4437255859375 711.3333333333333
], ],
"flags": {}, "flags": {},
"order": 15, "order": 15,
@ -610,7 +452,7 @@
"hidden": false, "hidden": false,
"paused": false, "paused": false,
"params": { "params": {
"filename": "AnimateDiff_00011.mp4", "filename": "AnimateDiff_00008.mp4",
"subfolder": "", "subfolder": "",
"type": "temp", "type": "temp",
"format": "video/nvenc_h264-mp4", "format": "video/nvenc_h264-mp4",
@ -619,6 +461,190 @@
} }
} }
}, },
{
"id": 57,
"type": "GetImageSizeAndCount",
"pos": [
674,
2
],
"size": {
"0": 210,
"1": 86
},
"flags": {},
"order": 6,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 126,
"slot_index": 0
}
],
"outputs": [
{
"name": "image",
"type": "IMAGE",
"links": [
129,
136
],
"shape": 3,
"slot_index": 0
},
{
"name": "720 width",
"type": "INT",
"links": [
165
],
"shape": 3,
"slot_index": 1
},
{
"name": "480 height",
"type": "INT",
"links": [
164
],
"shape": 3,
"slot_index": 2
},
{
"name": "16 count",
"type": "INT",
"links": [
163
],
"shape": 3,
"slot_index": 3
}
],
"properties": {
"Node name for S&R": "GetImageSizeAndCount"
}
},
{
"id": 41,
"type": "ImageResizeKJ",
"pos": [
315,
-19
],
"size": {
"0": 315,
"1": 242
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 128
},
{
"name": "get_image_size",
"type": "IMAGE",
"link": null
},
{
"name": "width_input",
"type": "INT",
"link": null,
"widget": {
"name": "width_input"
}
},
{
"name": "height_input",
"type": "INT",
"link": null,
"widget": {
"name": "height_input"
}
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
126
],
"shape": 3,
"slot_index": 0
},
{
"name": "width",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "height",
"type": "INT",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "ImageResizeKJ"
},
"widgets_values": [
720,
480,
"lanczos",
false,
2,
0,
0
]
},
{
"id": 11,
"type": "CogVideoDecode",
"pos": [
1201,
684
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 9,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 166
},
{
"name": "samples",
"type": "LATENT",
"link": 167
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
118
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
},
{ {
"id": 30, "id": 30,
"type": "CogVideoTextEncode", "type": "CogVideoTextEncode",
@ -645,7 +671,7 @@
"name": "conditioning", "name": "conditioning",
"type": "CONDITIONING", "type": "CONDITIONING",
"links": [ "links": [
79 160
], ],
"shape": 3, "shape": 3,
"slot_index": 0 "slot_index": 0
@ -655,84 +681,46 @@
"Node name for S&R": "CogVideoTextEncode" "Node name for S&R": "CogVideoTextEncode"
}, },
"widgets_values": [ "widgets_values": [
"video of dinosaur turning it's head in a cinematic and dramatic scene from a movie" "cinematic video of a red panda turning it's head"
] ]
}, },
{ {
"id": 36, "id": 31,
"type": "CogVideoSampler", "type": "CogVideoTextEncode",
"pos": [ "pos": [
1093, 503,
292 521
],
"size": [
315,
310
], ],
"size": {
"0": 463.01251220703125,
"1": 98.10446166992188
},
"flags": {}, "flags": {},
"order": 8, "order": 4,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
"name": "pipeline", "name": "clip",
"type": "COGVIDEOPIPE", "type": "CLIP",
"link": 78 "link": 56
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 79
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 80
},
{
"name": "samples",
"type": "LATENT",
"link": 122
},
{
"name": "num_frames",
"type": "INT",
"link": 137,
"widget": {
"name": "num_frames"
}
} }
], ],
"outputs": [ "outputs": [
{ {
"name": "cogvideo_pipe", "name": "conditioning",
"type": "COGVIDEOPIPE", "type": "CONDITIONING",
"links": [ "links": [
81 161
], ],
"shape": 3 "shape": 3,
}, "slot_index": 0
{
"name": "samples",
"type": "LATENT",
"links": [
82
],
"shape": 3
} }
], ],
"properties": { "properties": {
"Node name for S&R": "CogVideoSampler" "Node name for S&R": "CogVideoTextEncode"
}, },
"widgets_values": [ "widgets_values": [
480, "bad quality video, blurry, messy"
720,
16,
8,
25,
8,
1119546789766856,
"fixed",
0.8
] ]
}, },
{ {
@ -819,63 +807,98 @@
} }
}, },
{ {
"id": 57, "id": 64,
"type": "GetImageSizeAndCount", "type": "CogVideoSampler",
"pos": [ "pos": [
674, 1090,
2 290
], ],
"size": { "size": {
"0": 210, "0": 315,
"1": 86 "1": 342
}, },
"flags": {}, "flags": {},
"order": 6, "order": 8,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
"name": "image", "name": "pipeline",
"type": "IMAGE", "type": "COGVIDEOPIPE",
"link": 126, "link": 159
"slot_index": 0 },
{
"name": "positive",
"type": "CONDITIONING",
"link": 160
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 161
},
{
"name": "samples",
"type": "LATENT",
"link": 162
},
{
"name": "num_frames",
"type": "INT",
"link": 163,
"widget": {
"name": "num_frames"
}
},
{
"name": "height",
"type": "INT",
"link": 164,
"widget": {
"name": "height"
}
},
{
"name": "width",
"type": "INT",
"link": 165,
"widget": {
"name": "width"
}
} }
], ],
"outputs": [ "outputs": [
{ {
"name": "image", "name": "cogvideo_pipe",
"type": "IMAGE", "type": "COGVIDEOPIPE",
"links": [ "links": [
129, 166
136
], ],
"shape": 3,
"slot_index": 0
},
{
"name": "720 width",
"type": "INT",
"links": null,
"shape": 3 "shape": 3
}, },
{ {
"name": "480 height", "name": "samples",
"type": "INT", "type": "LATENT",
"links": null,
"shape": 3
},
{
"name": "16 count",
"type": "INT",
"links": [ "links": [
137 167
], ],
"shape": 3, "shape": 3
"slot_index": 3
} }
], ],
"properties": { "properties": {
"Node name for S&R": "GetImageSizeAndCount" "Node name for S&R": "CogVideoSampler"
} },
"widgets_values": [
480,
720,
48,
8,
35,
9,
6,
"fixed",
"DPM",
0.7000000000000001
]
} }
], ],
"links": [ "links": [
@ -895,46 +918,6 @@
0, 0,
"CLIP" "CLIP"
], ],
[
78,
1,
0,
36,
0,
"COGVIDEOPIPE"
],
[
79,
30,
0,
36,
1,
"CONDITIONING"
],
[
80,
31,
0,
36,
2,
"CONDITIONING"
],
[
81,
36,
0,
11,
0,
"COGVIDEOPIPE"
],
[
82,
36,
1,
11,
1,
"LATENT"
],
[ [
83, 83,
1, 1,
@ -975,14 +958,6 @@
0, 0,
"INT,FLOAT" "INT,FLOAT"
], ],
[
122,
37,
0,
36,
3,
"LATENT"
],
[ [
126, 126,
41, 41,
@ -1048,22 +1023,86 @@
"IMAGE" "IMAGE"
], ],
[ [
137, 159,
1,
0,
64,
0,
"COGVIDEOPIPE"
],
[
160,
30,
0,
64,
1,
"CONDITIONING"
],
[
161,
31,
0,
64,
2,
"CONDITIONING"
],
[
162,
37,
0,
64,
3,
"LATENT"
],
[
163,
57, 57,
3, 3,
36, 64,
4, 4,
"INT" "INT"
],
[
164,
57,
2,
64,
5,
"INT"
],
[
165,
57,
1,
64,
6,
"INT"
],
[
166,
64,
0,
11,
0,
"COGVIDEOPIPE"
],
[
167,
64,
1,
11,
1,
"LATENT"
] ]
], ],
"groups": [], "groups": [],
"config": {}, "config": {},
"extra": { "extra": {
"ds": { "ds": {
"scale": 0.7513148009015777, "scale": 0.6830134553650705,
"offset": [ "offset": [
45.633655208726886, 56.628416841109384,
389.8041242612087 394.7727729054069
] ]
} }
}, },

View File

@ -11,7 +11,7 @@
], ],
"size": { "size": {
"0": 315, "0": 315,
"1": 266 "1": 334
}, },
"flags": {}, "flags": {},
"order": 4, "order": 4,
@ -32,6 +32,11 @@
"name": "negative", "name": "negative",
"type": "CONDITIONING", "type": "CONDITIONING",
"link": 57 "link": 57
},
{
"name": "samples",
"type": "LATENT",
"link": null
} }
], ],
"outputs": [ "outputs": [
@ -63,50 +68,11 @@
25, 25,
6, 6,
806286757407561, 806286757407561,
"fixed" "fixed",
"DDIM",
1
] ]
}, },
{
"id": 11,
"type": "CogVideoDecode",
"pos": [
1142,
658
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 37
},
{
"name": "samples",
"type": "LATENT",
"link": 38
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
51
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
},
{ {
"id": 28, "id": 28,
"type": "VHS_VideoCombine", "type": "VHS_VideoCombine",
@ -169,7 +135,7 @@
"hidden": false, "hidden": false,
"paused": false, "paused": false,
"params": { "params": {
"filename": "AnimateDiff_00001.mp4", "filename": "CogVideoX_00001.mp4",
"subfolder": "", "subfolder": "",
"type": "temp", "type": "temp",
"format": "video/h264-mp4", "format": "video/h264-mp4",
@ -185,10 +151,10 @@
500, 500,
308 308
], ],
"size": [ "size": {
474.84501511852204, "0": 474.8450012207031,
164.74235966960538 "1": 164.7423553466797
], },
"flags": {}, "flags": {},
"order": 2, "order": 2,
"mode": 0, "mode": 0,
@ -258,10 +224,10 @@
503, 503,
521 521
], ],
"size": [ "size": {
463.01251866466464, "0": 463.01251220703125,
98.10446321574796 "1": 98.10446166992188
], },
"flags": {}, "flags": {},
"order": 3, "order": 3,
"mode": 0, "mode": 0,
@ -321,6 +287,47 @@
"widgets_values": [ "widgets_values": [
"fp16" "fp16"
] ]
},
{
"id": 11,
"type": "CogVideoDecode",
"pos": [
1138,
725
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 37
},
{
"name": "samples",
"type": "LATENT",
"link": 38
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
51
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
} }
], ],
"links": [ "links": [
@ -393,10 +400,10 @@
"config": {}, "config": {},
"extra": { "extra": {
"ds": { "ds": {
"scale": 0.6830134553650706, "scale": 0.9090909090909092,
"offset": [ "offset": [
359.4381777891929, 12.99028921497383,
334.95283678425216 38.21608107136124
] ]
} }
}, },

View File

@ -2,7 +2,7 @@ import os
import torch import torch
import folder_paths import folder_paths
import comfy.model_management as mm import comfy.model_management as mm
from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
from .pipeline_cogvideox import CogVideoXPipeline from .pipeline_cogvideox import CogVideoXPipeline
import logging import logging
@ -54,11 +54,11 @@ class DownloadAndLoadCogVideoModel:
) )
pipe = CogVideoXPipeline.from_pretrained(base_path, torch_dtype=dtype).to(offload_device) pipe = CogVideoXPipeline.from_pretrained(base_path, torch_dtype=dtype).to(offload_device)
pipeline = { pipeline = {
"pipe": pipe, "pipe": pipe,
"dtype": dtype "dtype": dtype,
"base_path": base_path
} }
return (pipeline,) return (pipeline,)
@ -115,11 +115,15 @@ class CogVideoTextEncode:
CATEGORY = "CogVideoWrapper" CATEGORY = "CogVideoWrapper"
def process(self, clip, prompt): def process(self, clip, prompt):
load_device = mm.text_encoder_device()
offload_device = mm.text_encoder_offload_device()
clip.tokenizer.t5xxl.pad_to_max_length = True clip.tokenizer.t5xxl.pad_to_max_length = True
clip.tokenizer.t5xxl.max_length = 226 clip.tokenizer.t5xxl.max_length = 226
clip.cond_stage_model.to(load_device)
tokens = clip.tokenize(prompt, return_word_ids=True) tokens = clip.tokenize(prompt, return_word_ids=True)
embeds = clip.encode_from_tokens(tokens, return_pooled=False, return_dict=False) embeds = clip.encode_from_tokens(tokens, return_pooled=False, return_dict=False)
clip.cond_stage_model.to(offload_device)
return (embeds, ) return (embeds, )
@ -194,6 +198,7 @@ class CogVideoSampler:
"steps": ("INT", {"default": 25, "min": 1}), "steps": ("INT", {"default": 25, "min": 1}),
"cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}), "cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}), "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
"scheduler": (["DDIM", "DPM"],),
}, },
"optional": { "optional": {
"samples": ("LATENT", ), "samples": ("LATENT", ),
@ -206,16 +211,22 @@ class CogVideoSampler:
FUNCTION = "process" FUNCTION = "process"
CATEGORY = "CogVideoWrapper" CATEGORY = "CogVideoWrapper"
def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, samples=None, denoise_strength=1.0): def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, samples=None, denoise_strength=1.0):
mm.soft_empty_cache() mm.soft_empty_cache()
device = mm.get_torch_device() device = mm.get_torch_device()
offload_device = mm.unet_offload_device() offload_device = mm.unet_offload_device()
pipe = pipeline["pipe"] pipe = pipeline["pipe"]
dtype = pipeline["dtype"] dtype = pipeline["dtype"]
base_path = pipeline["base_path"]
pipe.transformer.to(device) pipe.transformer.to(device)
generator = torch.Generator(device=device).manual_seed(seed) generator = torch.Generator(device=device).manual_seed(seed)
if scheduler == "DDIM":
pipe.scheduler = CogVideoXDDIMScheduler.from_pretrained(base_path, subfolder="scheduler")
elif scheduler == "DPM":
pipe.scheduler = CogVideoXDPMScheduler.from_pretrained(base_path, subfolder="scheduler")
latents = pipeline["pipe"]( latents = pipeline["pipe"](
num_inference_steps=steps, num_inference_steps=steps,
height = height, height = height,
@ -227,7 +238,6 @@ class CogVideoSampler:
denoise_strength=denoise_strength, denoise_strength=denoise_strength,
prompt_embeds=positive.to(dtype).to(device), prompt_embeds=positive.to(dtype).to(device),
negative_prompt_embeds=negative.to(dtype).to(device), negative_prompt_embeds=negative.to(dtype).to(device),
#negative_prompt_embeds=torch.zeros_like(embeds),
generator=generator, generator=generator,
output_type="latents", output_type="latents",
device=device device=device
@ -264,11 +274,10 @@ class CogVideoDecode:
if "num_frames" in pipeline: if "num_frames" in pipeline:
num_frames = pipeline["num_frames"] num_frames = pipeline["num_frames"]
fps = pipeline["fps"] fps = pipeline["fps"]
else: else:
num_frames = latents.shape[2] num_frames = latents.shape[2]
fps = 8 fps = 8
num_seconds = num_frames // fps num_seconds = num_frames // fps
latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width] latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
latents = 1 / vae.config.scaling_factor * latents latents = 1 / vae.config.scaling_factor * latents
@ -278,17 +287,14 @@ class CogVideoDecode:
# Whether or not to clear fake context parallel cache # Whether or not to clear fake context parallel cache
fake_cp = i + 1 < num_seconds fake_cp = i + 1 < num_seconds
start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3) start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3)
current_frames = vae.decode(latents[:, :, start_frame:end_frame], fake_cp=fake_cp).sample current_frames = vae.decode(latents[:, :, start_frame:end_frame], fake_cp=fake_cp).sample
frames.append(current_frames) frames.append(current_frames)
mm.soft_empty_cache()
vae.to(offload_device) vae.to(offload_device)
frames = torch.cat(frames, dim=2) frames = torch.cat(frames, dim=2)
print(frames.min(), frames.max())
video = pipeline["pipe"].video_processor.postprocess_video(video=frames, output_type="pt") video = pipeline["pipe"].video_processor.postprocess_video(video=frames, output_type="pt")
print(video.shape)
video = video[0].permute(0, 2, 3, 1).cpu().float() video = video[0].permute(0, 2, 3, 1).cpu().float()
print(video.min(), video.max())
return (video,) return (video,)

View File

@ -222,22 +222,6 @@ class CogVideoXPipeline(DiffusionPipeline):
latents = latents * self.scheduler.init_noise_sigma latents = latents * self.scheduler.init_noise_sigma
return latents, timesteps return latents, timesteps
def decode_latents(self, latents: torch.Tensor, num_seconds: int):
latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
latents = 1 / self.vae.config.scaling_factor * latents
frames = []
for i in range(num_seconds):
# Whether or not to clear fake context parallel cache
fake_cp = i + 1 < num_seconds
start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3)
current_frames = self.vae.decode(latents[:, :, start_frame:end_frame], fake_cp=fake_cp).sample
frames.append(current_frames)
frames = torch.cat(frames, dim=2)
return frames
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
def prepare_extra_step_kwargs(self, generator, eta): def prepare_extra_step_kwargs(self, generator, eta):
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@ -534,17 +518,7 @@ class CogVideoXPipeline(DiffusionPipeline):
progress_bar.update() progress_bar.update()
comfy_pbar.update(1) comfy_pbar.update(1)
if not output_type == "latents":
video = self.decode_latents(latents, num_frames // fps)
video = self.video_processor.postprocess_video(video=video, output_type=output_type)
else:
video = latents
print(video.shape)
# Offload all models # Offload all models
self.maybe_free_model_hooks() self.maybe_free_model_hooks()
if not return_dict:
return (video,)
return latents return latents
#return CogVideoXPipelineOutput(frames=video)