mirror of
https://git.datalinker.icu/kijai/ComfyUI-CogVideoXWrapper.git
synced 2025-12-08 20:34:23 +08:00
cleanup, update example
This commit is contained in:
parent
e98c428e1e
commit
06b5e021ad
@ -1,41 +1,7 @@
|
||||
{
|
||||
"last_node_id": 33,
|
||||
"last_link_id": 59,
|
||||
"last_node_id": 34,
|
||||
"last_link_id": 64,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 20,
|
||||
"type": "CLIPLoader",
|
||||
"pos": [
|
||||
-59,
|
||||
397
|
||||
],
|
||||
"size": {
|
||||
"0": 451.30548095703125,
|
||||
"1": 82
|
||||
},
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "CLIP",
|
||||
"type": "CLIP",
|
||||
"links": [
|
||||
54,
|
||||
56
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CLIPLoader"
|
||||
},
|
||||
"widgets_values": [
|
||||
"t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
|
||||
"sd3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 31,
|
||||
"type": "CogVideoTextEncode",
|
||||
@ -62,7 +28,7 @@
|
||||
"name": "conditioning",
|
||||
"type": "CONDITIONING",
|
||||
"links": [
|
||||
57
|
||||
62
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
@ -84,7 +50,7 @@
|
||||
],
|
||||
"size": {
|
||||
"0": 210,
|
||||
"1": 46
|
||||
"1": 78
|
||||
},
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
@ -93,12 +59,12 @@
|
||||
{
|
||||
"name": "pipeline",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"link": 37
|
||||
"link": 63
|
||||
},
|
||||
{
|
||||
"name": "samples",
|
||||
"type": "LATENT",
|
||||
"link": 38
|
||||
"link": 64
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
@ -114,7 +80,10 @@
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CogVideoDecode"
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
false
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
@ -128,14 +97,14 @@
|
||||
"1": 82
|
||||
},
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "cogvideo_pipe",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"links": [
|
||||
36
|
||||
60
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
@ -150,16 +119,55 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 22,
|
||||
"id": 30,
|
||||
"type": "CogVideoTextEncode",
|
||||
"pos": [
|
||||
500,
|
||||
308
|
||||
],
|
||||
"size": [
|
||||
471.90143257018326,
|
||||
168.0804709842023
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "clip",
|
||||
"type": "CLIP",
|
||||
"link": 54
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "conditioning",
|
||||
"type": "CONDITIONING",
|
||||
"links": [
|
||||
61
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CogVideoTextEncode"
|
||||
},
|
||||
"widgets_values": [
|
||||
"A golden retriever, sporting sleek black sunglasses, with its lengthy fur flowing in the breeze, sprints playfully across a rooftop terrace, recently refreshed by a light rain. The scene unfolds from a distance, the dog's energetic bounds growing larger as it approaches the camera, its tail wagging with unrestrained joy, while droplets of water glisten on the concrete behind it. The overcast sky provides a dramatic backdrop, emphasizing the vibrant golden coat of the canine as it dashes towards the viewer.\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 34,
|
||||
"type": "CogVideoSampler",
|
||||
"pos": [
|
||||
1041,
|
||||
342
|
||||
],
|
||||
"size": {
|
||||
"0": 315,
|
||||
"1": 382
|
||||
},
|
||||
"size": [
|
||||
315.84047081854465,
|
||||
358
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
@ -167,18 +175,17 @@
|
||||
{
|
||||
"name": "pipeline",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"link": 36
|
||||
"link": 60
|
||||
},
|
||||
{
|
||||
"name": "positive",
|
||||
"type": "CONDITIONING",
|
||||
"link": 55,
|
||||
"slot_index": 1
|
||||
"link": 61
|
||||
},
|
||||
{
|
||||
"name": "negative",
|
||||
"type": "CONDITIONING",
|
||||
"link": 57
|
||||
"link": 62
|
||||
},
|
||||
{
|
||||
"name": "samples",
|
||||
@ -191,7 +198,7 @@
|
||||
"name": "cogvideo_pipe",
|
||||
"type": "COGVIDEOPIPE",
|
||||
"links": [
|
||||
37
|
||||
63
|
||||
],
|
||||
"shape": 3
|
||||
},
|
||||
@ -199,7 +206,7 @@
|
||||
"name": "samples",
|
||||
"type": "LATENT",
|
||||
"links": [
|
||||
38
|
||||
64
|
||||
],
|
||||
"shape": 3
|
||||
}
|
||||
@ -211,9 +218,8 @@
|
||||
480,
|
||||
720,
|
||||
49,
|
||||
8,
|
||||
50,
|
||||
7,
|
||||
6,
|
||||
806286757407563,
|
||||
"fixed",
|
||||
"DPM",
|
||||
@ -226,8 +232,8 @@
|
||||
"id": 33,
|
||||
"type": "VHS_VideoCombine",
|
||||
"pos": [
|
||||
1533,
|
||||
136
|
||||
1441,
|
||||
129
|
||||
],
|
||||
"size": [
|
||||
778.7022705078125,
|
||||
@ -284,7 +290,7 @@
|
||||
"hidden": false,
|
||||
"paused": false,
|
||||
"params": {
|
||||
"filename": "CogVideoX5B.mp4",
|
||||
"filename": "CogVideoX5B_00009.mp4",
|
||||
"subfolder": "",
|
||||
"type": "temp",
|
||||
"format": "video/nvenc_h264-mp4",
|
||||
@ -295,70 +301,41 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"type": "CogVideoTextEncode",
|
||||
"id": 20,
|
||||
"type": "CLIPLoader",
|
||||
"pos": [
|
||||
500,
|
||||
308
|
||||
-26,
|
||||
400
|
||||
],
|
||||
"size": {
|
||||
"0": 474.8450012207031,
|
||||
"1": 164.7423553466797
|
||||
"0": 451.30548095703125,
|
||||
"1": 82
|
||||
},
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "clip",
|
||||
"type": "CLIP",
|
||||
"link": 54
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "conditioning",
|
||||
"type": "CONDITIONING",
|
||||
"name": "CLIP",
|
||||
"type": "CLIP",
|
||||
"links": [
|
||||
55
|
||||
54,
|
||||
56
|
||||
],
|
||||
"slot_index": 0,
|
||||
"shape": 3
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CogVideoTextEncode"
|
||||
"Node name for S&R": "CLIPLoader"
|
||||
},
|
||||
"widgets_values": [
|
||||
"The camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a steep mountain slope, dust kicks up from its tires, the sunlight shines on the SUV as it speeds along the dirt road, casting a warm glow over the scene. The dirt road curves gently into the distance, with no other cars or vehicles in sight. The trees on either side of the road are redwoods, with patches of greenery scattered throughout. The car is seen from the rear following the curve with ease, making it seem as if it is on a rugged drive through the rugged terrain. The dirt road itself is surrounded by steep hills and mountains, with a clear blue sky above with wispy clouds.\n"
|
||||
"t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
|
||||
"sd3"
|
||||
]
|
||||
}
|
||||
],
|
||||
"links": [
|
||||
[
|
||||
36,
|
||||
1,
|
||||
0,
|
||||
22,
|
||||
0,
|
||||
"COGVIDEOPIPE"
|
||||
],
|
||||
[
|
||||
37,
|
||||
22,
|
||||
0,
|
||||
11,
|
||||
0,
|
||||
"COGVIDEOPIPE"
|
||||
],
|
||||
[
|
||||
38,
|
||||
22,
|
||||
1,
|
||||
11,
|
||||
1,
|
||||
"LATENT"
|
||||
],
|
||||
[
|
||||
54,
|
||||
20,
|
||||
@ -367,14 +344,6 @@
|
||||
0,
|
||||
"CLIP"
|
||||
],
|
||||
[
|
||||
55,
|
||||
30,
|
||||
0,
|
||||
22,
|
||||
1,
|
||||
"CONDITIONING"
|
||||
],
|
||||
[
|
||||
56,
|
||||
20,
|
||||
@ -383,14 +352,6 @@
|
||||
0,
|
||||
"CLIP"
|
||||
],
|
||||
[
|
||||
57,
|
||||
31,
|
||||
0,
|
||||
22,
|
||||
2,
|
||||
"CONDITIONING"
|
||||
],
|
||||
[
|
||||
59,
|
||||
11,
|
||||
@ -398,16 +359,56 @@
|
||||
33,
|
||||
0,
|
||||
"IMAGE"
|
||||
],
|
||||
[
|
||||
60,
|
||||
1,
|
||||
0,
|
||||
34,
|
||||
0,
|
||||
"COGVIDEOPIPE"
|
||||
],
|
||||
[
|
||||
61,
|
||||
30,
|
||||
0,
|
||||
34,
|
||||
1,
|
||||
"CONDITIONING"
|
||||
],
|
||||
[
|
||||
62,
|
||||
31,
|
||||
0,
|
||||
34,
|
||||
2,
|
||||
"CONDITIONING"
|
||||
],
|
||||
[
|
||||
63,
|
||||
34,
|
||||
0,
|
||||
11,
|
||||
0,
|
||||
"COGVIDEOPIPE"
|
||||
],
|
||||
[
|
||||
64,
|
||||
34,
|
||||
1,
|
||||
11,
|
||||
1,
|
||||
"LATENT"
|
||||
]
|
||||
],
|
||||
"groups": [],
|
||||
"config": {},
|
||||
"extra": {
|
||||
"ds": {
|
||||
"scale": 0.7513148009015782,
|
||||
"scale": 0.7513148009015777,
|
||||
"offset": [
|
||||
106.37225000664994,
|
||||
78.14886929032406
|
||||
209.1392882550122,
|
||||
105.74671444060245
|
||||
]
|
||||
}
|
||||
},
|
||||
|
||||
43
nodes.py
43
nodes.py
@ -31,7 +31,7 @@ class DownloadAndLoadCogVideoModel:
|
||||
"fp32",
|
||||
"bf16",
|
||||
],
|
||||
{"default": "bf16"},
|
||||
{"default": "bf16", "tooltip": "official recommendation is that 2b model should be fp16, 5b model should be bf16"},
|
||||
),
|
||||
},
|
||||
}
|
||||
@ -209,13 +209,12 @@ class CogVideoSampler:
|
||||
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
|
||||
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
|
||||
"num_frames": ("INT", {"default": 48, "min": 8, "max": 1024, "step": 1}),
|
||||
"fps": ("INT", {"default": 8, "min": 1, "max": 100, "step": 1}),
|
||||
"steps": ("INT", {"default": 25, "min": 1}),
|
||||
"steps": ("INT", {"default": 50, "min": 1}),
|
||||
"cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
|
||||
"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
|
||||
"scheduler": (["DDIM", "DPM"],),
|
||||
"t_tile_length": ("INT", {"default": 16, "min": 2, "max": 128, "step": 1}),
|
||||
"t_tile_overlap": ("INT", {"default": 8, "min": 2, "max": 128, "step": 1}),
|
||||
"scheduler": (["DDIM", "DPM"], {"tooltip": "5B likes DPM, but it doesn't support temporal tiling"}),
|
||||
"t_tile_length": ("INT", {"default": 16, "min": 2, "max": 128, "step": 1, "tooltip": "Length of temporal tiling, use same alue as num_frames to disable, disabled automatically for DPM"}),
|
||||
"t_tile_overlap": ("INT", {"default": 8, "min": 2, "max": 128, "step": 1, "tooltip": "Overlap of temporal tiling"}),
|
||||
},
|
||||
"optional": {
|
||||
"samples": ("LATENT", ),
|
||||
@ -228,7 +227,7 @@ class CogVideoSampler:
|
||||
FUNCTION = "process"
|
||||
CATEGORY = "CogVideoWrapper"
|
||||
|
||||
def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames, scheduler, t_tile_length, t_tile_overlap, samples=None, denoise_strength=1.0):
|
||||
def process(self, pipeline, positive, negative, steps, cfg, seed, height, width, num_frames, scheduler, t_tile_length, t_tile_overlap, samples=None, denoise_strength=1.0):
|
||||
mm.soft_empty_cache()
|
||||
|
||||
assert t_tile_length > t_tile_overlap, "t_tile_length must be greater than t_tile_overlap"
|
||||
@ -257,7 +256,6 @@ class CogVideoSampler:
|
||||
num_frames = num_frames,
|
||||
t_tile_length = t_tile_length,
|
||||
t_tile_overlap = t_tile_overlap,
|
||||
fps = fps,
|
||||
guidance_scale=cfg,
|
||||
latents=samples["samples"] if samples is not None else None,
|
||||
denoise_strength=denoise_strength,
|
||||
@ -269,8 +267,6 @@ class CogVideoSampler:
|
||||
pipe.transformer.to(offload_device)
|
||||
mm.soft_empty_cache()
|
||||
print(latents.shape)
|
||||
pipeline["fps"] = fps
|
||||
pipeline["num_frames"] = num_frames
|
||||
|
||||
return (pipeline, {"samples": latents})
|
||||
|
||||
@ -280,6 +276,7 @@ class CogVideoDecode:
|
||||
return {"required": {
|
||||
"pipeline": ("COGVIDEOPIPE",),
|
||||
"samples": ("LATENT", ),
|
||||
"enable_vae_tiling": ("BOOLEAN", {"default": False}),
|
||||
}
|
||||
}
|
||||
|
||||
@ -288,37 +285,27 @@ class CogVideoDecode:
|
||||
FUNCTION = "decode"
|
||||
CATEGORY = "CogVideoWrapper"
|
||||
|
||||
def decode(self, pipeline, samples):
|
||||
def decode(self, pipeline, samples, enable_vae_tiling):
|
||||
device = mm.get_torch_device()
|
||||
offload_device = mm.unet_offload_device()
|
||||
latents = samples["samples"]
|
||||
vae = pipeline["pipe"].vae
|
||||
vae.to(device)
|
||||
if enable_vae_tiling:
|
||||
vae.enable_tiling(
|
||||
tile_sample_min_height=96,
|
||||
tile_sample_min_width=96,
|
||||
tile_overlap_factor_height=1 / 12,
|
||||
tile_overlap_factor_width=1 / 12,
|
||||
)
|
||||
|
||||
if "num_frames" in pipeline:
|
||||
num_frames = pipeline["num_frames"]
|
||||
fps = pipeline["fps"]
|
||||
else:
|
||||
num_frames = latents.shape[2]
|
||||
fps = 8
|
||||
|
||||
num_seconds = num_frames // fps
|
||||
latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
|
||||
latents = 1 / vae.config.scaling_factor * latents
|
||||
|
||||
frames = []
|
||||
pbar = ProgressBar(num_seconds)
|
||||
# for i in range(num_seconds):
|
||||
# start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3)
|
||||
# current_frames = vae.decode(latents[:, :, start_frame:end_frame]).sample
|
||||
# frames.append(current_frames)
|
||||
|
||||
# pbar.update(1)
|
||||
frames = vae.decode(latents).sample
|
||||
vae.to(offload_device)
|
||||
mm.soft_empty_cache()
|
||||
|
||||
#frames = torch.cat(frames, dim=2)
|
||||
video = pipeline["pipe"].video_processor.postprocess_video(video=frames, output_type="pt")
|
||||
video = video[0].permute(0, 2, 3, 1).cpu().float()
|
||||
print(video.min(), video.max())
|
||||
|
||||
@ -315,7 +315,6 @@ class CogVideoXPipeline(DiffusionPipeline):
|
||||
num_frames: int = 48,
|
||||
t_tile_length: int = 12,
|
||||
t_tile_overlap: int = 4,
|
||||
fps: int = 8,
|
||||
num_inference_steps: int = 50,
|
||||
timesteps: Optional[List[int]] = None,
|
||||
guidance_scale: float = 6,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user