mirror of
https://git.datalinker.icu/kijai/ComfyUI-CogVideoXWrapper.git
synced 2025-12-10 05:14:22 +08:00
Merge branch 'main' into add-5b
This commit is contained in:
commit
dc9de70259
415
examples/cogvideox_5b_example_01.json
Normal file
415
examples/cogvideox_5b_example_01.json
Normal file
@ -0,0 +1,415 @@
|
|||||||
|
{
|
||||||
|
"last_node_id": 33,
|
||||||
|
"last_link_id": 59,
|
||||||
|
"nodes": [
|
||||||
|
{
|
||||||
|
"id": 20,
|
||||||
|
"type": "CLIPLoader",
|
||||||
|
"pos": [
|
||||||
|
-59,
|
||||||
|
397
|
||||||
|
],
|
||||||
|
"size": {
|
||||||
|
"0": 451.30548095703125,
|
||||||
|
"1": 82
|
||||||
|
},
|
||||||
|
"flags": {},
|
||||||
|
"order": 0,
|
||||||
|
"mode": 0,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "CLIP",
|
||||||
|
"type": "CLIP",
|
||||||
|
"links": [
|
||||||
|
54,
|
||||||
|
56
|
||||||
|
],
|
||||||
|
"slot_index": 0,
|
||||||
|
"shape": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"Node name for S&R": "CLIPLoader"
|
||||||
|
},
|
||||||
|
"widgets_values": [
|
||||||
|
"t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
|
||||||
|
"sd3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 31,
|
||||||
|
"type": "CogVideoTextEncode",
|
||||||
|
"pos": [
|
||||||
|
503,
|
||||||
|
521
|
||||||
|
],
|
||||||
|
"size": {
|
||||||
|
"0": 463.01251220703125,
|
||||||
|
"1": 98.10446166992188
|
||||||
|
},
|
||||||
|
"flags": {},
|
||||||
|
"order": 3,
|
||||||
|
"mode": 0,
|
||||||
|
"inputs": [
|
||||||
|
{
|
||||||
|
"name": "clip",
|
||||||
|
"type": "CLIP",
|
||||||
|
"link": 56
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "conditioning",
|
||||||
|
"type": "CONDITIONING",
|
||||||
|
"links": [
|
||||||
|
57
|
||||||
|
],
|
||||||
|
"slot_index": 0,
|
||||||
|
"shape": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"Node name for S&R": "CogVideoTextEncode"
|
||||||
|
},
|
||||||
|
"widgets_values": [
|
||||||
|
""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"type": "CogVideoDecode",
|
||||||
|
"pos": [
|
||||||
|
1140,
|
||||||
|
783
|
||||||
|
],
|
||||||
|
"size": {
|
||||||
|
"0": 210,
|
||||||
|
"1": 46
|
||||||
|
},
|
||||||
|
"flags": {},
|
||||||
|
"order": 5,
|
||||||
|
"mode": 0,
|
||||||
|
"inputs": [
|
||||||
|
{
|
||||||
|
"name": "pipeline",
|
||||||
|
"type": "COGVIDEOPIPE",
|
||||||
|
"link": 37
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "samples",
|
||||||
|
"type": "LATENT",
|
||||||
|
"link": 38
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "images",
|
||||||
|
"type": "IMAGE",
|
||||||
|
"links": [
|
||||||
|
59
|
||||||
|
],
|
||||||
|
"slot_index": 0,
|
||||||
|
"shape": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"Node name for S&R": "CogVideoDecode"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"type": "DownloadAndLoadCogVideoModel",
|
||||||
|
"pos": [
|
||||||
|
649,
|
||||||
|
182
|
||||||
|
],
|
||||||
|
"size": {
|
||||||
|
"0": 315,
|
||||||
|
"1": 82
|
||||||
|
},
|
||||||
|
"flags": {},
|
||||||
|
"order": 1,
|
||||||
|
"mode": 0,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "cogvideo_pipe",
|
||||||
|
"type": "COGVIDEOPIPE",
|
||||||
|
"links": [
|
||||||
|
36
|
||||||
|
],
|
||||||
|
"slot_index": 0,
|
||||||
|
"shape": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"Node name for S&R": "DownloadAndLoadCogVideoModel"
|
||||||
|
},
|
||||||
|
"widgets_values": [
|
||||||
|
"THUDM/CogVideoX-5b",
|
||||||
|
"bf16"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 22,
|
||||||
|
"type": "CogVideoSampler",
|
||||||
|
"pos": [
|
||||||
|
1041,
|
||||||
|
342
|
||||||
|
],
|
||||||
|
"size": {
|
||||||
|
"0": 315,
|
||||||
|
"1": 382
|
||||||
|
},
|
||||||
|
"flags": {},
|
||||||
|
"order": 4,
|
||||||
|
"mode": 0,
|
||||||
|
"inputs": [
|
||||||
|
{
|
||||||
|
"name": "pipeline",
|
||||||
|
"type": "COGVIDEOPIPE",
|
||||||
|
"link": 36
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "positive",
|
||||||
|
"type": "CONDITIONING",
|
||||||
|
"link": 55,
|
||||||
|
"slot_index": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "negative",
|
||||||
|
"type": "CONDITIONING",
|
||||||
|
"link": 57
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "samples",
|
||||||
|
"type": "LATENT",
|
||||||
|
"link": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "cogvideo_pipe",
|
||||||
|
"type": "COGVIDEOPIPE",
|
||||||
|
"links": [
|
||||||
|
37
|
||||||
|
],
|
||||||
|
"shape": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "samples",
|
||||||
|
"type": "LATENT",
|
||||||
|
"links": [
|
||||||
|
38
|
||||||
|
],
|
||||||
|
"shape": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"Node name for S&R": "CogVideoSampler"
|
||||||
|
},
|
||||||
|
"widgets_values": [
|
||||||
|
480,
|
||||||
|
720,
|
||||||
|
49,
|
||||||
|
8,
|
||||||
|
50,
|
||||||
|
7,
|
||||||
|
806286757407563,
|
||||||
|
"fixed",
|
||||||
|
"DPM",
|
||||||
|
49,
|
||||||
|
8,
|
||||||
|
1
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 33,
|
||||||
|
"type": "VHS_VideoCombine",
|
||||||
|
"pos": [
|
||||||
|
1533,
|
||||||
|
136
|
||||||
|
],
|
||||||
|
"size": [
|
||||||
|
778.7022705078125,
|
||||||
|
853.801513671875
|
||||||
|
],
|
||||||
|
"flags": {},
|
||||||
|
"order": 6,
|
||||||
|
"mode": 0,
|
||||||
|
"inputs": [
|
||||||
|
{
|
||||||
|
"name": "images",
|
||||||
|
"type": "IMAGE",
|
||||||
|
"link": 59
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "audio",
|
||||||
|
"type": "AUDIO",
|
||||||
|
"link": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "meta_batch",
|
||||||
|
"type": "VHS_BatchManager",
|
||||||
|
"link": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "vae",
|
||||||
|
"type": "VAE",
|
||||||
|
"link": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "Filenames",
|
||||||
|
"type": "VHS_FILENAMES",
|
||||||
|
"links": null,
|
||||||
|
"shape": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"Node name for S&R": "VHS_VideoCombine"
|
||||||
|
},
|
||||||
|
"widgets_values": {
|
||||||
|
"frame_rate": 8,
|
||||||
|
"loop_count": 0,
|
||||||
|
"filename_prefix": "CogVideoX5B",
|
||||||
|
"format": "video/nvenc_h264-mp4",
|
||||||
|
"pix_fmt": "yuv420p",
|
||||||
|
"bitrate": 10,
|
||||||
|
"megabit": true,
|
||||||
|
"save_metadata": true,
|
||||||
|
"pingpong": false,
|
||||||
|
"save_output": false,
|
||||||
|
"videopreview": {
|
||||||
|
"hidden": false,
|
||||||
|
"paused": false,
|
||||||
|
"params": {
|
||||||
|
"filename": "CogVideoX5B.mp4",
|
||||||
|
"subfolder": "",
|
||||||
|
"type": "temp",
|
||||||
|
"format": "video/nvenc_h264-mp4",
|
||||||
|
"frame_rate": 8
|
||||||
|
},
|
||||||
|
"muted": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 30,
|
||||||
|
"type": "CogVideoTextEncode",
|
||||||
|
"pos": [
|
||||||
|
500,
|
||||||
|
308
|
||||||
|
],
|
||||||
|
"size": {
|
||||||
|
"0": 474.8450012207031,
|
||||||
|
"1": 164.7423553466797
|
||||||
|
},
|
||||||
|
"flags": {},
|
||||||
|
"order": 2,
|
||||||
|
"mode": 0,
|
||||||
|
"inputs": [
|
||||||
|
{
|
||||||
|
"name": "clip",
|
||||||
|
"type": "CLIP",
|
||||||
|
"link": 54
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "conditioning",
|
||||||
|
"type": "CONDITIONING",
|
||||||
|
"links": [
|
||||||
|
55
|
||||||
|
],
|
||||||
|
"slot_index": 0,
|
||||||
|
"shape": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"Node name for S&R": "CogVideoTextEncode"
|
||||||
|
},
|
||||||
|
"widgets_values": [
|
||||||
|
"The camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a steep mountain slope, dust kicks up from its tires, the sunlight shines on the SUV as it speeds along the dirt road, casting a warm glow over the scene. The dirt road curves gently into the distance, with no other cars or vehicles in sight. The trees on either side of the road are redwoods, with patches of greenery scattered throughout. The car is seen from the rear following the curve with ease, making it seem as if it is on a rugged drive through the rugged terrain. The dirt road itself is surrounded by steep hills and mountains, with a clear blue sky above with wispy clouds.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"links": [
|
||||||
|
[
|
||||||
|
36,
|
||||||
|
1,
|
||||||
|
0,
|
||||||
|
22,
|
||||||
|
0,
|
||||||
|
"COGVIDEOPIPE"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
37,
|
||||||
|
22,
|
||||||
|
0,
|
||||||
|
11,
|
||||||
|
0,
|
||||||
|
"COGVIDEOPIPE"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
38,
|
||||||
|
22,
|
||||||
|
1,
|
||||||
|
11,
|
||||||
|
1,
|
||||||
|
"LATENT"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
54,
|
||||||
|
20,
|
||||||
|
0,
|
||||||
|
30,
|
||||||
|
0,
|
||||||
|
"CLIP"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
55,
|
||||||
|
30,
|
||||||
|
0,
|
||||||
|
22,
|
||||||
|
1,
|
||||||
|
"CONDITIONING"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
56,
|
||||||
|
20,
|
||||||
|
0,
|
||||||
|
31,
|
||||||
|
0,
|
||||||
|
"CLIP"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
57,
|
||||||
|
31,
|
||||||
|
0,
|
||||||
|
22,
|
||||||
|
2,
|
||||||
|
"CONDITIONING"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
59,
|
||||||
|
11,
|
||||||
|
0,
|
||||||
|
33,
|
||||||
|
0,
|
||||||
|
"IMAGE"
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"groups": [],
|
||||||
|
"config": {},
|
||||||
|
"extra": {
|
||||||
|
"ds": {
|
||||||
|
"scale": 0.7513148009015782,
|
||||||
|
"offset": [
|
||||||
|
106.37225000664994,
|
||||||
|
78.14886929032406
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"version": 0.4
|
||||||
|
}
|
||||||
28
nodes.py
28
nodes.py
@ -48,7 +48,10 @@ class DownloadAndLoadCogVideoModel:
|
|||||||
|
|
||||||
dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
|
dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
|
||||||
|
|
||||||
|
if "2b" in model:
|
||||||
base_path = os.path.join(folder_paths.models_dir, "CogVideo", "CogVideo2B")
|
base_path = os.path.join(folder_paths.models_dir, "CogVideo", "CogVideo2B")
|
||||||
|
elif "5b" in model:
|
||||||
|
base_path = os.path.join(folder_paths.models_dir, "CogVideo", "CogVideoX-5b")
|
||||||
|
|
||||||
if not os.path.exists(base_path):
|
if not os.path.exists(base_path):
|
||||||
log.info(f"Downloading model to: {base_path}")
|
log.info(f"Downloading model to: {base_path}")
|
||||||
@ -205,14 +208,14 @@ class CogVideoSampler:
|
|||||||
"negative": ("CONDITIONING", ),
|
"negative": ("CONDITIONING", ),
|
||||||
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
|
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
|
||||||
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
|
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
|
||||||
"num_frames": ("INT", {"default": 48, "min": 8, "max": 1024, "step": 8}),
|
"num_frames": ("INT", {"default": 48, "min": 8, "max": 1024, "step": 1}),
|
||||||
"fps": ("INT", {"default": 8, "min": 1, "max": 100, "step": 1}),
|
"fps": ("INT", {"default": 8, "min": 1, "max": 100, "step": 1}),
|
||||||
"steps": ("INT", {"default": 25, "min": 1}),
|
"steps": ("INT", {"default": 25, "min": 1}),
|
||||||
"cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
|
"cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
|
||||||
"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
|
"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
|
||||||
"scheduler": (["DDIM", "DPM"],),
|
"scheduler": (["DDIM", "DPM"],),
|
||||||
"t_tile_length": ("INT", {"default": 16, "min": 16, "max": 128, "step": 4}),
|
"t_tile_length": ("INT", {"default": 16, "min": 2, "max": 128, "step": 1}),
|
||||||
"t_tile_overlap": ("INT", {"default": 8, "min": 8, "max": 128, "step": 2}),
|
"t_tile_overlap": ("INT", {"default": 8, "min": 2, "max": 128, "step": 1}),
|
||||||
},
|
},
|
||||||
"optional": {
|
"optional": {
|
||||||
"samples": ("LATENT", ),
|
"samples": ("LATENT", ),
|
||||||
@ -282,10 +285,10 @@ class CogVideoDecode:
|
|||||||
|
|
||||||
RETURN_TYPES = ("IMAGE",)
|
RETURN_TYPES = ("IMAGE",)
|
||||||
RETURN_NAMES = ("images",)
|
RETURN_NAMES = ("images",)
|
||||||
FUNCTION = "process"
|
FUNCTION = "decode"
|
||||||
CATEGORY = "CogVideoWrapper"
|
CATEGORY = "CogVideoWrapper"
|
||||||
|
|
||||||
def process(self, pipeline, samples):
|
def decode(self, pipeline, samples):
|
||||||
device = mm.get_torch_device()
|
device = mm.get_torch_device()
|
||||||
offload_device = mm.unet_offload_device()
|
offload_device = mm.unet_offload_device()
|
||||||
latents = samples["samples"]
|
latents = samples["samples"]
|
||||||
@ -305,19 +308,20 @@ class CogVideoDecode:
|
|||||||
|
|
||||||
frames = []
|
frames = []
|
||||||
pbar = ProgressBar(num_seconds)
|
pbar = ProgressBar(num_seconds)
|
||||||
for i in range(num_seconds):
|
# for i in range(num_seconds):
|
||||||
start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3)
|
# start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3)
|
||||||
current_frames = vae.decode(latents[:, :, start_frame:end_frame]).sample
|
# current_frames = vae.decode(latents[:, :, start_frame:end_frame]).sample
|
||||||
frames.append(current_frames)
|
# frames.append(current_frames)
|
||||||
|
|
||||||
pbar.update(1)
|
# pbar.update(1)
|
||||||
vae.clear_fake_context_parallel_cache()
|
frames = vae.decode(latents).sample
|
||||||
vae.to(offload_device)
|
vae.to(offload_device)
|
||||||
mm.soft_empty_cache()
|
mm.soft_empty_cache()
|
||||||
|
|
||||||
frames = torch.cat(frames, dim=2)
|
#frames = torch.cat(frames, dim=2)
|
||||||
video = pipeline["pipe"].video_processor.postprocess_video(video=frames, output_type="pt")
|
video = pipeline["pipe"].video_processor.postprocess_video(video=frames, output_type="pt")
|
||||||
video = video[0].permute(0, 2, 3, 1).cpu().float()
|
video = video[0].permute(0, 2, 3, 1).cpu().float()
|
||||||
|
print(video.min(), video.max())
|
||||||
|
|
||||||
return (video,)
|
return (video,)
|
||||||
|
|
||||||
|
|||||||
@ -17,6 +17,7 @@ import inspect
|
|||||||
from typing import Callable, Dict, List, Optional, Tuple, Union
|
from typing import Callable, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
import math
|
||||||
|
|
||||||
from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
|
from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
|
||||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
||||||
@ -24,11 +25,29 @@ from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
|
|||||||
from diffusers.utils import logging
|
from diffusers.utils import logging
|
||||||
from diffusers.utils.torch_utils import randn_tensor
|
from diffusers.utils.torch_utils import randn_tensor
|
||||||
from diffusers.video_processor import VideoProcessor
|
from diffusers.video_processor import VideoProcessor
|
||||||
|
from diffusers.models.embeddings import get_3d_rotary_pos_embed
|
||||||
|
|
||||||
from comfy.utils import ProgressBar
|
from comfy.utils import ProgressBar
|
||||||
|
|
||||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||||
|
|
||||||
|
def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
|
||||||
|
tw = tgt_width
|
||||||
|
th = tgt_height
|
||||||
|
h, w = src
|
||||||
|
r = h / w
|
||||||
|
if r > (th / tw):
|
||||||
|
resize_height = th
|
||||||
|
resize_width = int(round(th / h * w))
|
||||||
|
else:
|
||||||
|
resize_width = tw
|
||||||
|
resize_height = int(round(tw / w * h))
|
||||||
|
|
||||||
|
crop_top = int(round((th - resize_height) / 2.0))
|
||||||
|
crop_left = int(round((tw - resize_width) / 2.0))
|
||||||
|
|
||||||
|
return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
|
||||||
|
|
||||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
|
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
|
||||||
def retrieve_timesteps(
|
def retrieve_timesteps(
|
||||||
scheduler,
|
scheduler,
|
||||||
@ -229,6 +248,46 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|||||||
weights = weights.unsqueeze(0).unsqueeze(2).unsqueeze(3).unsqueeze(4).repeat(1, t_batch_size,1, 1, 1)
|
weights = weights.unsqueeze(0).unsqueeze(2).unsqueeze(3).unsqueeze(4).repeat(1, t_batch_size,1, 1, 1)
|
||||||
return weights
|
return weights
|
||||||
|
|
||||||
|
def fuse_qkv_projections(self) -> None:
|
||||||
|
r"""Enables fused QKV projections."""
|
||||||
|
self.fusing_transformer = True
|
||||||
|
self.transformer.fuse_qkv_projections()
|
||||||
|
|
||||||
|
def unfuse_qkv_projections(self) -> None:
|
||||||
|
r"""Disable QKV projection fusion if enabled."""
|
||||||
|
if not self.fusing_transformer:
|
||||||
|
logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
|
||||||
|
else:
|
||||||
|
self.transformer.unfuse_qkv_projections()
|
||||||
|
self.fusing_transformer = False
|
||||||
|
|
||||||
|
def _prepare_rotary_positional_embeddings(
|
||||||
|
self,
|
||||||
|
height: int,
|
||||||
|
width: int,
|
||||||
|
num_frames: int,
|
||||||
|
device: torch.device,
|
||||||
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
||||||
|
grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
||||||
|
base_size_width = 720 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
||||||
|
base_size_height = 480 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
||||||
|
|
||||||
|
grid_crops_coords = get_resize_crop_region_for_grid(
|
||||||
|
(grid_height, grid_width), base_size_width, base_size_height
|
||||||
|
)
|
||||||
|
freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
|
||||||
|
embed_dim=self.transformer.config.attention_head_dim,
|
||||||
|
crops_coords=grid_crops_coords,
|
||||||
|
grid_size=(grid_height, grid_width),
|
||||||
|
temporal_size=num_frames,
|
||||||
|
use_real=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
freqs_cos = freqs_cos.to(device=device)
|
||||||
|
freqs_sin = freqs_sin.to(device=device)
|
||||||
|
return freqs_cos, freqs_sin
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def guidance_scale(self):
|
def guidance_scale(self):
|
||||||
return self._guidance_scale
|
return self._guidance_scale
|
||||||
@ -374,6 +433,15 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|||||||
t_tile_weights = self._gaussian_weights(t_tile_length=t_tile_length, t_batch_size=1).to(latents.device).to(latents.dtype)
|
t_tile_weights = self._gaussian_weights(t_tile_length=t_tile_length, t_batch_size=1).to(latents.device).to(latents.dtype)
|
||||||
print("latents.shape", latents.shape)
|
print("latents.shape", latents.shape)
|
||||||
print("latents.device", latents.device)
|
print("latents.device", latents.device)
|
||||||
|
|
||||||
|
|
||||||
|
# 6.5. Create rotary embeds if required
|
||||||
|
image_rotary_emb = (
|
||||||
|
self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
|
||||||
|
if self.transformer.config.use_rotary_positional_embeddings
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
# 7. Denoising loop
|
# 7. Denoising loop
|
||||||
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
|
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
|
||||||
comfy_pbar = ProgressBar(num_inference_steps)
|
comfy_pbar = ProgressBar(num_inference_steps)
|
||||||
@ -383,7 +451,7 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|||||||
for i, t in enumerate(timesteps):
|
for i, t in enumerate(timesteps):
|
||||||
if self.interrupt:
|
if self.interrupt:
|
||||||
continue
|
continue
|
||||||
|
if not isinstance(self.scheduler, CogVideoXDPMScheduler):
|
||||||
#temporal tiling code based on https://github.com/mayuelala/FollowYourEmoji/blob/main/models/video_pipeline.py
|
#temporal tiling code based on https://github.com/mayuelala/FollowYourEmoji/blob/main/models/video_pipeline.py
|
||||||
# =====================================================
|
# =====================================================
|
||||||
grid_ts = 0
|
grid_ts = 0
|
||||||
@ -420,6 +488,7 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|||||||
hidden_states=latent_model_input_tile,
|
hidden_states=latent_model_input_tile,
|
||||||
encoder_hidden_states=prompt_embeds,
|
encoder_hidden_states=prompt_embeds,
|
||||||
timestep=t_input,
|
timestep=t_input,
|
||||||
|
image_rotary_emb=image_rotary_emb,
|
||||||
return_dict=False,
|
return_dict=False,
|
||||||
)[0]
|
)[0]
|
||||||
noise_pred = noise_pred.float()
|
noise_pred = noise_pred.float()
|
||||||
@ -429,21 +498,7 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|||||||
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
|
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||||
|
|
||||||
# compute the previous noisy sample x_t -> x_t-1
|
# compute the previous noisy sample x_t -> x_t-1
|
||||||
if not isinstance(self.scheduler, CogVideoXDPMScheduler):
|
|
||||||
latents_tile = self.scheduler.step(noise_pred, t, latents_tile, **extra_step_kwargs, return_dict=False)[0]
|
latents_tile = self.scheduler.step(noise_pred, t, latents_tile, **extra_step_kwargs, return_dict=False)[0]
|
||||||
else:
|
|
||||||
raise NotImplementedError("DPM is not supported with temporal tiling")
|
|
||||||
# else:
|
|
||||||
# latents_tile, old_pred_original_sample = self.scheduler.step(
|
|
||||||
# noise_pred,
|
|
||||||
# old_pred_original_sample,
|
|
||||||
# t,
|
|
||||||
# t_input[t_i - 1] if t_i > 0 else None,
|
|
||||||
# latents_tile,
|
|
||||||
# **extra_step_kwargs,
|
|
||||||
# return_dict=False,
|
|
||||||
# )
|
|
||||||
|
|
||||||
latents_all_list.append(latents_tile)
|
latents_all_list.append(latents_tile)
|
||||||
|
|
||||||
# ==========================================
|
# ==========================================
|
||||||
@ -465,13 +520,57 @@ class CogVideoXPipeline(DiffusionPipeline):
|
|||||||
latents_all /= contributors
|
latents_all /= contributors
|
||||||
|
|
||||||
latents = latents_all
|
latents = latents_all
|
||||||
# ==========================================
|
|
||||||
|
|
||||||
|
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||||
|
progress_bar.update()
|
||||||
|
comfy_pbar.update(1)
|
||||||
|
# ==========================================
|
||||||
|
else:
|
||||||
|
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
|
||||||
|
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
||||||
|
|
||||||
|
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
||||||
|
timestep = t.expand(latent_model_input.shape[0])
|
||||||
|
|
||||||
|
# predict noise model_output
|
||||||
|
noise_pred = self.transformer(
|
||||||
|
hidden_states=latent_model_input,
|
||||||
|
encoder_hidden_states=prompt_embeds,
|
||||||
|
timestep=timestep,
|
||||||
|
image_rotary_emb=image_rotary_emb,
|
||||||
|
return_dict=False,
|
||||||
|
)[0]
|
||||||
|
noise_pred = noise_pred.float()
|
||||||
|
|
||||||
|
|
||||||
|
self._guidance_scale = 1 + guidance_scale * (
|
||||||
|
(1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
|
||||||
|
)
|
||||||
|
|
||||||
|
if do_classifier_free_guidance:
|
||||||
|
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
||||||
|
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||||
|
|
||||||
|
# compute the previous noisy sample x_t -> x_t-1
|
||||||
|
if not isinstance(self.scheduler, CogVideoXDPMScheduler):
|
||||||
|
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
|
||||||
|
else:
|
||||||
|
latents, old_pred_original_sample = self.scheduler.step(
|
||||||
|
noise_pred,
|
||||||
|
old_pred_original_sample,
|
||||||
|
t,
|
||||||
|
timesteps[i - 1] if i > 0 else None,
|
||||||
|
latents,
|
||||||
|
**extra_step_kwargs,
|
||||||
|
return_dict=False,
|
||||||
|
)
|
||||||
|
latents = latents.to(prompt_embeds.dtype)
|
||||||
|
|
||||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||||
progress_bar.update()
|
progress_bar.update()
|
||||||
comfy_pbar.update(1)
|
comfy_pbar.update(1)
|
||||||
|
|
||||||
|
|
||||||
# Offload all models
|
# Offload all models
|
||||||
self.maybe_free_model_hooks()
|
self.maybe_free_model_hooks()
|
||||||
|
|
||||||
|
|||||||
@ -1,2 +1,2 @@
|
|||||||
huggingface_hub
|
huggingface_hub
|
||||||
diffusers>=0.30.0
|
diffusers>=0.30.1
|
||||||
Loading…
x
Reference in New Issue
Block a user