Support CogVideoX-Interpolation -model

https://github.com/feizc/CogvideX-Interpolation
This commit is contained in:
kijai 2024-10-17 13:41:23 +03:00
parent 09ed641575
commit 4f8f3aa74f
3 changed files with 933 additions and 8 deletions

View File

@ -0,0 +1,831 @@
{
"last_node_id": 67,
"last_link_id": 152,
"nodes": [
{
"id": 20,
"type": "CLIPLoader",
"pos": {
"0": -26,
"1": 400
},
"size": {
"0": 451.30548095703125,
"1": 82
},
"flags": {},
"order": 0,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "CLIP",
"type": "CLIP",
"links": [
54,
56
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CLIPLoader"
},
"widgets_values": [
"t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
"sd3"
]
},
{
"id": 31,
"type": "CogVideoTextEncode",
"pos": {
"0": 497,
"1": 520
},
"size": {
"0": 463.01251220703125,
"1": 124
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 56
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
123
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
"The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
1,
true
]
},
{
"id": 30,
"type": "CogVideoTextEncode",
"pos": {
"0": 493,
"1": 303
},
"size": {
"0": 471.90142822265625,
"1": 168.08047485351562
},
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 54
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
122
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
"a majestic stag is grazing in an enhanced forest, basking in the setting sun filtered by the trees",
1,
true
]
},
{
"id": 57,
"type": "CogVideoSampler",
"pos": {
"0": 1138,
"1": 150
},
"size": {
"0": 399.8780822753906,
"1": 370
},
"flags": {},
"order": 9,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 121
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 122
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 123
},
{
"name": "samples",
"type": "LATENT",
"link": null,
"shape": 7
},
{
"name": "image_cond_latents",
"type": "LATENT",
"link": 146,
"shape": 7
},
{
"name": "context_options",
"type": "COGCONTEXT",
"link": null,
"shape": 7
},
{
"name": "controlnet",
"type": "COGVIDECONTROLNET",
"link": null,
"shape": 7
}
],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
128
],
"slot_index": 0,
"shape": 3
},
{
"name": "samples",
"type": "LATENT",
"links": [
127
],
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoSampler"
},
"widgets_values": [
480,
720,
49,
20,
6,
65334758276105,
"fixed",
"CogVideoXDPMScheduler",
1
]
},
{
"id": 1,
"type": "DownloadAndLoadCogVideoModel",
"pos": {
"0": 633,
"1": 44
},
"size": {
"0": 337.8885192871094,
"1": 194
},
"flags": {},
"order": 1,
"mode": 0,
"inputs": [
{
"name": "pab_config",
"type": "PAB_CONFIG",
"link": null,
"shape": 7
},
{
"name": "block_edit",
"type": "TRANSFORMERBLOCKS",
"link": null,
"shape": 7
},
{
"name": "lora",
"type": "COGLORA",
"link": null,
"shape": 7
}
],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
121,
149
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadCogVideoModel"
},
"widgets_values": [
"feizhengcong/CogvideoX-Interpolation",
"bf16",
"disabled",
"disabled",
false
]
},
{
"id": 65,
"type": "CogVideoImageInterpolationEncode",
"pos": {
"0": 1123,
"1": 647
},
"size": [
331.6177535935244,
118
],
"flags": {},
"order": 8,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 149
},
{
"name": "start_image",
"type": "IMAGE",
"link": 147
},
{
"name": "end_image",
"type": "IMAGE",
"link": 152
},
{
"name": "mask",
"type": "MASK",
"link": null,
"shape": 7
}
],
"outputs": [
{
"name": "samples",
"type": "LATENT",
"links": [
146
],
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoImageInterpolationEncode"
},
"widgets_values": [
false
]
},
{
"id": 44,
"type": "VHS_VideoCombine",
"pos": {
"0": 1927,
"1": 146
},
"size": [
605.3909912109375,
714.2606608072917
],
"flags": {},
"order": 11,
"mode": 0,
"inputs": [
{
"name": "images",
"type": "IMAGE",
"link": 118
},
{
"name": "audio",
"type": "AUDIO",
"link": null,
"shape": 7
},
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null,
"shape": 7
},
{
"name": "vae",
"type": "VAE",
"link": null,
"shape": 7
}
],
"outputs": [
{
"name": "Filenames",
"type": "VHS_FILENAMES",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_VideoCombine"
},
"widgets_values": {
"frame_rate": 8,
"loop_count": 0,
"filename_prefix": "CogVideoX_interpolation",
"format": "video/h264-mp4",
"pix_fmt": "yuv420p",
"crf": 19,
"save_metadata": true,
"pingpong": false,
"save_output": false,
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"filename": "CogVideoX-I2V_00001.mp4",
"subfolder": "",
"type": "temp",
"format": "video/h264-mp4",
"frame_rate": 8
},
"muted": false
}
}
},
{
"id": 67,
"type": "ImageResizeKJ",
"pos": {
"0": 569,
"1": 1173
},
"size": [
315,
266
],
"flags": {
"collapsed": true
},
"order": 7,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 151
},
{
"name": "get_image_size",
"type": "IMAGE",
"link": 150,
"shape": 7
},
{
"name": "width_input",
"type": "INT",
"link": null,
"widget": {
"name": "width_input"
},
"shape": 7
},
{
"name": "height_input",
"type": "INT",
"link": null,
"widget": {
"name": "height_input"
},
"shape": 7
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
152
],
"slot_index": 0,
"shape": 3
},
{
"name": "width",
"type": "INT",
"links": [],
"slot_index": 1,
"shape": 3
},
{
"name": "height",
"type": "INT",
"links": [],
"slot_index": 2,
"shape": 3
}
],
"properties": {
"Node name for S&R": "ImageResizeKJ"
},
"widgets_values": [
720,
480,
"lanczos",
false,
16,
0,
0,
"disabled"
]
},
{
"id": 37,
"type": "ImageResizeKJ",
"pos": {
"0": 537,
"1": 722
},
"size": {
"0": 315,
"1": 266
},
"flags": {},
"order": 6,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 71
},
{
"name": "get_image_size",
"type": "IMAGE",
"link": null,
"shape": 7
},
{
"name": "width_input",
"type": "INT",
"link": null,
"widget": {
"name": "width_input"
}
},
{
"name": "height_input",
"type": "INT",
"link": null,
"widget": {
"name": "height_input"
}
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
147,
150
],
"slot_index": 0,
"shape": 3
},
{
"name": "width",
"type": "INT",
"links": [],
"slot_index": 1,
"shape": 3
},
{
"name": "height",
"type": "INT",
"links": [],
"slot_index": 2,
"shape": 3
}
],
"properties": {
"Node name for S&R": "ImageResizeKJ"
},
"widgets_values": [
720,
480,
"lanczos",
false,
16,
0,
0,
"disabled"
]
},
{
"id": 36,
"type": "LoadImage",
"pos": {
"0": 20,
"1": 674
},
"size": {
"0": 402.06353759765625,
"1": 396.6225891113281
},
"flags": {},
"order": 2,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
71
],
"slot_index": 0,
"shape": 3
},
{
"name": "MASK",
"type": "MASK",
"links": null,
"shape": 3
}
],
"title": "Load Image: Start",
"properties": {
"Node name for S&R": "LoadImage"
},
"widgets_values": [
"sd3stag.png",
"image"
]
},
{
"id": 66,
"type": "LoadImage",
"pos": {
"0": 20,
"1": 1121
},
"size": {
"0": 402.06353759765625,
"1": 396.6225891113281
},
"flags": {},
"order": 3,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
151
],
"slot_index": 0,
"shape": 3
},
{
"name": "MASK",
"type": "MASK",
"links": null,
"shape": 3
}
],
"title": "Load Image: End",
"properties": {
"Node name for S&R": "LoadImage"
},
"widgets_values": [
"sd3stag.png",
"image"
]
},
{
"id": 56,
"type": "CogVideoDecode",
"pos": {
"0": 1581,
"1": 148
},
"size": {
"0": 300.396484375,
"1": 198
},
"flags": {},
"order": 10,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 128
},
{
"name": "samples",
"type": "LATENT",
"link": 127
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
118
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
},
"widgets_values": [
true,
240,
360,
0.2,
0.2,
true
]
}
],
"links": [
[
54,
20,
0,
30,
0,
"CLIP"
],
[
56,
20,
0,
31,
0,
"CLIP"
],
[
71,
36,
0,
37,
0,
"IMAGE"
],
[
118,
56,
0,
44,
0,
"IMAGE"
],
[
121,
1,
0,
57,
0,
"COGVIDEOPIPE"
],
[
122,
30,
0,
57,
1,
"CONDITIONING"
],
[
123,
31,
0,
57,
2,
"CONDITIONING"
],
[
127,
57,
1,
56,
1,
"LATENT"
],
[
128,
57,
0,
56,
0,
"COGVIDEOPIPE"
],
[
146,
65,
0,
57,
4,
"LATENT"
],
[
147,
37,
0,
65,
1,
"IMAGE"
],
[
149,
1,
0,
65,
0,
"COGVIDEOPIPE"
],
[
150,
37,
0,
67,
1,
"IMAGE"
],
[
151,
66,
0,
67,
0,
"IMAGE"
],
[
152,
67,
0,
65,
2,
"IMAGE"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 0.693433494944327,
"offset": [
225.6761629383604,
-15.041612364034256
]
}
},
"version": 0.4
}

View File

@ -258,6 +258,7 @@ class DownloadAndLoadCogVideoModel:
"alibaba-pai/CogVideoX-Fun-V1.1-5b-InP",
"alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose",
"alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose",
"feizhengcong/CogvideoX-Interpolation",
],
),
@ -313,14 +314,15 @@ class DownloadAndLoadCogVideoModel:
base_path = os.path.join(download_path, "CogVideo2B")
download_path = base_path
repo_id = model
elif "5b" in model:
else:
base_path = os.path.join(download_path, (model.split("/")[-1]))
download_path = base_path
repo_id = model
if "2b" in model:
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json')
elif "5b" in model:
else:
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
if not os.path.exists(base_path):
@ -799,7 +801,7 @@ class CogVideoImageEncode:
"image": ("IMAGE", ),
},
"optional": {
"chunk_size": ("INT", {"default": 16, "min": 1}),
"chunk_size": ("INT", {"default": 16, "min": 4}),
"enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}),
"mask": ("MASK", ),
},
@ -875,6 +877,77 @@ class CogVideoImageEncode:
vae.to(offload_device)
return ({"samples": final_latents}, )
class CogVideoImageInterpolationEncode:
@classmethod
def INPUT_TYPES(s):
return {"required": {
"pipeline": ("COGVIDEOPIPE",),
"start_image": ("IMAGE", ),
"end_image": ("IMAGE", ),
},
"optional": {
"enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}),
"mask": ("MASK", ),
},
}
RETURN_TYPES = ("LATENT",)
RETURN_NAMES = ("samples",)
FUNCTION = "encode"
CATEGORY = "CogVideoWrapper"
def encode(self, pipeline, start_image, end_image, chunk_size=8, enable_tiling=False, mask=None):
device = mm.get_torch_device()
offload_device = mm.unet_offload_device()
generator = torch.Generator(device=device).manual_seed(0)
B, H, W, C = start_image.shape
vae = pipeline["pipe"].vae
vae.enable_slicing()
if enable_tiling:
from .mz_enable_vae_encode_tiling import enable_vae_encode_tiling
enable_vae_encode_tiling(vae)
if not pipeline["cpu_offloading"]:
vae.to(device)
check_diffusers_version()
vae._clear_fake_context_parallel_cache()
if mask is not None:
pipeline["pipe"].original_mask = mask
# print(mask.shape)
# mask = mask.repeat(B, 1, 1) # Shape: [B, H, W]
# mask = mask.unsqueeze(-1).repeat(1, 1, 1, C)
# print(mask.shape)
# input_image = input_image * (1 -mask)
else:
pipeline["pipe"].original_mask = None
start_image = (start_image * 2.0 - 1.0).to(vae.dtype).to(device).unsqueeze(0).permute(0, 4, 1, 2, 3) # B, C, T, H, W
end_image = (end_image * 2.0 - 1.0).to(vae.dtype).to(device).unsqueeze(0).permute(0, 4, 1, 2, 3)
B, T, C, H, W = start_image.shape
latents_list = []
# Encode the chunk of images
start_latents = vae.encode(start_image).latent_dist.sample(generator) * vae.config.scaling_factor
end_latents = vae.encode(end_image).latent_dist.sample(generator) * vae.config.scaling_factor
start_latents = start_latents.permute(0, 2, 1, 3, 4) # B, T, C, H, W
end_latents = end_latents.permute(0, 2, 1, 3, 4) # B, T, C, H, W
latents_list = [start_latents, end_latents]
# Concatenate all the chunks along the temporal dimension
final_latents = torch.cat(latents_list, dim=1)
log.info(f"Encoded latents shape: {final_latents.shape}")
if not pipeline["cpu_offloading"]:
vae.to(offload_device)
return ({"samples": final_latents}, )
class CogVideoSampler:
@classmethod
@ -1500,6 +1573,7 @@ NODE_CLASS_MAPPINGS = {
"CogVideoTextEncode": CogVideoTextEncode,
"CogVideoDualTextEncode_311": CogVideoDualTextEncode_311,
"CogVideoImageEncode": CogVideoImageEncode,
"CogVideoImageInterpolationEncode": CogVideoImageInterpolationEncode,
"CogVideoXFunSampler": CogVideoXFunSampler,
"CogVideoXFunVid2VidSampler": CogVideoXFunVid2VidSampler,
"CogVideoXFunControlSampler": CogVideoXFunControlSampler,
@ -1520,6 +1594,7 @@ NODE_DISPLAY_NAME_MAPPINGS = {
"CogVideoTextEncode": "CogVideo TextEncode",
"CogVideoDualTextEncode_311": "CogVideo DualTextEncode",
"CogVideoImageEncode": "CogVideo ImageEncode",
"CogVideoImageInterpolationEncode": "CogVideo ImageInterpolation Encode",
"CogVideoXFunSampler": "CogVideoXFun Sampler",
"CogVideoXFunVid2VidSampler": "CogVideoXFun Vid2Vid Sampler",
"CogVideoXFunControlSampler": "CogVideoXFun Control Sampler",

View File

@ -501,15 +501,34 @@ class CogVideoXPipeline(VideoSysPipeline):
# 5.5.
if image_cond_latents is not None:
padding_shape = (
if image_cond_latents.shape[1] > 1:
logger.info("More than one image conditioning frame received, interpolating")
padding_shape = (
batch_size,
(latents.shape[1] - 1),
(latents.shape[1] - 2),
self.vae.config.latent_channels,
height // self.vae_scale_factor_spatial,
width // self.vae_scale_factor_spatial,
)
latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype)
image_cond_latents = torch.cat([image_cond_latents, latent_padding], dim=1)
)
print("padding_shape: ", padding_shape)
latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype)
print(image_cond_latents.shape)
print(image_cond_latents[:, 0, :, :, :].shape)
print(image_cond_latents[:, -1, :, :, :].shape)
image_cond_latents = torch.cat([image_cond_latents[:, 0, :, :, :].unsqueeze(1), latent_padding, image_cond_latents[:, -1, :, :, :].unsqueeze(1)], dim=1)
print("image cond latents shape",image_cond_latents.shape)
else:
logger.info("Only one image conditioning frame received, img2vid")
padding_shape = (
batch_size,
(latents.shape[1] - 1),
self.vae.config.latent_channels,
height // self.vae_scale_factor_spatial,
width // self.vae_scale_factor_spatial,
)
latent_padding = torch.zeros(padding_shape, device=device, dtype=self.vae.dtype)
image_cond_latents = torch.cat([image_cond_latents, latent_padding], dim=1)
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)