mirror of
https://git.datalinker.icu/kijai/ComfyUI-CogVideoXWrapper.git
synced 2026-05-13 17:24:48 +08:00
Modify CogVideoXfun pipeline to fix Vid2Vid sampler issues
This commit is contained in:
parent
25f16462aa
commit
c2950dfb47
@ -262,111 +262,111 @@ class CogVideoX_Fun_Pipeline_Inpaint(VideoSysPipeline):
|
|||||||
set_pab_manager(pab_config)
|
set_pab_manager(pab_config)
|
||||||
|
|
||||||
def prepare_latents(
|
def prepare_latents(
|
||||||
self,
|
self,
|
||||||
batch_size,
|
|
||||||
num_channels_latents,
|
|
||||||
height,
|
|
||||||
width,
|
|
||||||
video_length,
|
|
||||||
dtype,
|
|
||||||
device,
|
|
||||||
generator,
|
|
||||||
latents=None,
|
|
||||||
video=None,
|
|
||||||
timestep=None,
|
|
||||||
is_strength_max=True,
|
|
||||||
return_noise=False,
|
|
||||||
return_video_latents=False,
|
|
||||||
context_size=None,
|
|
||||||
context_overlap=None,
|
|
||||||
freenoise=False,
|
|
||||||
):
|
|
||||||
shape = (
|
|
||||||
batch_size,
|
batch_size,
|
||||||
(video_length - 1) // self.vae_scale_factor_temporal + 1,
|
|
||||||
num_channels_latents,
|
num_channels_latents,
|
||||||
height // self.vae_scale_factor_spatial,
|
height,
|
||||||
width // self.vae_scale_factor_spatial,
|
width,
|
||||||
)
|
video_length,
|
||||||
if isinstance(generator, list) and len(generator) != batch_size:
|
dtype,
|
||||||
raise ValueError(
|
device,
|
||||||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
generator,
|
||||||
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
latents=None,
|
||||||
|
video=None,
|
||||||
|
timestep=None,
|
||||||
|
is_strength_max=True,
|
||||||
|
return_noise=False,
|
||||||
|
return_video_latents=False,
|
||||||
|
context_size=None,
|
||||||
|
context_overlap=None,
|
||||||
|
freenoise=False,
|
||||||
|
):
|
||||||
|
shape = (
|
||||||
|
batch_size,
|
||||||
|
(video_length - 1) // self.vae_scale_factor_temporal + 1,
|
||||||
|
num_channels_latents,
|
||||||
|
height // self.vae_scale_factor_spatial,
|
||||||
|
width // self.vae_scale_factor_spatial,
|
||||||
)
|
)
|
||||||
|
if isinstance(generator, list) and len(generator) != batch_size:
|
||||||
|
raise ValueError(
|
||||||
|
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
||||||
|
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
||||||
|
)
|
||||||
|
|
||||||
if return_video_latents or (latents is None and not is_strength_max):
|
if return_video_latents or (latents is None and not is_strength_max):
|
||||||
video = video.to(device=device, dtype=self.vae.dtype)
|
video = video.to(device=device, dtype=self.vae.dtype)
|
||||||
|
|
||||||
bs = 1
|
bs = 1
|
||||||
new_video = []
|
new_video = []
|
||||||
for i in range(0, video.shape[0], bs):
|
for i in range(0, video.shape[0], bs):
|
||||||
video_bs = video[i : i + bs]
|
video_bs = video[i : i + bs]
|
||||||
video_bs = self.vae.encode(video_bs)[0]
|
video_bs = self.vae.encode(video_bs)[0]
|
||||||
video_bs = video_bs.sample()
|
video_bs = video_bs.sample()
|
||||||
new_video.append(video_bs)
|
new_video.append(video_bs)
|
||||||
video = torch.cat(new_video, dim = 0)
|
video = torch.cat(new_video, dim = 0)
|
||||||
video = video * self.vae.config.scaling_factor
|
video = video * self.vae.config.scaling_factor
|
||||||
|
|
||||||
video_latents = video.repeat(batch_size // video.shape[0], 1, 1, 1, 1)
|
video_latents = video.repeat(batch_size // video.shape[0], 1, 1, 1, 1)
|
||||||
video_latents = video_latents.to(device=device, dtype=dtype)
|
video_latents = video_latents.to(device=device, dtype=dtype)
|
||||||
video_latents = rearrange(video_latents, "b c f h w -> b f c h w")
|
video_latents = rearrange(video_latents, "b c f h w -> b f c h w")
|
||||||
|
|
||||||
if latents is None:
|
if latents is None:
|
||||||
noise = randn_tensor(shape, generator=generator, device=torch.device("cpu"), dtype=dtype)
|
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
||||||
if freenoise:
|
if freenoise:
|
||||||
print("Applying FreeNoise")
|
print("Applying FreeNoise")
|
||||||
# code and comments from AnimateDiff-Evolved by Kosinkadink (https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved)
|
# code and comments from AnimateDiff-Evolved by Kosinkadink (https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved)
|
||||||
video_length = video_length // 4
|
video_length_adjusted = video_length // 4
|
||||||
delta = context_size - context_overlap
|
delta = context_size - context_overlap
|
||||||
for start_idx in range(0, video_length-context_size, delta):
|
for start_idx in range(0, video_length_adjusted - context_size, delta):
|
||||||
# start_idx corresponds to the beginning of a context window
|
# start_idx corresponds to the beginning of a context window
|
||||||
# goal: place shuffled in the delta region right after the end of the context window
|
# goal: place shuffled in the delta region right after the end of the context window
|
||||||
# if space after context window is not enough to place the noise, adjust and finish
|
# if space after context window is not enough to place the noise, adjust and finish
|
||||||
place_idx = start_idx + context_size
|
place_idx = start_idx + context_size
|
||||||
# if place_idx is outside the valid indexes, we are already finished
|
# if place_idx is outside the valid indexes, we are already finished
|
||||||
if place_idx >= video_length:
|
if place_idx >= video_length_adjusted:
|
||||||
break
|
break
|
||||||
end_idx = place_idx - 1
|
end_idx = place_idx - 1
|
||||||
#print("video_length:", video_length, "start_idx:", start_idx, "end_idx:", end_idx, "place_idx:", place_idx, "delta:", delta)
|
#print("video_length:", video_length, "start_idx:", start_idx, "end_idx:", end_idx, "place_idx:", place_idx, "delta:", delta)
|
||||||
|
|
||||||
# if there is not enough room to copy delta amount of indexes, copy limited amount and finish
|
# if there is not enough room to copy delta amount of indexes, copy limited amount and finish
|
||||||
if end_idx + delta >= video_length:
|
if end_idx + delta >= video_length_adjusted:
|
||||||
final_delta = video_length - place_idx
|
final_delta = video_length_adjusted - place_idx
|
||||||
# generate list of indexes in final delta region
|
# generate list of indexes in final delta region
|
||||||
list_idx = torch.tensor(list(range(start_idx,start_idx+final_delta)), device=torch.device("cpu"), dtype=torch.long)
|
list_idx = torch.tensor(list(range(start_idx, start_idx + final_delta)), device=device, dtype=torch.long)
|
||||||
|
# shuffle list
|
||||||
|
list_idx = list_idx[torch.randperm(final_delta, generator=generator)]
|
||||||
|
# apply shuffled indexes
|
||||||
|
noise[:, place_idx:place_idx + final_delta, :, :, :] = noise[:, list_idx, :, :, :]
|
||||||
|
break
|
||||||
|
# otherwise, do normal behavior
|
||||||
|
# generate list of indexes in delta region
|
||||||
|
list_idx = torch.tensor(list(range(start_idx, start_idx + delta)), device=device, dtype=torch.long)
|
||||||
# shuffle list
|
# shuffle list
|
||||||
list_idx = list_idx[torch.randperm(final_delta, generator=generator)]
|
list_idx = list_idx[torch.randperm(delta, generator=generator)]
|
||||||
# apply shuffled indexes
|
# apply shuffled indexes
|
||||||
noise[:, place_idx:place_idx + final_delta, :, :, :] = noise[:, list_idx, :, :, :]
|
#print("place_idx:", place_idx, "delta:", delta, "list_idx:", list_idx)
|
||||||
break
|
noise[:, place_idx:place_idx + delta, :, :, :] = noise[:, list_idx, :, :, :]
|
||||||
# otherwise, do normal behavior
|
|
||||||
# generate list of indexes in delta region
|
|
||||||
list_idx = torch.tensor(list(range(start_idx,start_idx+delta)), device=torch.device("cpu"), dtype=torch.long)
|
|
||||||
# shuffle list
|
|
||||||
list_idx = list_idx[torch.randperm(delta, generator=generator)]
|
|
||||||
# apply shuffled indexes
|
|
||||||
#print("place_idx:", place_idx, "delta:", delta, "list_idx:", list_idx)
|
|
||||||
noise[:, place_idx:place_idx + delta, :, :, :] = noise[:, list_idx, :, :, :]
|
|
||||||
|
|
||||||
# if strength is 1. then initialise the latents to noise, else initial to image + noise
|
# if strength is 1. then initialise the latents to noise, else initial to image + noise
|
||||||
latents = noise if is_strength_max else self.scheduler.add_noise(video_latents, noise, timestep)
|
latents = noise if is_strength_max else self.scheduler.add_noise(video_latents, noise, timestep)
|
||||||
# if pure noise then scale the initial latents by the Scheduler's init sigma
|
# if pure noise then scale the initial latents by the Scheduler's init sigma
|
||||||
latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
|
latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
|
||||||
latents = latents.to(device)
|
latents = latents.to(device)
|
||||||
else:
|
else:
|
||||||
noise = latents.to(device)
|
noise = latents.to(device)
|
||||||
latents = noise * self.scheduler.init_noise_sigma
|
latents = noise * self.scheduler.init_noise_sigma
|
||||||
|
|
||||||
# scale the initial noise by the standard deviation required by the scheduler
|
# scale the initial noise by the standard deviation required by the scheduler
|
||||||
outputs = (latents,)
|
outputs = (latents,)
|
||||||
|
|
||||||
if return_noise:
|
if return_noise:
|
||||||
outputs += (noise,)
|
outputs += (noise,)
|
||||||
|
|
||||||
if return_video_latents:
|
if return_video_latents:
|
||||||
outputs += (video_latents,)
|
outputs += (video_latents,)
|
||||||
|
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
def prepare_mask_latents(
|
def prepare_mask_latents(
|
||||||
self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance, noise_aug_strength
|
self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance, noise_aug_strength
|
||||||
|
|||||||
614
examples/cogvideox_fun_V2V_example_01.json
Normal file
614
examples/cogvideox_fun_V2V_example_01.json
Normal file
@ -0,0 +1,614 @@
|
|||||||
|
{
|
||||||
|
"last_node_id": 224,
|
||||||
|
"last_link_id": 333,
|
||||||
|
"nodes": [
|
||||||
|
{
|
||||||
|
"id": 216,
|
||||||
|
"type": "CogVideoTextEncode",
|
||||||
|
"pos": {
|
||||||
|
"0": 1320,
|
||||||
|
"1": 2224
|
||||||
|
},
|
||||||
|
"size": [
|
||||||
|
400,
|
||||||
|
200
|
||||||
|
],
|
||||||
|
"flags": {},
|
||||||
|
"order": 4,
|
||||||
|
"mode": 0,
|
||||||
|
"inputs": [
|
||||||
|
{
|
||||||
|
"name": "clip",
|
||||||
|
"type": "CLIP",
|
||||||
|
"link": 315
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "conditioning",
|
||||||
|
"type": "CONDITIONING",
|
||||||
|
"links": [
|
||||||
|
314
|
||||||
|
],
|
||||||
|
"slot_index": 0,
|
||||||
|
"shape": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"Node name for S&R": "CogVideoTextEncode"
|
||||||
|
},
|
||||||
|
"widgets_values": [
|
||||||
|
"",
|
||||||
|
1,
|
||||||
|
true
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 217,
|
||||||
|
"type": "CLIPLoader",
|
||||||
|
"pos": {
|
||||||
|
"0": 937,
|
||||||
|
"1": 2245
|
||||||
|
},
|
||||||
|
"size": {
|
||||||
|
"0": 315,
|
||||||
|
"1": 82
|
||||||
|
},
|
||||||
|
"flags": {},
|
||||||
|
"order": 0,
|
||||||
|
"mode": 0,
|
||||||
|
"inputs": [],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "CLIP",
|
||||||
|
"type": "CLIP",
|
||||||
|
"links": [
|
||||||
|
315,
|
||||||
|
317
|
||||||
|
],
|
||||||
|
"slot_index": 0,
|
||||||
|
"shape": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"Node name for S&R": "CLIPLoader"
|
||||||
|
},
|
||||||
|
"widgets_values": [
|
||||||
|
"t5xxl_fp8_e4m3fn.safetensors",
|
||||||
|
"sd3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 206,
|
||||||
|
"type": "VHS_VideoCombine",
|
||||||
|
"pos": {
|
||||||
|
"0": 2648,
|
||||||
|
"1": 2268
|
||||||
|
},
|
||||||
|
"size": [
|
||||||
|
595.7279663085938,
|
||||||
|
652.9874093191964
|
||||||
|
],
|
||||||
|
"flags": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"order": 8,
|
||||||
|
"mode": 0,
|
||||||
|
"inputs": [
|
||||||
|
{
|
||||||
|
"name": "images",
|
||||||
|
"type": "IMAGE",
|
||||||
|
"link": 333,
|
||||||
|
"label": "images"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "audio",
|
||||||
|
"type": "AUDIO",
|
||||||
|
"link": null,
|
||||||
|
"label": "audio"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "meta_batch",
|
||||||
|
"type": "VHS_BatchManager",
|
||||||
|
"link": null,
|
||||||
|
"label": "meta_batch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "vae",
|
||||||
|
"type": "VAE",
|
||||||
|
"link": null,
|
||||||
|
"label": "vae"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "Filenames",
|
||||||
|
"type": "VHS_FILENAMES",
|
||||||
|
"links": null,
|
||||||
|
"shape": 3,
|
||||||
|
"label": "Filenames"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"Node name for S&R": "VHS_VideoCombine",
|
||||||
|
"ttNbgOverride": {
|
||||||
|
"color": "#223",
|
||||||
|
"bgcolor": "#335",
|
||||||
|
"groupcolor": "#88A"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"widgets_values": {
|
||||||
|
"frame_rate": 24,
|
||||||
|
"loop_count": 0,
|
||||||
|
"filename_prefix": "1009/",
|
||||||
|
"format": "video/h264-mp4",
|
||||||
|
"pix_fmt": "yuv420p",
|
||||||
|
"crf": 19,
|
||||||
|
"save_metadata": false,
|
||||||
|
"pingpong": false,
|
||||||
|
"save_output": true,
|
||||||
|
"videopreview": {
|
||||||
|
"hidden": false,
|
||||||
|
"paused": false,
|
||||||
|
"params": {
|
||||||
|
"filename": "1009_00373.mp4",
|
||||||
|
"subfolder": "",
|
||||||
|
"type": "output",
|
||||||
|
"format": "video/h264-mp4",
|
||||||
|
"frame_rate": 24
|
||||||
|
},
|
||||||
|
"muted": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"color": "#223",
|
||||||
|
"bgcolor": "#335"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 220,
|
||||||
|
"type": "CogVideoDecode",
|
||||||
|
"pos": {
|
||||||
|
"0": 2242.513671875,
|
||||||
|
"1": 2250.128662109375
|
||||||
|
},
|
||||||
|
"size": {
|
||||||
|
"0": 315,
|
||||||
|
"1": 198
|
||||||
|
},
|
||||||
|
"flags": {},
|
||||||
|
"order": 7,
|
||||||
|
"mode": 0,
|
||||||
|
"inputs": [
|
||||||
|
{
|
||||||
|
"name": "pipeline",
|
||||||
|
"type": "COGVIDEOPIPE",
|
||||||
|
"link": 320
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "samples",
|
||||||
|
"type": "LATENT",
|
||||||
|
"link": 321
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "images",
|
||||||
|
"type": "IMAGE",
|
||||||
|
"links": [
|
||||||
|
333
|
||||||
|
],
|
||||||
|
"slot_index": 0,
|
||||||
|
"shape": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"Node name for S&R": "CogVideoDecode"
|
||||||
|
},
|
||||||
|
"widgets_values": [
|
||||||
|
false,
|
||||||
|
240,
|
||||||
|
360,
|
||||||
|
0.2,
|
||||||
|
0.2,
|
||||||
|
true
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 218,
|
||||||
|
"type": "CogVideoTextEncode",
|
||||||
|
"pos": {
|
||||||
|
"0": 1318,
|
||||||
|
"1": 2492
|
||||||
|
},
|
||||||
|
"size": {
|
||||||
|
"0": 400,
|
||||||
|
"1": 200
|
||||||
|
},
|
||||||
|
"flags": {},
|
||||||
|
"order": 5,
|
||||||
|
"mode": 0,
|
||||||
|
"inputs": [
|
||||||
|
{
|
||||||
|
"name": "clip",
|
||||||
|
"type": "CLIP",
|
||||||
|
"link": 317
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "conditioning",
|
||||||
|
"type": "CONDITIONING",
|
||||||
|
"links": [
|
||||||
|
318
|
||||||
|
],
|
||||||
|
"slot_index": 0,
|
||||||
|
"shape": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"Node name for S&R": "CogVideoTextEncode"
|
||||||
|
},
|
||||||
|
"widgets_values": [
|
||||||
|
"The video is not of a high quality, it has a low resolution. Strange motion trajectory, a poor composition and deformed video, low resolution, duplicate and ugly, strange body structure, long and strange neck, bad teeth, bad eyes, bad limbs, bad hands, blurry camera, shaking camera. Deformation, blurry, ugly, distortion. ",
|
||||||
|
1,
|
||||||
|
true
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 215,
|
||||||
|
"type": "DownloadAndLoadCogVideoModel",
|
||||||
|
"pos": {
|
||||||
|
"0": 1388,
|
||||||
|
"1": 1986
|
||||||
|
},
|
||||||
|
"size": {
|
||||||
|
"0": 315,
|
||||||
|
"1": 194
|
||||||
|
},
|
||||||
|
"flags": {},
|
||||||
|
"order": 1,
|
||||||
|
"mode": 0,
|
||||||
|
"inputs": [
|
||||||
|
{
|
||||||
|
"name": "pab_config",
|
||||||
|
"type": "PAB_CONFIG",
|
||||||
|
"link": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "block_edit",
|
||||||
|
"type": "TRANSFORMERBLOCKS",
|
||||||
|
"link": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "lora",
|
||||||
|
"type": "COGLORA",
|
||||||
|
"link": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "cogvideo_pipe",
|
||||||
|
"type": "COGVIDEOPIPE",
|
||||||
|
"links": [
|
||||||
|
313
|
||||||
|
],
|
||||||
|
"shape": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"Node name for S&R": "DownloadAndLoadCogVideoModel"
|
||||||
|
},
|
||||||
|
"widgets_values": [
|
||||||
|
"kijai/CogVideoX-Fun-5b",
|
||||||
|
"bf16",
|
||||||
|
"disabled",
|
||||||
|
"disabled",
|
||||||
|
false
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 214,
|
||||||
|
"type": "CogVideoXFunVid2VidSampler",
|
||||||
|
"pos": {
|
||||||
|
"0": 1823,
|
||||||
|
"1": 2249
|
||||||
|
},
|
||||||
|
"size": {
|
||||||
|
"0": 380.4000244140625,
|
||||||
|
"1": 306
|
||||||
|
},
|
||||||
|
"flags": {},
|
||||||
|
"order": 6,
|
||||||
|
"mode": 0,
|
||||||
|
"inputs": [
|
||||||
|
{
|
||||||
|
"name": "pipeline",
|
||||||
|
"type": "COGVIDEOPIPE",
|
||||||
|
"link": 313
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "positive",
|
||||||
|
"type": "CONDITIONING",
|
||||||
|
"link": 314
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "negative",
|
||||||
|
"type": "CONDITIONING",
|
||||||
|
"link": 318
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "validation_video",
|
||||||
|
"type": "IMAGE",
|
||||||
|
"link": 332
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "context_options",
|
||||||
|
"type": "COGCONTEXT",
|
||||||
|
"link": 327
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "cogvideo_pipe",
|
||||||
|
"type": "COGVIDEOPIPE",
|
||||||
|
"links": [
|
||||||
|
320
|
||||||
|
],
|
||||||
|
"slot_index": 0,
|
||||||
|
"shape": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "samples",
|
||||||
|
"type": "LATENT",
|
||||||
|
"links": [
|
||||||
|
321
|
||||||
|
],
|
||||||
|
"slot_index": 1,
|
||||||
|
"shape": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"Node name for S&R": "CogVideoXFunVid2VidSampler"
|
||||||
|
},
|
||||||
|
"widgets_values": [
|
||||||
|
49,
|
||||||
|
768,
|
||||||
|
15861381752719,
|
||||||
|
"fixed",
|
||||||
|
30,
|
||||||
|
7,
|
||||||
|
"DDIM",
|
||||||
|
0.5
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 222,
|
||||||
|
"type": "CogVideoContextOptions",
|
||||||
|
"pos": {
|
||||||
|
"0": 1379,
|
||||||
|
"1": 2771
|
||||||
|
},
|
||||||
|
"size": [
|
||||||
|
315,
|
||||||
|
154
|
||||||
|
],
|
||||||
|
"flags": {},
|
||||||
|
"order": 2,
|
||||||
|
"mode": 0,
|
||||||
|
"inputs": [],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "context_options",
|
||||||
|
"type": "COGCONTEXT",
|
||||||
|
"links": [
|
||||||
|
327
|
||||||
|
],
|
||||||
|
"slot_index": 0,
|
||||||
|
"shape": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"Node name for S&R": "CogVideoContextOptions"
|
||||||
|
},
|
||||||
|
"widgets_values": [
|
||||||
|
"uniform_standard",
|
||||||
|
48,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
true
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 209,
|
||||||
|
"type": "VHS_LoadVideo",
|
||||||
|
"pos": {
|
||||||
|
"0": 286,
|
||||||
|
"1": 2110
|
||||||
|
},
|
||||||
|
"size": [
|
||||||
|
426.63287353515625,
|
||||||
|
508.3616420200893
|
||||||
|
],
|
||||||
|
"flags": {},
|
||||||
|
"order": 3,
|
||||||
|
"mode": 0,
|
||||||
|
"inputs": [
|
||||||
|
{
|
||||||
|
"name": "meta_batch",
|
||||||
|
"type": "VHS_BatchManager",
|
||||||
|
"link": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "vae",
|
||||||
|
"type": "VAE",
|
||||||
|
"link": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "IMAGE",
|
||||||
|
"type": "IMAGE",
|
||||||
|
"links": [
|
||||||
|
332
|
||||||
|
],
|
||||||
|
"slot_index": 0,
|
||||||
|
"shape": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "frame_count",
|
||||||
|
"type": "INT",
|
||||||
|
"links": null,
|
||||||
|
"shape": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "audio",
|
||||||
|
"type": "AUDIO",
|
||||||
|
"links": null,
|
||||||
|
"shape": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "video_info",
|
||||||
|
"type": "VHS_VIDEOINFO",
|
||||||
|
"links": null,
|
||||||
|
"shape": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"Node name for S&R": "VHS_LoadVideo"
|
||||||
|
},
|
||||||
|
"widgets_values": {
|
||||||
|
"video": "CogVideoX-Fun_00006.mp4",
|
||||||
|
"force_rate": 0,
|
||||||
|
"force_size": "Disabled",
|
||||||
|
"custom_width": 512,
|
||||||
|
"custom_height": 512,
|
||||||
|
"frame_load_cap": 0,
|
||||||
|
"skip_first_frames": 0,
|
||||||
|
"select_every_nth": 3,
|
||||||
|
"choose video to upload": "image",
|
||||||
|
"videopreview": {
|
||||||
|
"hidden": false,
|
||||||
|
"paused": false,
|
||||||
|
"params": {
|
||||||
|
"frame_load_cap": 0,
|
||||||
|
"skip_first_frames": 0,
|
||||||
|
"force_rate": 0,
|
||||||
|
"filename": "CogVideoX-Fun_00006.mp4",
|
||||||
|
"type": "input",
|
||||||
|
"format": "video/mp4",
|
||||||
|
"select_every_nth": 3
|
||||||
|
},
|
||||||
|
"muted": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"color": "#223",
|
||||||
|
"bgcolor": "#335"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"links": [
|
||||||
|
[
|
||||||
|
313,
|
||||||
|
215,
|
||||||
|
0,
|
||||||
|
214,
|
||||||
|
0,
|
||||||
|
"COGVIDEOPIPE"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
314,
|
||||||
|
216,
|
||||||
|
0,
|
||||||
|
214,
|
||||||
|
1,
|
||||||
|
"CONDITIONING"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
315,
|
||||||
|
217,
|
||||||
|
0,
|
||||||
|
216,
|
||||||
|
0,
|
||||||
|
"CLIP"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
317,
|
||||||
|
217,
|
||||||
|
0,
|
||||||
|
218,
|
||||||
|
0,
|
||||||
|
"CLIP"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
318,
|
||||||
|
218,
|
||||||
|
0,
|
||||||
|
214,
|
||||||
|
2,
|
||||||
|
"CONDITIONING"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
320,
|
||||||
|
214,
|
||||||
|
0,
|
||||||
|
220,
|
||||||
|
0,
|
||||||
|
"COGVIDEOPIPE"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
321,
|
||||||
|
214,
|
||||||
|
1,
|
||||||
|
220,
|
||||||
|
1,
|
||||||
|
"LATENT"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
327,
|
||||||
|
222,
|
||||||
|
0,
|
||||||
|
214,
|
||||||
|
4,
|
||||||
|
"COGCONTEXT"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
332,
|
||||||
|
209,
|
||||||
|
0,
|
||||||
|
214,
|
||||||
|
3,
|
||||||
|
"IMAGE"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
333,
|
||||||
|
220,
|
||||||
|
0,
|
||||||
|
206,
|
||||||
|
0,
|
||||||
|
"IMAGE"
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"groups": [
|
||||||
|
{
|
||||||
|
"title": "cogvideoxfun v2v sampler",
|
||||||
|
"bounding": [
|
||||||
|
78,
|
||||||
|
1820,
|
||||||
|
3385,
|
||||||
|
1603
|
||||||
|
],
|
||||||
|
"color": "#3f789e",
|
||||||
|
"font_size": 24,
|
||||||
|
"flags": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"config": {},
|
||||||
|
"extra": {
|
||||||
|
"ds": {
|
||||||
|
"scale": 0.3797498335832498,
|
||||||
|
"offset": [
|
||||||
|
590.2813176508071,
|
||||||
|
-1550.4283526080162
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"version": 0.4
|
||||||
|
}
|
||||||
76
nodes.py
76
nodes.py
@ -1538,6 +1538,9 @@ class CogVideoXFunVid2VidSampler:
|
|||||||
"denoise_strength": ("FLOAT", {"default": 0.70, "min": 0.05, "max": 1.00, "step": 0.01}),
|
"denoise_strength": ("FLOAT", {"default": 0.70, "min": 0.05, "max": 1.00, "step": 0.01}),
|
||||||
"validation_video": ("IMAGE",),
|
"validation_video": ("IMAGE",),
|
||||||
},
|
},
|
||||||
|
"optional": {
|
||||||
|
"context_options": ("COGCONTEXT", ),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
RETURN_TYPES = ("COGVIDEOPIPE", "LATENT",)
|
RETURN_TYPES = ("COGVIDEOPIPE", "LATENT",)
|
||||||
@ -1545,8 +1548,7 @@ class CogVideoXFunVid2VidSampler:
|
|||||||
FUNCTION = "process"
|
FUNCTION = "process"
|
||||||
CATEGORY = "CogVideoWrapper"
|
CATEGORY = "CogVideoWrapper"
|
||||||
|
|
||||||
def process(self, pipeline, positive, negative, video_length, base_resolution, seed, steps, cfg, denoise_strength, scheduler,
|
def process(self, pipeline, positive, negative, video_length, base_resolution, seed, steps, cfg, denoise_strength, scheduler, validation_video, context_options=None):
|
||||||
validation_video):
|
|
||||||
device = mm.get_torch_device()
|
device = mm.get_torch_device()
|
||||||
offload_device = mm.unet_offload_device()
|
offload_device = mm.unet_offload_device()
|
||||||
pipe = pipeline["pipe"]
|
pipe = pipeline["pipe"]
|
||||||
@ -1562,15 +1564,20 @@ class CogVideoXFunVid2VidSampler:
|
|||||||
mm.soft_empty_cache()
|
mm.soft_empty_cache()
|
||||||
|
|
||||||
# Count most suitable height and width
|
# Count most suitable height and width
|
||||||
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
|
aspect_ratio_sample_size = {key: [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
|
||||||
|
|
||||||
validation_video = np.array(validation_video.cpu().numpy() * 255, np.uint8)
|
validation_video = np.array(validation_video.cpu().numpy() * 255, np.uint8)
|
||||||
original_width, original_height = Image.fromarray(validation_video[0]).size
|
original_width, original_height = Image.fromarray(validation_video[0]).size
|
||||||
|
|
||||||
closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
|
closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
|
||||||
height, width = [int(x / 16) * 16 for x in closest_size]
|
height, width = [int(x / 16) * 16 for x in closest_size]
|
||||||
|
log.info(f"Closest bucket size: {width}x{height}")
|
||||||
|
|
||||||
|
# Handle context_options and adjust scheduler if needed
|
||||||
|
if context_options is not None and context_options["context_schedule"] == "temporal_tiling":
|
||||||
|
logging.info("Temporal tiling enabled, changing scheduler to CogVideoXDDIM")
|
||||||
|
scheduler = "CogVideoXDDIM"
|
||||||
|
|
||||||
# Load Sampler
|
|
||||||
scheduler_config = pipeline["scheduler_config"]
|
scheduler_config = pipeline["scheduler_config"]
|
||||||
if scheduler in scheduler_mapping:
|
if scheduler in scheduler_mapping:
|
||||||
noise_scheduler = scheduler_mapping[scheduler].from_config(scheduler_config)
|
noise_scheduler = scheduler_mapping[scheduler].from_config(scheduler_config)
|
||||||
@ -1578,7 +1585,15 @@ class CogVideoXFunVid2VidSampler:
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown scheduler: {scheduler}")
|
raise ValueError(f"Unknown scheduler: {scheduler}")
|
||||||
|
|
||||||
generator = torch.Generator(device=torch.device("cpu")).manual_seed(seed)
|
if context_options is not None:
|
||||||
|
context_frames = context_options["context_frames"] // 4
|
||||||
|
context_stride = context_options["context_stride"] // 4
|
||||||
|
context_overlap = context_options["context_overlap"] // 4
|
||||||
|
else:
|
||||||
|
context_frames, context_stride, context_overlap = None, None, None
|
||||||
|
|
||||||
|
# Create generator on the correct device
|
||||||
|
generator = torch.Generator(device=device).manual_seed(seed)
|
||||||
|
|
||||||
autocastcondition = not pipeline["onediff"] or not dtype == torch.float32
|
autocastcondition = not pipeline["onediff"] or not dtype == torch.float32
|
||||||
autocast_context = torch.autocast(mm.get_autocast_device(device)) if autocastcondition else nullcontext()
|
autocast_context = torch.autocast(mm.get_autocast_device(device)) if autocastcondition else nullcontext()
|
||||||
@ -1586,12 +1601,33 @@ class CogVideoXFunVid2VidSampler:
|
|||||||
video_length = int((video_length - 1) // pipe.vae.config.temporal_compression_ratio * pipe.vae.config.temporal_compression_ratio) + 1 if video_length != 1 else 1
|
video_length = int((video_length - 1) // pipe.vae.config.temporal_compression_ratio * pipe.vae.config.temporal_compression_ratio) + 1 if video_length != 1 else 1
|
||||||
input_video, input_video_mask, clip_image = get_video_to_video_latent(validation_video, video_length=video_length, sample_size=(height, width))
|
input_video, input_video_mask, clip_image = get_video_to_video_latent(validation_video, video_length=video_length, sample_size=(height, width))
|
||||||
|
|
||||||
# for _lora_path, _lora_weight in zip(cogvideoxfun_model.get("loras", []), cogvideoxfun_model.get("strength_model", [])):
|
# Move tensors to the correct device
|
||||||
# pipeline = merge_lora(pipeline, _lora_path, _lora_weight)
|
input_video = input_video.to(device)
|
||||||
|
input_video_mask = input_video_mask.to(device)
|
||||||
|
if clip_image is not None:
|
||||||
|
clip_image = clip_image.to(device)
|
||||||
|
|
||||||
|
# Move positive and negative prompts to the correct device
|
||||||
|
positive = positive.to(dtype).to(device)
|
||||||
|
negative = negative.to(dtype).to(device)
|
||||||
|
|
||||||
|
# Add print statements to debug devices
|
||||||
|
print(f"Device Information:")
|
||||||
|
print(f" device: {device}")
|
||||||
|
print(f" positive.device: {positive.device}")
|
||||||
|
print(f" negative.device: {negative.device}")
|
||||||
|
print(f" input_video.device: {input_video.device}")
|
||||||
|
print(f" input_video_mask.device: {input_video_mask.device}")
|
||||||
|
if clip_image is not None:
|
||||||
|
print(f" clip_image.device: {clip_image.device}")
|
||||||
|
print(f" generator device: {generator.device}")
|
||||||
|
print(f" pipe.device: {device}")
|
||||||
|
print(f" pipe.vae.device: {next(pipe.vae.parameters()).device}")
|
||||||
|
print(f" pipe.transformer.device: {next(pipe.transformer.parameters()).device}")
|
||||||
|
|
||||||
common_params = {
|
common_params = {
|
||||||
"prompt_embeds": positive.to(dtype).to(device),
|
"prompt_embeds": positive,
|
||||||
"negative_prompt_embeds": negative.to(dtype).to(device),
|
"negative_prompt_embeds": negative,
|
||||||
"num_frames": video_length,
|
"num_frames": video_length,
|
||||||
"height": height,
|
"height": height,
|
||||||
"width": width,
|
"width": width,
|
||||||
@ -1605,25 +1641,15 @@ class CogVideoXFunVid2VidSampler:
|
|||||||
**common_params,
|
**common_params,
|
||||||
video=input_video,
|
video=input_video,
|
||||||
mask_video=input_video_mask,
|
mask_video=input_video_mask,
|
||||||
strength=float(denoise_strength)
|
strength=float(denoise_strength),
|
||||||
|
context_schedule=context_options["context_schedule"] if context_options is not None else None,
|
||||||
|
context_frames=context_frames,
|
||||||
|
context_stride=context_stride,
|
||||||
|
context_overlap=context_overlap,
|
||||||
|
freenoise=context_options["freenoise"] if context_options is not None else None
|
||||||
)
|
)
|
||||||
|
|
||||||
# for _lora_path, _lora_weight in zip(cogvideoxfun_model.get("loras", []), cogvideoxfun_model.get("strength_model", [])):
|
|
||||||
# pipeline = unmerge_lora(pipeline, _lora_path, _lora_weight)
|
|
||||||
return (pipeline, {"samples": latents})
|
return (pipeline, {"samples": latents})
|
||||||
|
|
||||||
def add_noise_to_reference_video(image, ratio=None):
|
|
||||||
if ratio is None:
|
|
||||||
sigma = torch.normal(mean=-3.0, std=0.5, size=(image.shape[0],)).to(image.device)
|
|
||||||
sigma = torch.exp(sigma).to(image.dtype)
|
|
||||||
else:
|
|
||||||
sigma = torch.ones((image.shape[0],)).to(image.device, image.dtype) * ratio
|
|
||||||
|
|
||||||
image_noise = torch.randn_like(image) * sigma[:, None, None, None, None]
|
|
||||||
image_noise = torch.where(image==-1, torch.zeros_like(image), image_noise)
|
|
||||||
image = image + image_noise
|
|
||||||
return image
|
|
||||||
|
|
||||||
class CogVideoControlImageEncode:
|
class CogVideoControlImageEncode:
|
||||||
@classmethod
|
@classmethod
|
||||||
def INPUT_TYPES(s):
|
def INPUT_TYPES(s):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user