Modify CogVideoXfun pipeline to fix Vid2Vid sampler issues

This commit is contained in:
wangxi 2024-10-26 07:38:14 +08:00
parent 25f16462aa
commit c2950dfb47
3 changed files with 756 additions and 116 deletions

View File

@ -262,111 +262,111 @@ class CogVideoX_Fun_Pipeline_Inpaint(VideoSysPipeline):
set_pab_manager(pab_config)
def prepare_latents(
self,
batch_size,
num_channels_latents,
height,
width,
video_length,
dtype,
device,
generator,
latents=None,
video=None,
timestep=None,
is_strength_max=True,
return_noise=False,
return_video_latents=False,
context_size=None,
context_overlap=None,
freenoise=False,
):
shape = (
self,
batch_size,
(video_length - 1) // self.vae_scale_factor_temporal + 1,
num_channels_latents,
height // self.vae_scale_factor_spatial,
width // self.vae_scale_factor_spatial,
)
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
height,
width,
video_length,
dtype,
device,
generator,
latents=None,
video=None,
timestep=None,
is_strength_max=True,
return_noise=False,
return_video_latents=False,
context_size=None,
context_overlap=None,
freenoise=False,
):
shape = (
batch_size,
(video_length - 1) // self.vae_scale_factor_temporal + 1,
num_channels_latents,
height // self.vae_scale_factor_spatial,
width // self.vae_scale_factor_spatial,
)
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
if return_video_latents or (latents is None and not is_strength_max):
video = video.to(device=device, dtype=self.vae.dtype)
bs = 1
new_video = []
for i in range(0, video.shape[0], bs):
video_bs = video[i : i + bs]
video_bs = self.vae.encode(video_bs)[0]
video_bs = video_bs.sample()
new_video.append(video_bs)
video = torch.cat(new_video, dim = 0)
video = video * self.vae.config.scaling_factor
if return_video_latents or (latents is None and not is_strength_max):
video = video.to(device=device, dtype=self.vae.dtype)
bs = 1
new_video = []
for i in range(0, video.shape[0], bs):
video_bs = video[i : i + bs]
video_bs = self.vae.encode(video_bs)[0]
video_bs = video_bs.sample()
new_video.append(video_bs)
video = torch.cat(new_video, dim = 0)
video = video * self.vae.config.scaling_factor
video_latents = video.repeat(batch_size // video.shape[0], 1, 1, 1, 1)
video_latents = video_latents.to(device=device, dtype=dtype)
video_latents = rearrange(video_latents, "b c f h w -> b f c h w")
video_latents = video.repeat(batch_size // video.shape[0], 1, 1, 1, 1)
video_latents = video_latents.to(device=device, dtype=dtype)
video_latents = rearrange(video_latents, "b c f h w -> b f c h w")
if latents is None:
noise = randn_tensor(shape, generator=generator, device=torch.device("cpu"), dtype=dtype)
if freenoise:
print("Applying FreeNoise")
# code and comments from AnimateDiff-Evolved by Kosinkadink (https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved)
video_length = video_length // 4
delta = context_size - context_overlap
for start_idx in range(0, video_length-context_size, delta):
# start_idx corresponds to the beginning of a context window
# goal: place shuffled in the delta region right after the end of the context window
# if space after context window is not enough to place the noise, adjust and finish
place_idx = start_idx + context_size
# if place_idx is outside the valid indexes, we are already finished
if place_idx >= video_length:
break
end_idx = place_idx - 1
#print("video_length:", video_length, "start_idx:", start_idx, "end_idx:", end_idx, "place_idx:", place_idx, "delta:", delta)
if latents is None:
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
if freenoise:
print("Applying FreeNoise")
# code and comments from AnimateDiff-Evolved by Kosinkadink (https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved)
video_length_adjusted = video_length // 4
delta = context_size - context_overlap
for start_idx in range(0, video_length_adjusted - context_size, delta):
# start_idx corresponds to the beginning of a context window
# goal: place shuffled in the delta region right after the end of the context window
# if space after context window is not enough to place the noise, adjust and finish
place_idx = start_idx + context_size
# if place_idx is outside the valid indexes, we are already finished
if place_idx >= video_length_adjusted:
break
end_idx = place_idx - 1
#print("video_length:", video_length, "start_idx:", start_idx, "end_idx:", end_idx, "place_idx:", place_idx, "delta:", delta)
# if there is not enough room to copy delta amount of indexes, copy limited amount and finish
if end_idx + delta >= video_length:
final_delta = video_length - place_idx
# generate list of indexes in final delta region
list_idx = torch.tensor(list(range(start_idx,start_idx+final_delta)), device=torch.device("cpu"), dtype=torch.long)
# if there is not enough room to copy delta amount of indexes, copy limited amount and finish
if end_idx + delta >= video_length_adjusted:
final_delta = video_length_adjusted - place_idx
# generate list of indexes in final delta region
list_idx = torch.tensor(list(range(start_idx, start_idx + final_delta)), device=device, dtype=torch.long)
# shuffle list
list_idx = list_idx[torch.randperm(final_delta, generator=generator)]
# apply shuffled indexes
noise[:, place_idx:place_idx + final_delta, :, :, :] = noise[:, list_idx, :, :, :]
break
# otherwise, do normal behavior
# generate list of indexes in delta region
list_idx = torch.tensor(list(range(start_idx, start_idx + delta)), device=device, dtype=torch.long)
# shuffle list
list_idx = list_idx[torch.randperm(final_delta, generator=generator)]
list_idx = list_idx[torch.randperm(delta, generator=generator)]
# apply shuffled indexes
noise[:, place_idx:place_idx + final_delta, :, :, :] = noise[:, list_idx, :, :, :]
break
# otherwise, do normal behavior
# generate list of indexes in delta region
list_idx = torch.tensor(list(range(start_idx,start_idx+delta)), device=torch.device("cpu"), dtype=torch.long)
# shuffle list
list_idx = list_idx[torch.randperm(delta, generator=generator)]
# apply shuffled indexes
#print("place_idx:", place_idx, "delta:", delta, "list_idx:", list_idx)
noise[:, place_idx:place_idx + delta, :, :, :] = noise[:, list_idx, :, :, :]
#print("place_idx:", place_idx, "delta:", delta, "list_idx:", list_idx)
noise[:, place_idx:place_idx + delta, :, :, :] = noise[:, list_idx, :, :, :]
# if strength is 1. then initialise the latents to noise, else initial to image + noise
latents = noise if is_strength_max else self.scheduler.add_noise(video_latents, noise, timestep)
# if pure noise then scale the initial latents by the Scheduler's init sigma
latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
latents = latents.to(device)
else:
noise = latents.to(device)
latents = noise * self.scheduler.init_noise_sigma
# if strength is 1. then initialise the latents to noise, else initial to image + noise
latents = noise if is_strength_max else self.scheduler.add_noise(video_latents, noise, timestep)
# if pure noise then scale the initial latents by the Scheduler's init sigma
latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
latents = latents.to(device)
else:
noise = latents.to(device)
latents = noise * self.scheduler.init_noise_sigma
# scale the initial noise by the standard deviation required by the scheduler
outputs = (latents,)
# scale the initial noise by the standard deviation required by the scheduler
outputs = (latents,)
if return_noise:
outputs += (noise,)
if return_noise:
outputs += (noise,)
if return_video_latents:
outputs += (video_latents,)
if return_video_latents:
outputs += (video_latents,)
return outputs
return outputs
def prepare_mask_latents(
self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance, noise_aug_strength

View File

@ -0,0 +1,614 @@
{
"last_node_id": 224,
"last_link_id": 333,
"nodes": [
{
"id": 216,
"type": "CogVideoTextEncode",
"pos": {
"0": 1320,
"1": 2224
},
"size": [
400,
200
],
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 315
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
314
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
"",
1,
true
]
},
{
"id": 217,
"type": "CLIPLoader",
"pos": {
"0": 937,
"1": 2245
},
"size": {
"0": 315,
"1": 82
},
"flags": {},
"order": 0,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "CLIP",
"type": "CLIP",
"links": [
315,
317
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CLIPLoader"
},
"widgets_values": [
"t5xxl_fp8_e4m3fn.safetensors",
"sd3"
]
},
{
"id": 206,
"type": "VHS_VideoCombine",
"pos": {
"0": 2648,
"1": 2268
},
"size": [
595.7279663085938,
652.9874093191964
],
"flags": {
"collapsed": false
},
"order": 8,
"mode": 0,
"inputs": [
{
"name": "images",
"type": "IMAGE",
"link": 333,
"label": "images"
},
{
"name": "audio",
"type": "AUDIO",
"link": null,
"label": "audio"
},
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null,
"label": "meta_batch"
},
{
"name": "vae",
"type": "VAE",
"link": null,
"label": "vae"
}
],
"outputs": [
{
"name": "Filenames",
"type": "VHS_FILENAMES",
"links": null,
"shape": 3,
"label": "Filenames"
}
],
"properties": {
"Node name for S&R": "VHS_VideoCombine",
"ttNbgOverride": {
"color": "#223",
"bgcolor": "#335",
"groupcolor": "#88A"
}
},
"widgets_values": {
"frame_rate": 24,
"loop_count": 0,
"filename_prefix": "1009/",
"format": "video/h264-mp4",
"pix_fmt": "yuv420p",
"crf": 19,
"save_metadata": false,
"pingpong": false,
"save_output": true,
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"filename": "1009_00373.mp4",
"subfolder": "",
"type": "output",
"format": "video/h264-mp4",
"frame_rate": 24
},
"muted": false
}
},
"color": "#223",
"bgcolor": "#335"
},
{
"id": 220,
"type": "CogVideoDecode",
"pos": {
"0": 2242.513671875,
"1": 2250.128662109375
},
"size": {
"0": 315,
"1": 198
},
"flags": {},
"order": 7,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 320
},
{
"name": "samples",
"type": "LATENT",
"link": 321
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
333
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
},
"widgets_values": [
false,
240,
360,
0.2,
0.2,
true
]
},
{
"id": 218,
"type": "CogVideoTextEncode",
"pos": {
"0": 1318,
"1": 2492
},
"size": {
"0": 400,
"1": 200
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 317
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
318
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
"The video is not of a high quality, it has a low resolution. Strange motion trajectory, a poor composition and deformed video, low resolution, duplicate and ugly, strange body structure, long and strange neck, bad teeth, bad eyes, bad limbs, bad hands, blurry camera, shaking camera. Deformation, blurry, ugly, distortion. ",
1,
true
]
},
{
"id": 215,
"type": "DownloadAndLoadCogVideoModel",
"pos": {
"0": 1388,
"1": 1986
},
"size": {
"0": 315,
"1": 194
},
"flags": {},
"order": 1,
"mode": 0,
"inputs": [
{
"name": "pab_config",
"type": "PAB_CONFIG",
"link": null
},
{
"name": "block_edit",
"type": "TRANSFORMERBLOCKS",
"link": null
},
{
"name": "lora",
"type": "COGLORA",
"link": null
}
],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
313
],
"shape": 3
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadCogVideoModel"
},
"widgets_values": [
"kijai/CogVideoX-Fun-5b",
"bf16",
"disabled",
"disabled",
false
]
},
{
"id": 214,
"type": "CogVideoXFunVid2VidSampler",
"pos": {
"0": 1823,
"1": 2249
},
"size": {
"0": 380.4000244140625,
"1": 306
},
"flags": {},
"order": 6,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 313
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 314
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 318
},
{
"name": "validation_video",
"type": "IMAGE",
"link": 332
},
{
"name": "context_options",
"type": "COGCONTEXT",
"link": 327
}
],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
320
],
"slot_index": 0,
"shape": 3
},
{
"name": "samples",
"type": "LATENT",
"links": [
321
],
"slot_index": 1,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoXFunVid2VidSampler"
},
"widgets_values": [
49,
768,
15861381752719,
"fixed",
30,
7,
"DDIM",
0.5
]
},
{
"id": 222,
"type": "CogVideoContextOptions",
"pos": {
"0": 1379,
"1": 2771
},
"size": [
315,
154
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "context_options",
"type": "COGCONTEXT",
"links": [
327
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoContextOptions"
},
"widgets_values": [
"uniform_standard",
48,
4,
4,
true
]
},
{
"id": 209,
"type": "VHS_LoadVideo",
"pos": {
"0": 286,
"1": 2110
},
"size": [
426.63287353515625,
508.3616420200893
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
},
{
"name": "vae",
"type": "VAE",
"link": null
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
332
],
"slot_index": 0,
"shape": 3
},
{
"name": "frame_count",
"type": "INT",
"links": null,
"shape": 3
},
{
"name": "audio",
"type": "AUDIO",
"links": null,
"shape": 3
},
{
"name": "video_info",
"type": "VHS_VIDEOINFO",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_LoadVideo"
},
"widgets_values": {
"video": "CogVideoX-Fun_00006.mp4",
"force_rate": 0,
"force_size": "Disabled",
"custom_width": 512,
"custom_height": 512,
"frame_load_cap": 0,
"skip_first_frames": 0,
"select_every_nth": 3,
"choose video to upload": "image",
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"frame_load_cap": 0,
"skip_first_frames": 0,
"force_rate": 0,
"filename": "CogVideoX-Fun_00006.mp4",
"type": "input",
"format": "video/mp4",
"select_every_nth": 3
},
"muted": false
}
},
"color": "#223",
"bgcolor": "#335"
}
],
"links": [
[
313,
215,
0,
214,
0,
"COGVIDEOPIPE"
],
[
314,
216,
0,
214,
1,
"CONDITIONING"
],
[
315,
217,
0,
216,
0,
"CLIP"
],
[
317,
217,
0,
218,
0,
"CLIP"
],
[
318,
218,
0,
214,
2,
"CONDITIONING"
],
[
320,
214,
0,
220,
0,
"COGVIDEOPIPE"
],
[
321,
214,
1,
220,
1,
"LATENT"
],
[
327,
222,
0,
214,
4,
"COGCONTEXT"
],
[
332,
209,
0,
214,
3,
"IMAGE"
],
[
333,
220,
0,
206,
0,
"IMAGE"
]
],
"groups": [
{
"title": "cogvideoxfun v2v sampler",
"bounding": [
78,
1820,
3385,
1603
],
"color": "#3f789e",
"font_size": 24,
"flags": {}
}
],
"config": {},
"extra": {
"ds": {
"scale": 0.3797498335832498,
"offset": [
590.2813176508071,
-1550.4283526080162
]
}
},
"version": 0.4
}

View File

@ -1538,6 +1538,9 @@ class CogVideoXFunVid2VidSampler:
"denoise_strength": ("FLOAT", {"default": 0.70, "min": 0.05, "max": 1.00, "step": 0.01}),
"validation_video": ("IMAGE",),
},
"optional": {
"context_options": ("COGCONTEXT", ),
},
}
RETURN_TYPES = ("COGVIDEOPIPE", "LATENT",)
@ -1545,8 +1548,7 @@ class CogVideoXFunVid2VidSampler:
FUNCTION = "process"
CATEGORY = "CogVideoWrapper"
def process(self, pipeline, positive, negative, video_length, base_resolution, seed, steps, cfg, denoise_strength, scheduler,
validation_video):
def process(self, pipeline, positive, negative, video_length, base_resolution, seed, steps, cfg, denoise_strength, scheduler, validation_video, context_options=None):
device = mm.get_torch_device()
offload_device = mm.unet_offload_device()
pipe = pipeline["pipe"]
@ -1562,15 +1564,20 @@ class CogVideoXFunVid2VidSampler:
mm.soft_empty_cache()
# Count most suitable height and width
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
aspect_ratio_sample_size = {key: [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
validation_video = np.array(validation_video.cpu().numpy() * 255, np.uint8)
original_width, original_height = Image.fromarray(validation_video[0]).size
closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
height, width = [int(x / 16) * 16 for x in closest_size]
log.info(f"Closest bucket size: {width}x{height}")
# Handle context_options and adjust scheduler if needed
if context_options is not None and context_options["context_schedule"] == "temporal_tiling":
logging.info("Temporal tiling enabled, changing scheduler to CogVideoXDDIM")
scheduler = "CogVideoXDDIM"
# Load Sampler
scheduler_config = pipeline["scheduler_config"]
if scheduler in scheduler_mapping:
noise_scheduler = scheduler_mapping[scheduler].from_config(scheduler_config)
@ -1578,7 +1585,15 @@ class CogVideoXFunVid2VidSampler:
else:
raise ValueError(f"Unknown scheduler: {scheduler}")
generator = torch.Generator(device=torch.device("cpu")).manual_seed(seed)
if context_options is not None:
context_frames = context_options["context_frames"] // 4
context_stride = context_options["context_stride"] // 4
context_overlap = context_options["context_overlap"] // 4
else:
context_frames, context_stride, context_overlap = None, None, None
# Create generator on the correct device
generator = torch.Generator(device=device).manual_seed(seed)
autocastcondition = not pipeline["onediff"] or not dtype == torch.float32
autocast_context = torch.autocast(mm.get_autocast_device(device)) if autocastcondition else nullcontext()
@ -1586,12 +1601,33 @@ class CogVideoXFunVid2VidSampler:
video_length = int((video_length - 1) // pipe.vae.config.temporal_compression_ratio * pipe.vae.config.temporal_compression_ratio) + 1 if video_length != 1 else 1
input_video, input_video_mask, clip_image = get_video_to_video_latent(validation_video, video_length=video_length, sample_size=(height, width))
# for _lora_path, _lora_weight in zip(cogvideoxfun_model.get("loras", []), cogvideoxfun_model.get("strength_model", [])):
# pipeline = merge_lora(pipeline, _lora_path, _lora_weight)
# Move tensors to the correct device
input_video = input_video.to(device)
input_video_mask = input_video_mask.to(device)
if clip_image is not None:
clip_image = clip_image.to(device)
# Move positive and negative prompts to the correct device
positive = positive.to(dtype).to(device)
negative = negative.to(dtype).to(device)
# Add print statements to debug devices
print(f"Device Information:")
print(f" device: {device}")
print(f" positive.device: {positive.device}")
print(f" negative.device: {negative.device}")
print(f" input_video.device: {input_video.device}")
print(f" input_video_mask.device: {input_video_mask.device}")
if clip_image is not None:
print(f" clip_image.device: {clip_image.device}")
print(f" generator device: {generator.device}")
print(f" pipe.device: {device}")
print(f" pipe.vae.device: {next(pipe.vae.parameters()).device}")
print(f" pipe.transformer.device: {next(pipe.transformer.parameters()).device}")
common_params = {
"prompt_embeds": positive.to(dtype).to(device),
"negative_prompt_embeds": negative.to(dtype).to(device),
"prompt_embeds": positive,
"negative_prompt_embeds": negative,
"num_frames": video_length,
"height": height,
"width": width,
@ -1605,25 +1641,15 @@ class CogVideoXFunVid2VidSampler:
**common_params,
video=input_video,
mask_video=input_video_mask,
strength=float(denoise_strength)
strength=float(denoise_strength),
context_schedule=context_options["context_schedule"] if context_options is not None else None,
context_frames=context_frames,
context_stride=context_stride,
context_overlap=context_overlap,
freenoise=context_options["freenoise"] if context_options is not None else None
)
# for _lora_path, _lora_weight in zip(cogvideoxfun_model.get("loras", []), cogvideoxfun_model.get("strength_model", [])):
# pipeline = unmerge_lora(pipeline, _lora_path, _lora_weight)
return (pipeline, {"samples": latents})
def add_noise_to_reference_video(image, ratio=None):
if ratio is None:
sigma = torch.normal(mean=-3.0, std=0.5, size=(image.shape[0],)).to(image.device)
sigma = torch.exp(sigma).to(image.dtype)
else:
sigma = torch.ones((image.shape[0],)).to(image.device, image.dtype) * ratio
image_noise = torch.randn_like(image) * sigma[:, None, None, None, None]
image_noise = torch.where(image==-1, torch.zeros_like(image), image_noise)
image = image + image_noise
return image
class CogVideoControlImageEncode:
@classmethod
def INPUT_TYPES(s):