From ed167ecfffaf13415e885745506fbcb9844e3c96 Mon Sep 17 00:00:00 2001 From: kijai <40791699+kijai@users.noreply.github.com> Date: Sun, 10 Nov 2024 19:27:40 +0200 Subject: [PATCH] allow scheduling cfg --- nodes.py | 12 ++++++++++-- pipeline_cogvideox.py | 12 ++++++------ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/nodes.py b/nodes.py index 7fa9606..2ac987a 100644 --- a/nodes.py +++ b/nodes.py @@ -349,6 +349,7 @@ class CogVideoImageEncode: "chunk_size": ("INT", {"default": 16, "min": 4}), "enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}), "mask": ("MASK", ), + "noise_aug_strength": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001, "tooltip": "Augment image with noise"}), }, } @@ -357,7 +358,7 @@ class CogVideoImageEncode: FUNCTION = "encode" CATEGORY = "CogVideoWrapper" - def encode(self, pipeline, image, chunk_size=8, enable_tiling=False, mask=None): + def encode(self, pipeline, image, chunk_size=8, enable_tiling=False, mask=None, noise_aug_strength=0.0): device = mm.get_torch_device() offload_device = mm.unet_offload_device() generator = torch.Generator(device=device).manual_seed(0) @@ -395,6 +396,8 @@ class CogVideoImageEncode: input_image = input_image.to(vae.dtype).to(device) input_image = input_image.unsqueeze(0).permute(0, 4, 1, 2, 3) # B, C, T, H, W B, C, T, H, W = input_image.shape + if noise_aug_strength > 0: + input_image = add_noise_to_reference_video(input_image, ratio=noise_aug_strength) latents_list = [] # Loop through the temporal dimension in chunks of 16 @@ -786,7 +789,7 @@ class CogVideoSampler: "negative": ("CONDITIONING", ), "height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 16}), "width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 16}), - "num_frames": ("INT", {"default": 48, "min": 16, "max": 1024, "step": 1}), + "num_frames": ("INT", {"default": 49, "min": 17, "max": 1024, "step": 4}), "steps": ("INT", {"default": 50, "min": 1}), "cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}), "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}), @@ -868,6 +871,11 @@ class CogVideoSampler: pipe.transformer.use_fastercache = False pipe.transformer.fastercache_counter = 0 + if not isinstance(cfg, list): + cfg = [cfg for _ in range(steps)] + else: + assert len(cfg) == steps, "Length of cfg list must match number of steps" + autocastcondition = not pipeline["onediff"] or not dtype == torch.float32 autocast_context = torch.autocast(mm.get_autocast_device(device), dtype=dtype) if autocastcondition else nullcontext() with autocast_context: diff --git a/pipeline_cogvideox.py b/pipeline_cogvideox.py index 22bb8e7..a9353b2 100644 --- a/pipeline_cogvideox.py +++ b/pipeline_cogvideox.py @@ -451,7 +451,7 @@ class CogVideoXPipeline(VideoSysPipeline, CogVideoXLoraLoaderMixin): # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 + do_classifier_free_guidance = guidance_scale[0] > 1.0 if do_classifier_free_guidance: prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) @@ -660,7 +660,7 @@ class CogVideoXPipeline(VideoSysPipeline, CogVideoXLoraLoaderMixin): if self.do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + self._guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + self._guidance_scale[i] * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 latents_tile = self.scheduler.step(noise_pred, t, latents_tile.to(self.vae.dtype), **extra_step_kwargs, return_dict=False)[0] @@ -801,7 +801,7 @@ class CogVideoXPipeline(VideoSysPipeline, CogVideoXLoraLoaderMixin): noise_pred /= counter if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + self._guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + self._guidance_scale[i] * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 if not isinstance(self.scheduler, CogVideoXDPMScheduler): @@ -865,15 +865,15 @@ class CogVideoXPipeline(VideoSysPipeline, CogVideoXLoraLoaderMixin): video_flow_features=video_flow_features if (tora is not None and tora["start_percent"] <= current_step_percentage <= tora["end_percent"]) else None, )[0] noise_pred = noise_pred.float() - + print(self._guidance_scale[i]) if isinstance(self.scheduler, CogVideoXDPMScheduler): - self._guidance_scale = 1 + guidance_scale * ( + self._guidance_scale[i] = 1 + guidance_scale[i] * ( (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2 ) if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + self._guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + self._guidance_scale[i] * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 if not isinstance(self.scheduler, CogVideoXDPMScheduler):