mirror of
https://git.datalinker.icu/kijai/ComfyUI-CogVideoXWrapper.git
synced 2025-12-08 20:34:23 +08:00
allow scheduling cfg
This commit is contained in:
parent
87ed4a56cf
commit
ed167ecfff
12
nodes.py
12
nodes.py
@ -349,6 +349,7 @@ class CogVideoImageEncode:
|
||||
"chunk_size": ("INT", {"default": 16, "min": 4}),
|
||||
"enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}),
|
||||
"mask": ("MASK", ),
|
||||
"noise_aug_strength": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001, "tooltip": "Augment image with noise"}),
|
||||
},
|
||||
}
|
||||
|
||||
@ -357,7 +358,7 @@ class CogVideoImageEncode:
|
||||
FUNCTION = "encode"
|
||||
CATEGORY = "CogVideoWrapper"
|
||||
|
||||
def encode(self, pipeline, image, chunk_size=8, enable_tiling=False, mask=None):
|
||||
def encode(self, pipeline, image, chunk_size=8, enable_tiling=False, mask=None, noise_aug_strength=0.0):
|
||||
device = mm.get_torch_device()
|
||||
offload_device = mm.unet_offload_device()
|
||||
generator = torch.Generator(device=device).manual_seed(0)
|
||||
@ -395,6 +396,8 @@ class CogVideoImageEncode:
|
||||
input_image = input_image.to(vae.dtype).to(device)
|
||||
input_image = input_image.unsqueeze(0).permute(0, 4, 1, 2, 3) # B, C, T, H, W
|
||||
B, C, T, H, W = input_image.shape
|
||||
if noise_aug_strength > 0:
|
||||
input_image = add_noise_to_reference_video(input_image, ratio=noise_aug_strength)
|
||||
|
||||
latents_list = []
|
||||
# Loop through the temporal dimension in chunks of 16
|
||||
@ -786,7 +789,7 @@ class CogVideoSampler:
|
||||
"negative": ("CONDITIONING", ),
|
||||
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 16}),
|
||||
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 16}),
|
||||
"num_frames": ("INT", {"default": 48, "min": 16, "max": 1024, "step": 1}),
|
||||
"num_frames": ("INT", {"default": 49, "min": 17, "max": 1024, "step": 4}),
|
||||
"steps": ("INT", {"default": 50, "min": 1}),
|
||||
"cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
|
||||
"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
|
||||
@ -868,6 +871,11 @@ class CogVideoSampler:
|
||||
pipe.transformer.use_fastercache = False
|
||||
pipe.transformer.fastercache_counter = 0
|
||||
|
||||
if not isinstance(cfg, list):
|
||||
cfg = [cfg for _ in range(steps)]
|
||||
else:
|
||||
assert len(cfg) == steps, "Length of cfg list must match number of steps"
|
||||
|
||||
autocastcondition = not pipeline["onediff"] or not dtype == torch.float32
|
||||
autocast_context = torch.autocast(mm.get_autocast_device(device), dtype=dtype) if autocastcondition else nullcontext()
|
||||
with autocast_context:
|
||||
|
||||
@ -451,7 +451,7 @@ class CogVideoXPipeline(VideoSysPipeline, CogVideoXLoraLoaderMixin):
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
# corresponds to doing no classifier free guidance.
|
||||
do_classifier_free_guidance = guidance_scale > 1.0
|
||||
do_classifier_free_guidance = guidance_scale[0] > 1.0
|
||||
|
||||
if do_classifier_free_guidance:
|
||||
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
|
||||
@ -660,7 +660,7 @@ class CogVideoXPipeline(VideoSysPipeline, CogVideoXLoraLoaderMixin):
|
||||
|
||||
if self.do_classifier_free_guidance:
|
||||
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
||||
noise_pred = noise_pred_uncond + self._guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
noise_pred = noise_pred_uncond + self._guidance_scale[i] * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
latents_tile = self.scheduler.step(noise_pred, t, latents_tile.to(self.vae.dtype), **extra_step_kwargs, return_dict=False)[0]
|
||||
@ -801,7 +801,7 @@ class CogVideoXPipeline(VideoSysPipeline, CogVideoXLoraLoaderMixin):
|
||||
noise_pred /= counter
|
||||
if do_classifier_free_guidance:
|
||||
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
||||
noise_pred = noise_pred_uncond + self._guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
noise_pred = noise_pred_uncond + self._guidance_scale[i] * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
if not isinstance(self.scheduler, CogVideoXDPMScheduler):
|
||||
@ -865,15 +865,15 @@ class CogVideoXPipeline(VideoSysPipeline, CogVideoXLoraLoaderMixin):
|
||||
video_flow_features=video_flow_features if (tora is not None and tora["start_percent"] <= current_step_percentage <= tora["end_percent"]) else None,
|
||||
)[0]
|
||||
noise_pred = noise_pred.float()
|
||||
|
||||
print(self._guidance_scale[i])
|
||||
if isinstance(self.scheduler, CogVideoXDPMScheduler):
|
||||
self._guidance_scale = 1 + guidance_scale * (
|
||||
self._guidance_scale[i] = 1 + guidance_scale[i] * (
|
||||
(1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
|
||||
)
|
||||
|
||||
if do_classifier_free_guidance:
|
||||
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
||||
noise_pred = noise_pred_uncond + self._guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
noise_pred = noise_pred_uncond + self._guidance_scale[i] * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
if not isinstance(self.scheduler, CogVideoXDPMScheduler):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user