Add start/end percent to image_conds

This commit is contained in:
kijai 2024-11-20 02:07:29 +02:00
parent b9688f3cd2
commit 5cc570a467
2 changed files with 21 additions and 4 deletions

View File

@ -221,6 +221,8 @@ class CogVideoImageEncode:
"enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}), "enable_tiling": ("BOOLEAN", {"default": False, "tooltip": "Enable tiling for the VAE to reduce memory usage"}),
"noise_aug_strength": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001, "tooltip": "Augment image with noise"}), "noise_aug_strength": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001, "tooltip": "Augment image with noise"}),
"strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
"start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01}),
"end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
}, },
} }
@ -229,7 +231,7 @@ class CogVideoImageEncode:
FUNCTION = "encode" FUNCTION = "encode"
CATEGORY = "CogVideoWrapper" CATEGORY = "CogVideoWrapper"
def encode(self, vae, start_image, end_image=None, enable_tiling=False, noise_aug_strength=0.0, strength=1.0): def encode(self, vae, start_image, end_image=None, enable_tiling=False, noise_aug_strength=0.0, strength=1.0, start_percent=0.0, end_percent=1.0):
device = mm.get_torch_device() device = mm.get_torch_device()
offload_device = mm.unet_offload_device() offload_device = mm.unet_offload_device()
generator = torch.Generator(device=device).manual_seed(0) generator = torch.Generator(device=device).manual_seed(0)
@ -277,7 +279,11 @@ class CogVideoImageEncode:
log.info(f"Encoded latents shape: {final_latents.shape}") log.info(f"Encoded latents shape: {final_latents.shape}")
vae.to(offload_device) vae.to(offload_device)
return ({"samples": final_latents}, ) return ({
"samples": final_latents,
"start_percent": start_percent,
"end_percent": end_percent
}, )
class CogVideoImageEncodeFunInP: class CogVideoImageEncodeFunInP:
@classmethod @classmethod
@ -608,6 +614,8 @@ class CogVideoSampler:
if image_cond_latents is not None: if image_cond_latents is not None:
assert supports_image_conds, "Image condition latents only supported for I2V and Interpolation models" assert supports_image_conds, "Image condition latents only supported for I2V and Interpolation models"
image_conds = image_cond_latents["samples"] image_conds = image_cond_latents["samples"]
image_cond_start_percent = image_cond_latents.get("start_percent", 0.0)
image_cond_end_percent = image_cond_latents.get("end_percent", 1.0)
if "1.5" in model_name or "1_5" in model_name: if "1.5" in model_name or "1_5" in model_name:
image_conds = image_conds / 0.7 # needed for 1.5 models image_conds = image_conds / 0.7 # needed for 1.5 models
else: else:
@ -704,6 +712,8 @@ class CogVideoSampler:
freenoise=context_options["freenoise"] if context_options is not None else None, freenoise=context_options["freenoise"] if context_options is not None else None,
controlnet=controlnet, controlnet=controlnet,
tora=tora_trajectory if tora_trajectory is not None else None, tora=tora_trajectory if tora_trajectory is not None else None,
image_cond_start_percent=image_cond_start_percent,
image_cond_end_percent=image_cond_end_percent
) )
if not model["cpu_offloading"] and model["manual_offloading"]: if not model["cpu_offloading"] and model["manual_offloading"]:
pipe.transformer.to(offload_device) pipe.transformer.to(offload_device)

View File

@ -349,6 +349,8 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
freenoise: Optional[bool] = True, freenoise: Optional[bool] = True,
controlnet: Optional[dict] = None, controlnet: Optional[dict] = None,
tora: Optional[dict] = None, tora: Optional[dict] = None,
image_cond_start_percent: float = 0.0,
image_cond_end_percent: float = 1.0,
): ):
""" """
@ -708,8 +710,13 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
current_step_percentage = i / num_inference_steps
if image_cond_latents is not None: if image_cond_latents is not None:
latent_image_input = torch.cat([image_cond_latents] * 2) if do_classifier_free_guidance else image_cond_latents if not image_cond_start_percent <= current_step_percentage <= image_cond_end_percent:
latent_image_input = torch.zeros_like(latent_model_input)
else:
latent_image_input = torch.cat([image_cond_latents] * 2) if do_classifier_free_guidance else image_cond_latents
if fun_mask is not None: #for fun img2vid and interpolation if fun_mask is not None: #for fun img2vid and interpolation
fun_inpaint_mask = torch.cat([fun_mask] * 2) if do_classifier_free_guidance else fun_mask fun_inpaint_mask = torch.cat([fun_mask] * 2) if do_classifier_free_guidance else fun_mask
masks_input = torch.cat([fun_inpaint_mask, latent_image_input], dim=2) masks_input = torch.cat([fun_inpaint_mask, latent_image_input], dim=2)
@ -726,7 +733,7 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
timestep = t.expand(latent_model_input.shape[0]) timestep = t.expand(latent_model_input.shape[0])
current_step_percentage = i / num_inference_steps
if controlnet is not None: if controlnet is not None:
controlnet_states = None controlnet_states = None