mirror of
https://git.datalinker.icu/kijai/ComfyUI-CogVideoXWrapper.git
synced 2025-12-16 08:24:25 +08:00
Deprecate CogVideoXFunVid2VidSampler and move it's functionality to CogVideoXFunSampler
too many nodes
This commit is contained in:
parent
5b4819ba65
commit
3f97f07275
@ -349,7 +349,7 @@ class CogVideoX_Fun_Pipeline_Inpaint(VideoSysPipeline):
|
|||||||
noise[:, place_idx:place_idx + delta, :, :, :] = noise[:, list_idx, :, :, :]
|
noise[:, place_idx:place_idx + delta, :, :, :] = noise[:, list_idx, :, :, :]
|
||||||
|
|
||||||
# if strength is 1. then initialise the latents to noise, else initial to image + noise
|
# if strength is 1. then initialise the latents to noise, else initial to image + noise
|
||||||
latents = noise if is_strength_max else self.scheduler.add_noise(video_latents, noise, timestep)
|
latents = noise if is_strength_max else self.scheduler.add_noise(video_latents.to(noise), noise, timestep)
|
||||||
# if pure noise then scale the initial latents by the Scheduler's init sigma
|
# if pure noise then scale the initial latents by the Scheduler's init sigma
|
||||||
latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
|
latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
|
||||||
latents = latents.to(device)
|
latents = latents.to(device)
|
||||||
|
|||||||
153
nodes.py
153
nodes.py
@ -469,7 +469,7 @@ class DownloadAndLoadCogVideoGGUFModel:
|
|||||||
"optional": {
|
"optional": {
|
||||||
"pab_config": ("PAB_CONFIG", {"default": None}),
|
"pab_config": ("PAB_CONFIG", {"default": None}),
|
||||||
"block_edit": ("TRANSFORMERBLOCKS", {"default": None}),
|
"block_edit": ("TRANSFORMERBLOCKS", {"default": None}),
|
||||||
"compile": (["disabled","onediff","torch"], {"tooltip": "compile the model for faster inference, these are advanced options only available on Linux, see readme for more info"}),
|
"compile": (["disabled","torch"], {"tooltip": "compile the model for faster inference, these are advanced options only available on Linux, see readme for more info"}),
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -569,9 +569,10 @@ class DownloadAndLoadCogVideoGGUFModel:
|
|||||||
from .fp8_optimization import convert_fp8_linear
|
from .fp8_optimization import convert_fp8_linear
|
||||||
convert_fp8_linear(transformer, vae_dtype)
|
convert_fp8_linear(transformer, vae_dtype)
|
||||||
|
|
||||||
# compilation
|
if compile == "torch":
|
||||||
for i, block in enumerate(transformer.transformer_blocks):
|
# compilation
|
||||||
transformer.transformer_blocks[i] = torch.compile(block, fullgraph=False, dynamic=False, backend="inductor")
|
for i, block in enumerate(transformer.transformer_blocks):
|
||||||
|
transformer.transformer_blocks[i] = torch.compile(block, fullgraph=False, dynamic=False, backend="inductor")
|
||||||
with open(scheduler_path) as f:
|
with open(scheduler_path) as f:
|
||||||
scheduler_config = json.load(f)
|
scheduler_config = json.load(f)
|
||||||
|
|
||||||
@ -1107,7 +1108,7 @@ class ToraEncodeTrajectory:
|
|||||||
"coordinates": ("STRING", {"forceInput": True}),
|
"coordinates": ("STRING", {"forceInput": True}),
|
||||||
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
|
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
|
||||||
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
|
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
|
||||||
"num_frames": ("INT", {"default": 49, "min": 16, "max": 1024, "step": 1}),
|
"num_frames": ("INT", {"default": 49, "min": 2, "max": 1024, "step": 1}),
|
||||||
"strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
|
"strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
|
||||||
"start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01}),
|
"start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01}),
|
||||||
"end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
|
"end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
|
||||||
@ -1482,6 +1483,8 @@ class CogVideoXFunSampler:
|
|||||||
"context_options": ("COGCONTEXT", ),
|
"context_options": ("COGCONTEXT", ),
|
||||||
"tora_trajectory": ("TORAFEATURES", ),
|
"tora_trajectory": ("TORAFEATURES", ),
|
||||||
"fastercache": ("FASTERCACHEARGS",),
|
"fastercache": ("FASTERCACHEARGS",),
|
||||||
|
"vid2vid_images": ("IMAGE",),
|
||||||
|
"vid2vid_denoise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001}),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1491,7 +1494,8 @@ class CogVideoXFunSampler:
|
|||||||
CATEGORY = "CogVideoWrapper"
|
CATEGORY = "CogVideoWrapper"
|
||||||
|
|
||||||
def process(self, pipeline, positive, negative, video_length, base_resolution, seed, steps, cfg, scheduler,
|
def process(self, pipeline, positive, negative, video_length, base_resolution, seed, steps, cfg, scheduler,
|
||||||
start_img=None, end_img=None, opt_empty_latent=None, noise_aug_strength=0.0563, context_options=None, fastercache=None, tora_trajectory=None):
|
start_img=None, end_img=None, opt_empty_latent=None, noise_aug_strength=0.0563, context_options=None, fastercache=None,
|
||||||
|
tora_trajectory=None, vid2vid_images=None, vid2vid_denoise=1.0):
|
||||||
device = mm.get_torch_device()
|
device = mm.get_torch_device()
|
||||||
offload_device = mm.unet_offload_device()
|
offload_device = mm.unet_offload_device()
|
||||||
pipe = pipeline["pipe"]
|
pipe = pipeline["pipe"]
|
||||||
@ -1506,8 +1510,12 @@ class CogVideoXFunSampler:
|
|||||||
mm.soft_empty_cache()
|
mm.soft_empty_cache()
|
||||||
|
|
||||||
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
|
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
|
||||||
|
#vid2vid
|
||||||
if start_img is not None:
|
if vid2vid_images is not None:
|
||||||
|
validation_video = np.array(vid2vid_images.cpu().numpy() * 255, np.uint8)
|
||||||
|
original_width, original_height = Image.fromarray(validation_video[0]).size
|
||||||
|
#img2vid
|
||||||
|
elif start_img is not None:
|
||||||
start_img = [to_pil(_start_img) for _start_img in start_img] if start_img is not None else None
|
start_img = [to_pil(_start_img) for _start_img in start_img] if start_img is not None else None
|
||||||
end_img = [to_pil(_end_img) for _end_img in end_img] if end_img is not None else None
|
end_img = [to_pil(_end_img) for _end_img in end_img] if end_img is not None else None
|
||||||
# Count most suitable height and width
|
# Count most suitable height and width
|
||||||
@ -1560,28 +1568,34 @@ class CogVideoXFunSampler:
|
|||||||
autocast_context = torch.autocast(mm.get_autocast_device(device)) if autocastcondition else nullcontext()
|
autocast_context = torch.autocast(mm.get_autocast_device(device)) if autocastcondition else nullcontext()
|
||||||
with autocast_context:
|
with autocast_context:
|
||||||
video_length = int((video_length - 1) // pipe.vae.config.temporal_compression_ratio * pipe.vae.config.temporal_compression_ratio) + 1 if video_length != 1 else 1
|
video_length = int((video_length - 1) // pipe.vae.config.temporal_compression_ratio * pipe.vae.config.temporal_compression_ratio) + 1 if video_length != 1 else 1
|
||||||
input_video, input_video_mask, clip_image = get_image_to_video_latent(start_img, end_img, video_length=video_length, sample_size=(height, width))
|
if vid2vid_images is not None:
|
||||||
|
input_video, input_video_mask, clip_image = get_video_to_video_latent(validation_video, video_length=video_length, sample_size=(height, width))
|
||||||
|
else:
|
||||||
|
input_video, input_video_mask, clip_image = get_image_to_video_latent(start_img, end_img, video_length=video_length, sample_size=(height, width))
|
||||||
|
|
||||||
|
common_params = {
|
||||||
|
"prompt_embeds": positive.to(dtype).to(device),
|
||||||
|
"negative_prompt_embeds": negative.to(dtype).to(device),
|
||||||
|
"num_frames": video_length,
|
||||||
|
"height": height,
|
||||||
|
"width": width,
|
||||||
|
"generator": generator,
|
||||||
|
"guidance_scale": cfg,
|
||||||
|
"num_inference_steps": steps,
|
||||||
|
"comfyui_progressbar": True,
|
||||||
|
"context_schedule":context_options["context_schedule"] if context_options is not None else None,
|
||||||
|
"context_frames":context_frames,
|
||||||
|
"context_stride": context_stride,
|
||||||
|
"context_overlap": context_overlap,
|
||||||
|
"freenoise":context_options["freenoise"] if context_options is not None else None,
|
||||||
|
"tora":tora_trajectory if tora_trajectory is not None else None,
|
||||||
|
}
|
||||||
latents = pipe(
|
latents = pipe(
|
||||||
prompt_embeds=positive.to(dtype).to(device),
|
**common_params,
|
||||||
negative_prompt_embeds=negative.to(dtype).to(device),
|
|
||||||
num_frames = video_length,
|
|
||||||
height = height,
|
|
||||||
width = width,
|
|
||||||
generator = generator,
|
|
||||||
guidance_scale = cfg,
|
|
||||||
num_inference_steps = steps,
|
|
||||||
|
|
||||||
video = input_video,
|
video = input_video,
|
||||||
mask_video = input_video_mask,
|
mask_video = input_video_mask,
|
||||||
comfyui_progressbar = True,
|
|
||||||
noise_aug_strength = noise_aug_strength,
|
noise_aug_strength = noise_aug_strength,
|
||||||
context_schedule=context_options["context_schedule"] if context_options is not None else None,
|
strength = vid2vid_denoise,
|
||||||
context_frames=context_frames,
|
|
||||||
context_stride= context_stride,
|
|
||||||
context_overlap= context_overlap,
|
|
||||||
freenoise=context_options["freenoise"] if context_options is not None else None,
|
|
||||||
tora=tora_trajectory if tora_trajectory is not None else None,
|
|
||||||
)
|
)
|
||||||
#if not pipeline["cpu_offloading"]:
|
#if not pipeline["cpu_offloading"]:
|
||||||
# pipe.transformer.to(offload_device)
|
# pipe.transformer.to(offload_device)
|
||||||
@ -1594,95 +1608,16 @@ class CogVideoXFunVid2VidSampler:
|
|||||||
def INPUT_TYPES(s):
|
def INPUT_TYPES(s):
|
||||||
return {
|
return {
|
||||||
"required": {
|
"required": {
|
||||||
"pipeline": ("COGVIDEOPIPE",),
|
"note": ("STRING", {"default": "This node is deprecated, functionality moved to 'CogVideoXFunSampler' node instead.", "multiline": True}),
|
||||||
"positive": ("CONDITIONING", ),
|
|
||||||
"negative": ("CONDITIONING", ),
|
|
||||||
"video_length": ("INT", {"default": 49, "min": 5, "max": 49, "step": 4}),
|
|
||||||
"base_resolution": ("INT", {"min": 64, "max": 1280, "step": 64, "default": 512, "tooltip": "Base resolution, closest training data bucket resolution is chosen based on the selection."}),
|
|
||||||
"seed": ("INT", {"default": 42, "min": 0, "max": 0xffffffffffffffff}),
|
|
||||||
"steps": ("INT", {"default": 25, "min": 1, "max": 200, "step": 1}),
|
|
||||||
"cfg": ("FLOAT", {"default": 6.0, "min": 1.0, "max": 20.0, "step": 0.01}),
|
|
||||||
"scheduler": (available_schedulers,
|
|
||||||
{
|
|
||||||
"default": 'DDIM'
|
|
||||||
}
|
|
||||||
),
|
|
||||||
"denoise_strength": ("FLOAT", {"default": 0.70, "min": 0.05, "max": 1.00, "step": 0.01}),
|
|
||||||
"validation_video": ("IMAGE",),
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
RETURN_TYPES = ("COGVIDEOPIPE", "LATENT",)
|
RETURN_TYPES = ()
|
||||||
RETURN_NAMES = ("cogvideo_pipe", "samples",)
|
|
||||||
FUNCTION = "process"
|
FUNCTION = "process"
|
||||||
CATEGORY = "CogVideoWrapper"
|
CATEGORY = "CogVideoWrapper"
|
||||||
|
DEPRECATED = True
|
||||||
def process(self, pipeline, positive, negative, video_length, base_resolution, seed, steps, cfg, denoise_strength, scheduler,
|
def process(self):
|
||||||
validation_video):
|
return ()
|
||||||
device = mm.get_torch_device()
|
|
||||||
offload_device = mm.unet_offload_device()
|
|
||||||
pipe = pipeline["pipe"]
|
|
||||||
dtype = pipeline["dtype"]
|
|
||||||
base_path = pipeline["base_path"]
|
|
||||||
|
|
||||||
assert "fun" in base_path.lower(), "'Unfun' models not supported in 'CogVideoXFunSampler', use the 'CogVideoSampler'"
|
|
||||||
assert "pose" not in base_path.lower(), "'Pose' models not supported in 'CogVideoXFunVid2VidSampler', use the 'CogVideoXFunControlSampler'"
|
|
||||||
|
|
||||||
if not pipeline["cpu_offloading"]:
|
|
||||||
pipe.enable_model_cpu_offload(device=device)
|
|
||||||
|
|
||||||
mm.soft_empty_cache()
|
|
||||||
|
|
||||||
# Count most suitable height and width
|
|
||||||
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
|
|
||||||
|
|
||||||
validation_video = np.array(validation_video.cpu().numpy() * 255, np.uint8)
|
|
||||||
original_width, original_height = Image.fromarray(validation_video[0]).size
|
|
||||||
|
|
||||||
closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
|
|
||||||
height, width = [int(x / 16) * 16 for x in closest_size]
|
|
||||||
|
|
||||||
# Load Sampler
|
|
||||||
scheduler_config = pipeline["scheduler_config"]
|
|
||||||
if scheduler in scheduler_mapping:
|
|
||||||
noise_scheduler = scheduler_mapping[scheduler].from_config(scheduler_config)
|
|
||||||
pipe.scheduler = noise_scheduler
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unknown scheduler: {scheduler}")
|
|
||||||
|
|
||||||
generator = torch.Generator(device=torch.device("cpu")).manual_seed(seed)
|
|
||||||
|
|
||||||
autocastcondition = not pipeline["onediff"] or not dtype == torch.float32
|
|
||||||
autocast_context = torch.autocast(mm.get_autocast_device(device)) if autocastcondition else nullcontext()
|
|
||||||
with autocast_context:
|
|
||||||
video_length = int((video_length - 1) // pipe.vae.config.temporal_compression_ratio * pipe.vae.config.temporal_compression_ratio) + 1 if video_length != 1 else 1
|
|
||||||
input_video, input_video_mask, clip_image = get_video_to_video_latent(validation_video, video_length=video_length, sample_size=(height, width))
|
|
||||||
|
|
||||||
# for _lora_path, _lora_weight in zip(cogvideoxfun_model.get("loras", []), cogvideoxfun_model.get("strength_model", [])):
|
|
||||||
# pipeline = merge_lora(pipeline, _lora_path, _lora_weight)
|
|
||||||
|
|
||||||
common_params = {
|
|
||||||
"prompt_embeds": positive.to(dtype).to(device),
|
|
||||||
"negative_prompt_embeds": negative.to(dtype).to(device),
|
|
||||||
"num_frames": video_length,
|
|
||||||
"height": height,
|
|
||||||
"width": width,
|
|
||||||
"generator": generator,
|
|
||||||
"guidance_scale": cfg,
|
|
||||||
"num_inference_steps": steps,
|
|
||||||
"comfyui_progressbar": True,
|
|
||||||
}
|
|
||||||
|
|
||||||
latents = pipe(
|
|
||||||
**common_params,
|
|
||||||
video=input_video,
|
|
||||||
mask_video=input_video_mask,
|
|
||||||
strength=float(denoise_strength)
|
|
||||||
)
|
|
||||||
|
|
||||||
# for _lora_path, _lora_weight in zip(cogvideoxfun_model.get("loras", []), cogvideoxfun_model.get("strength_model", [])):
|
|
||||||
# pipeline = unmerge_lora(pipeline, _lora_path, _lora_weight)
|
|
||||||
return (pipeline, {"samples": latents})
|
|
||||||
|
|
||||||
def add_noise_to_reference_video(image, ratio=None):
|
def add_noise_to_reference_video(image, ratio=None):
|
||||||
if ratio is None:
|
if ratio is None:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user