mirror of
https://git.datalinker.icu/kijai/ComfyUI-CogVideoXWrapper.git
synced 2025-12-15 07:54:24 +08:00
Add CogVideoX-Fun-V1.1-5b-Control
https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-5b-Control
This commit is contained in:
parent
c9efefe736
commit
ecd067260c
@ -108,6 +108,7 @@ class DownloadAndLoadCogVideoModel:
|
|||||||
"alibaba-pai/CogVideoX-Fun-V1.1-5b-InP",
|
"alibaba-pai/CogVideoX-Fun-V1.1-5b-InP",
|
||||||
"alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose",
|
"alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose",
|
||||||
"alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose",
|
"alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose",
|
||||||
|
"alibaba-pai/CogVideoX-Fun-V1.1-5b-Control",
|
||||||
"feizhengcong/CogvideoX-Interpolation",
|
"feizhengcong/CogvideoX-Interpolation",
|
||||||
"NimVideo/cogvideox-2b-img2vid"
|
"NimVideo/cogvideox-2b-img2vid"
|
||||||
],
|
],
|
||||||
@ -233,7 +234,7 @@ class DownloadAndLoadCogVideoModel:
|
|||||||
transformer,
|
transformer,
|
||||||
scheduler,
|
scheduler,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
is_fun_inpaint=True if "fun" in model.lower() and "pose" not in model.lower() else False
|
is_fun_inpaint="fun" in model.lower() and not ("pose" in model.lower() or "control" in model.lower())
|
||||||
)
|
)
|
||||||
if "cogvideox-2b-img2vid" in model:
|
if "cogvideox-2b-img2vid" in model:
|
||||||
pipe.input_with_padding = False
|
pipe.input_with_padding = False
|
||||||
@ -255,7 +256,6 @@ class DownloadAndLoadCogVideoModel:
|
|||||||
adapter_weight = l['strength']
|
adapter_weight = l['strength']
|
||||||
pipe.load_lora_weights(l['path'], weight_name=l['path'].split("/")[-1], lora_rank=lora_rank, adapter_name=adapter_name)
|
pipe.load_lora_weights(l['path'], weight_name=l['path'].split("/")[-1], lora_rank=lora_rank, adapter_name=adapter_name)
|
||||||
|
|
||||||
#transformer = load_lora_into_transformer(lora, transformer)
|
|
||||||
adapter_list.append(adapter_name)
|
adapter_list.append(adapter_name)
|
||||||
adapter_weights.append(adapter_weight)
|
adapter_weights.append(adapter_weight)
|
||||||
for l in lora:
|
for l in lora:
|
||||||
@ -549,7 +549,12 @@ class DownloadAndLoadCogVideoGGUFModel:
|
|||||||
vae = AutoencoderKLCogVideoX.from_config(vae_config).to(vae_dtype).to(offload_device)
|
vae = AutoencoderKLCogVideoX.from_config(vae_config).to(vae_dtype).to(offload_device)
|
||||||
vae.load_state_dict(vae_sd)
|
vae.load_state_dict(vae_sd)
|
||||||
del vae_sd
|
del vae_sd
|
||||||
pipe = CogVideoXPipeline(transformer, scheduler, dtype=vae_dtype)
|
pipe = CogVideoXPipeline(
|
||||||
|
transformer,
|
||||||
|
scheduler,
|
||||||
|
dtype=vae_dtype,
|
||||||
|
is_fun_inpaint="fun" in model.lower() and not ("pose" in model.lower() or "control" in model.lower())
|
||||||
|
)
|
||||||
|
|
||||||
if enable_sequential_cpu_offload:
|
if enable_sequential_cpu_offload:
|
||||||
pipe.enable_sequential_cpu_offload()
|
pipe.enable_sequential_cpu_offload()
|
||||||
@ -675,7 +680,6 @@ class CogVideoXModelLoader:
|
|||||||
set_module_tensor_to_device(transformer, name, device=transformer_load_device, dtype=base_dtype, value=sd[name])
|
set_module_tensor_to_device(transformer, name, device=transformer_load_device, dtype=base_dtype, value=sd[name])
|
||||||
del sd
|
del sd
|
||||||
|
|
||||||
|
|
||||||
#scheduler
|
#scheduler
|
||||||
with open(scheduler_config_path) as f:
|
with open(scheduler_config_path) as f:
|
||||||
scheduler_config = json.load(f)
|
scheduler_config = json.load(f)
|
||||||
@ -692,14 +696,12 @@ class CogVideoXModelLoader:
|
|||||||
module.fuse_projections(fuse=True)
|
module.fuse_projections(fuse=True)
|
||||||
transformer.attention_mode = attention_mode
|
transformer.attention_mode = attention_mode
|
||||||
|
|
||||||
if "fun" in model_type:
|
pipe = CogVideoXPipeline(
|
||||||
if not "pose" in model_type:
|
transformer,
|
||||||
raise NotImplementedError("Fun models besides pose are not supported with this loader yet")
|
scheduler,
|
||||||
pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler)
|
dtype=base_dtype,
|
||||||
else:
|
is_fun_inpaint="fun" in model.lower() and not ("pose" in model.lower() or "control" in model.lower())
|
||||||
pipe = CogVideoXPipeline(transformer, scheduler, dtype=base_dtype)
|
)
|
||||||
else:
|
|
||||||
pipe = CogVideoXPipeline(transformer, scheduler, dtype=base_dtype)
|
|
||||||
|
|
||||||
if enable_sequential_cpu_offload:
|
if enable_sequential_cpu_offload:
|
||||||
pipe.enable_sequential_cpu_offload()
|
pipe.enable_sequential_cpu_offload()
|
||||||
@ -797,11 +799,6 @@ class CogVideoXModelLoader:
|
|||||||
manual_offloading = False # to disable manual .to(device) calls
|
manual_offloading = False # to disable manual .to(device) calls
|
||||||
log.info(f"Quantized transformer blocks to {quantization}")
|
log.info(f"Quantized transformer blocks to {quantization}")
|
||||||
|
|
||||||
# if load_device == "offload_device":
|
|
||||||
# pipe.transformer.to(offload_device)
|
|
||||||
# else:
|
|
||||||
# pipe.transformer.to(device)
|
|
||||||
|
|
||||||
pipeline = {
|
pipeline = {
|
||||||
"pipe": pipe,
|
"pipe": pipe,
|
||||||
"dtype": base_dtype,
|
"dtype": base_dtype,
|
||||||
@ -812,7 +809,6 @@ class CogVideoXModelLoader:
|
|||||||
"model_name": model,
|
"model_name": model,
|
||||||
"manual_offloading": manual_offloading,
|
"manual_offloading": manual_offloading,
|
||||||
}
|
}
|
||||||
|
|
||||||
return (pipeline,)
|
return (pipeline,)
|
||||||
|
|
||||||
#region VAE
|
#region VAE
|
||||||
|
|||||||
8
nodes.py
8
nodes.py
@ -343,13 +343,10 @@ class CogVideoImageEncodeFunInP:
|
|||||||
|
|
||||||
bs = 1
|
bs = 1
|
||||||
new_mask_pixel_values = []
|
new_mask_pixel_values = []
|
||||||
print("input_image shape: ",input_image.shape)
|
|
||||||
for i in range(0, input_image.shape[0], bs):
|
for i in range(0, input_image.shape[0], bs):
|
||||||
mask_pixel_values_bs = input_image[i : i + bs]
|
mask_pixel_values_bs = input_image[i : i + bs]
|
||||||
mask_pixel_values_bs = vae.encode(mask_pixel_values_bs)[0]
|
mask_pixel_values_bs = vae.encode(mask_pixel_values_bs)[0]
|
||||||
print("mask_pixel_values_bs: ",mask_pixel_values_bs.parameters.shape)
|
|
||||||
mask_pixel_values_bs = mask_pixel_values_bs.mode()
|
mask_pixel_values_bs = mask_pixel_values_bs.mode()
|
||||||
print("mask_pixel_values_bs: ",mask_pixel_values_bs.shape, mask_pixel_values_bs.min(), mask_pixel_values_bs.max())
|
|
||||||
new_mask_pixel_values.append(mask_pixel_values_bs)
|
new_mask_pixel_values.append(mask_pixel_values_bs)
|
||||||
masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0)
|
masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0)
|
||||||
masked_image_latents = masked_image_latents.permute(0, 2, 1, 3, 4) # B, T, C, H, W
|
masked_image_latents = masked_image_latents.permute(0, 2, 1, 3, 4) # B, T, C, H, W
|
||||||
@ -601,8 +598,7 @@ class CogVideoSampler:
|
|||||||
|
|
||||||
model_name = model.get("model_name", "")
|
model_name = model.get("model_name", "")
|
||||||
supports_image_conds = True if "I2V" in model_name or "interpolation" in model_name.lower() or "fun" in model_name.lower() else False
|
supports_image_conds = True if "I2V" in model_name or "interpolation" in model_name.lower() or "fun" in model_name.lower() else False
|
||||||
|
if "fun" in model_name.lower() and not ("pose" in model_name.lower() or "control" in model_name.lower()) and image_cond_latents is not None:
|
||||||
if "fun" in model_name.lower() and "pose" not in model_name.lower() and image_cond_latents is not None:
|
|
||||||
assert image_cond_latents["mask"] is not None, "For fun inpaint models use CogVideoImageEncodeFunInP"
|
assert image_cond_latents["mask"] is not None, "For fun inpaint models use CogVideoImageEncodeFunInP"
|
||||||
fun_mask = image_cond_latents["mask"]
|
fun_mask = image_cond_latents["mask"]
|
||||||
else:
|
else:
|
||||||
@ -855,7 +851,7 @@ class CogVideoXFunResizeToClosestBucket:
|
|||||||
from .cogvideox_fun.utils import ASPECT_RATIO_512, get_closest_ratio
|
from .cogvideox_fun.utils import ASPECT_RATIO_512, get_closest_ratio
|
||||||
|
|
||||||
B, H, W, C = images.shape
|
B, H, W, C = images.shape
|
||||||
# Count most suitable height and width
|
# Find most suitable height and width
|
||||||
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
|
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
|
||||||
|
|
||||||
closest_size, closest_ratio = get_closest_ratio(H, W, ratios=aspect_ratio_sample_size)
|
closest_size, closest_ratio = get_closest_ratio(H, W, ratios=aspect_ratio_sample_size)
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
# WORK IN PROGRESS
|
# WORK IN PROGRESS
|
||||||
|
|
||||||
|
Spreadsheet (WIP) of supported models and their supported features: https://docs.google.com/spreadsheets/d/16eA6mSL8XkTcu9fSWkPSHfRIqyAKJbR1O99xnuGdCKY/edit?usp=sharing
|
||||||
|
|
||||||
## BREAKING Update8
|
## BREAKING Update8
|
||||||
|
|
||||||
This is big one, and unfortunately to do the necessary cleanup and refactoring this will break every old workflow as they are.
|
This is big one, and unfortunately to do the necessary cleanup and refactoring this will break every old workflow as they are.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user