Add CogVideoX-Fun-V1.1-5b-Control

https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-5b-Control
This commit is contained in:
kijai 2024-11-20 01:23:54 +02:00
parent c9efefe736
commit ecd067260c
3 changed files with 19 additions and 25 deletions

View File

@ -108,6 +108,7 @@ class DownloadAndLoadCogVideoModel:
"alibaba-pai/CogVideoX-Fun-V1.1-5b-InP",
"alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose",
"alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose",
"alibaba-pai/CogVideoX-Fun-V1.1-5b-Control",
"feizhengcong/CogvideoX-Interpolation",
"NimVideo/cogvideox-2b-img2vid"
],
@ -233,7 +234,7 @@ class DownloadAndLoadCogVideoModel:
transformer,
scheduler,
dtype=dtype,
is_fun_inpaint=True if "fun" in model.lower() and "pose" not in model.lower() else False
is_fun_inpaint="fun" in model.lower() and not ("pose" in model.lower() or "control" in model.lower())
)
if "cogvideox-2b-img2vid" in model:
pipe.input_with_padding = False
@ -255,7 +256,6 @@ class DownloadAndLoadCogVideoModel:
adapter_weight = l['strength']
pipe.load_lora_weights(l['path'], weight_name=l['path'].split("/")[-1], lora_rank=lora_rank, adapter_name=adapter_name)
#transformer = load_lora_into_transformer(lora, transformer)
adapter_list.append(adapter_name)
adapter_weights.append(adapter_weight)
for l in lora:
@ -549,7 +549,12 @@ class DownloadAndLoadCogVideoGGUFModel:
vae = AutoencoderKLCogVideoX.from_config(vae_config).to(vae_dtype).to(offload_device)
vae.load_state_dict(vae_sd)
del vae_sd
pipe = CogVideoXPipeline(transformer, scheduler, dtype=vae_dtype)
pipe = CogVideoXPipeline(
transformer,
scheduler,
dtype=vae_dtype,
is_fun_inpaint="fun" in model.lower() and not ("pose" in model.lower() or "control" in model.lower())
)
if enable_sequential_cpu_offload:
pipe.enable_sequential_cpu_offload()
@ -675,7 +680,6 @@ class CogVideoXModelLoader:
set_module_tensor_to_device(transformer, name, device=transformer_load_device, dtype=base_dtype, value=sd[name])
del sd
#scheduler
with open(scheduler_config_path) as f:
scheduler_config = json.load(f)
@ -692,14 +696,12 @@ class CogVideoXModelLoader:
module.fuse_projections(fuse=True)
transformer.attention_mode = attention_mode
if "fun" in model_type:
if not "pose" in model_type:
raise NotImplementedError("Fun models besides pose are not supported with this loader yet")
pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler)
else:
pipe = CogVideoXPipeline(transformer, scheduler, dtype=base_dtype)
else:
pipe = CogVideoXPipeline(transformer, scheduler, dtype=base_dtype)
pipe = CogVideoXPipeline(
transformer,
scheduler,
dtype=base_dtype,
is_fun_inpaint="fun" in model.lower() and not ("pose" in model.lower() or "control" in model.lower())
)
if enable_sequential_cpu_offload:
pipe.enable_sequential_cpu_offload()
@ -796,11 +798,6 @@ class CogVideoXModelLoader:
manual_offloading = False # to disable manual .to(device) calls
log.info(f"Quantized transformer blocks to {quantization}")
# if load_device == "offload_device":
# pipe.transformer.to(offload_device)
# else:
# pipe.transformer.to(device)
pipeline = {
"pipe": pipe,
@ -812,7 +809,6 @@ class CogVideoXModelLoader:
"model_name": model,
"manual_offloading": manual_offloading,
}
return (pipeline,)
#region VAE

View File

@ -343,13 +343,10 @@ class CogVideoImageEncodeFunInP:
bs = 1
new_mask_pixel_values = []
print("input_image shape: ",input_image.shape)
for i in range(0, input_image.shape[0], bs):
mask_pixel_values_bs = input_image[i : i + bs]
mask_pixel_values_bs = vae.encode(mask_pixel_values_bs)[0]
print("mask_pixel_values_bs: ",mask_pixel_values_bs.parameters.shape)
mask_pixel_values_bs = mask_pixel_values_bs.mode()
print("mask_pixel_values_bs: ",mask_pixel_values_bs.shape, mask_pixel_values_bs.min(), mask_pixel_values_bs.max())
new_mask_pixel_values.append(mask_pixel_values_bs)
masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0)
masked_image_latents = masked_image_latents.permute(0, 2, 1, 3, 4) # B, T, C, H, W
@ -601,8 +598,7 @@ class CogVideoSampler:
model_name = model.get("model_name", "")
supports_image_conds = True if "I2V" in model_name or "interpolation" in model_name.lower() or "fun" in model_name.lower() else False
if "fun" in model_name.lower() and "pose" not in model_name.lower() and image_cond_latents is not None:
if "fun" in model_name.lower() and not ("pose" in model_name.lower() or "control" in model_name.lower()) and image_cond_latents is not None:
assert image_cond_latents["mask"] is not None, "For fun inpaint models use CogVideoImageEncodeFunInP"
fun_mask = image_cond_latents["mask"]
else:
@ -855,8 +851,8 @@ class CogVideoXFunResizeToClosestBucket:
from .cogvideox_fun.utils import ASPECT_RATIO_512, get_closest_ratio
B, H, W, C = images.shape
# Count most suitable height and width
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
# Find most suitable height and width
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
closest_size, closest_ratio = get_closest_ratio(H, W, ratios=aspect_ratio_sample_size)
height, width = [int(x / 16) * 16 for x in closest_size]

View File

@ -1,5 +1,7 @@
# WORK IN PROGRESS
Spreadsheet (WIP) of supported models and their supported features: https://docs.google.com/spreadsheets/d/16eA6mSL8XkTcu9fSWkPSHfRIqyAKJbR1O99xnuGdCKY/edit?usp=sharing
## BREAKING Update8
This is big one, and unfortunately to do the necessary cleanup and refactoring this will break every old workflow as they are.