Add CogVideoX-Fun-V1.1-5b-Control

https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-5b-Control
This commit is contained in:
kijai 2024-11-20 01:23:54 +02:00
parent c9efefe736
commit ecd067260c
3 changed files with 19 additions and 25 deletions

View File

@ -108,6 +108,7 @@ class DownloadAndLoadCogVideoModel:
"alibaba-pai/CogVideoX-Fun-V1.1-5b-InP", "alibaba-pai/CogVideoX-Fun-V1.1-5b-InP",
"alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose", "alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose",
"alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose", "alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose",
"alibaba-pai/CogVideoX-Fun-V1.1-5b-Control",
"feizhengcong/CogvideoX-Interpolation", "feizhengcong/CogvideoX-Interpolation",
"NimVideo/cogvideox-2b-img2vid" "NimVideo/cogvideox-2b-img2vid"
], ],
@ -233,7 +234,7 @@ class DownloadAndLoadCogVideoModel:
transformer, transformer,
scheduler, scheduler,
dtype=dtype, dtype=dtype,
is_fun_inpaint=True if "fun" in model.lower() and "pose" not in model.lower() else False is_fun_inpaint="fun" in model.lower() and not ("pose" in model.lower() or "control" in model.lower())
) )
if "cogvideox-2b-img2vid" in model: if "cogvideox-2b-img2vid" in model:
pipe.input_with_padding = False pipe.input_with_padding = False
@ -255,7 +256,6 @@ class DownloadAndLoadCogVideoModel:
adapter_weight = l['strength'] adapter_weight = l['strength']
pipe.load_lora_weights(l['path'], weight_name=l['path'].split("/")[-1], lora_rank=lora_rank, adapter_name=adapter_name) pipe.load_lora_weights(l['path'], weight_name=l['path'].split("/")[-1], lora_rank=lora_rank, adapter_name=adapter_name)
#transformer = load_lora_into_transformer(lora, transformer)
adapter_list.append(adapter_name) adapter_list.append(adapter_name)
adapter_weights.append(adapter_weight) adapter_weights.append(adapter_weight)
for l in lora: for l in lora:
@ -549,7 +549,12 @@ class DownloadAndLoadCogVideoGGUFModel:
vae = AutoencoderKLCogVideoX.from_config(vae_config).to(vae_dtype).to(offload_device) vae = AutoencoderKLCogVideoX.from_config(vae_config).to(vae_dtype).to(offload_device)
vae.load_state_dict(vae_sd) vae.load_state_dict(vae_sd)
del vae_sd del vae_sd
pipe = CogVideoXPipeline(transformer, scheduler, dtype=vae_dtype) pipe = CogVideoXPipeline(
transformer,
scheduler,
dtype=vae_dtype,
is_fun_inpaint="fun" in model.lower() and not ("pose" in model.lower() or "control" in model.lower())
)
if enable_sequential_cpu_offload: if enable_sequential_cpu_offload:
pipe.enable_sequential_cpu_offload() pipe.enable_sequential_cpu_offload()
@ -675,7 +680,6 @@ class CogVideoXModelLoader:
set_module_tensor_to_device(transformer, name, device=transformer_load_device, dtype=base_dtype, value=sd[name]) set_module_tensor_to_device(transformer, name, device=transformer_load_device, dtype=base_dtype, value=sd[name])
del sd del sd
#scheduler #scheduler
with open(scheduler_config_path) as f: with open(scheduler_config_path) as f:
scheduler_config = json.load(f) scheduler_config = json.load(f)
@ -692,14 +696,12 @@ class CogVideoXModelLoader:
module.fuse_projections(fuse=True) module.fuse_projections(fuse=True)
transformer.attention_mode = attention_mode transformer.attention_mode = attention_mode
if "fun" in model_type: pipe = CogVideoXPipeline(
if not "pose" in model_type: transformer,
raise NotImplementedError("Fun models besides pose are not supported with this loader yet") scheduler,
pipe = CogVideoX_Fun_Pipeline_Inpaint(vae, transformer, scheduler) dtype=base_dtype,
else: is_fun_inpaint="fun" in model.lower() and not ("pose" in model.lower() or "control" in model.lower())
pipe = CogVideoXPipeline(transformer, scheduler, dtype=base_dtype) )
else:
pipe = CogVideoXPipeline(transformer, scheduler, dtype=base_dtype)
if enable_sequential_cpu_offload: if enable_sequential_cpu_offload:
pipe.enable_sequential_cpu_offload() pipe.enable_sequential_cpu_offload()
@ -797,11 +799,6 @@ class CogVideoXModelLoader:
manual_offloading = False # to disable manual .to(device) calls manual_offloading = False # to disable manual .to(device) calls
log.info(f"Quantized transformer blocks to {quantization}") log.info(f"Quantized transformer blocks to {quantization}")
# if load_device == "offload_device":
# pipe.transformer.to(offload_device)
# else:
# pipe.transformer.to(device)
pipeline = { pipeline = {
"pipe": pipe, "pipe": pipe,
"dtype": base_dtype, "dtype": base_dtype,
@ -812,7 +809,6 @@ class CogVideoXModelLoader:
"model_name": model, "model_name": model,
"manual_offloading": manual_offloading, "manual_offloading": manual_offloading,
} }
return (pipeline,) return (pipeline,)
#region VAE #region VAE

View File

@ -343,13 +343,10 @@ class CogVideoImageEncodeFunInP:
bs = 1 bs = 1
new_mask_pixel_values = [] new_mask_pixel_values = []
print("input_image shape: ",input_image.shape)
for i in range(0, input_image.shape[0], bs): for i in range(0, input_image.shape[0], bs):
mask_pixel_values_bs = input_image[i : i + bs] mask_pixel_values_bs = input_image[i : i + bs]
mask_pixel_values_bs = vae.encode(mask_pixel_values_bs)[0] mask_pixel_values_bs = vae.encode(mask_pixel_values_bs)[0]
print("mask_pixel_values_bs: ",mask_pixel_values_bs.parameters.shape)
mask_pixel_values_bs = mask_pixel_values_bs.mode() mask_pixel_values_bs = mask_pixel_values_bs.mode()
print("mask_pixel_values_bs: ",mask_pixel_values_bs.shape, mask_pixel_values_bs.min(), mask_pixel_values_bs.max())
new_mask_pixel_values.append(mask_pixel_values_bs) new_mask_pixel_values.append(mask_pixel_values_bs)
masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0) masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0)
masked_image_latents = masked_image_latents.permute(0, 2, 1, 3, 4) # B, T, C, H, W masked_image_latents = masked_image_latents.permute(0, 2, 1, 3, 4) # B, T, C, H, W
@ -601,8 +598,7 @@ class CogVideoSampler:
model_name = model.get("model_name", "") model_name = model.get("model_name", "")
supports_image_conds = True if "I2V" in model_name or "interpolation" in model_name.lower() or "fun" in model_name.lower() else False supports_image_conds = True if "I2V" in model_name or "interpolation" in model_name.lower() or "fun" in model_name.lower() else False
if "fun" in model_name.lower() and not ("pose" in model_name.lower() or "control" in model_name.lower()) and image_cond_latents is not None:
if "fun" in model_name.lower() and "pose" not in model_name.lower() and image_cond_latents is not None:
assert image_cond_latents["mask"] is not None, "For fun inpaint models use CogVideoImageEncodeFunInP" assert image_cond_latents["mask"] is not None, "For fun inpaint models use CogVideoImageEncodeFunInP"
fun_mask = image_cond_latents["mask"] fun_mask = image_cond_latents["mask"]
else: else:
@ -855,7 +851,7 @@ class CogVideoXFunResizeToClosestBucket:
from .cogvideox_fun.utils import ASPECT_RATIO_512, get_closest_ratio from .cogvideox_fun.utils import ASPECT_RATIO_512, get_closest_ratio
B, H, W, C = images.shape B, H, W, C = images.shape
# Count most suitable height and width # Find most suitable height and width
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()} aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
closest_size, closest_ratio = get_closest_ratio(H, W, ratios=aspect_ratio_sample_size) closest_size, closest_ratio = get_closest_ratio(H, W, ratios=aspect_ratio_sample_size)

View File

@ -1,5 +1,7 @@
# WORK IN PROGRESS # WORK IN PROGRESS
Spreadsheet (WIP) of supported models and their supported features: https://docs.google.com/spreadsheets/d/16eA6mSL8XkTcu9fSWkPSHfRIqyAKJbR1O99xnuGdCKY/edit?usp=sharing
## BREAKING Update8 ## BREAKING Update8
This is big one, and unfortunately to do the necessary cleanup and refactoring this will break every old workflow as they are. This is big one, and unfortunately to do the necessary cleanup and refactoring this will break every old workflow as they are.