This commit is contained in:
kijai 2024-11-09 12:13:52 +02:00
parent 2eb9b81d27
commit 643bbc18c1
4 changed files with 36 additions and 22 deletions

View File

@ -254,7 +254,7 @@ class CogVideoXBlock(nn.Module):
norm_hidden_states = rearrange(h, "(B T) C H W -> B (T H W) C", T=T)
del h, fuser
#fastercache
#region fastercache
B = norm_hidden_states.shape[0]
if fastercache_counter >= fastercache_start_step + 3 and fastercache_counter%3!=0 and self.cached_hidden_states[-1].shape[0] >= B:
attn_hidden_states = (
@ -365,6 +365,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
flip_sin_to_cos: bool = True,
freq_shift: int = 0,
time_embed_dim: int = 512,
ofs_embed_dim: Optional[int] = None,
text_embed_dim: int = 4096,
num_layers: int = 30,
dropout: float = 0.0,
@ -373,7 +374,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
sample_height: int = 60,
sample_frames: int = 49,
patch_size: int = 2,
patch_size_t: int = 2,
patch_size_t: int = None,
temporal_compression_ratio: int = 4,
max_text_seq_length: int = 226,
activation_fn: str = "gelu-approximate",
@ -420,6 +421,11 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
self.ofs_embedding = None
if ofs_embed_dim:
self.ofs_embedding = TimestepEmbedding(ofs_embed_dim, ofs_embed_dim, timestep_activation_fn) # same as time embeddings, for ofs
# 3. Define spatio-temporal transformers blocks
self.transformer_blocks = nn.ModuleList(
[
@ -553,6 +559,9 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
# there might be better ways to encapsulate this.
t_emb = t_emb.to(dtype=hidden_states.dtype)
emb = self.time_embedding(t_emb, timestep_cond)
if self.ofs_embedding is not None: #1.5 I2V
emb_ofs = self.ofs_embedding(emb, timestep_cond)
emb = emb + emb_ofs
# 2. Patch embedding
p = self.config.patch_size

View File

@ -72,6 +72,7 @@ class DownloadAndLoadCogVideoModel:
"THUDM/CogVideoX-5b",
"THUDM/CogVideoX-5b-I2V",
"kijai/CogVideoX-5b-1.5-T2V",
"kijai/CogVideoX-5b-1.5-I2V",
"bertjiazheng/KoolCogVideoX-5b",
"kijai/CogVideoX-Fun-2b",
"kijai/CogVideoX-Fun-5b",
@ -97,6 +98,7 @@ class DownloadAndLoadCogVideoModel:
"block_edit": ("TRANSFORMERBLOCKS", {"default": None}),
"lora": ("COGLORA", {"default": None}),
"compile_args":("COMPILEARGS", ),
"load_device": (["main_device", "offload_device"], {"default": "main_device"}),
}
}
@ -106,12 +108,13 @@ class DownloadAndLoadCogVideoModel:
CATEGORY = "CogVideoWrapper"
DESCRIPTION = "Downloads and loads the selected CogVideo model from Huggingface to 'ComfyUI/models/CogVideo'"
def loadmodel(self, model, precision, fp8_transformer="disabled", compile="disabled", enable_sequential_cpu_offload=False, pab_config=None, block_edit=None, lora=None, compile_args=None):
def loadmodel(self, model, precision, fp8_transformer="disabled", compile="disabled", enable_sequential_cpu_offload=False, pab_config=None, block_edit=None, lora=None, compile_args=None, load_device="main_device"):
check_diffusers_version()
device = mm.get_torch_device()
offload_device = mm.unet_offload_device()
transformer_load_device = device if load_device == "main_device" else offload_device
mm.soft_empty_cache()
dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
@ -134,6 +137,8 @@ class DownloadAndLoadCogVideoModel:
if not os.path.exists(base_path):
base_path = os.path.join(download_path, (model.split("/")[-1]))
download_path = base_path
subfolder = "transformer"
allow_patterns = ["*transformer*", "*scheduler*", "*vae*"]
elif "2b" in model:
if 'img2vid' in model:
@ -144,27 +149,33 @@ class DownloadAndLoadCogVideoModel:
base_path = os.path.join(download_path, "CogVideo2B")
download_path = base_path
repo_id = model
elif "1.5-T2V" in model:
subfolder = "transformer"
allow_patterns = ["*transformer*", "*scheduler*", "*vae*"]
elif "1.5-T2V" in model or "1.5-I2V" in model:
base_path = os.path.join(download_path, "CogVideoX-5b-1.5")
download_path = base_path
transformer_path = os.path.join(base_path, "transformer_T2V")
subfolder = "transformer_T2V" if "1.5-T2V" in model else "transformer_I2V"
allow_patterns = [f"*{subfolder}*"]
repo_id = "kijai/CogVideoX-5b-1.5"
else:
base_path = os.path.join(download_path, (model.split("/")[-1]))
download_path = base_path
repo_id = model
subfolder = "transformer"
allow_patterns = ["*transformer*", "*scheduler*", "*vae*"]
if "2b" in model:
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json')
else:
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
if not os.path.exists(base_path) or not os.path.exists(os.path.join(base_path, "transformer")):
if not os.path.exists(base_path) or not os.path.exists(os.path.join(base_path, subfolder)):
log.info(f"Downloading model to: {base_path}")
from huggingface_hub import snapshot_download
snapshot_download(
repo_id=repo_id,
allow_patterns=allow_patterns,
ignore_patterns=["*text_encoder*", "*tokenizer*"],
local_dir=download_path,
local_dir_use_symlinks=False,
@ -173,18 +184,16 @@ class DownloadAndLoadCogVideoModel:
# transformer
if "Fun" in model:
if pab_config is not None:
transformer = CogVideoXTransformer3DModelFunPAB.from_pretrained(base_path, subfolder="transformer")
transformer = CogVideoXTransformer3DModelFunPAB.from_pretrained(base_path, subfolder=subfolder)
else:
transformer = CogVideoXTransformer3DModelFun.from_pretrained(base_path, subfolder="transformer")
elif "1.5-T2V" in model:
transformer = CogVideoXTransformer3DModel.from_pretrained(transformer_path)
transformer = CogVideoXTransformer3DModelFun.from_pretrained(base_path, subfolder=subfolder)
else:
if pab_config is not None:
transformer = CogVideoXTransformer3DModelPAB.from_pretrained(base_path, subfolder="transformer")
transformer = CogVideoXTransformer3DModelPAB.from_pretrained(base_path, subfolder=subfolder)
else:
transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder="transformer")
transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder=subfolder)
transformer = transformer.to(dtype).to(offload_device)
transformer = transformer.to(dtype).to(transformer_load_device)
if block_edit is not None:
transformer = remove_specific_blocks(transformer, block_edit)

View File

@ -782,9 +782,9 @@ class CogVideoSampler:
"pipeline": ("COGVIDEOPIPE",),
"positive": ("CONDITIONING", ),
"negative": ("CONDITIONING", ),
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
"num_frames": ("INT", {"default": 49, "min": 16, "max": 1024, "step": 1}),
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 16}),
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 16}),
"num_frames": ("INT", {"default": 48, "min": 16, "max": 1024, "step": 1}),
"steps": ("INT", {"default": 50, "min": 1}),
"cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),

View File

@ -442,10 +442,6 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
argument.
"""
#assert (
# num_frames <= 48 and num_frames % fps == 0 and fps == 8
#), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX."
height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
num_videos_per_prompt = 1
@ -480,8 +476,8 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
# 5. Prepare latents.
latent_channels = self.vae.config.latent_channels
if latents is None and num_frames == t_tile_length:
num_frames += 1
#if latents is None and num_frames == t_tile_length:
# num_frames += 1
if self.original_mask is not None:
image_latents = latents