diff --git a/custom_cogvideox_transformer_3d.py b/custom_cogvideox_transformer_3d.py index 9bbd87b..1003aa7 100644 --- a/custom_cogvideox_transformer_3d.py +++ b/custom_cogvideox_transformer_3d.py @@ -254,7 +254,7 @@ class CogVideoXBlock(nn.Module): norm_hidden_states = rearrange(h, "(B T) C H W -> B (T H W) C", T=T) del h, fuser - #fastercache + #region fastercache B = norm_hidden_states.shape[0] if fastercache_counter >= fastercache_start_step + 3 and fastercache_counter%3!=0 and self.cached_hidden_states[-1].shape[0] >= B: attn_hidden_states = ( @@ -365,6 +365,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin): flip_sin_to_cos: bool = True, freq_shift: int = 0, time_embed_dim: int = 512, + ofs_embed_dim: Optional[int] = None, text_embed_dim: int = 4096, num_layers: int = 30, dropout: float = 0.0, @@ -373,7 +374,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin): sample_height: int = 60, sample_frames: int = 49, patch_size: int = 2, - patch_size_t: int = 2, + patch_size_t: int = None, temporal_compression_ratio: int = 4, max_text_seq_length: int = 226, activation_fn: str = "gelu-approximate", @@ -420,6 +421,11 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin): self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift) self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn) + self.ofs_embedding = None + + if ofs_embed_dim: + self.ofs_embedding = TimestepEmbedding(ofs_embed_dim, ofs_embed_dim, timestep_activation_fn) # same as time embeddings, for ofs + # 3. Define spatio-temporal transformers blocks self.transformer_blocks = nn.ModuleList( [ @@ -553,6 +559,9 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin): # there might be better ways to encapsulate this. t_emb = t_emb.to(dtype=hidden_states.dtype) emb = self.time_embedding(t_emb, timestep_cond) + if self.ofs_embedding is not None: #1.5 I2V + emb_ofs = self.ofs_embedding(emb, timestep_cond) + emb = emb + emb_ofs # 2. Patch embedding p = self.config.patch_size diff --git a/model_loading.py b/model_loading.py index 7adc9d6..bb3f774 100644 --- a/model_loading.py +++ b/model_loading.py @@ -72,6 +72,7 @@ class DownloadAndLoadCogVideoModel: "THUDM/CogVideoX-5b", "THUDM/CogVideoX-5b-I2V", "kijai/CogVideoX-5b-1.5-T2V", + "kijai/CogVideoX-5b-1.5-I2V", "bertjiazheng/KoolCogVideoX-5b", "kijai/CogVideoX-Fun-2b", "kijai/CogVideoX-Fun-5b", @@ -97,6 +98,7 @@ class DownloadAndLoadCogVideoModel: "block_edit": ("TRANSFORMERBLOCKS", {"default": None}), "lora": ("COGLORA", {"default": None}), "compile_args":("COMPILEARGS", ), + "load_device": (["main_device", "offload_device"], {"default": "main_device"}), } } @@ -106,12 +108,13 @@ class DownloadAndLoadCogVideoModel: CATEGORY = "CogVideoWrapper" DESCRIPTION = "Downloads and loads the selected CogVideo model from Huggingface to 'ComfyUI/models/CogVideo'" - def loadmodel(self, model, precision, fp8_transformer="disabled", compile="disabled", enable_sequential_cpu_offload=False, pab_config=None, block_edit=None, lora=None, compile_args=None): + def loadmodel(self, model, precision, fp8_transformer="disabled", compile="disabled", enable_sequential_cpu_offload=False, pab_config=None, block_edit=None, lora=None, compile_args=None, load_device="main_device"): check_diffusers_version() device = mm.get_torch_device() offload_device = mm.unet_offload_device() + transformer_load_device = device if load_device == "main_device" else offload_device mm.soft_empty_cache() dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision] @@ -134,6 +137,8 @@ class DownloadAndLoadCogVideoModel: if not os.path.exists(base_path): base_path = os.path.join(download_path, (model.split("/")[-1])) download_path = base_path + subfolder = "transformer" + allow_patterns = ["*transformer*", "*scheduler*", "*vae*"] elif "2b" in model: if 'img2vid' in model: @@ -144,27 +149,33 @@ class DownloadAndLoadCogVideoModel: base_path = os.path.join(download_path, "CogVideo2B") download_path = base_path repo_id = model - elif "1.5-T2V" in model: + subfolder = "transformer" + allow_patterns = ["*transformer*", "*scheduler*", "*vae*"] + elif "1.5-T2V" in model or "1.5-I2V" in model: base_path = os.path.join(download_path, "CogVideoX-5b-1.5") download_path = base_path - transformer_path = os.path.join(base_path, "transformer_T2V") + subfolder = "transformer_T2V" if "1.5-T2V" in model else "transformer_I2V" + allow_patterns = [f"*{subfolder}*"] repo_id = "kijai/CogVideoX-5b-1.5" else: base_path = os.path.join(download_path, (model.split("/")[-1])) download_path = base_path repo_id = model + subfolder = "transformer" + allow_patterns = ["*transformer*", "*scheduler*", "*vae*"] if "2b" in model: scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json') else: scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json') - if not os.path.exists(base_path) or not os.path.exists(os.path.join(base_path, "transformer")): + if not os.path.exists(base_path) or not os.path.exists(os.path.join(base_path, subfolder)): log.info(f"Downloading model to: {base_path}") from huggingface_hub import snapshot_download snapshot_download( repo_id=repo_id, + allow_patterns=allow_patterns, ignore_patterns=["*text_encoder*", "*tokenizer*"], local_dir=download_path, local_dir_use_symlinks=False, @@ -173,18 +184,16 @@ class DownloadAndLoadCogVideoModel: # transformer if "Fun" in model: if pab_config is not None: - transformer = CogVideoXTransformer3DModelFunPAB.from_pretrained(base_path, subfolder="transformer") + transformer = CogVideoXTransformer3DModelFunPAB.from_pretrained(base_path, subfolder=subfolder) else: - transformer = CogVideoXTransformer3DModelFun.from_pretrained(base_path, subfolder="transformer") - elif "1.5-T2V" in model: - transformer = CogVideoXTransformer3DModel.from_pretrained(transformer_path) + transformer = CogVideoXTransformer3DModelFun.from_pretrained(base_path, subfolder=subfolder) else: if pab_config is not None: - transformer = CogVideoXTransformer3DModelPAB.from_pretrained(base_path, subfolder="transformer") + transformer = CogVideoXTransformer3DModelPAB.from_pretrained(base_path, subfolder=subfolder) else: - transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder="transformer") + transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder=subfolder) - transformer = transformer.to(dtype).to(offload_device) + transformer = transformer.to(dtype).to(transformer_load_device) if block_edit is not None: transformer = remove_specific_blocks(transformer, block_edit) diff --git a/nodes.py b/nodes.py index fa5e3ad..d8a7cbc 100644 --- a/nodes.py +++ b/nodes.py @@ -782,9 +782,9 @@ class CogVideoSampler: "pipeline": ("COGVIDEOPIPE",), "positive": ("CONDITIONING", ), "negative": ("CONDITIONING", ), - "height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}), - "width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}), - "num_frames": ("INT", {"default": 49, "min": 16, "max": 1024, "step": 1}), + "height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 16}), + "width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 16}), + "num_frames": ("INT", {"default": 48, "min": 16, "max": 1024, "step": 1}), "steps": ("INT", {"default": 50, "min": 1}), "cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}), "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}), diff --git a/pipeline_cogvideox.py b/pipeline_cogvideox.py index f2fb927..29ebfb3 100644 --- a/pipeline_cogvideox.py +++ b/pipeline_cogvideox.py @@ -442,10 +442,6 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin): argument. """ - #assert ( - # num_frames <= 48 and num_frames % fps == 0 and fps == 8 - #), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX." - height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial num_videos_per_prompt = 1 @@ -480,8 +476,8 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin): # 5. Prepare latents. latent_channels = self.vae.config.latent_channels - if latents is None and num_frames == t_tile_length: - num_frames += 1 + #if latents is None and num_frames == t_tile_length: + # num_frames += 1 if self.original_mask is not None: image_latents = latents