mirror of
https://git.datalinker.icu/kijai/ComfyUI-CogVideoXWrapper.git
synced 2025-12-09 04:44:22 +08:00
i2v
This commit is contained in:
parent
2eb9b81d27
commit
643bbc18c1
@ -254,7 +254,7 @@ class CogVideoXBlock(nn.Module):
|
||||
norm_hidden_states = rearrange(h, "(B T) C H W -> B (T H W) C", T=T)
|
||||
del h, fuser
|
||||
|
||||
#fastercache
|
||||
#region fastercache
|
||||
B = norm_hidden_states.shape[0]
|
||||
if fastercache_counter >= fastercache_start_step + 3 and fastercache_counter%3!=0 and self.cached_hidden_states[-1].shape[0] >= B:
|
||||
attn_hidden_states = (
|
||||
@ -365,6 +365,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
||||
flip_sin_to_cos: bool = True,
|
||||
freq_shift: int = 0,
|
||||
time_embed_dim: int = 512,
|
||||
ofs_embed_dim: Optional[int] = None,
|
||||
text_embed_dim: int = 4096,
|
||||
num_layers: int = 30,
|
||||
dropout: float = 0.0,
|
||||
@ -373,7 +374,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
||||
sample_height: int = 60,
|
||||
sample_frames: int = 49,
|
||||
patch_size: int = 2,
|
||||
patch_size_t: int = 2,
|
||||
patch_size_t: int = None,
|
||||
temporal_compression_ratio: int = 4,
|
||||
max_text_seq_length: int = 226,
|
||||
activation_fn: str = "gelu-approximate",
|
||||
@ -420,6 +421,11 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
||||
self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
|
||||
self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
|
||||
|
||||
self.ofs_embedding = None
|
||||
|
||||
if ofs_embed_dim:
|
||||
self.ofs_embedding = TimestepEmbedding(ofs_embed_dim, ofs_embed_dim, timestep_activation_fn) # same as time embeddings, for ofs
|
||||
|
||||
# 3. Define spatio-temporal transformers blocks
|
||||
self.transformer_blocks = nn.ModuleList(
|
||||
[
|
||||
@ -553,6 +559,9 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
||||
# there might be better ways to encapsulate this.
|
||||
t_emb = t_emb.to(dtype=hidden_states.dtype)
|
||||
emb = self.time_embedding(t_emb, timestep_cond)
|
||||
if self.ofs_embedding is not None: #1.5 I2V
|
||||
emb_ofs = self.ofs_embedding(emb, timestep_cond)
|
||||
emb = emb + emb_ofs
|
||||
|
||||
# 2. Patch embedding
|
||||
p = self.config.patch_size
|
||||
|
||||
@ -72,6 +72,7 @@ class DownloadAndLoadCogVideoModel:
|
||||
"THUDM/CogVideoX-5b",
|
||||
"THUDM/CogVideoX-5b-I2V",
|
||||
"kijai/CogVideoX-5b-1.5-T2V",
|
||||
"kijai/CogVideoX-5b-1.5-I2V",
|
||||
"bertjiazheng/KoolCogVideoX-5b",
|
||||
"kijai/CogVideoX-Fun-2b",
|
||||
"kijai/CogVideoX-Fun-5b",
|
||||
@ -97,6 +98,7 @@ class DownloadAndLoadCogVideoModel:
|
||||
"block_edit": ("TRANSFORMERBLOCKS", {"default": None}),
|
||||
"lora": ("COGLORA", {"default": None}),
|
||||
"compile_args":("COMPILEARGS", ),
|
||||
"load_device": (["main_device", "offload_device"], {"default": "main_device"}),
|
||||
}
|
||||
}
|
||||
|
||||
@ -106,12 +108,13 @@ class DownloadAndLoadCogVideoModel:
|
||||
CATEGORY = "CogVideoWrapper"
|
||||
DESCRIPTION = "Downloads and loads the selected CogVideo model from Huggingface to 'ComfyUI/models/CogVideo'"
|
||||
|
||||
def loadmodel(self, model, precision, fp8_transformer="disabled", compile="disabled", enable_sequential_cpu_offload=False, pab_config=None, block_edit=None, lora=None, compile_args=None):
|
||||
def loadmodel(self, model, precision, fp8_transformer="disabled", compile="disabled", enable_sequential_cpu_offload=False, pab_config=None, block_edit=None, lora=None, compile_args=None, load_device="main_device"):
|
||||
|
||||
check_diffusers_version()
|
||||
|
||||
device = mm.get_torch_device()
|
||||
offload_device = mm.unet_offload_device()
|
||||
transformer_load_device = device if load_device == "main_device" else offload_device
|
||||
mm.soft_empty_cache()
|
||||
|
||||
dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
|
||||
@ -134,6 +137,8 @@ class DownloadAndLoadCogVideoModel:
|
||||
if not os.path.exists(base_path):
|
||||
base_path = os.path.join(download_path, (model.split("/")[-1]))
|
||||
download_path = base_path
|
||||
subfolder = "transformer"
|
||||
allow_patterns = ["*transformer*", "*scheduler*", "*vae*"]
|
||||
|
||||
elif "2b" in model:
|
||||
if 'img2vid' in model:
|
||||
@ -144,27 +149,33 @@ class DownloadAndLoadCogVideoModel:
|
||||
base_path = os.path.join(download_path, "CogVideo2B")
|
||||
download_path = base_path
|
||||
repo_id = model
|
||||
elif "1.5-T2V" in model:
|
||||
subfolder = "transformer"
|
||||
allow_patterns = ["*transformer*", "*scheduler*", "*vae*"]
|
||||
elif "1.5-T2V" in model or "1.5-I2V" in model:
|
||||
base_path = os.path.join(download_path, "CogVideoX-5b-1.5")
|
||||
download_path = base_path
|
||||
transformer_path = os.path.join(base_path, "transformer_T2V")
|
||||
subfolder = "transformer_T2V" if "1.5-T2V" in model else "transformer_I2V"
|
||||
allow_patterns = [f"*{subfolder}*"]
|
||||
repo_id = "kijai/CogVideoX-5b-1.5"
|
||||
else:
|
||||
base_path = os.path.join(download_path, (model.split("/")[-1]))
|
||||
download_path = base_path
|
||||
repo_id = model
|
||||
subfolder = "transformer"
|
||||
allow_patterns = ["*transformer*", "*scheduler*", "*vae*"]
|
||||
|
||||
if "2b" in model:
|
||||
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json')
|
||||
else:
|
||||
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
|
||||
|
||||
if not os.path.exists(base_path) or not os.path.exists(os.path.join(base_path, "transformer")):
|
||||
if not os.path.exists(base_path) or not os.path.exists(os.path.join(base_path, subfolder)):
|
||||
log.info(f"Downloading model to: {base_path}")
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
snapshot_download(
|
||||
repo_id=repo_id,
|
||||
allow_patterns=allow_patterns,
|
||||
ignore_patterns=["*text_encoder*", "*tokenizer*"],
|
||||
local_dir=download_path,
|
||||
local_dir_use_symlinks=False,
|
||||
@ -173,18 +184,16 @@ class DownloadAndLoadCogVideoModel:
|
||||
# transformer
|
||||
if "Fun" in model:
|
||||
if pab_config is not None:
|
||||
transformer = CogVideoXTransformer3DModelFunPAB.from_pretrained(base_path, subfolder="transformer")
|
||||
transformer = CogVideoXTransformer3DModelFunPAB.from_pretrained(base_path, subfolder=subfolder)
|
||||
else:
|
||||
transformer = CogVideoXTransformer3DModelFun.from_pretrained(base_path, subfolder="transformer")
|
||||
elif "1.5-T2V" in model:
|
||||
transformer = CogVideoXTransformer3DModel.from_pretrained(transformer_path)
|
||||
transformer = CogVideoXTransformer3DModelFun.from_pretrained(base_path, subfolder=subfolder)
|
||||
else:
|
||||
if pab_config is not None:
|
||||
transformer = CogVideoXTransformer3DModelPAB.from_pretrained(base_path, subfolder="transformer")
|
||||
transformer = CogVideoXTransformer3DModelPAB.from_pretrained(base_path, subfolder=subfolder)
|
||||
else:
|
||||
transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder="transformer")
|
||||
transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder=subfolder)
|
||||
|
||||
transformer = transformer.to(dtype).to(offload_device)
|
||||
transformer = transformer.to(dtype).to(transformer_load_device)
|
||||
|
||||
if block_edit is not None:
|
||||
transformer = remove_specific_blocks(transformer, block_edit)
|
||||
|
||||
6
nodes.py
6
nodes.py
@ -782,9 +782,9 @@ class CogVideoSampler:
|
||||
"pipeline": ("COGVIDEOPIPE",),
|
||||
"positive": ("CONDITIONING", ),
|
||||
"negative": ("CONDITIONING", ),
|
||||
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
|
||||
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
|
||||
"num_frames": ("INT", {"default": 49, "min": 16, "max": 1024, "step": 1}),
|
||||
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 16}),
|
||||
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 16}),
|
||||
"num_frames": ("INT", {"default": 48, "min": 16, "max": 1024, "step": 1}),
|
||||
"steps": ("INT", {"default": 50, "min": 1}),
|
||||
"cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
|
||||
"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
|
||||
|
||||
@ -442,10 +442,6 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
|
||||
argument.
|
||||
"""
|
||||
|
||||
#assert (
|
||||
# num_frames <= 48 and num_frames % fps == 0 and fps == 8
|
||||
#), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX."
|
||||
|
||||
height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
|
||||
width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
|
||||
num_videos_per_prompt = 1
|
||||
@ -480,8 +476,8 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
|
||||
# 5. Prepare latents.
|
||||
latent_channels = self.vae.config.latent_channels
|
||||
|
||||
if latents is None and num_frames == t_tile_length:
|
||||
num_frames += 1
|
||||
#if latents is None and num_frames == t_tile_length:
|
||||
# num_frames += 1
|
||||
|
||||
if self.original_mask is not None:
|
||||
image_latents = latents
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user