mirror of
https://git.datalinker.icu/kijai/ComfyUI-CogVideoXWrapper.git
synced 2025-12-10 05:14:22 +08:00
i2v
This commit is contained in:
parent
2eb9b81d27
commit
643bbc18c1
@ -254,7 +254,7 @@ class CogVideoXBlock(nn.Module):
|
|||||||
norm_hidden_states = rearrange(h, "(B T) C H W -> B (T H W) C", T=T)
|
norm_hidden_states = rearrange(h, "(B T) C H W -> B (T H W) C", T=T)
|
||||||
del h, fuser
|
del h, fuser
|
||||||
|
|
||||||
#fastercache
|
#region fastercache
|
||||||
B = norm_hidden_states.shape[0]
|
B = norm_hidden_states.shape[0]
|
||||||
if fastercache_counter >= fastercache_start_step + 3 and fastercache_counter%3!=0 and self.cached_hidden_states[-1].shape[0] >= B:
|
if fastercache_counter >= fastercache_start_step + 3 and fastercache_counter%3!=0 and self.cached_hidden_states[-1].shape[0] >= B:
|
||||||
attn_hidden_states = (
|
attn_hidden_states = (
|
||||||
@ -365,6 +365,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
|||||||
flip_sin_to_cos: bool = True,
|
flip_sin_to_cos: bool = True,
|
||||||
freq_shift: int = 0,
|
freq_shift: int = 0,
|
||||||
time_embed_dim: int = 512,
|
time_embed_dim: int = 512,
|
||||||
|
ofs_embed_dim: Optional[int] = None,
|
||||||
text_embed_dim: int = 4096,
|
text_embed_dim: int = 4096,
|
||||||
num_layers: int = 30,
|
num_layers: int = 30,
|
||||||
dropout: float = 0.0,
|
dropout: float = 0.0,
|
||||||
@ -373,7 +374,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
|||||||
sample_height: int = 60,
|
sample_height: int = 60,
|
||||||
sample_frames: int = 49,
|
sample_frames: int = 49,
|
||||||
patch_size: int = 2,
|
patch_size: int = 2,
|
||||||
patch_size_t: int = 2,
|
patch_size_t: int = None,
|
||||||
temporal_compression_ratio: int = 4,
|
temporal_compression_ratio: int = 4,
|
||||||
max_text_seq_length: int = 226,
|
max_text_seq_length: int = 226,
|
||||||
activation_fn: str = "gelu-approximate",
|
activation_fn: str = "gelu-approximate",
|
||||||
@ -420,6 +421,11 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
|||||||
self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
|
self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
|
||||||
self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
|
self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
|
||||||
|
|
||||||
|
self.ofs_embedding = None
|
||||||
|
|
||||||
|
if ofs_embed_dim:
|
||||||
|
self.ofs_embedding = TimestepEmbedding(ofs_embed_dim, ofs_embed_dim, timestep_activation_fn) # same as time embeddings, for ofs
|
||||||
|
|
||||||
# 3. Define spatio-temporal transformers blocks
|
# 3. Define spatio-temporal transformers blocks
|
||||||
self.transformer_blocks = nn.ModuleList(
|
self.transformer_blocks = nn.ModuleList(
|
||||||
[
|
[
|
||||||
@ -553,6 +559,9 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
|||||||
# there might be better ways to encapsulate this.
|
# there might be better ways to encapsulate this.
|
||||||
t_emb = t_emb.to(dtype=hidden_states.dtype)
|
t_emb = t_emb.to(dtype=hidden_states.dtype)
|
||||||
emb = self.time_embedding(t_emb, timestep_cond)
|
emb = self.time_embedding(t_emb, timestep_cond)
|
||||||
|
if self.ofs_embedding is not None: #1.5 I2V
|
||||||
|
emb_ofs = self.ofs_embedding(emb, timestep_cond)
|
||||||
|
emb = emb + emb_ofs
|
||||||
|
|
||||||
# 2. Patch embedding
|
# 2. Patch embedding
|
||||||
p = self.config.patch_size
|
p = self.config.patch_size
|
||||||
|
|||||||
@ -72,6 +72,7 @@ class DownloadAndLoadCogVideoModel:
|
|||||||
"THUDM/CogVideoX-5b",
|
"THUDM/CogVideoX-5b",
|
||||||
"THUDM/CogVideoX-5b-I2V",
|
"THUDM/CogVideoX-5b-I2V",
|
||||||
"kijai/CogVideoX-5b-1.5-T2V",
|
"kijai/CogVideoX-5b-1.5-T2V",
|
||||||
|
"kijai/CogVideoX-5b-1.5-I2V",
|
||||||
"bertjiazheng/KoolCogVideoX-5b",
|
"bertjiazheng/KoolCogVideoX-5b",
|
||||||
"kijai/CogVideoX-Fun-2b",
|
"kijai/CogVideoX-Fun-2b",
|
||||||
"kijai/CogVideoX-Fun-5b",
|
"kijai/CogVideoX-Fun-5b",
|
||||||
@ -97,6 +98,7 @@ class DownloadAndLoadCogVideoModel:
|
|||||||
"block_edit": ("TRANSFORMERBLOCKS", {"default": None}),
|
"block_edit": ("TRANSFORMERBLOCKS", {"default": None}),
|
||||||
"lora": ("COGLORA", {"default": None}),
|
"lora": ("COGLORA", {"default": None}),
|
||||||
"compile_args":("COMPILEARGS", ),
|
"compile_args":("COMPILEARGS", ),
|
||||||
|
"load_device": (["main_device", "offload_device"], {"default": "main_device"}),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -106,12 +108,13 @@ class DownloadAndLoadCogVideoModel:
|
|||||||
CATEGORY = "CogVideoWrapper"
|
CATEGORY = "CogVideoWrapper"
|
||||||
DESCRIPTION = "Downloads and loads the selected CogVideo model from Huggingface to 'ComfyUI/models/CogVideo'"
|
DESCRIPTION = "Downloads and loads the selected CogVideo model from Huggingface to 'ComfyUI/models/CogVideo'"
|
||||||
|
|
||||||
def loadmodel(self, model, precision, fp8_transformer="disabled", compile="disabled", enable_sequential_cpu_offload=False, pab_config=None, block_edit=None, lora=None, compile_args=None):
|
def loadmodel(self, model, precision, fp8_transformer="disabled", compile="disabled", enable_sequential_cpu_offload=False, pab_config=None, block_edit=None, lora=None, compile_args=None, load_device="main_device"):
|
||||||
|
|
||||||
check_diffusers_version()
|
check_diffusers_version()
|
||||||
|
|
||||||
device = mm.get_torch_device()
|
device = mm.get_torch_device()
|
||||||
offload_device = mm.unet_offload_device()
|
offload_device = mm.unet_offload_device()
|
||||||
|
transformer_load_device = device if load_device == "main_device" else offload_device
|
||||||
mm.soft_empty_cache()
|
mm.soft_empty_cache()
|
||||||
|
|
||||||
dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
|
dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
|
||||||
@ -134,6 +137,8 @@ class DownloadAndLoadCogVideoModel:
|
|||||||
if not os.path.exists(base_path):
|
if not os.path.exists(base_path):
|
||||||
base_path = os.path.join(download_path, (model.split("/")[-1]))
|
base_path = os.path.join(download_path, (model.split("/")[-1]))
|
||||||
download_path = base_path
|
download_path = base_path
|
||||||
|
subfolder = "transformer"
|
||||||
|
allow_patterns = ["*transformer*", "*scheduler*", "*vae*"]
|
||||||
|
|
||||||
elif "2b" in model:
|
elif "2b" in model:
|
||||||
if 'img2vid' in model:
|
if 'img2vid' in model:
|
||||||
@ -144,27 +149,33 @@ class DownloadAndLoadCogVideoModel:
|
|||||||
base_path = os.path.join(download_path, "CogVideo2B")
|
base_path = os.path.join(download_path, "CogVideo2B")
|
||||||
download_path = base_path
|
download_path = base_path
|
||||||
repo_id = model
|
repo_id = model
|
||||||
elif "1.5-T2V" in model:
|
subfolder = "transformer"
|
||||||
|
allow_patterns = ["*transformer*", "*scheduler*", "*vae*"]
|
||||||
|
elif "1.5-T2V" in model or "1.5-I2V" in model:
|
||||||
base_path = os.path.join(download_path, "CogVideoX-5b-1.5")
|
base_path = os.path.join(download_path, "CogVideoX-5b-1.5")
|
||||||
download_path = base_path
|
download_path = base_path
|
||||||
transformer_path = os.path.join(base_path, "transformer_T2V")
|
subfolder = "transformer_T2V" if "1.5-T2V" in model else "transformer_I2V"
|
||||||
|
allow_patterns = [f"*{subfolder}*"]
|
||||||
repo_id = "kijai/CogVideoX-5b-1.5"
|
repo_id = "kijai/CogVideoX-5b-1.5"
|
||||||
else:
|
else:
|
||||||
base_path = os.path.join(download_path, (model.split("/")[-1]))
|
base_path = os.path.join(download_path, (model.split("/")[-1]))
|
||||||
download_path = base_path
|
download_path = base_path
|
||||||
repo_id = model
|
repo_id = model
|
||||||
|
subfolder = "transformer"
|
||||||
|
allow_patterns = ["*transformer*", "*scheduler*", "*vae*"]
|
||||||
|
|
||||||
if "2b" in model:
|
if "2b" in model:
|
||||||
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json')
|
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_2b.json')
|
||||||
else:
|
else:
|
||||||
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
|
scheduler_path = os.path.join(script_directory, 'configs', 'scheduler_config_5b.json')
|
||||||
|
|
||||||
if not os.path.exists(base_path) or not os.path.exists(os.path.join(base_path, "transformer")):
|
if not os.path.exists(base_path) or not os.path.exists(os.path.join(base_path, subfolder)):
|
||||||
log.info(f"Downloading model to: {base_path}")
|
log.info(f"Downloading model to: {base_path}")
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
snapshot_download(
|
snapshot_download(
|
||||||
repo_id=repo_id,
|
repo_id=repo_id,
|
||||||
|
allow_patterns=allow_patterns,
|
||||||
ignore_patterns=["*text_encoder*", "*tokenizer*"],
|
ignore_patterns=["*text_encoder*", "*tokenizer*"],
|
||||||
local_dir=download_path,
|
local_dir=download_path,
|
||||||
local_dir_use_symlinks=False,
|
local_dir_use_symlinks=False,
|
||||||
@ -173,18 +184,16 @@ class DownloadAndLoadCogVideoModel:
|
|||||||
# transformer
|
# transformer
|
||||||
if "Fun" in model:
|
if "Fun" in model:
|
||||||
if pab_config is not None:
|
if pab_config is not None:
|
||||||
transformer = CogVideoXTransformer3DModelFunPAB.from_pretrained(base_path, subfolder="transformer")
|
transformer = CogVideoXTransformer3DModelFunPAB.from_pretrained(base_path, subfolder=subfolder)
|
||||||
else:
|
else:
|
||||||
transformer = CogVideoXTransformer3DModelFun.from_pretrained(base_path, subfolder="transformer")
|
transformer = CogVideoXTransformer3DModelFun.from_pretrained(base_path, subfolder=subfolder)
|
||||||
elif "1.5-T2V" in model:
|
|
||||||
transformer = CogVideoXTransformer3DModel.from_pretrained(transformer_path)
|
|
||||||
else:
|
else:
|
||||||
if pab_config is not None:
|
if pab_config is not None:
|
||||||
transformer = CogVideoXTransformer3DModelPAB.from_pretrained(base_path, subfolder="transformer")
|
transformer = CogVideoXTransformer3DModelPAB.from_pretrained(base_path, subfolder=subfolder)
|
||||||
else:
|
else:
|
||||||
transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder="transformer")
|
transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder=subfolder)
|
||||||
|
|
||||||
transformer = transformer.to(dtype).to(offload_device)
|
transformer = transformer.to(dtype).to(transformer_load_device)
|
||||||
|
|
||||||
if block_edit is not None:
|
if block_edit is not None:
|
||||||
transformer = remove_specific_blocks(transformer, block_edit)
|
transformer = remove_specific_blocks(transformer, block_edit)
|
||||||
|
|||||||
6
nodes.py
6
nodes.py
@ -782,9 +782,9 @@ class CogVideoSampler:
|
|||||||
"pipeline": ("COGVIDEOPIPE",),
|
"pipeline": ("COGVIDEOPIPE",),
|
||||||
"positive": ("CONDITIONING", ),
|
"positive": ("CONDITIONING", ),
|
||||||
"negative": ("CONDITIONING", ),
|
"negative": ("CONDITIONING", ),
|
||||||
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
|
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 16}),
|
||||||
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
|
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 16}),
|
||||||
"num_frames": ("INT", {"default": 49, "min": 16, "max": 1024, "step": 1}),
|
"num_frames": ("INT", {"default": 48, "min": 16, "max": 1024, "step": 1}),
|
||||||
"steps": ("INT", {"default": 50, "min": 1}),
|
"steps": ("INT", {"default": 50, "min": 1}),
|
||||||
"cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
|
"cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
|
||||||
"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
|
"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
|
||||||
|
|||||||
@ -442,10 +442,6 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
|
|||||||
argument.
|
argument.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
#assert (
|
|
||||||
# num_frames <= 48 and num_frames % fps == 0 and fps == 8
|
|
||||||
#), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX."
|
|
||||||
|
|
||||||
height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
|
height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
|
||||||
width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
|
width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
|
||||||
num_videos_per_prompt = 1
|
num_videos_per_prompt = 1
|
||||||
@ -480,8 +476,8 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
|
|||||||
# 5. Prepare latents.
|
# 5. Prepare latents.
|
||||||
latent_channels = self.vae.config.latent_channels
|
latent_channels = self.vae.config.latent_channels
|
||||||
|
|
||||||
if latents is None and num_frames == t_tile_length:
|
#if latents is None and num_frames == t_tile_length:
|
||||||
num_frames += 1
|
# num_frames += 1
|
||||||
|
|
||||||
if self.original_mask is not None:
|
if self.original_mask is not None:
|
||||||
image_latents = latents
|
image_latents = latents
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user