From e8a289112f0dfbaec327368f9bd5e01a969fe797 Mon Sep 17 00:00:00 2001 From: kijai <40791699+kijai@users.noreply.github.com> Date: Wed, 13 Nov 2024 15:37:45 +0200 Subject: [PATCH] fix VAE scaling (again) --- model_loading.py | 5 ----- nodes.py | 10 ++++++---- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/model_loading.py b/model_loading.py index 959c8ff..fe9245b 100644 --- a/model_loading.py +++ b/model_loading.py @@ -350,8 +350,6 @@ class DownloadAndLoadCogVideoGGUFModel: def loadmodel(self, model, vae_precision, fp8_fastmode, load_device, enable_sequential_cpu_offload, pab_config=None, block_edit=None, compile="disabled", attention_mode="sdpa"): - check_diffusers_version() - device = mm.get_torch_device() offload_device = mm.unet_offload_device() mm.soft_empty_cache() @@ -597,9 +595,6 @@ class DownloadAndLoadToraModel: DESCRIPTION = "Downloads and loads the the Tora model from Huggingface to 'ComfyUI/models/CogVideo/CogVideoX-5b-Tora'" def loadmodel(self, model): - - check_diffusers_version() - device = mm.get_torch_device() offload_device = mm.unet_offload_device() mm.soft_empty_cache() diff --git a/nodes.py b/nodes.py index f2874cd..ecea9db 100644 --- a/nodes.py +++ b/nodes.py @@ -298,7 +298,7 @@ class CogVideoTextEncode: embeds = clip.encode_from_tokens(tokens, return_pooled=False, return_dict=False) - if embeds.shape[1] > 226: + if embeds.shape[1] > max_tokens: raise ValueError(f"Prompt is too long, max tokens supported is {max_tokens} or less, got {embeds.shape[1]}") embeds *= strength if force_offload: @@ -371,7 +371,7 @@ class CogVideoImageEncode: model_name = pipeline.get("model_name", "") if ("1.5" in model_name or "1_5" in model_name) and image.shape[0] == 1: - vae_scaling_factor = 1 / vae.config.scaling_factor + vae_scaling_factor = 1 #/ vae.config.scaling_factor else: vae_scaling_factor = vae.config.scaling_factor @@ -599,16 +599,18 @@ class ToraEncodeTrajectory: vae.to(device) video_flow = vae.encode(video_flow).latent_dist.sample(generator) * vae.config.scaling_factor + log.info(f"video_flow shape after encoding: {video_flow.shape}") #torch.Size([1, 16, 4, 80, 80]) if not pipeline["cpu_offloading"]: vae.to(offload_device) - + #print("video_flow shape before traj_extractor: ", video_flow.shape) #torch.Size([1, 16, 4, 80, 80]) video_flow_features = tora_model["traj_extractor"](video_flow.to(torch.float32)) video_flow_features = torch.stack(video_flow_features) + #print("video_flow_features after traj_extractor: ", video_flow_features.shape) #torch.Size([42, 4, 128, 40, 40]) video_flow_features = video_flow_features * strength - log.info(f"video_flow shape: {video_flow.shape}") + tora = { "video_flow_features" : video_flow_features,