From 1e356fa905e5a529ad905e8deca2559fa4d2bd8b Mon Sep 17 00:00:00 2001 From: kijai <40791699+kijai@users.noreply.github.com> Date: Tue, 27 Aug 2024 22:43:57 +0300 Subject: [PATCH] tweaks --- nodes.py | 2 +- pipeline_cogvideox.py | 28 ++++++++++++---------------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/nodes.py b/nodes.py index f38fee6..74360fd 100644 --- a/nodes.py +++ b/nodes.py @@ -59,7 +59,7 @@ class DownloadAndLoadCogVideoModel: snapshot_download( repo_id=model, - ignore_patterns=["*text_encoder*"], + ignore_patterns=["*text_encoder*", "*tokenizer*"], local_dir=base_path, local_dir_use_symlinks=False, ) diff --git a/pipeline_cogvideox.py b/pipeline_cogvideox.py index b96e2fe..fbd3927 100644 --- a/pipeline_cogvideox.py +++ b/pipeline_cogvideox.py @@ -285,16 +285,16 @@ class CogVideoXPipeline(DiffusionPipeline): temporal_size=num_frames, use_real=True, ) - - freqs_cos = freqs_cos.view(num_frames, grid_height * grid_width, -1) - freqs_sin = freqs_sin.view(num_frames, grid_height * grid_width, -1) if start_frame is not None: + freqs_cos = freqs_cos.view(num_frames, grid_height * grid_width, -1) + freqs_sin = freqs_sin.view(num_frames, grid_height * grid_width, -1) + freqs_cos = freqs_cos[start_frame:end_frame] freqs_sin = freqs_sin[start_frame:end_frame] - freqs_cos = freqs_cos.view(-1, freqs_cos.shape[-1]) - freqs_sin = freqs_sin.view(-1, freqs_sin.shape[-1]) + freqs_cos = freqs_cos.view(-1, freqs_cos.shape[-1]) + freqs_sin = freqs_sin.view(-1, freqs_sin.shape[-1]) freqs_cos = freqs_cos.to(device=device) freqs_sin = freqs_sin.to(device=device) @@ -444,12 +444,12 @@ class CogVideoXPipeline(DiffusionPipeline): print("latents.device", latents.device) - # # 6.5. Create rotary embeds if required - # image_rotary_emb = ( - # self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device) - # if self.transformer.config.use_rotary_positional_embeddings - # else None - # ) + # 6.5. Create rotary embeds if required + image_rotary_emb = ( + self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device) + if self.transformer.config.use_rotary_positional_embeddings + else None + ) # 7. Denoising loop num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) @@ -541,11 +541,7 @@ class CogVideoXPipeline(DiffusionPipeline): comfy_pbar.update(1) # ========================================== else: - image_rotary_emb = ( - self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device) - if self.transformer.config.use_rotary_positional_embeddings - else None - ) + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)