From 0870af8a1dfa4193a588e119c33e2137276e3d1f Mon Sep 17 00:00:00 2001
From: zishen-ucap <huangzs@ucap.com.cn>
Date: Mon, 14 Apr 2025 10:58:46 +0800
Subject: [PATCH] Modified the coefficients of CogVideoX1.5

---
 TeaCache4CogVideoX1.5/README.md               |  8 +++----
 .../teacache_smaple_video.py                  | 22 +++++++++++--------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/TeaCache4CogVideoX1.5/README.md b/TeaCache4CogVideoX1.5/README.md
index b0fb42e..91e0c75 100644
--- a/TeaCache4CogVideoX1.5/README.md
+++ b/TeaCache4CogVideoX1.5/README.md
@@ -3,19 +3,19 @@
 
 [TeaCache](https://github.com/LiewFeng/TeaCache) can speedup [CogVideoX1.5](https://github.com/THUDM/CogVideo) 1.8x without much visual quality degradation, in a training-free manner. The following video shows the results generated by TeaCache-CogVideoX1.5 with various `rel_l1_thresh` values: 0 (original), 0.1 (1.3x speedup), 0.2 (1.8x speedup), and 0.3(2.1x speedup).Additionally, the image-to-video (i2v) results are also demonstrated, with the following speedups: 0.1 (1.5x speedup), 0.2 (2.2x speedup), and 0.3 (2.7x speedup).
 
-https://github.com/user-attachments/assets/c444b850-3252-4b37-ad4a-122d389218d9
+https://github.com/user-attachments/assets/21261b03-71c6-47bf-9769-2a81c8dc452f
 
-https://github.com/user-attachments/assets/5f181a57-d5e3-46db-b388-8591e50f98e2
+https://github.com/user-attachments/assets/5e98e646-4034-4ae7-9680-a65ecd88dac9
 
 ## 📈 Inference Latency Comparisons on a Single H100 GPU
 
 | CogVideoX1.5-t2v | TeaCache (0.1) | TeaCache (0.2) | TeaCache (0.3) |
 | :--------------: | :------------: | :------------: | :------------: |
-|      ~465 s      |     ~372 s     |     ~261 s     |     ~223 s     |
+|      ~465 s      |     ~322 s     |     ~260 s     |     ~204 s     |
 
 | CogVideoX1.5-i2v | TeaCache (0.1) | TeaCache (0.2) | TeaCache (0.3) |
 | :--------------: | :------------: | :------------: | :------------: |
-|      ~475 s      |     ~323 s     |     ~218 s     |     ~171 s     |
+|      ~475 s      |     ~316 s     |     ~239 s     |     ~204 s     |
 
 ## Installation
 
diff --git a/TeaCache4CogVideoX1.5/teacache_smaple_video.py b/TeaCache4CogVideoX1.5/teacache_smaple_video.py
index 6307c26..ed1a186 100644
--- a/TeaCache4CogVideoX1.5/teacache_smaple_video.py
+++ b/TeaCache4CogVideoX1.5/teacache_smaple_video.py
@@ -6,6 +6,14 @@ from diffusers.models.modeling_outputs import Transformer2DModelOutput
 from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, scale_lora_layers, unscale_lora_layers, export_to_video, load_image
 from diffusers import CogVideoXPipeline, CogVideoXImageToVideoPipeline
 
+coefficients_dict = {
+    "CogVideoX-2b":[-3.10658903e+01,  2.54732368e+01, -5.92380459e+00,  1.75769064e+00, -3.61568434e-03],
+    "CogVideoX-5b":[-1.53880483e+03,  8.43202495e+02, -1.34363087e+02,  7.97131516e+00, -5.23162339e-02],
+    "CogVideoX-5b-I2V":[-1.53880483e+03,  8.43202495e+02, -1.34363087e+02,  7.97131516e+00, -5.23162339e-02],
+    "CogVideoX1.5-5B":[ 2.50210439e+02, -1.65061612e+02,  3.57804877e+01, -7.81551492e-01, 3.58559703e-02],
+    "CogVideoX1.5-5B-I2V":[ 1.22842302e+02, -1.04088754e+02,  2.62981677e+01, -3.06009921e-01, 3.71213220e-02],
+}
+
 
 def teacache_forward(
         self,
@@ -64,13 +72,7 @@ def teacache_forward(
                 should_calc = True
                 self.accumulated_rel_l1_distance = 0
             else: 
-                if not self.config.use_rotary_positional_embeddings:
-                    # CogVideoX-2B
-                    coefficients = [-3.10658903e+01,  2.54732368e+01, -5.92380459e+00,  1.75769064e+00, -3.61568434e-03]   
-                else:
-                    # CogVideoX-5B and CogvideoX1.5-5B
-                    coefficients = [-1.53880483e+03,  8.43202495e+02, -1.34363087e+02,  7.97131516e+00, -5.23162339e-02]
-                rescale_func = np.poly1d(coefficients)
+                rescale_func = np.poly1d(self.coefficients)
                 self.accumulated_rel_l1_distance += rescale_func(((emb-self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()).cpu().item())
                 if self.accumulated_rel_l1_distance < self.rel_l1_thresh:
                     should_calc = False
@@ -196,6 +198,7 @@ def main(args):
     guidance_scale = args.guidance_scale
     fps = args.fps
     image_path = args.image_path
+    mode = ckpts_path.split("/")[-1]
 
     if generate_type == "t2v":
         pipe = CogVideoXPipeline.from_pretrained(ckpts_path, torch_dtype=torch.bfloat16)
@@ -212,6 +215,7 @@ def main(args):
     pipe.transformer.__class__.previous_residual_encoder = None
     pipe.transformer.__class__.num_steps = num_inference_steps
     pipe.transformer.__class__.cnt = 0
+    pipe.transformer.__class__.coefficients = coefficients_dict[mode]
     pipe.transformer.__class__.forward = teacache_forward
 
     pipe.to("cuda")
@@ -243,7 +247,7 @@ def main(args):
             generator=torch.Generator("cuda").manual_seed(seed),  # Set the seed for reproducibility
         ).frames[0]
     words = prompt.split()[:5]
-    video_path = f"{output_path}/teacache_cogvideox1.5-5B_{words}.mp4"
+    video_path = f"{output_path}/teacache_cogvideox1.5-5B_{words}_{rel_l1_thresh}.mp4"
     export_to_video(video, video_path, fps=fps)
 
 
@@ -263,7 +267,7 @@ if __name__ == "__main__":
     parser.add_argument("--height", type=int, default=768, help="Number of steps for the inference process")
     parser.add_argument("--num_frames", type=int, default=81, help="Number of steps for the inference process")
     parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance")
-    parser.add_argument("--fps", type=int, default=16, help="Number of steps for the inference process")
+    parser.add_argument("--fps", type=int, default=16, help="Frame rate of video")
     args = parser.parse_args()
 
     main(args)
\ No newline at end of file