From 0870af8a1dfa4193a588e119c33e2137276e3d1f Mon Sep 17 00:00:00 2001 From: zishen-ucap Date: Mon, 14 Apr 2025 10:58:46 +0800 Subject: [PATCH] Modified the coefficients of CogVideoX1.5 --- TeaCache4CogVideoX1.5/README.md | 8 +++---- .../teacache_smaple_video.py | 22 +++++++++++-------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/TeaCache4CogVideoX1.5/README.md b/TeaCache4CogVideoX1.5/README.md index b0fb42e..91e0c75 100644 --- a/TeaCache4CogVideoX1.5/README.md +++ b/TeaCache4CogVideoX1.5/README.md @@ -3,19 +3,19 @@ [TeaCache](https://github.com/LiewFeng/TeaCache) can speedup [CogVideoX1.5](https://github.com/THUDM/CogVideo) 1.8x without much visual quality degradation, in a training-free manner. The following video shows the results generated by TeaCache-CogVideoX1.5 with various `rel_l1_thresh` values: 0 (original), 0.1 (1.3x speedup), 0.2 (1.8x speedup), and 0.3(2.1x speedup).Additionally, the image-to-video (i2v) results are also demonstrated, with the following speedups: 0.1 (1.5x speedup), 0.2 (2.2x speedup), and 0.3 (2.7x speedup). -https://github.com/user-attachments/assets/c444b850-3252-4b37-ad4a-122d389218d9 +https://github.com/user-attachments/assets/21261b03-71c6-47bf-9769-2a81c8dc452f -https://github.com/user-attachments/assets/5f181a57-d5e3-46db-b388-8591e50f98e2 +https://github.com/user-attachments/assets/5e98e646-4034-4ae7-9680-a65ecd88dac9 ## 📈 Inference Latency Comparisons on a Single H100 GPU | CogVideoX1.5-t2v | TeaCache (0.1) | TeaCache (0.2) | TeaCache (0.3) | | :--------------: | :------------: | :------------: | :------------: | -| ~465 s | ~372 s | ~261 s | ~223 s | +| ~465 s | ~322 s | ~260 s | ~204 s | | CogVideoX1.5-i2v | TeaCache (0.1) | TeaCache (0.2) | TeaCache (0.3) | | :--------------: | :------------: | :------------: | :------------: | -| ~475 s | ~323 s | ~218 s | ~171 s | +| ~475 s | ~316 s | ~239 s | ~204 s | ## Installation diff --git a/TeaCache4CogVideoX1.5/teacache_smaple_video.py b/TeaCache4CogVideoX1.5/teacache_smaple_video.py index 6307c26..ed1a186 100644 --- a/TeaCache4CogVideoX1.5/teacache_smaple_video.py +++ b/TeaCache4CogVideoX1.5/teacache_smaple_video.py @@ -6,6 +6,14 @@ from diffusers.models.modeling_outputs import Transformer2DModelOutput from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, scale_lora_layers, unscale_lora_layers, export_to_video, load_image from diffusers import CogVideoXPipeline, CogVideoXImageToVideoPipeline +coefficients_dict = { + "CogVideoX-2b":[-3.10658903e+01, 2.54732368e+01, -5.92380459e+00, 1.75769064e+00, -3.61568434e-03], + "CogVideoX-5b":[-1.53880483e+03, 8.43202495e+02, -1.34363087e+02, 7.97131516e+00, -5.23162339e-02], + "CogVideoX-5b-I2V":[-1.53880483e+03, 8.43202495e+02, -1.34363087e+02, 7.97131516e+00, -5.23162339e-02], + "CogVideoX1.5-5B":[ 2.50210439e+02, -1.65061612e+02, 3.57804877e+01, -7.81551492e-01, 3.58559703e-02], + "CogVideoX1.5-5B-I2V":[ 1.22842302e+02, -1.04088754e+02, 2.62981677e+01, -3.06009921e-01, 3.71213220e-02], +} + def teacache_forward( self, @@ -64,13 +72,7 @@ def teacache_forward( should_calc = True self.accumulated_rel_l1_distance = 0 else: - if not self.config.use_rotary_positional_embeddings: - # CogVideoX-2B - coefficients = [-3.10658903e+01, 2.54732368e+01, -5.92380459e+00, 1.75769064e+00, -3.61568434e-03] - else: - # CogVideoX-5B and CogvideoX1.5-5B - coefficients = [-1.53880483e+03, 8.43202495e+02, -1.34363087e+02, 7.97131516e+00, -5.23162339e-02] - rescale_func = np.poly1d(coefficients) + rescale_func = np.poly1d(self.coefficients) self.accumulated_rel_l1_distance += rescale_func(((emb-self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()).cpu().item()) if self.accumulated_rel_l1_distance < self.rel_l1_thresh: should_calc = False @@ -196,6 +198,7 @@ def main(args): guidance_scale = args.guidance_scale fps = args.fps image_path = args.image_path + mode = ckpts_path.split("/")[-1] if generate_type == "t2v": pipe = CogVideoXPipeline.from_pretrained(ckpts_path, torch_dtype=torch.bfloat16) @@ -212,6 +215,7 @@ def main(args): pipe.transformer.__class__.previous_residual_encoder = None pipe.transformer.__class__.num_steps = num_inference_steps pipe.transformer.__class__.cnt = 0 + pipe.transformer.__class__.coefficients = coefficients_dict[mode] pipe.transformer.__class__.forward = teacache_forward pipe.to("cuda") @@ -243,7 +247,7 @@ def main(args): generator=torch.Generator("cuda").manual_seed(seed), # Set the seed for reproducibility ).frames[0] words = prompt.split()[:5] - video_path = f"{output_path}/teacache_cogvideox1.5-5B_{words}.mp4" + video_path = f"{output_path}/teacache_cogvideox1.5-5B_{words}_{rel_l1_thresh}.mp4" export_to_video(video, video_path, fps=fps) @@ -263,7 +267,7 @@ if __name__ == "__main__": parser.add_argument("--height", type=int, default=768, help="Number of steps for the inference process") parser.add_argument("--num_frames", type=int, default=81, help="Number of steps for the inference process") parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance") - parser.add_argument("--fps", type=int, default=16, help="Number of steps for the inference process") + parser.add_argument("--fps", type=int, default=16, help="Frame rate of video") args = parser.parse_args() main(args) \ No newline at end of file