Merge pull request #59 from zishen-ucap/feature-update

Update coefficients of CogVideoX1.5
This commit is contained in:
Feng Liu 2025-04-14 11:27:42 +08:00 committed by GitHub
commit 36b6ed12c9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 17 additions and 13 deletions

View File

@ -3,19 +3,19 @@
[TeaCache](https://github.com/LiewFeng/TeaCache) can speedup [CogVideoX1.5](https://github.com/THUDM/CogVideo) 1.8x without much visual quality degradation, in a training-free manner. The following video shows the results generated by TeaCache-CogVideoX1.5 with various `rel_l1_thresh` values: 0 (original), 0.1 (1.3x speedup), 0.2 (1.8x speedup), and 0.3(2.1x speedup).Additionally, the image-to-video (i2v) results are also demonstrated, with the following speedups: 0.1 (1.5x speedup), 0.2 (2.2x speedup), and 0.3 (2.7x speedup).
https://github.com/user-attachments/assets/c444b850-3252-4b37-ad4a-122d389218d9
https://github.com/user-attachments/assets/21261b03-71c6-47bf-9769-2a81c8dc452f
https://github.com/user-attachments/assets/5f181a57-d5e3-46db-b388-8591e50f98e2
https://github.com/user-attachments/assets/5e98e646-4034-4ae7-9680-a65ecd88dac9
## 📈 Inference Latency Comparisons on a Single H100 GPU
| CogVideoX1.5-t2v | TeaCache (0.1) | TeaCache (0.2) | TeaCache (0.3) |
| :--------------: | :------------: | :------------: | :------------: |
| ~465 s | ~372 s | ~261 s | ~223 s |
| ~465 s | ~322 s | ~260 s | ~204 s |
| CogVideoX1.5-i2v | TeaCache (0.1) | TeaCache (0.2) | TeaCache (0.3) |
| :--------------: | :------------: | :------------: | :------------: |
| ~475 s | ~323 s | ~218 s | ~171 s |
| ~475 s | ~316 s | ~239 s | ~204 s |
## Installation

View File

@ -6,6 +6,14 @@ from diffusers.models.modeling_outputs import Transformer2DModelOutput
from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, scale_lora_layers, unscale_lora_layers, export_to_video, load_image
from diffusers import CogVideoXPipeline, CogVideoXImageToVideoPipeline
coefficients_dict = {
"CogVideoX-2b":[-3.10658903e+01, 2.54732368e+01, -5.92380459e+00, 1.75769064e+00, -3.61568434e-03],
"CogVideoX-5b":[-1.53880483e+03, 8.43202495e+02, -1.34363087e+02, 7.97131516e+00, -5.23162339e-02],
"CogVideoX-5b-I2V":[-1.53880483e+03, 8.43202495e+02, -1.34363087e+02, 7.97131516e+00, -5.23162339e-02],
"CogVideoX1.5-5B":[ 2.50210439e+02, -1.65061612e+02, 3.57804877e+01, -7.81551492e-01, 3.58559703e-02],
"CogVideoX1.5-5B-I2V":[ 1.22842302e+02, -1.04088754e+02, 2.62981677e+01, -3.06009921e-01, 3.71213220e-02],
}
def teacache_forward(
self,
@ -64,13 +72,7 @@ def teacache_forward(
should_calc = True
self.accumulated_rel_l1_distance = 0
else:
if not self.config.use_rotary_positional_embeddings:
# CogVideoX-2B
coefficients = [-3.10658903e+01, 2.54732368e+01, -5.92380459e+00, 1.75769064e+00, -3.61568434e-03]
else:
# CogVideoX-5B and CogvideoX1.5-5B
coefficients = [-1.53880483e+03, 8.43202495e+02, -1.34363087e+02, 7.97131516e+00, -5.23162339e-02]
rescale_func = np.poly1d(coefficients)
rescale_func = np.poly1d(self.coefficients)
self.accumulated_rel_l1_distance += rescale_func(((emb-self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()).cpu().item())
if self.accumulated_rel_l1_distance < self.rel_l1_thresh:
should_calc = False
@ -196,6 +198,7 @@ def main(args):
guidance_scale = args.guidance_scale
fps = args.fps
image_path = args.image_path
mode = ckpts_path.split("/")[-1]
if generate_type == "t2v":
pipe = CogVideoXPipeline.from_pretrained(ckpts_path, torch_dtype=torch.bfloat16)
@ -212,6 +215,7 @@ def main(args):
pipe.transformer.__class__.previous_residual_encoder = None
pipe.transformer.__class__.num_steps = num_inference_steps
pipe.transformer.__class__.cnt = 0
pipe.transformer.__class__.coefficients = coefficients_dict[mode]
pipe.transformer.__class__.forward = teacache_forward
pipe.to("cuda")
@ -243,7 +247,7 @@ def main(args):
generator=torch.Generator("cuda").manual_seed(seed), # Set the seed for reproducibility
).frames[0]
words = prompt.split()[:5]
video_path = f"{output_path}/teacache_cogvideox1.5-5B_{words}.mp4"
video_path = f"{output_path}/teacache_cogvideox1.5-5B_{words}_{rel_l1_thresh}.mp4"
export_to_video(video, video_path, fps=fps)
@ -263,7 +267,7 @@ if __name__ == "__main__":
parser.add_argument("--height", type=int, default=768, help="Number of steps for the inference process")
parser.add_argument("--num_frames", type=int, default=81, help="Number of steps for the inference process")
parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance")
parser.add_argument("--fps", type=int, default=16, help="Number of steps for the inference process")
parser.add_argument("--fps", type=int, default=16, help="Frame rate of video")
args = parser.parse_args()
main(args)