From e7389af4bf025effd27ee86cffe3c5e5519a03fa Mon Sep 17 00:00:00 2001
From: zishen-ucap <huangzs@ucap.com.cn>
Date: Fri, 17 Jan 2025 15:32:40 +0800
Subject: [PATCH 1/6] Add TeaCache4CogVideoX1.5

---
 TeaCache4CogVideoX1.5/README.md               |  49 ++++
 .../teacache_smaple_video.py                  | 237 ++++++++++++++++++
 2 files changed, 286 insertions(+)
 create mode 100644 TeaCache4CogVideoX1.5/README.md
 create mode 100644 TeaCache4CogVideoX1.5/teacache_smaple_video.py

diff --git a/TeaCache4CogVideoX1.5/README.md b/TeaCache4CogVideoX1.5/README.md
new file mode 100644
index 0000000..10f4ef2
--- /dev/null
+++ b/TeaCache4CogVideoX1.5/README.md
@@ -0,0 +1,49 @@
+<!-- ## **TeaCache4CogVideoX1.5** -->
+# TeaCache4CogVideoX1.5
+
+[TeaCache](https://github.com/LiewFeng/TeaCache) can speedup [CogVideoX1.5](https://github.com/THUDM/CogVideo) 1.8x without much visual quality degradation, in a training-free manner. The following video shows the results generated by TeaCache-ConsisID with various `rel_l1_thresh` values: 0 (original), 0.1 (1.3x speedup), 0.2 (1.8x speedup), and 0.3(2.1x speedup).
+
+https://github.com/user-attachments/assets/c444b850-3252-4b37-ad4a-122d389218d9
+
+## 📈 Inference Latency Comparisons on a Single H100 GPU
+
+| CogVideoX1.5 | TeaCache (0.1) | TeaCache (0.2) | TeaCache (0.3) |
+| :----------: | :------------: | :------------: | :------------: |
+|    ~465 s    |     ~372 s     |     ~261 s     |     ~223 s     |
+
+
+## Usage
+
+Follow [CogVideoX](https://github.com/THUDM/CogVideo) to clone the repo and finish the installation, then you can modify the `rel_l1_thresh` to obtain your desired trade-off between latency and visul quality, and change the `ckpts_path`, `prompt`, `image` to customize your identity-preserving video.
+
+For single-gpu inference, you can use the following command:
+
+```bash
+cd TeaCache4CogVideoX1.5
+
+python3 teacache_sample_video.py \
+    --rel_l1_thresh 0.2 \
+    --ckpts_path THUDM/CogVideoX1.5-5B \
+    --prompt "A clear, turquoise river flows through a rocky canyon, cascading over a small waterfall and forming a pool of water at the bottom.The river is the main focus of the scene, with its clear water reflecting the surrounding trees and rocks. The canyon walls are steep and rocky, with some vegetation growing on them. The trees are mostly pine trees, with their green needles contrasting with the brown and gray rocks. The overall tone of the scene is one of peace and tranquility.", help='Description of the video for the model to generate." \
+    --seed 42 \
+    --num_inference_steps 50 \
+    --output_path ./teacache_results
+```
+
+## Citation
+
+If you find TeaCache is useful in your research or applications, please consider giving us a star 🌟 and citing it by the following BibTeX entry.
+
+```
+@article{liu2024timestep,
+  title={Timestep Embedding Tells: It's Time to Cache for Video Diffusion Model},
+  author={Liu, Feng and Zhang, Shiwei and Wang, Xiaofeng and Wei, Yujie and Qiu, Haonan and Zhao, Yuzhong and Zhang, Yingya and Ye, Qixiang and Wan, Fang},
+  journal={arXiv preprint arXiv:2411.19108},
+  year={2024}
+}
+```
+
+
+## Acknowledgements
+
+We would like to thank the contributors to the [CogVideoX](https://github.com/THUDM/CogVideo) and [Diffusers](https://github.com/huggingface/diffusers).
diff --git a/TeaCache4CogVideoX1.5/teacache_smaple_video.py b/TeaCache4CogVideoX1.5/teacache_smaple_video.py
new file mode 100644
index 0000000..ee2a006
--- /dev/null
+++ b/TeaCache4CogVideoX1.5/teacache_smaple_video.py
@@ -0,0 +1,237 @@
+import argparse
+import torch
+import numpy as np
+from typing import Any, Dict, Optional, Tuple,  Union
+from videosys.models.transformers.cogvideox_transformer_3d import Transformer2DModelOutput
+from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, scale_lora_layers, unscale_lora_layers, export_to_video
+from diffusers import CogVideoXPipeline
+
+
+def teacache_forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        timestep: Union[int, float, torch.LongTensor],
+        timestep_cond: Optional[torch.Tensor] = None,
+        ofs: Optional[Union[int, float, torch.LongTensor]] = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ):
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+
+        batch_size, num_frames, channels, height, width = hidden_states.shape
+
+        # 1. Time embedding
+        timesteps = timestep
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=hidden_states.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+
+        if self.ofs_embedding is not None:
+            ofs_emb = self.ofs_proj(ofs)
+            ofs_emb = ofs_emb.to(dtype=hidden_states.dtype)
+            ofs_emb = self.ofs_embedding(ofs_emb)
+            emb = emb + ofs_emb
+
+        # 2. Patch embedding
+        hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
+        hidden_states = self.embedding_dropout(hidden_states)
+
+        text_seq_length = encoder_hidden_states.shape[1]
+        encoder_hidden_states = hidden_states[:, :text_seq_length]
+        hidden_states = hidden_states[:, text_seq_length:]
+
+        if self.enable_teacache:
+            if self.cnt == 0 or self.cnt == self.num_steps-1:
+                should_calc = True
+                self.accumulated_rel_l1_distance = 0
+            else: 
+                if not self.config.use_rotary_positional_embeddings:
+                    # CogVideoX-2B
+                    coefficients = [-3.10658903e+01,  2.54732368e+01, -5.92380459e+00,  1.75769064e+00, -3.61568434e-03]   
+                else:
+                    # CogVideoX-5B and CogvideoX1.5-5B
+                    coefficients = [-1.53880483e+03,  8.43202495e+02, -1.34363087e+02,  7.97131516e+00, -5.23162339e-02]
+                rescale_func = np.poly1d(coefficients)
+                self.accumulated_rel_l1_distance += rescale_func(((emb-self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()).cpu().item())
+                if self.accumulated_rel_l1_distance < self.rel_l1_thresh:
+                    should_calc = False
+                else:
+                    should_calc = True
+                    self.accumulated_rel_l1_distance = 0
+            self.previous_modulated_input = emb
+            self.cnt += 1
+            if self.cnt == self.num_steps:
+                self.cnt = 0            
+        
+        if self.enable_teacache:
+            if not should_calc:
+                hidden_states += self.previous_residual
+                encoder_hidden_states += self.previous_residual_encoder
+            else:
+                ori_hidden_states = hidden_states.clone()
+                ori_encoder_hidden_states = encoder_hidden_states.clone()
+                # 4. Transformer blocks
+                for i, block in enumerate(self.transformer_blocks):
+                    if torch.is_grad_enabled() and self.gradient_checkpointing:
+
+                        def create_custom_forward(module):
+                            def custom_forward(*inputs):
+                                return module(*inputs)
+
+                            return custom_forward
+
+                        ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                        hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(block),
+                            hidden_states,
+                            encoder_hidden_states,
+                            emb,
+                            image_rotary_emb,
+                            **ckpt_kwargs,
+                        )
+                    else:
+                        hidden_states, encoder_hidden_states = block(
+                            hidden_states=hidden_states,
+                            encoder_hidden_states=encoder_hidden_states,
+                            temb=emb,
+                            image_rotary_emb=image_rotary_emb,
+                        )
+
+                self.previous_residual = hidden_states - ori_hidden_states
+                self.previous_residual_encoder = encoder_hidden_states - ori_encoder_hidden_states
+        else:
+            # 4. Transformer blocks
+            for i, block in enumerate(self.transformer_blocks):
+                if torch.is_grad_enabled() and self.gradient_checkpointing:
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs)
+
+                        return custom_forward
+
+                    ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                    hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        hidden_states,
+                        encoder_hidden_states,
+                        emb,
+                        image_rotary_emb,
+                        **ckpt_kwargs,
+                    )
+                else:
+                    hidden_states, encoder_hidden_states = block(
+                        hidden_states=hidden_states,
+                        encoder_hidden_states=encoder_hidden_states,
+                        temb=emb,
+                        image_rotary_emb=image_rotary_emb,
+                    )
+
+        if not self.config.use_rotary_positional_embeddings:
+            # CogVideoX-2B
+            hidden_states = self.norm_final(hidden_states)
+        else:
+            # CogVideoX-5B and CogvideoX1.5-5B
+            hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+            hidden_states = self.norm_final(hidden_states)
+            hidden_states = hidden_states[:, text_seq_length:]
+
+        # 5. Final block
+        hidden_states = self.norm_out(hidden_states, temb=emb)
+        hidden_states = self.proj_out(hidden_states)
+
+        # 6. Unpatchify
+        p = self.config.patch_size
+        p_t = self.config.patch_size_t
+
+        if p_t is None:
+            output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
+            output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
+        else:
+            output = hidden_states.reshape(
+                batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
+            )
+            output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)
+
+
+def main(args):
+    prompt = args.prompt
+    negative_prompt = args.negative_prompt
+    seed = args.seed
+    ckpts_path = args.ckpts_path
+    output_path = args.output_path
+    num_inference_steps = args.num_inference_steps
+    rel_l1_thresh = args.rel_l1_thresh
+    pipe = CogVideoXPipeline.from_pretrained(ckpts_path, torch_dtype=torch.bfloat16)
+
+    # TeaCache
+    pipe.transformer.__class__.enable_teacache = True
+    pipe.transformer.__class__.rel_l1_thresh = rel_l1_thresh
+    pipe.transformer.__class__.accumulated_rel_l1_distance = 0
+    pipe.transformer.__class__.previous_modulated_input = None
+    pipe.transformer.__class__.previous_residual = None
+    pipe.transformer.__class__.previous_residual_encoder = None
+    pipe.transformer.__class__.num_steps = num_inference_steps
+    pipe.transformer.__class__.cnt = 0
+    pipe.transformer.__class__.forward = teacache_forward
+
+    pipe.to("cuda")
+    pipe.vae.enable_slicing()
+    pipe.vae.enable_tiling()
+
+    video = pipe(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        width=1360,
+        height=768,
+        num_frames=81,
+        use_dynamic_cfg=True,
+        guidance_scale=6,
+        num_inference_steps=num_inference_steps,
+        generator=torch.Generator("cuda").manual_seed(seed)
+    ).frames[0]
+    words = prompt.split()[:5]
+    video_path = f"{output_path}/teacache_cogvideox1.5-5B_{words}.mp4"
+    export_to_video(video, video_path, fps=16)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run CogvideoX1.5-5B with given parameters")
+    
+    parser.add_argument('--seed', type=int, default=42, help='Random seed')
+    parser.add_argument('--num_inference_steps', type=int, default=50, help='Number of inference steps')
+    parser.add_argument("--output_path", type=str, default="./teacache_results", help="The path where the generated video will be saved")
+    parser.add_argument('--ckpts_path', type=str, default="/data-123/zishen/cvproject/CogVideo/THUDM/CogVideoX1.5-5B", help='Path to checkpoint')
+    parser.add_argument('--rel_l1_thresh', type=float, default=0.2, help='Higher speedup will cause to worse quality -- 0.1 for 1.3x speedup -- 0.2 for 1.8x speedup -- 0.3 for 2.1x speedup')
+    parser.add_argument('--prompt', type=str, default="A clear, turquoise river flows through a rocky canyon, cascading over a small waterfall and forming a pool of water at the bottom.The river is the main focus of the scene, with its clear water reflecting the surrounding trees and rocks. The canyon walls are steep and rocky, with some vegetation growing on them. The trees are mostly pine trees, with their green needles contrasting with the brown and gray rocks. The overall tone of the scene is one of peace and tranquility.", help='Description of the video for the model to generate')
+    parser.add_argument('--negative_prompt', type=str, default=None, help='Description of unwanted situations in model generated videos')
+    args = parser.parse_args()
+
+    main(args)
\ No newline at end of file

From f73b91d29d305798c227ae549d770be0cda10049 Mon Sep 17 00:00:00 2001
From: zishen-ucap <huangzs@ucap.com.cn>
Date: Fri, 17 Jan 2025 15:41:13 +0800
Subject: [PATCH 2/6] Add TeaCache4CogVideoX1.5

---
 TeaCache4CogVideoX1.5/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TeaCache4CogVideoX1.5/README.md b/TeaCache4CogVideoX1.5/README.md
index 10f4ef2..f8b059b 100644
--- a/TeaCache4CogVideoX1.5/README.md
+++ b/TeaCache4CogVideoX1.5/README.md
@@ -1,7 +1,7 @@
 <!-- ## **TeaCache4CogVideoX1.5** -->
 # TeaCache4CogVideoX1.5
 
-[TeaCache](https://github.com/LiewFeng/TeaCache) can speedup [CogVideoX1.5](https://github.com/THUDM/CogVideo) 1.8x without much visual quality degradation, in a training-free manner. The following video shows the results generated by TeaCache-ConsisID with various `rel_l1_thresh` values: 0 (original), 0.1 (1.3x speedup), 0.2 (1.8x speedup), and 0.3(2.1x speedup).
+[TeaCache](https://github.com/LiewFeng/TeaCache) can speedup [CogVideoX1.5](https://github.com/THUDM/CogVideo) 1.8x without much visual quality degradation, in a training-free manner. The following video shows the results generated by TeaCache-CogVideoX1.5 with various `rel_l1_thresh` values: 0 (original), 0.1 (1.3x speedup), 0.2 (1.8x speedup), and 0.3(2.1x speedup).
 
 https://github.com/user-attachments/assets/c444b850-3252-4b37-ad4a-122d389218d9
 

From 3a5b9c092a2f30aea6b71d7516886b2f9b71ccf6 Mon Sep 17 00:00:00 2001
From: zishen-ucap <huangzs@ucap.com.cn>
Date: Fri, 17 Jan 2025 18:43:59 +0800
Subject: [PATCH 3/6] Fixed issues based on PR review

---
 TeaCache4CogVideoX1.5/README.md               | 38 ++++++++--
 .../teacache_smaple_video.py                  | 72 +++++++++++++------
 2 files changed, 84 insertions(+), 26 deletions(-)

diff --git a/TeaCache4CogVideoX1.5/README.md b/TeaCache4CogVideoX1.5/README.md
index f8b059b..46a90dd 100644
--- a/TeaCache4CogVideoX1.5/README.md
+++ b/TeaCache4CogVideoX1.5/README.md
@@ -1,22 +1,33 @@
 <!-- ## **TeaCache4CogVideoX1.5** -->
 # TeaCache4CogVideoX1.5
 
-[TeaCache](https://github.com/LiewFeng/TeaCache) can speedup [CogVideoX1.5](https://github.com/THUDM/CogVideo) 1.8x without much visual quality degradation, in a training-free manner. The following video shows the results generated by TeaCache-CogVideoX1.5 with various `rel_l1_thresh` values: 0 (original), 0.1 (1.3x speedup), 0.2 (1.8x speedup), and 0.3(2.1x speedup).
+[TeaCache](https://github.com/LiewFeng/TeaCache) can speedup [CogVideoX1.5](https://github.com/THUDM/CogVideo) 1.8x without much visual quality degradation, in a training-free manner. The following video shows the results generated by TeaCache-ConsisID with various `rel_l1_thresh` values: 0 (original), 0.1 (1.3x speedup), 0.2 (1.8x speedup), and 0.3(2.1x speedup).Additionally, the image-to-video (i2v) results are also demonstrated, with the following speedups: 0.1 (1.5x speedup), 0.2 (2.2x speedup), and 0.3 (2.7x speedup).
 
 https://github.com/user-attachments/assets/c444b850-3252-4b37-ad4a-122d389218d9
 
+https://github.com/user-attachments/assets/5f181a57-d5e3-46db-b388-8591e50f98e2
+
 ## 📈 Inference Latency Comparisons on a Single H100 GPU
 
-| CogVideoX1.5 | TeaCache (0.1) | TeaCache (0.2) | TeaCache (0.3) |
-| :----------: | :------------: | :------------: | :------------: |
-|    ~465 s    |     ~372 s     |     ~261 s     |     ~223 s     |
+| CogVideoX1.5-t2v | TeaCache (0.1) | TeaCache (0.2) | TeaCache (0.3) |
+| :--------------: | :------------: | :------------: | :------------: |
+|      ~465 s      |     ~372 s     |     ~261 s     |     ~223 s     |
 
+| CogVideoX1.5-i2v | TeaCache (0.1) | TeaCache (0.2) | TeaCache (0.3) |
+| :--------------: | :------------: | :------------: | :------------: |
+|      ~475 s      |     ~323 s     |     ~218 s     |     ~171 s     |
+
+## Installation
+
+```shell
+pip install --upgrade diffusers[torch] transformers protobuf tokenizers sentencepiece imageio imageio-ffmpeg
+```
 
 ## Usage
 
-Follow [CogVideoX](https://github.com/THUDM/CogVideo) to clone the repo and finish the installation, then you can modify the `rel_l1_thresh` to obtain your desired trade-off between latency and visul quality, and change the `ckpts_path`, `prompt`, `image` to customize your identity-preserving video.
+You can modify the `rel_l1_thresh` to obtain your desired trade-off between latency and visul quality, and change the `ckpts_path`, `prompt`, `image_path` to customize your identity-preserving video.
 
-For single-gpu inference, you can use the following command:
+For T2v inference, you can use the following command:
 
 ```bash
 cd TeaCache4CogVideoX1.5
@@ -30,6 +41,21 @@ python3 teacache_sample_video.py \
     --output_path ./teacache_results
 ```
 
+For I2v inference, you can use the following command:
+
+```bash
+cd TeaCache4CogVideoX1.5
+
+python3 teacache_sample_video.py \
+    --rel_l1_thresh 0.1 \
+    --ckpts_path THUDM/CogVideoX1.5-5B-I2V \
+    --prompt "A girl gazed at the camera and smiled, her hair drifting in the wind." \
+    --seed 42 \
+    --num_inference_steps 50 \
+    --output_path ./teacache_results \
+    --image_path ./image/path \
+```
+
 ## Citation
 
 If you find TeaCache is useful in your research or applications, please consider giving us a star 🌟 and citing it by the following BibTeX entry.
diff --git a/TeaCache4CogVideoX1.5/teacache_smaple_video.py b/TeaCache4CogVideoX1.5/teacache_smaple_video.py
index ee2a006..267e7f8 100644
--- a/TeaCache4CogVideoX1.5/teacache_smaple_video.py
+++ b/TeaCache4CogVideoX1.5/teacache_smaple_video.py
@@ -2,9 +2,9 @@ import argparse
 import torch
 import numpy as np
 from typing import Any, Dict, Optional, Tuple,  Union
-from videosys.models.transformers.cogvideox_transformer_3d import Transformer2DModelOutput
-from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, scale_lora_layers, unscale_lora_layers, export_to_video
-from diffusers import CogVideoXPipeline
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, scale_lora_layers, unscale_lora_layers, export_to_video, load_image
+from diffusers import CogVideoXPipeline, CogVideoXImageToVideoPipeline
 
 
 def teacache_forward(
@@ -182,14 +182,26 @@ def teacache_forward(
 
 
 def main(args):
-    prompt = args.prompt
-    negative_prompt = args.negative_prompt
     seed = args.seed
     ckpts_path = args.ckpts_path
     output_path = args.output_path
     num_inference_steps = args.num_inference_steps
     rel_l1_thresh = args.rel_l1_thresh
-    pipe = CogVideoXPipeline.from_pretrained(ckpts_path, torch_dtype=torch.bfloat16)
+    generate_type = args.generate_type
+    prompt = args.prompt
+    negative_prompt = args.negative_prompt
+    height = args.height
+    width = args.width
+    num_frames = args.num_frames
+    guidance_scale = args.guidance_scale
+    fps = args.fps
+    image_path = args.image_path
+
+    if generate_type == "t2v":
+        pipe = CogVideoXPipeline.from_pretrained(ckpts_path, torch_dtype=torch.bfloat16)
+    else:
+        pipe = CogVideoXImageToVideoPipeline.from_pretrained(ckpts_path, torch_dtype=torch.bfloat16)
+        image = load_image(image=image_path)
 
     # TeaCache
     pipe.transformer.__class__.enable_teacache = True
@@ -206,32 +218,52 @@ def main(args):
     pipe.vae.enable_slicing()
     pipe.vae.enable_tiling()
 
-    video = pipe(
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        width=1360,
-        height=768,
-        num_frames=81,
-        use_dynamic_cfg=True,
-        guidance_scale=6,
-        num_inference_steps=num_inference_steps,
-        generator=torch.Generator("cuda").manual_seed(seed)
-    ).frames[0]
+    if generate_type == "t2v":
+        video = pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            width=width,
+            height=height,
+            num_frames=num_frames,
+            use_dynamic_cfg=True,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator=torch.Generator("cuda").manual_seed(seed)
+        ).frames[0]
+    else:
+        video = pipe(
+            height=height,
+            width=width,
+            prompt=prompt,
+            image=image,
+            num_inference_steps=num_inference_steps,  # Number of inference steps
+            num_frames=num_frames,  # Number of frames to generate
+            use_dynamic_cfg=True,  # This id used for DPM scheduler, for DDIM scheduler, it should be False
+            guidance_scale=guidance_scale,
+            generator=torch.Generator("cuda").manual_seed(seed),  # Set the seed for reproducibility
+        ).frames[0]
     words = prompt.split()[:5]
     video_path = f"{output_path}/teacache_cogvideox1.5-5B_{words}.mp4"
-    export_to_video(video, video_path, fps=16)
+    export_to_video(video, video_path, fps=fps)
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run CogvideoX1.5-5B with given parameters")
+    parser = argparse.ArgumentParser(description="Run CogVideoX1.5-5B with given parameters")
     
     parser.add_argument('--seed', type=int, default=42, help='Random seed')
     parser.add_argument('--num_inference_steps', type=int, default=50, help='Number of inference steps')
     parser.add_argument("--output_path", type=str, default="./teacache_results", help="The path where the generated video will be saved")
-    parser.add_argument('--ckpts_path', type=str, default="/data-123/zishen/cvproject/CogVideo/THUDM/CogVideoX1.5-5B", help='Path to checkpoint')
+    parser.add_argument('--ckpts_path', type=str, default="/data-123/zishen/cvproject/CogVideo/THUDM/CogVideoX1.5-5B", help='Path to checkpoint, t2v->THUDM/CogVideoX1.5-5B, i2v->THUDM/CogVideoX1.5-5B-I2V')
     parser.add_argument('--rel_l1_thresh', type=float, default=0.2, help='Higher speedup will cause to worse quality -- 0.1 for 1.3x speedup -- 0.2 for 1.8x speedup -- 0.3 for 2.1x speedup')
     parser.add_argument('--prompt', type=str, default="A clear, turquoise river flows through a rocky canyon, cascading over a small waterfall and forming a pool of water at the bottom.The river is the main focus of the scene, with its clear water reflecting the surrounding trees and rocks. The canyon walls are steep and rocky, with some vegetation growing on them. The trees are mostly pine trees, with their green needles contrasting with the brown and gray rocks. The overall tone of the scene is one of peace and tranquility.", help='Description of the video for the model to generate')
     parser.add_argument('--negative_prompt', type=str, default=None, help='Description of unwanted situations in model generated videos')
+    parser.add_argument("--image_path",type=str,default=None,help="The path of the image to be used as the background of the video")
+    parser.add_argument("--generate_type", type=str, default="t2v", help="The type of video generation (e.g., 't2v', 'i2v')")
+    parser.add_argument("--width", type=int, default=1360, help="Number of steps for the inference process")
+    parser.add_argument("--height", type=int, default=768, help="Number of steps for the inference process")
+    parser.add_argument("--num_frames", type=int, default=81, help="Number of steps for the inference process")
+    parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance")
+    parser.add_argument("--fps", type=int, default=16, help="Number of steps for the inference process")
     args = parser.parse_args()
 
     main(args)
\ No newline at end of file

From f7a4d67e355fc898c99517e3c16e08818431f887 Mon Sep 17 00:00:00 2001
From: zishen-ucap <huangzs@ucap.com.cn>
Date: Fri, 17 Jan 2025 18:48:17 +0800
Subject: [PATCH 4/6] Fixed issues based on PR review

---
 TeaCache4CogVideoX1.5/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/TeaCache4CogVideoX1.5/README.md b/TeaCache4CogVideoX1.5/README.md
index 46a90dd..49ff457 100644
--- a/TeaCache4CogVideoX1.5/README.md
+++ b/TeaCache4CogVideoX1.5/README.md
@@ -27,7 +27,7 @@ pip install --upgrade diffusers[torch] transformers protobuf tokenizers sentence
 
 You can modify the `rel_l1_thresh` to obtain your desired trade-off between latency and visul quality, and change the `ckpts_path`, `prompt`, `image_path` to customize your identity-preserving video.
 
-For T2v inference, you can use the following command:
+For T2V inference, you can use the following command:
 
 ```bash
 cd TeaCache4CogVideoX1.5
@@ -41,7 +41,7 @@ python3 teacache_sample_video.py \
     --output_path ./teacache_results
 ```
 
-For I2v inference, you can use the following command:
+For I2V inference, you can use the following command:
 
 ```bash
 cd TeaCache4CogVideoX1.5

From 105e7a085cec8be8ecbdc750d62559649bff517c Mon Sep 17 00:00:00 2001
From: zishen-ucap <huangzs@ucap.com.cn>
Date: Mon, 20 Jan 2025 09:10:35 +0800
Subject: [PATCH 5/6] Fixed issues based on PR review

---
 TeaCache4CogVideoX1.5/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TeaCache4CogVideoX1.5/README.md b/TeaCache4CogVideoX1.5/README.md
index 49ff457..955c633 100644
--- a/TeaCache4CogVideoX1.5/README.md
+++ b/TeaCache4CogVideoX1.5/README.md
@@ -35,7 +35,7 @@ cd TeaCache4CogVideoX1.5
 python3 teacache_sample_video.py \
     --rel_l1_thresh 0.2 \
     --ckpts_path THUDM/CogVideoX1.5-5B \
-    --prompt "A clear, turquoise river flows through a rocky canyon, cascading over a small waterfall and forming a pool of water at the bottom.The river is the main focus of the scene, with its clear water reflecting the surrounding trees and rocks. The canyon walls are steep and rocky, with some vegetation growing on them. The trees are mostly pine trees, with their green needles contrasting with the brown and gray rocks. The overall tone of the scene is one of peace and tranquility.", help='Description of the video for the model to generate." \
+    --prompt "A clear, turquoise river flows through a rocky canyon, cascading over a small waterfall and forming a pool of water at the bottom. The river is the main focus of the scene, with its clear water reflecting the surrounding trees and rocks. The canyon walls are steep and rocky, with some vegetation growing on them. The trees are mostly pine trees, with their green needles contrasting with the brown and gray rocks. The overall tone of the scene is one of peace and tranquility." \
     --seed 42 \
     --num_inference_steps 50 \
     --output_path ./teacache_results

From fd63cdfabe239473f6d31b69becfaf952ee1a356 Mon Sep 17 00:00:00 2001
From: zishen-ucap <huangzs@ucap.com.cn>
Date: Mon, 20 Jan 2025 09:29:50 +0800
Subject: [PATCH 6/6] Fixed issues based on PR review

---
 TeaCache4CogVideoX1.5/teacache_smaple_video.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TeaCache4CogVideoX1.5/teacache_smaple_video.py b/TeaCache4CogVideoX1.5/teacache_smaple_video.py
index 267e7f8..6307c26 100644
--- a/TeaCache4CogVideoX1.5/teacache_smaple_video.py
+++ b/TeaCache4CogVideoX1.5/teacache_smaple_video.py
@@ -253,7 +253,7 @@ if __name__ == "__main__":
     parser.add_argument('--seed', type=int, default=42, help='Random seed')
     parser.add_argument('--num_inference_steps', type=int, default=50, help='Number of inference steps')
     parser.add_argument("--output_path", type=str, default="./teacache_results", help="The path where the generated video will be saved")
-    parser.add_argument('--ckpts_path', type=str, default="/data-123/zishen/cvproject/CogVideo/THUDM/CogVideoX1.5-5B", help='Path to checkpoint, t2v->THUDM/CogVideoX1.5-5B, i2v->THUDM/CogVideoX1.5-5B-I2V')
+    parser.add_argument('--ckpts_path', type=str, default="THUDM/CogVideoX1.5-5B", help='Path to checkpoint, t2v->THUDM/CogVideoX1.5-5B, i2v->THUDM/CogVideoX1.5-5B-I2V')
     parser.add_argument('--rel_l1_thresh', type=float, default=0.2, help='Higher speedup will cause to worse quality -- 0.1 for 1.3x speedup -- 0.2 for 1.8x speedup -- 0.3 for 2.1x speedup')
     parser.add_argument('--prompt', type=str, default="A clear, turquoise river flows through a rocky canyon, cascading over a small waterfall and forming a pool of water at the bottom.The river is the main focus of the scene, with its clear water reflecting the surrounding trees and rocks. The canyon walls are steep and rocky, with some vegetation growing on them. The trees are mostly pine trees, with their green needles contrasting with the brown and gray rocks. The overall tone of the scene is one of peace and tranquility.", help='Description of the video for the model to generate')
     parser.add_argument('--negative_prompt', type=str, default=None, help='Description of unwanted situations in model generated videos')