mirror of
https://git.datalinker.icu/ali-vilab/TeaCache
synced 2025-12-08 20:34:24 +08:00
support HunyuanVideo
This commit is contained in:
parent
beb3a51d9d
commit
7c56245d3b
Binary file not shown.
BIN
.vs/TeaCache/v17/.wsuo
Normal file
BIN
.vs/TeaCache/v17/.wsuo
Normal file
Binary file not shown.
139
.vs/TeaCache/v17/DocumentLayout.json
Normal file
139
.vs/TeaCache/v17/DocumentLayout.json
Normal file
@ -0,0 +1,139 @@
|
||||
{
|
||||
"Version": 1,
|
||||
"WorkspaceRootPath": "C:\\Users\\25142\\Documents\\TeaCache\\",
|
||||
"Documents": [
|
||||
{
|
||||
"AbsoluteMoniker": "D:0:0:{A2FE74E1-B743-11D0-AE1A-00A0C90FFFC3}|\u003CMiscFiles\u003E|C:\\Users\\25142\\Documents\\TeaCache\\README.md||{EFC0BB08-EA7D-40C6-A696-C870411A895B}",
|
||||
"RelativeMoniker": "D:0:0:{A2FE74E1-B743-11D0-AE1A-00A0C90FFFC3}|\u003CMiscFiles\u003E|solutionrelative:README.md||{EFC0BB08-EA7D-40C6-A696-C870411A895B}"
|
||||
},
|
||||
{
|
||||
"AbsoluteMoniker": "D:0:0:{A2FE74E1-B743-11D0-AE1A-00A0C90FFFC3}|\u003CMiscFiles\u003E|C:\\Users\\25142\\Documents\\TeaCache\\eval\\teacache\\experiments\\opensora_plan.py||{3B902123-F8A7-4915-9F01-361F908088D0}",
|
||||
"RelativeMoniker": "D:0:0:{A2FE74E1-B743-11D0-AE1A-00A0C90FFFC3}|\u003CMiscFiles\u003E|solutionrelative:eval\\teacache\\experiments\\opensora_plan.py||{3B902123-F8A7-4915-9F01-361F908088D0}"
|
||||
},
|
||||
{
|
||||
"AbsoluteMoniker": "D:0:0:{A2FE74E1-B743-11D0-AE1A-00A0C90FFFC3}|\u003CMiscFiles\u003E|C:\\Users\\25142\\Documents\\TeaCache\\eval\\teacache\\experiments\\opensora.py||{3B902123-F8A7-4915-9F01-361F908088D0}",
|
||||
"RelativeMoniker": "D:0:0:{A2FE74E1-B743-11D0-AE1A-00A0C90FFFC3}|\u003CMiscFiles\u003E|solutionrelative:eval\\teacache\\experiments\\opensora.py||{3B902123-F8A7-4915-9F01-361F908088D0}"
|
||||
},
|
||||
{
|
||||
"AbsoluteMoniker": "D:0:0:{A2FE74E1-B743-11D0-AE1A-00A0C90FFFC3}|\u003CMiscFiles\u003E|C:\\Users\\25142\\Documents\\TeaCache\\eval\\teacache\\experiments\\latte.py||{3B902123-F8A7-4915-9F01-361F908088D0}",
|
||||
"RelativeMoniker": "D:0:0:{A2FE74E1-B743-11D0-AE1A-00A0C90FFFC3}|\u003CMiscFiles\u003E|solutionrelative:eval\\teacache\\experiments\\latte.py||{3B902123-F8A7-4915-9F01-361F908088D0}"
|
||||
},
|
||||
{
|
||||
"AbsoluteMoniker": "D:0:0:{A2FE74E1-B743-11D0-AE1A-00A0C90FFFC3}|\u003CMiscFiles\u003E|C:\\Users\\25142\\Documents\\TeaCache\\eval\\teacache\\experiments\\cogvideox.py||{3B902123-F8A7-4915-9F01-361F908088D0}",
|
||||
"RelativeMoniker": "D:0:0:{A2FE74E1-B743-11D0-AE1A-00A0C90FFFC3}|\u003CMiscFiles\u003E|solutionrelative:eval\\teacache\\experiments\\cogvideox.py||{3B902123-F8A7-4915-9F01-361F908088D0}"
|
||||
},
|
||||
{
|
||||
"AbsoluteMoniker": "D:0:0:{A2FE74E1-B743-11D0-AE1A-00A0C90FFFC3}|\u003CMiscFiles\u003E|C:\\Users\\25142\\Documents\\TeaCache\\TeaCache4HunyuanVideo\\teacache_sample_video.py||{3B902123-F8A7-4915-9F01-361F908088D0}",
|
||||
"RelativeMoniker": "D:0:0:{A2FE74E1-B743-11D0-AE1A-00A0C90FFFC3}|\u003CMiscFiles\u003E|solutionrelative:TeaCache4HunyuanVideo\\teacache_sample_video.py||{3B902123-F8A7-4915-9F01-361F908088D0}"
|
||||
},
|
||||
{
|
||||
"AbsoluteMoniker": "D:0:0:{A2FE74E1-B743-11D0-AE1A-00A0C90FFFC3}|\u003CMiscFiles\u003E|C:\\Users\\25142\\Documents\\TeaCache\\TeaCache4HunyuanVideo\\README.md||{EFC0BB08-EA7D-40C6-A696-C870411A895B}",
|
||||
"RelativeMoniker": "D:0:0:{A2FE74E1-B743-11D0-AE1A-00A0C90FFFC3}|\u003CMiscFiles\u003E|solutionrelative:TeaCache4HunyuanVideo\\README.md||{EFC0BB08-EA7D-40C6-A696-C870411A895B}"
|
||||
}
|
||||
],
|
||||
"DocumentGroupContainers": [
|
||||
{
|
||||
"Orientation": 0,
|
||||
"VerticalTabListWidth": 256,
|
||||
"DocumentGroups": [
|
||||
{
|
||||
"DockedWidth": 200,
|
||||
"SelectedChildIndex": 5,
|
||||
"Children": [
|
||||
{
|
||||
"$type": "Document",
|
||||
"DocumentIndex": 1,
|
||||
"Title": "opensora_plan.py",
|
||||
"DocumentMoniker": "C:\\Users\\25142\\Documents\\TeaCache\\eval\\teacache\\experiments\\opensora_plan.py",
|
||||
"RelativeDocumentMoniker": "eval\\teacache\\experiments\\opensora_plan.py",
|
||||
"ToolTip": "C:\\Users\\25142\\Documents\\TeaCache\\eval\\teacache\\experiments\\opensora_plan.py",
|
||||
"RelativeToolTip": "eval\\teacache\\experiments\\opensora_plan.py",
|
||||
"ViewState": "AgIAAAAAAAAAAAAAAAAAADQCAABOAAAAAAAAAA==",
|
||||
"Icon": "ae27a6b0-e345-4288-96df-5eaf394ee369.001001|",
|
||||
"WhenOpened": "2024-12-24T12:45:11.23Z",
|
||||
"EditorCaption": ""
|
||||
},
|
||||
{
|
||||
"$type": "Document",
|
||||
"DocumentIndex": 2,
|
||||
"Title": "opensora.py",
|
||||
"DocumentMoniker": "C:\\Users\\25142\\Documents\\TeaCache\\eval\\teacache\\experiments\\opensora.py",
|
||||
"RelativeDocumentMoniker": "eval\\teacache\\experiments\\opensora.py",
|
||||
"ToolTip": "C:\\Users\\25142\\Documents\\TeaCache\\eval\\teacache\\experiments\\opensora.py",
|
||||
"RelativeToolTip": "eval\\teacache\\experiments\\opensora.py",
|
||||
"ViewState": "AgIAAAAAAAAAAAAAAAAAANEAAAAAAAAAAAAAAA==",
|
||||
"Icon": "ae27a6b0-e345-4288-96df-5eaf394ee369.001001|",
|
||||
"WhenOpened": "2024-12-24T12:44:45.479Z",
|
||||
"EditorCaption": ""
|
||||
},
|
||||
{
|
||||
"$type": "Document",
|
||||
"DocumentIndex": 3,
|
||||
"Title": "latte.py",
|
||||
"DocumentMoniker": "C:\\Users\\25142\\Documents\\TeaCache\\eval\\teacache\\experiments\\latte.py",
|
||||
"RelativeDocumentMoniker": "eval\\teacache\\experiments\\latte.py",
|
||||
"ToolTip": "C:\\Users\\25142\\Documents\\TeaCache\\eval\\teacache\\experiments\\latte.py",
|
||||
"RelativeToolTip": "eval\\teacache\\experiments\\latte.py",
|
||||
"ViewState": "AgIAAAAAAAAAAAAAAAAAAPYBAABOAAAAAAAAAA==",
|
||||
"Icon": "ae27a6b0-e345-4288-96df-5eaf394ee369.001001|",
|
||||
"WhenOpened": "2024-12-24T12:44:13.292Z",
|
||||
"EditorCaption": ""
|
||||
},
|
||||
{
|
||||
"$type": "Document",
|
||||
"DocumentIndex": 5,
|
||||
"Title": "teacache_sample_video.py",
|
||||
"DocumentMoniker": "C:\\Users\\25142\\Documents\\TeaCache\\TeaCache4HunyuanVideo\\teacache_sample_video.py",
|
||||
"RelativeDocumentMoniker": "TeaCache4HunyuanVideo\\teacache_sample_video.py",
|
||||
"ToolTip": "C:\\Users\\25142\\Documents\\TeaCache\\TeaCache4HunyuanVideo\\teacache_sample_video.py",
|
||||
"RelativeToolTip": "TeaCache4HunyuanVideo\\teacache_sample_video.py",
|
||||
"ViewState": "AgIAAMIAAAAAAAAAAAASwNgAAAA4AAAAAAAAAA==",
|
||||
"Icon": "ae27a6b0-e345-4288-96df-5eaf394ee369.001001|",
|
||||
"WhenOpened": "2024-12-24T12:43:02.085Z",
|
||||
"EditorCaption": ""
|
||||
},
|
||||
{
|
||||
"$type": "Document",
|
||||
"DocumentIndex": 4,
|
||||
"Title": "cogvideox.py",
|
||||
"DocumentMoniker": "C:\\Users\\25142\\Documents\\TeaCache\\eval\\teacache\\experiments\\cogvideox.py",
|
||||
"RelativeDocumentMoniker": "eval\\teacache\\experiments\\cogvideox.py",
|
||||
"ToolTip": "C:\\Users\\25142\\Documents\\TeaCache\\eval\\teacache\\experiments\\cogvideox.py",
|
||||
"RelativeToolTip": "eval\\teacache\\experiments\\cogvideox.py",
|
||||
"ViewState": "AgIAAAkAAAAAAAAAAAAAAM4AAAAeAAAAAAAAAA==",
|
||||
"Icon": "ae27a6b0-e345-4288-96df-5eaf394ee369.001001|",
|
||||
"WhenOpened": "2024-12-24T12:42:50.703Z",
|
||||
"EditorCaption": ""
|
||||
},
|
||||
{
|
||||
"$type": "Document",
|
||||
"DocumentIndex": 0,
|
||||
"Title": "README.md",
|
||||
"DocumentMoniker": "C:\\Users\\25142\\Documents\\TeaCache\\README.md",
|
||||
"RelativeDocumentMoniker": "README.md",
|
||||
"ToolTip": "C:\\Users\\25142\\Documents\\TeaCache\\README.md",
|
||||
"RelativeToolTip": "README.md",
|
||||
"ViewState": "AgIAACcAAAAAAAAAAAAAADQAAAAAAAAAAAAAAA==",
|
||||
"Icon": "ae27a6b0-e345-4288-96df-5eaf394ee369.001818|",
|
||||
"WhenOpened": "2024-12-24T12:36:09.99Z",
|
||||
"EditorCaption": ""
|
||||
},
|
||||
{
|
||||
"$type": "Document",
|
||||
"DocumentIndex": 6,
|
||||
"Title": "README.md",
|
||||
"DocumentMoniker": "C:\\Users\\25142\\Documents\\TeaCache\\TeaCache4HunyuanVideo\\README.md",
|
||||
"RelativeDocumentMoniker": "TeaCache4HunyuanVideo\\README.md",
|
||||
"ToolTip": "C:\\Users\\25142\\Documents\\TeaCache\\TeaCache4HunyuanVideo\\README.md",
|
||||
"RelativeToolTip": "TeaCache4HunyuanVideo\\README.md",
|
||||
"ViewState": "AgIAAAAAAAAAAAAAAAAAABAAAAAHAAAAAAAAAA==",
|
||||
"Icon": "ae27a6b0-e345-4288-96df-5eaf394ee369.001818|",
|
||||
"WhenOpened": "2024-12-24T12:35:11.041Z",
|
||||
"EditorCaption": ""
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
11
.vs/VSWorkspaceState.json
Normal file
11
.vs/VSWorkspaceState.json
Normal file
@ -0,0 +1,11 @@
|
||||
{
|
||||
"ExpandedNodes": [
|
||||
"",
|
||||
"\\eval",
|
||||
"\\eval\\teacache",
|
||||
"\\eval\\teacache\\experiments",
|
||||
"\\TeaCache4HunyuanVideo"
|
||||
],
|
||||
"SelectedNode": "\\eval\\teacache\\experiments\\opensora_plan.py",
|
||||
"PreviewInSolutionExplorer": false
|
||||
}
|
||||
BIN
.vs/slnx.sqlite
Normal file
BIN
.vs/slnx.sqlite
Normal file
Binary file not shown.
@ -54,13 +54,17 @@
|
||||

|
||||
|
||||
## Latest News 🔥
|
||||
- [2024/12/24] 🔥 Support [HunyuanVideo](https://github.com/Tencent/HunyuanVideo).
|
||||
- [2024/12/19] 🔥 Support [CogVideoX](https://github.com/THUDM/CogVideo).
|
||||
- [2024/12/06] 🎉 Release the [code](https://github.com/LiewFeng/TeaCache) TeaCache. Support [Open-Sora](https://github.com/hpcaitech/Open-Sora), [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan) and [Latte](https://github.com/Vchitect/Latte).
|
||||
- [2024/12/06] 🎉 Release the [code](https://github.com/LiewFeng/TeaCache) of TeaCache. Support [Open-Sora](https://github.com/hpcaitech/Open-Sora), [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan) and [Latte](https://github.com/Vchitect/Latte).
|
||||
- [2024/11/28] 🎉 Release the [paper](https://arxiv.org/abs/2411.19108) of TeaCache.
|
||||
|
||||
## Introduction
|
||||
We introduce Timestep Embedding Aware Cache (TeaCache), a training-free caching approach that estimates and leverages the fluctuating differences among model outputs across timesteps. For more details and visual results, please visit our [project page](https://github.com/LiewFeng/TeaCache).
|
||||
|
||||
## TeaCache for HunyuanVideo
|
||||
Please refer to [TeaCache4HunyuanVideo](./TeaCache4HunyuanVideo/README.md).
|
||||
|
||||
## Installation
|
||||
|
||||
Prerequisites:
|
||||
@ -135,4 +139,4 @@ If you find TeaCache is useful in your research or applications, please consider
|
||||
|
||||
## Acknowledgement
|
||||
|
||||
This repository is built based on [VideoSys](https://github.com/NUS-HPC-AI-Lab/VideoSys), [Open-Sora](https://github.com/hpcaitech/Open-Sora), [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan), [Latte](https://github.com/Vchitect/Latte) and [CogVideoX](https://github.com/THUDM/CogVideo). Thanks for their contributions!
|
||||
This repository is built based on [VideoSys](https://github.com/NUS-HPC-AI-Lab/VideoSys), [Open-Sora](https://github.com/hpcaitech/Open-Sora), [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan), [Latte](https://github.com/Vchitect/Latte), [CogVideoX](https://github.com/THUDM/CogVideo) and [HunyuanVideo](https://github.com/Tencent/HunyuanVideo). Thanks for their contributions!
|
||||
|
||||
55
TeaCache4HunyuanVideo/README.md
Normal file
55
TeaCache4HunyuanVideo/README.md
Normal file
@ -0,0 +1,55 @@
|
||||
<!-- ## **TeaCache4HunyuanVideo** -->
|
||||
# TeaCache4HunyuanVideo
|
||||
|
||||
[TeaCache](https://github.com/LiewFeng/TeaCache) can speedup [HunyuanVideo](https://github.com/Tencent/HunyuanVideo) 2x without much visual quality degradation, in a training-free manner.
|
||||
|
||||
## 📈 Inference Latency Comparisons on a Single A800 GPU
|
||||
|
||||
|
||||
| Resolution | HunyuanVideo | TeaCache (0.1) | TeaCache (0.15) |
|
||||
|:---------------------:|:-------------------------:|:--------------------:|:----------------------:|
|
||||
| 540p | ~18 min | ~11 min | ~8 min |
|
||||
| 720p | ~50 min | ~30 min | ~23 min |
|
||||
|
||||
|
||||
## Usage
|
||||
|
||||
Follow [HunyuanVideo](https://github.com/Tencent/HunyuanVideo) to clone the repo and finish the installation, then copy 'teacache_sample_video.py' in this repo to the HunyuanVideo repo.
|
||||
|
||||
For single-gpu inference, you can use the following command:
|
||||
|
||||
```bash
|
||||
cd HunyuanVideo
|
||||
|
||||
python3 teacache_sample_video.py \
|
||||
--video-size 720 1280 \
|
||||
--video-length 129 \
|
||||
--infer-steps 50 \
|
||||
--prompt "A cat walks on the grass, realistic style." \
|
||||
--flow-reverse \
|
||||
--use-cpu-offload \
|
||||
--save-path ./teacache_results
|
||||
```
|
||||
|
||||
To generate a video with 8 GPUs, you can use the following command:
|
||||
|
||||
```bash
|
||||
cd HunyuanVideo
|
||||
|
||||
torchrun --nproc_per_node=8 teacache_sample_video.py \
|
||||
--video-size 1280 720 \
|
||||
--video-length 129 \
|
||||
--infer-steps 50 \
|
||||
--prompt "A cat walks on the grass, realistic style." \
|
||||
--flow-reverse \
|
||||
--seed 42 \
|
||||
--ulysses-degree 8 \
|
||||
--ring-degree 1 \
|
||||
--save-path ./teacache_results
|
||||
```
|
||||
|
||||
You can modify the thresh in line 220 to obtain your desired trade-off between latency and visul quality.
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
We would like to thank the contributors to the [HunyuanVideo](https://github.com/Tencent/HunyuanVideo).
|
||||
254
TeaCache4HunyuanVideo/teacache_sample_video.py
Normal file
254
TeaCache4HunyuanVideo/teacache_sample_video.py
Normal file
@ -0,0 +1,254 @@
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from loguru import logger
|
||||
from datetime import datetime
|
||||
|
||||
from hyvideo.utils.file_utils import save_videos_grid
|
||||
from hyvideo.config import parse_args
|
||||
from hyvideo.inference import HunyuanVideoSampler
|
||||
|
||||
from hyvideo.modules.modulate_layers import modulate
|
||||
from hyvideo.modules.attenion import attention, parallel_attention, get_cu_seqlens
|
||||
from typing import Any, List, Tuple, Optional, Union, Dict
|
||||
import torch
|
||||
import json
|
||||
import numpy as np
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def teacache_forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
t: torch.Tensor, # Should be in range(0, 1000).
|
||||
text_states: torch.Tensor = None,
|
||||
text_mask: torch.Tensor = None, # Now we don't use it.
|
||||
text_states_2: Optional[torch.Tensor] = None, # Text embedding for modulation.
|
||||
freqs_cos: Optional[torch.Tensor] = None,
|
||||
freqs_sin: Optional[torch.Tensor] = None,
|
||||
guidance: torch.Tensor = None, # Guidance for modulation, should be cfg_scale x 1000.
|
||||
return_dict: bool = True,
|
||||
) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
|
||||
out = {}
|
||||
img = x
|
||||
txt = text_states
|
||||
_, _, ot, oh, ow = x.shape
|
||||
tt, th, tw = (
|
||||
ot // self.patch_size[0],
|
||||
oh // self.patch_size[1],
|
||||
ow // self.patch_size[2],
|
||||
)
|
||||
|
||||
# Prepare modulation vectors.
|
||||
vec = self.time_in(t)
|
||||
|
||||
# text modulation
|
||||
vec = vec + self.vector_in(text_states_2)
|
||||
|
||||
# guidance modulation
|
||||
if self.guidance_embed:
|
||||
if guidance is None:
|
||||
raise ValueError(
|
||||
"Didn't get guidance strength for guidance distilled model."
|
||||
)
|
||||
|
||||
# our timestep_embedding is merged into guidance_in(TimestepEmbedder)
|
||||
vec = vec + self.guidance_in(guidance)
|
||||
|
||||
# Embed image and text.
|
||||
img = self.img_in(img)
|
||||
if self.text_projection == "linear":
|
||||
txt = self.txt_in(txt)
|
||||
elif self.text_projection == "single_refiner":
|
||||
txt = self.txt_in(txt, t, text_mask if self.use_attention_mask else None)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"Unsupported text_projection: {self.text_projection}"
|
||||
)
|
||||
|
||||
txt_seq_len = txt.shape[1]
|
||||
img_seq_len = img.shape[1]
|
||||
|
||||
# Compute cu_squlens and max_seqlen for flash attention
|
||||
cu_seqlens_q = get_cu_seqlens(text_mask, img_seq_len)
|
||||
cu_seqlens_kv = cu_seqlens_q
|
||||
max_seqlen_q = img_seq_len + txt_seq_len
|
||||
max_seqlen_kv = max_seqlen_q
|
||||
|
||||
freqs_cis = (freqs_cos, freqs_sin) if freqs_cos is not None else None
|
||||
|
||||
if self.enable_teacache:
|
||||
inp = img.clone()
|
||||
vec_ = vec.clone()
|
||||
txt_ = txt.clone()
|
||||
(
|
||||
img_mod1_shift,
|
||||
img_mod1_scale,
|
||||
img_mod1_gate,
|
||||
img_mod2_shift,
|
||||
img_mod2_scale,
|
||||
img_mod2_gate,
|
||||
) = self.double_blocks[0].img_mod(vec_).chunk(6, dim=-1)
|
||||
normed_inp = self.double_blocks[0].img_norm1(inp)
|
||||
modulated_inp = modulate(
|
||||
normed_inp, shift=img_mod1_shift, scale=img_mod1_scale
|
||||
)
|
||||
if self.cnt == 0 or self.cnt == self.num_steps-1:
|
||||
should_calc = True
|
||||
self.accumulated_rel_l1_distance = 0
|
||||
else:
|
||||
coefficients = [7.33226126e+02, -4.01131952e+02, 6.75869174e+01, -3.14987800e+00, 9.61237896e-02]
|
||||
rescale_func = np.poly1d(coefficients)
|
||||
self.accumulated_rel_l1_distance += rescale_func(((modulated_inp-self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()).cpu().item())
|
||||
if self.accumulated_rel_l1_distance < self.rel_l1_thresh:
|
||||
should_calc = False
|
||||
else:
|
||||
should_calc = True
|
||||
self.accumulated_rel_l1_distance = 0
|
||||
self.previous_modulated_input = modulated_inp
|
||||
self.cnt = 0 if self.cnt == self.num_steps-1 else self.cnt + 1
|
||||
|
||||
if self.enable_teacache:
|
||||
if not should_calc:
|
||||
img += self.previous_residual
|
||||
else:
|
||||
ori_img = img.clone()
|
||||
# --------------------- Pass through DiT blocks ------------------------
|
||||
for _, block in enumerate(self.double_blocks):
|
||||
double_block_args = [
|
||||
img,
|
||||
txt,
|
||||
vec,
|
||||
cu_seqlens_q,
|
||||
cu_seqlens_kv,
|
||||
max_seqlen_q,
|
||||
max_seqlen_kv,
|
||||
freqs_cis,
|
||||
]
|
||||
|
||||
img, txt = block(*double_block_args)
|
||||
|
||||
# Merge txt and img to pass through single stream blocks.
|
||||
x = torch.cat((img, txt), 1)
|
||||
if len(self.single_blocks) > 0:
|
||||
for _, block in enumerate(self.single_blocks):
|
||||
single_block_args = [
|
||||
x,
|
||||
vec,
|
||||
txt_seq_len,
|
||||
cu_seqlens_q,
|
||||
cu_seqlens_kv,
|
||||
max_seqlen_q,
|
||||
max_seqlen_kv,
|
||||
(freqs_cos, freqs_sin),
|
||||
]
|
||||
|
||||
x = block(*single_block_args)
|
||||
|
||||
img = x[:, :img_seq_len, ...]
|
||||
self.previous_residual = img - ori_img
|
||||
else:
|
||||
# --------------------- Pass through DiT blocks ------------------------
|
||||
for _, block in enumerate(self.double_blocks):
|
||||
double_block_args = [
|
||||
img,
|
||||
txt,
|
||||
vec,
|
||||
cu_seqlens_q,
|
||||
cu_seqlens_kv,
|
||||
max_seqlen_q,
|
||||
max_seqlen_kv,
|
||||
freqs_cis,
|
||||
]
|
||||
|
||||
img, txt = block(*double_block_args)
|
||||
|
||||
# Merge txt and img to pass through single stream blocks.
|
||||
x = torch.cat((img, txt), 1)
|
||||
if len(self.single_blocks) > 0:
|
||||
for _, block in enumerate(self.single_blocks):
|
||||
single_block_args = [
|
||||
x,
|
||||
vec,
|
||||
txt_seq_len,
|
||||
cu_seqlens_q,
|
||||
cu_seqlens_kv,
|
||||
max_seqlen_q,
|
||||
max_seqlen_kv,
|
||||
(freqs_cos, freqs_sin),
|
||||
]
|
||||
|
||||
x = block(*single_block_args)
|
||||
|
||||
img = x[:, :img_seq_len, ...]
|
||||
|
||||
# ---------------------------- Final layer ------------------------------
|
||||
img = self.final_layer(img, vec) # (N, T, patch_size ** 2 * out_channels)
|
||||
|
||||
img = self.unpatchify(img, tt, th, tw)
|
||||
if return_dict:
|
||||
out["x"] = img
|
||||
return out
|
||||
return img
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
print(args)
|
||||
models_root_path = Path(args.model_base)
|
||||
if not models_root_path.exists():
|
||||
raise ValueError(f"`models_root` not exists: {models_root_path}")
|
||||
|
||||
# Create save folder to save the samples
|
||||
save_path = args.save_path if args.save_path_suffix=="" else f'{args.save_path}_{args.save_path_suffix}'
|
||||
if not os.path.exists(args.save_path):
|
||||
os.makedirs(save_path, exist_ok=True)
|
||||
|
||||
# Load models
|
||||
hunyuan_video_sampler = HunyuanVideoSampler.from_pretrained(models_root_path, args=args)
|
||||
|
||||
# Get the updated args
|
||||
args = hunyuan_video_sampler.args
|
||||
|
||||
|
||||
# TeaCache
|
||||
hunyuan_video_sampler.pipeline.transformer.__class__.enable_teacache = True
|
||||
hunyuan_video_sampler.pipeline.transformer.__class__.cnt = 0
|
||||
hunyuan_video_sampler.pipeline.transformer.__class__.num_steps = args.infer_steps - 1
|
||||
hunyuan_video_sampler.pipeline.transformer.__class__.rel_l1_thresh = 0.15 # 0.1 for 1.6x speedup, 0.15 for 2.1x speedup
|
||||
hunyuan_video_sampler.pipeline.transformer.__class__.accumulated_rel_l1_distance = 0
|
||||
hunyuan_video_sampler.pipeline.transformer.__class__.previous_modulated_input = None
|
||||
hunyuan_video_sampler.pipeline.transformer.__class__.previous_residual = None
|
||||
hunyuan_video_sampler.pipeline.transformer.__class__.forward = teacache_forward
|
||||
|
||||
# Start sampling
|
||||
# TODO: batch inference check
|
||||
outputs = hunyuan_video_sampler.predict(
|
||||
prompt=args.prompt,
|
||||
height=args.video_size[0],
|
||||
width=args.video_size[1],
|
||||
video_length=args.video_length,
|
||||
seed=args.seed,
|
||||
negative_prompt=args.neg_prompt,
|
||||
infer_steps=args.infer_steps,
|
||||
guidance_scale=args.cfg_scale,
|
||||
num_videos_per_prompt=args.num_videos,
|
||||
flow_shift=args.flow_shift,
|
||||
batch_size=args.batch_size,
|
||||
embedded_guidance_scale=args.embedded_cfg_scale
|
||||
)
|
||||
samples = outputs['samples']
|
||||
|
||||
# Save samples
|
||||
if 'LOCAL_RANK' not in os.environ or int(os.environ['LOCAL_RANK']) == 0:
|
||||
for i, sample in enumerate(samples):
|
||||
sample = samples[i].unsqueeze(0)
|
||||
time_flag = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d-%H:%M:%S")
|
||||
save_path = f"{save_path}/{time_flag}_seed{outputs['seeds'][i]}_{outputs['prompts'][i][:100].replace('/','')}.mp4"
|
||||
save_videos_grid(sample, save_path, fps=24)
|
||||
logger.info(f'Sample save to: {save_path}')
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -194,25 +194,25 @@ def teacache_forward(
|
||||
def eval_teacache_slow(prompt_list):
|
||||
config = CogVideoXConfig()
|
||||
engine = VideoSysEngine(config)
|
||||
engine.driver_worker.transformer.enable_teacache = True
|
||||
engine.driver_worker.transformer.rel_l1_thresh = 0.1
|
||||
engine.driver_worker.transformer.accumulated_rel_l1_distance = 0
|
||||
engine.driver_worker.transformer.previous_modulated_input = None
|
||||
engine.driver_worker.transformer.previous_residual = None
|
||||
engine.driver_worker.transformer.previous_residual_encoder = None
|
||||
engine.driver_worker.transformer.__class__.forward = teacache_forward
|
||||
engine.driver_worker.transformer.__class__.enable_teacache = True
|
||||
engine.driver_worker.transformer.__class__.rel_l1_thresh = 0.1
|
||||
engine.driver_worker.transformer.__class__.accumulated_rel_l1_distance = 0
|
||||
engine.driver_worker.transformer.__class__.previous_modulated_input = None
|
||||
engine.driver_worker.transformer.__class__.previous_residual = None
|
||||
engine.driver_worker.transformer.__class__.previous_residual_encoder = None
|
||||
engine.driver_worker.transformer.__class__.__class__.forward = teacache_forward
|
||||
generate_func(engine, prompt_list, "./samples/cogvideox_teacache_slow", loop=5)
|
||||
|
||||
def eval_teacache_fast(prompt_list):
|
||||
config = CogVideoXConfig()
|
||||
engine = VideoSysEngine(config)
|
||||
engine.driver_worker.transformer.enable_teacache = True
|
||||
engine.driver_worker.transformer.rel_l1_thresh = 0.2
|
||||
engine.driver_worker.transformer.accumulated_rel_l1_distance = 0
|
||||
engine.driver_worker.transformer.previous_modulated_input = None
|
||||
engine.driver_worker.transformer.previous_residual = None
|
||||
engine.driver_worker.transformer.previous_residual_encoder = None
|
||||
engine.driver_worker.transformer.__class__.forward = teacache_forward
|
||||
engine.driver_worker.transformer.__class__.enable_teacache = True
|
||||
engine.driver_worker.transformer.__class__.rel_l1_thresh = 0.2
|
||||
engine.driver_worker.transformer.__class__.accumulated_rel_l1_distance = 0
|
||||
engine.driver_worker.transformer.__class__.previous_modulated_input = None
|
||||
engine.driver_worker.transformer.__class__.previous_residual = None
|
||||
engine.driver_worker.transformer.__class__.previous_residual_encoder = None
|
||||
engine.driver_worker.transformer.__class__.__class__.forward = teacache_forward
|
||||
generate_func(engine, prompt_list, "./samples/cogvideox_teacache_fast", loop=5)
|
||||
|
||||
|
||||
|
||||
@ -497,23 +497,23 @@ def teacache_forward(
|
||||
def eval_teacache_slow(prompt_list):
|
||||
config = LatteConfig()
|
||||
engine = VideoSysEngine(config)
|
||||
engine.driver_worker.transformer.enable_teacache = True
|
||||
engine.driver_worker.transformer.rel_l1_thresh = 0.1
|
||||
engine.driver_worker.transformer.accumulated_rel_l1_distance = 0
|
||||
engine.driver_worker.transformer.previous_modulated_input = None
|
||||
engine.driver_worker.transformer.previous_residual = None
|
||||
engine.driver_worker.transformer.__class__.forward = teacache_forward
|
||||
engine.driver_worker.transformer.__class__.enable_teacache = True
|
||||
engine.driver_worker.transformer.__class__.rel_l1_thresh = 0.1
|
||||
engine.driver_worker.transformer.__class__.accumulated_rel_l1_distance = 0
|
||||
engine.driver_worker.transformer.__class__.previous_modulated_input = None
|
||||
engine.driver_worker.transformer.__class__.previous_residual = None
|
||||
engine.driver_worker.transformer.__class__.__class__.forward = teacache_forward
|
||||
generate_func(engine, prompt_list, "./samples/latte_teacache_slow", loop=5)
|
||||
|
||||
def eval_teacache_fast(prompt_list):
|
||||
config = LatteConfig()
|
||||
engine = VideoSysEngine(config)
|
||||
engine.driver_worker.transformer.enable_teacache = True
|
||||
engine.driver_worker.transformer.rel_l1_thresh = 0.2
|
||||
engine.driver_worker.transformer.accumulated_rel_l1_distance = 0
|
||||
engine.driver_worker.transformer.previous_modulated_input = None
|
||||
engine.driver_worker.transformer.previous_residual = None
|
||||
engine.driver_worker.transformer.__class__.forward = teacache_forward
|
||||
engine.driver_worker.transformer.__class__.enable_teacache = True
|
||||
engine.driver_worker.transformer.__class__.rel_l1_thresh = 0.2
|
||||
engine.driver_worker.transformer.__class__.accumulated_rel_l1_distance = 0
|
||||
engine.driver_worker.transformer.__class__.previous_modulated_input = None
|
||||
engine.driver_worker.transformer.__class__.previous_residual = None
|
||||
engine.driver_worker.transformer.__class__.__class__.forward = teacache_forward
|
||||
generate_func(engine, prompt_list, "./samples/latte_teacache_fast", loop=5)
|
||||
|
||||
|
||||
|
||||
@ -211,23 +211,23 @@ def eval_base(prompt_list):
|
||||
def eval_teacache_slow(prompt_list):
|
||||
config = OpenSoraConfig()
|
||||
engine = VideoSysEngine(config)
|
||||
engine.driver_worker.transformer.enable_teacache = True
|
||||
engine.driver_worker.transformer.rel_l1_thresh = 0.1
|
||||
engine.driver_worker.transformer.accumulated_rel_l1_distance = 0
|
||||
engine.driver_worker.transformer.previous_modulated_input = None
|
||||
engine.driver_worker.transformer.previous_residual = None
|
||||
engine.driver_worker.transformer.__class__.forward = teacache_forward
|
||||
engine.driver_worker.transformer.__class__.enable_teacache = True
|
||||
engine.driver_worker.transformer.__class__.rel_l1_thresh = 0.1
|
||||
engine.driver_worker.transformer.__class__.accumulated_rel_l1_distance = 0
|
||||
engine.driver_worker.transformer.__class__.previous_modulated_input = None
|
||||
engine.driver_worker.transformer.__class__.previous_residual = None
|
||||
engine.driver_worker.transformer.__class__.__class__.forward = teacache_forward
|
||||
generate_func(engine, prompt_list, "./samples/opensora_teacache_slow", loop=5)
|
||||
|
||||
def eval_teacache_fast(prompt_list):
|
||||
config = OpenSoraConfig()
|
||||
engine = VideoSysEngine(config)
|
||||
engine.driver_worker.transformer.enable_teacache = True
|
||||
engine.driver_worker.transformer.rel_l1_thresh = 0.2
|
||||
engine.driver_worker.transformer.accumulated_rel_l1_distance = 0
|
||||
engine.driver_worker.transformer.previous_modulated_input = None
|
||||
engine.driver_worker.transformer.previous_residual = None
|
||||
engine.driver_worker.transformer.__class__.forward = teacache_forward
|
||||
engine.driver_worker.transformer.__class__.enable_teacache = True
|
||||
engine.driver_worker.transformer.__class__.rel_l1_thresh = 0.2
|
||||
engine.driver_worker.transformer.__class__.accumulated_rel_l1_distance = 0
|
||||
engine.driver_worker.transformer.__class__.previous_modulated_input = None
|
||||
engine.driver_worker.transformer.__class__.previous_residual = None
|
||||
engine.driver_worker.transformer.__class__.__class__.forward = teacache_forward
|
||||
generate_func(engine, prompt_list, "./samples/opensora_teacache_fast", loop=5)
|
||||
|
||||
|
||||
|
||||
@ -560,23 +560,23 @@ def teacache_forward(
|
||||
def eval_teacache_slow(prompt_list):
|
||||
config = OpenSoraPlanConfig(version="v110", transformer_type="65x512x512")
|
||||
engine = VideoSysEngine(config)
|
||||
engine.driver_worker.transformer.enable_teacache = True
|
||||
engine.driver_worker.transformer.rel_l1_thresh = 0.1
|
||||
engine.driver_worker.transformer.accumulated_rel_l1_distance = 0
|
||||
engine.driver_worker.transformer.previous_modulated_input = None
|
||||
engine.driver_worker.transformer.previous_residual = None
|
||||
engine.driver_worker.transformer.__class__.forward = teacache_forward
|
||||
engine.driver_worker.transformer.__class__.enable_teacache = True
|
||||
engine.driver_worker.transformer.__class__.rel_l1_thresh = 0.1
|
||||
engine.driver_worker.transformer.__class__.accumulated_rel_l1_distance = 0
|
||||
engine.driver_worker.transformer.__class__.previous_modulated_input = None
|
||||
engine.driver_worker.transformer.__class__.previous_residual = None
|
||||
engine.driver_worker.transformer.__class__.__class__.forward = teacache_forward
|
||||
generate_func(engine, prompt_list, "./samples/opensoraplan_teacache_slow", loop=5)
|
||||
|
||||
def eval_teacache_fast(prompt_list):
|
||||
config = OpenSoraPlanConfig(version="v110", transformer_type="65x512x512")
|
||||
engine = VideoSysEngine(config)
|
||||
engine.driver_worker.transformer.enable_teacache = True
|
||||
engine.driver_worker.transformer.rel_l1_thresh = 0.2
|
||||
engine.driver_worker.transformer.accumulated_rel_l1_distance = 0
|
||||
engine.driver_worker.transformer.previous_modulated_input = None
|
||||
engine.driver_worker.transformer.previous_residual = None
|
||||
engine.driver_worker.transformer.__class__.forward = teacache_forward
|
||||
engine.driver_worker.transformer.__class__.enable_teacache = True
|
||||
engine.driver_worker.transformer.__class__.rel_l1_thresh = 0.2
|
||||
engine.driver_worker.transformer.__class__.accumulated_rel_l1_distance = 0
|
||||
engine.driver_worker.transformer.__class__.previous_modulated_input = None
|
||||
engine.driver_worker.transformer.__class__.previous_residual = None
|
||||
engine.driver_worker.transformer.__class__.__class__.forward = teacache_forward
|
||||
generate_func(engine, prompt_list, "./samples/opensoraplan_teacache_fast", loop=5)
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user