Merge branch 'main' into add-5b

This commit is contained in:
BrettMeirhofer 2024-08-27 09:25:52 -05:00 committed by GitHub
commit dc9de70259
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 600 additions and 82 deletions

View File

@ -0,0 +1,415 @@
{
"last_node_id": 33,
"last_link_id": 59,
"nodes": [
{
"id": 20,
"type": "CLIPLoader",
"pos": [
-59,
397
],
"size": {
"0": 451.30548095703125,
"1": 82
},
"flags": {},
"order": 0,
"mode": 0,
"outputs": [
{
"name": "CLIP",
"type": "CLIP",
"links": [
54,
56
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CLIPLoader"
},
"widgets_values": [
"t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
"sd3"
]
},
{
"id": 31,
"type": "CogVideoTextEncode",
"pos": [
503,
521
],
"size": {
"0": 463.01251220703125,
"1": 98.10446166992188
},
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 56
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
57
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
""
]
},
{
"id": 11,
"type": "CogVideoDecode",
"pos": [
1140,
783
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 37
},
{
"name": "samples",
"type": "LATENT",
"link": 38
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
59
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
},
{
"id": 1,
"type": "DownloadAndLoadCogVideoModel",
"pos": [
649,
182
],
"size": {
"0": 315,
"1": 82
},
"flags": {},
"order": 1,
"mode": 0,
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
36
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadCogVideoModel"
},
"widgets_values": [
"THUDM/CogVideoX-5b",
"bf16"
]
},
{
"id": 22,
"type": "CogVideoSampler",
"pos": [
1041,
342
],
"size": {
"0": 315,
"1": 382
},
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 36
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 55,
"slot_index": 1
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 57
},
{
"name": "samples",
"type": "LATENT",
"link": null
}
],
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
37
],
"shape": 3
},
{
"name": "samples",
"type": "LATENT",
"links": [
38
],
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoSampler"
},
"widgets_values": [
480,
720,
49,
8,
50,
7,
806286757407563,
"fixed",
"DPM",
49,
8,
1
]
},
{
"id": 33,
"type": "VHS_VideoCombine",
"pos": [
1533,
136
],
"size": [
778.7022705078125,
853.801513671875
],
"flags": {},
"order": 6,
"mode": 0,
"inputs": [
{
"name": "images",
"type": "IMAGE",
"link": 59
},
{
"name": "audio",
"type": "AUDIO",
"link": null
},
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null
},
{
"name": "vae",
"type": "VAE",
"link": null
}
],
"outputs": [
{
"name": "Filenames",
"type": "VHS_FILENAMES",
"links": null,
"shape": 3
}
],
"properties": {
"Node name for S&R": "VHS_VideoCombine"
},
"widgets_values": {
"frame_rate": 8,
"loop_count": 0,
"filename_prefix": "CogVideoX5B",
"format": "video/nvenc_h264-mp4",
"pix_fmt": "yuv420p",
"bitrate": 10,
"megabit": true,
"save_metadata": true,
"pingpong": false,
"save_output": false,
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"filename": "CogVideoX5B.mp4",
"subfolder": "",
"type": "temp",
"format": "video/nvenc_h264-mp4",
"frame_rate": 8
},
"muted": false
}
}
},
{
"id": 30,
"type": "CogVideoTextEncode",
"pos": [
500,
308
],
"size": {
"0": 474.8450012207031,
"1": 164.7423553466797
},
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 54
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
55
],
"slot_index": 0,
"shape": 3
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
"The camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a steep mountain slope, dust kicks up from its tires, the sunlight shines on the SUV as it speeds along the dirt road, casting a warm glow over the scene. The dirt road curves gently into the distance, with no other cars or vehicles in sight. The trees on either side of the road are redwoods, with patches of greenery scattered throughout. The car is seen from the rear following the curve with ease, making it seem as if it is on a rugged drive through the rugged terrain. The dirt road itself is surrounded by steep hills and mountains, with a clear blue sky above with wispy clouds.\n"
]
}
],
"links": [
[
36,
1,
0,
22,
0,
"COGVIDEOPIPE"
],
[
37,
22,
0,
11,
0,
"COGVIDEOPIPE"
],
[
38,
22,
1,
11,
1,
"LATENT"
],
[
54,
20,
0,
30,
0,
"CLIP"
],
[
55,
30,
0,
22,
1,
"CONDITIONING"
],
[
56,
20,
0,
31,
0,
"CLIP"
],
[
57,
31,
0,
22,
2,
"CONDITIONING"
],
[
59,
11,
0,
33,
0,
"IMAGE"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 0.7513148009015782,
"offset": [
106.37225000664994,
78.14886929032406
]
}
},
"version": 0.4
}

View File

@ -48,7 +48,10 @@ class DownloadAndLoadCogVideoModel:
dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision] dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
base_path = os.path.join(folder_paths.models_dir, "CogVideo", "CogVideo2B") if "2b" in model:
base_path = os.path.join(folder_paths.models_dir, "CogVideo", "CogVideo2B")
elif "5b" in model:
base_path = os.path.join(folder_paths.models_dir, "CogVideo", "CogVideoX-5b")
if not os.path.exists(base_path): if not os.path.exists(base_path):
log.info(f"Downloading model to: {base_path}") log.info(f"Downloading model to: {base_path}")
@ -205,14 +208,14 @@ class CogVideoSampler:
"negative": ("CONDITIONING", ), "negative": ("CONDITIONING", ),
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}), "height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}), "width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
"num_frames": ("INT", {"default": 48, "min": 8, "max": 1024, "step": 8}), "num_frames": ("INT", {"default": 48, "min": 8, "max": 1024, "step": 1}),
"fps": ("INT", {"default": 8, "min": 1, "max": 100, "step": 1}), "fps": ("INT", {"default": 8, "min": 1, "max": 100, "step": 1}),
"steps": ("INT", {"default": 25, "min": 1}), "steps": ("INT", {"default": 25, "min": 1}),
"cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}), "cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}), "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
"scheduler": (["DDIM", "DPM"],), "scheduler": (["DDIM", "DPM"],),
"t_tile_length": ("INT", {"default": 16, "min": 16, "max": 128, "step": 4}), "t_tile_length": ("INT", {"default": 16, "min": 2, "max": 128, "step": 1}),
"t_tile_overlap": ("INT", {"default": 8, "min": 8, "max": 128, "step": 2}), "t_tile_overlap": ("INT", {"default": 8, "min": 2, "max": 128, "step": 1}),
}, },
"optional": { "optional": {
"samples": ("LATENT", ), "samples": ("LATENT", ),
@ -282,10 +285,10 @@ class CogVideoDecode:
RETURN_TYPES = ("IMAGE",) RETURN_TYPES = ("IMAGE",)
RETURN_NAMES = ("images",) RETURN_NAMES = ("images",)
FUNCTION = "process" FUNCTION = "decode"
CATEGORY = "CogVideoWrapper" CATEGORY = "CogVideoWrapper"
def process(self, pipeline, samples): def decode(self, pipeline, samples):
device = mm.get_torch_device() device = mm.get_torch_device()
offload_device = mm.unet_offload_device() offload_device = mm.unet_offload_device()
latents = samples["samples"] latents = samples["samples"]
@ -305,19 +308,20 @@ class CogVideoDecode:
frames = [] frames = []
pbar = ProgressBar(num_seconds) pbar = ProgressBar(num_seconds)
for i in range(num_seconds): # for i in range(num_seconds):
start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3) # start_frame, end_frame = (0, 3) if i == 0 else (2 * i + 1, 2 * i + 3)
current_frames = vae.decode(latents[:, :, start_frame:end_frame]).sample # current_frames = vae.decode(latents[:, :, start_frame:end_frame]).sample
frames.append(current_frames) # frames.append(current_frames)
pbar.update(1) # pbar.update(1)
vae.clear_fake_context_parallel_cache() frames = vae.decode(latents).sample
vae.to(offload_device) vae.to(offload_device)
mm.soft_empty_cache() mm.soft_empty_cache()
frames = torch.cat(frames, dim=2) #frames = torch.cat(frames, dim=2)
video = pipeline["pipe"].video_processor.postprocess_video(video=frames, output_type="pt") video = pipeline["pipe"].video_processor.postprocess_video(video=frames, output_type="pt")
video = video[0].permute(0, 2, 3, 1).cpu().float() video = video[0].permute(0, 2, 3, 1).cpu().float()
print(video.min(), video.max())
return (video,) return (video,)

View File

@ -17,6 +17,7 @@ import inspect
from typing import Callable, Dict, List, Optional, Tuple, Union from typing import Callable, Dict, List, Optional, Tuple, Union
import torch import torch
import math
from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
from diffusers.pipelines.pipeline_utils import DiffusionPipeline from diffusers.pipelines.pipeline_utils import DiffusionPipeline
@ -24,11 +25,29 @@ from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
from diffusers.utils import logging from diffusers.utils import logging
from diffusers.utils.torch_utils import randn_tensor from diffusers.utils.torch_utils import randn_tensor
from diffusers.video_processor import VideoProcessor from diffusers.video_processor import VideoProcessor
from diffusers.models.embeddings import get_3d_rotary_pos_embed
from comfy.utils import ProgressBar from comfy.utils import ProgressBar
logger = logging.get_logger(__name__) # pylint: disable=invalid-name logger = logging.get_logger(__name__) # pylint: disable=invalid-name
def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
tw = tgt_width
th = tgt_height
h, w = src
r = h / w
if r > (th / tw):
resize_height = th
resize_width = int(round(th / h * w))
else:
resize_width = tw
resize_height = int(round(tw / w * h))
crop_top = int(round((th - resize_height) / 2.0))
crop_left = int(round((tw - resize_width) / 2.0))
return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
def retrieve_timesteps( def retrieve_timesteps(
scheduler, scheduler,
@ -229,6 +248,46 @@ class CogVideoXPipeline(DiffusionPipeline):
weights = weights.unsqueeze(0).unsqueeze(2).unsqueeze(3).unsqueeze(4).repeat(1, t_batch_size,1, 1, 1) weights = weights.unsqueeze(0).unsqueeze(2).unsqueeze(3).unsqueeze(4).repeat(1, t_batch_size,1, 1, 1)
return weights return weights
def fuse_qkv_projections(self) -> None:
r"""Enables fused QKV projections."""
self.fusing_transformer = True
self.transformer.fuse_qkv_projections()
def unfuse_qkv_projections(self) -> None:
r"""Disable QKV projection fusion if enabled."""
if not self.fusing_transformer:
logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
else:
self.transformer.unfuse_qkv_projections()
self.fusing_transformer = False
def _prepare_rotary_positional_embeddings(
self,
height: int,
width: int,
num_frames: int,
device: torch.device,
) -> Tuple[torch.Tensor, torch.Tensor]:
grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
base_size_width = 720 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
base_size_height = 480 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
grid_crops_coords = get_resize_crop_region_for_grid(
(grid_height, grid_width), base_size_width, base_size_height
)
freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
embed_dim=self.transformer.config.attention_head_dim,
crops_coords=grid_crops_coords,
grid_size=(grid_height, grid_width),
temporal_size=num_frames,
use_real=True,
)
freqs_cos = freqs_cos.to(device=device)
freqs_sin = freqs_sin.to(device=device)
return freqs_cos, freqs_sin
@property @property
def guidance_scale(self): def guidance_scale(self):
return self._guidance_scale return self._guidance_scale
@ -374,6 +433,15 @@ class CogVideoXPipeline(DiffusionPipeline):
t_tile_weights = self._gaussian_weights(t_tile_length=t_tile_length, t_batch_size=1).to(latents.device).to(latents.dtype) t_tile_weights = self._gaussian_weights(t_tile_length=t_tile_length, t_batch_size=1).to(latents.device).to(latents.dtype)
print("latents.shape", latents.shape) print("latents.shape", latents.shape)
print("latents.device", latents.device) print("latents.device", latents.device)
# 6.5. Create rotary embeds if required
image_rotary_emb = (
self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
if self.transformer.config.use_rotary_positional_embeddings
else None
)
# 7. Denoising loop # 7. Denoising loop
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
comfy_pbar = ProgressBar(num_inference_steps) comfy_pbar = ProgressBar(num_inference_steps)
@ -383,94 +451,125 @@ class CogVideoXPipeline(DiffusionPipeline):
for i, t in enumerate(timesteps): for i, t in enumerate(timesteps):
if self.interrupt: if self.interrupt:
continue continue
if not isinstance(self.scheduler, CogVideoXDPMScheduler):
#temporal tiling code based on https://github.com/mayuelala/FollowYourEmoji/blob/main/models/video_pipeline.py
# =====================================================
grid_ts = 0
cur_t = 0
while cur_t < latents.shape[1]:
cur_t = max(grid_ts * t_tile_length - t_tile_overlap * grid_ts, 0) + t_tile_length
grid_ts += 1
#temporal tiling code based on https://github.com/mayuelala/FollowYourEmoji/blob/main/models/video_pipeline.py all_t = latents.shape[1]
# ===================================================== latents_all_list = []
grid_ts = 0 # =====================================================
cur_t = 0
while cur_t < latents.shape[1]:
cur_t = max(grid_ts * t_tile_length - t_tile_overlap * grid_ts, 0) + t_tile_length
grid_ts += 1
all_t = latents.shape[1] for t_i in range(grid_ts):
latents_all_list = [] if t_i < grid_ts - 1:
# ===================================================== ofs_t = max(t_i * t_tile_length - t_tile_overlap * t_i, 0)
if t_i == grid_ts - 1:
ofs_t = all_t - t_tile_length
for t_i in range(grid_ts): input_start_t = ofs_t
if t_i < grid_ts - 1: input_end_t = ofs_t + t_tile_length
ofs_t = max(t_i * t_tile_length - t_tile_overlap * t_i, 0)
if t_i == grid_ts - 1:
ofs_t = all_t - t_tile_length
input_start_t = ofs_t #latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
input_end_t = ofs_t + t_tile_length #latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
#latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents latents_tile = latents[:, input_start_t:input_end_t,:, :, :]
#latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) latent_model_input_tile = torch.cat([latents_tile] * 2) if do_classifier_free_guidance else latents_tile
latent_model_input_tile = self.scheduler.scale_model_input(latent_model_input_tile, t)
latents_tile = latents[:, input_start_t:input_end_t,:, :, :] #t_input = t[None].to(device)
latent_model_input_tile = torch.cat([latents_tile] * 2) if do_classifier_free_guidance else latents_tile t_input = t.expand(latent_model_input_tile.shape[0]) # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
latent_model_input_tile = self.scheduler.scale_model_input(latent_model_input_tile, t)
#t_input = t[None].to(device) # predict noise model_output
t_input = t.expand(latent_model_input_tile.shape[0]) # broadcast to batch dimension in a way that's compatible with ONNX/Core ML noise_pred = self.transformer(
hidden_states=latent_model_input_tile,
encoder_hidden_states=prompt_embeds,
timestep=t_input,
image_rotary_emb=image_rotary_emb,
return_dict=False,
)[0]
noise_pred = noise_pred.float()
if self.do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
latents_tile = self.scheduler.step(noise_pred, t, latents_tile, **extra_step_kwargs, return_dict=False)[0]
latents_all_list.append(latents_tile)
# ==========================================
latents_all = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype)
contributors = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype)
# Add each tile contribution to overall latents
for t_i in range(grid_ts):
if t_i < grid_ts - 1:
ofs_t = max(t_i * t_tile_length - t_tile_overlap * t_i, 0)
if t_i == grid_ts - 1:
ofs_t = all_t - t_tile_length
input_start_t = ofs_t
input_end_t = ofs_t + t_tile_length
latents_all[:, input_start_t:input_end_t,:, :, :] += latents_all_list[t_i] * t_tile_weights
contributors[:, input_start_t:input_end_t,:, :, :] += t_tile_weights
latents_all /= contributors
latents = latents_all
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
comfy_pbar.update(1)
# ==========================================
else:
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
timestep = t.expand(latent_model_input.shape[0])
# predict noise model_output # predict noise model_output
noise_pred = self.transformer( noise_pred = self.transformer(
hidden_states=latent_model_input_tile, hidden_states=latent_model_input,
encoder_hidden_states=prompt_embeds, encoder_hidden_states=prompt_embeds,
timestep=t_input, timestep=timestep,
image_rotary_emb=image_rotary_emb,
return_dict=False, return_dict=False,
)[0] )[0]
noise_pred = noise_pred.float() noise_pred = noise_pred.float()
if self.do_classifier_free_guidance:
self._guidance_scale = 1 + guidance_scale * (
(1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
)
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1 # compute the previous noisy sample x_t -> x_t-1
if not isinstance(self.scheduler, CogVideoXDPMScheduler): if not isinstance(self.scheduler, CogVideoXDPMScheduler):
latents_tile = self.scheduler.step(noise_pred, t, latents_tile, **extra_step_kwargs, return_dict=False)[0] latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
else: else:
raise NotImplementedError("DPM is not supported with temporal tiling") latents, old_pred_original_sample = self.scheduler.step(
# else: noise_pred,
# latents_tile, old_pred_original_sample = self.scheduler.step( old_pred_original_sample,
# noise_pred, t,
# old_pred_original_sample, timesteps[i - 1] if i > 0 else None,
# t, latents,
# t_input[t_i - 1] if t_i > 0 else None, **extra_step_kwargs,
# latents_tile, return_dict=False,
# **extra_step_kwargs, )
# return_dict=False, latents = latents.to(prompt_embeds.dtype)
# )
latents_all_list.append(latents_tile) if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
comfy_pbar.update(1)
# ==========================================
latents_all = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype)
contributors = torch.zeros(latents.shape, device=latents.device, dtype=latents.dtype)
# Add each tile contribution to overall latents
for t_i in range(grid_ts):
if t_i < grid_ts - 1:
ofs_t = max(t_i * t_tile_length - t_tile_overlap * t_i, 0)
if t_i == grid_ts - 1:
ofs_t = all_t - t_tile_length
input_start_t = ofs_t
input_end_t = ofs_t + t_tile_length
latents_all[:, input_start_t:input_end_t,:, :, :] += latents_all_list[t_i] * t_tile_weights
contributors[:, input_start_t:input_end_t,:, :, :] += t_tile_weights
latents_all /= contributors
latents = latents_all
# ==========================================
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
comfy_pbar.update(1)
# Offload all models # Offload all models
self.maybe_free_model_hooks() self.maybe_free_model_hooks()

View File

@ -1,2 +1,2 @@
huggingface_hub huggingface_hub
diffusers>=0.30.0 diffusers>=0.30.1