mirror of
https://git.datalinker.icu/comfyanonymous/ComfyUI
synced 2025-12-10 06:24:26 +08:00
* Looking into a @wrap_attn decorator to look for 'optimized_attention_override' entry in transformer_options * Created logging code for this branch so that it can be used to track down all the code paths where transformer_options would need to be added * Fix memory usage issue with inspect * Made WAN attention receive transformer_options, test node added to wan to test out attention override later * Added **kwargs to all attention functions so transformer_options could potentially be passed through * Make sure wrap_attn doesn't make itself recurse infinitely, attempt to load SageAttention and FlashAttention if not enabled so that they can be marked as available or not, create registry for available attention * Turn off attention logging for now, make AttentionOverrideTestNode have a dropdown with available attention (this is a test node only) * Make flux work with optimized_attention_override * Add logs to verify optimized_attention_override is passed all the way into attention function * Make Qwen work with optimized_attention_override * Made hidream work with optimized_attention_override * Made wan patches_replace work with optimized_attention_override * Made SD3 work with optimized_attention_override * Made HunyuanVideo work with optimized_attention_override * Made Mochi work with optimized_attention_override * Made LTX work with optimized_attention_override * Made StableAudio work with optimized_attention_override * Made optimized_attention_override work with ACE Step * Made Hunyuan3D work with optimized_attention_override * Make CosmosPredict2 work with optimized_attention_override * Made CosmosVideo work with optimized_attention_override * Made Omnigen 2 work with optimized_attention_override * Made StableCascade work with optimized_attention_override * Made AuraFlow work with optimized_attention_override * Made Lumina work with optimized_attention_override * Made Chroma work with optimized_attention_override * Made SVD work with optimized_attention_override * Fix WanI2VCrossAttention so that it expects to receive transformer_options * Fixed Wan2.1 Fun Camera transformer_options passthrough * Fixed WAN 2.1 VACE transformer_options passthrough * Add optimized to get_attention_function * Disable attention logs for now * Remove attention logging code * Remove _register_core_attention_functions, as we wouldn't want someone to call that, just in case * Satisfy ruff * Remove AttentionOverrideTest node, that's something to cook up for later
149 lines
6.3 KiB
Python
149 lines
6.3 KiB
Python
import torch
|
|
from torch import nn
|
|
from comfy.ldm.flux.layers import (
|
|
DoubleStreamBlock,
|
|
LastLayer,
|
|
MLPEmbedder,
|
|
SingleStreamBlock,
|
|
timestep_embedding,
|
|
)
|
|
import comfy.patcher_extension
|
|
|
|
|
|
class Hunyuan3Dv2(nn.Module):
|
|
def __init__(
|
|
self,
|
|
in_channels=64,
|
|
context_in_dim=1536,
|
|
hidden_size=1024,
|
|
mlp_ratio=4.0,
|
|
num_heads=16,
|
|
depth=16,
|
|
depth_single_blocks=32,
|
|
qkv_bias=True,
|
|
guidance_embed=False,
|
|
image_model=None,
|
|
dtype=None,
|
|
device=None,
|
|
operations=None
|
|
):
|
|
super().__init__()
|
|
self.dtype = dtype
|
|
|
|
if hidden_size % num_heads != 0:
|
|
raise ValueError(
|
|
f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
|
|
)
|
|
|
|
self.max_period = 1000 # While reimplementing the model I noticed that they messed up. This 1000 value was meant to be the time_factor but they set the max_period instead
|
|
self.latent_in = operations.Linear(in_channels, hidden_size, bias=True, dtype=dtype, device=device)
|
|
self.time_in = MLPEmbedder(in_dim=256, hidden_dim=hidden_size, dtype=dtype, device=device, operations=operations)
|
|
self.guidance_in = (
|
|
MLPEmbedder(in_dim=256, hidden_dim=hidden_size, dtype=dtype, device=device, operations=operations) if guidance_embed else None
|
|
)
|
|
self.cond_in = operations.Linear(context_in_dim, hidden_size, dtype=dtype, device=device)
|
|
self.double_blocks = nn.ModuleList(
|
|
[
|
|
DoubleStreamBlock(
|
|
hidden_size,
|
|
num_heads,
|
|
mlp_ratio=mlp_ratio,
|
|
qkv_bias=qkv_bias,
|
|
dtype=dtype, device=device, operations=operations
|
|
)
|
|
for _ in range(depth)
|
|
]
|
|
)
|
|
self.single_blocks = nn.ModuleList(
|
|
[
|
|
SingleStreamBlock(
|
|
hidden_size,
|
|
num_heads,
|
|
mlp_ratio=mlp_ratio,
|
|
dtype=dtype, device=device, operations=operations
|
|
)
|
|
for _ in range(depth_single_blocks)
|
|
]
|
|
)
|
|
self.final_layer = LastLayer(hidden_size, 1, in_channels, dtype=dtype, device=device, operations=operations)
|
|
|
|
def forward(self, x, timestep, context, guidance=None, transformer_options={}, **kwargs):
|
|
return comfy.patcher_extension.WrapperExecutor.new_class_executor(
|
|
self._forward,
|
|
self,
|
|
comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
|
|
).execute(x, timestep, context, guidance, transformer_options, **kwargs)
|
|
|
|
def _forward(self, x, timestep, context, guidance=None, transformer_options={}, **kwargs):
|
|
x = x.movedim(-1, -2)
|
|
timestep = 1.0 - timestep
|
|
txt = context
|
|
img = self.latent_in(x)
|
|
|
|
vec = self.time_in(timestep_embedding(timestep, 256, self.max_period).to(dtype=img.dtype))
|
|
if self.guidance_in is not None:
|
|
if guidance is not None:
|
|
vec = vec + self.guidance_in(timestep_embedding(guidance, 256, self.max_period).to(img.dtype))
|
|
|
|
txt = self.cond_in(txt)
|
|
pe = None
|
|
attn_mask = None
|
|
|
|
patches_replace = transformer_options.get("patches_replace", {})
|
|
blocks_replace = patches_replace.get("dit", {})
|
|
for i, block in enumerate(self.double_blocks):
|
|
if ("double_block", i) in blocks_replace:
|
|
def block_wrap(args):
|
|
out = {}
|
|
out["img"], out["txt"] = block(img=args["img"],
|
|
txt=args["txt"],
|
|
vec=args["vec"],
|
|
pe=args["pe"],
|
|
attn_mask=args.get("attn_mask"),
|
|
transformer_options=args["transformer_options"])
|
|
return out
|
|
|
|
out = blocks_replace[("double_block", i)]({"img": img,
|
|
"txt": txt,
|
|
"vec": vec,
|
|
"pe": pe,
|
|
"attn_mask": attn_mask,
|
|
"transformer_options": transformer_options},
|
|
{"original_block": block_wrap})
|
|
txt = out["txt"]
|
|
img = out["img"]
|
|
else:
|
|
img, txt = block(img=img,
|
|
txt=txt,
|
|
vec=vec,
|
|
pe=pe,
|
|
attn_mask=attn_mask,
|
|
transformer_options=transformer_options)
|
|
|
|
img = torch.cat((txt, img), 1)
|
|
|
|
for i, block in enumerate(self.single_blocks):
|
|
if ("single_block", i) in blocks_replace:
|
|
def block_wrap(args):
|
|
out = {}
|
|
out["img"] = block(args["img"],
|
|
vec=args["vec"],
|
|
pe=args["pe"],
|
|
attn_mask=args.get("attn_mask"),
|
|
transformer_options=args["transformer_options"])
|
|
return out
|
|
|
|
out = blocks_replace[("single_block", i)]({"img": img,
|
|
"vec": vec,
|
|
"pe": pe,
|
|
"attn_mask": attn_mask,
|
|
"transformer_options": transformer_options},
|
|
{"original_block": block_wrap})
|
|
img = out["img"]
|
|
else:
|
|
img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, transformer_options=transformer_options)
|
|
|
|
img = img[:, txt.shape[1]:, ...]
|
|
img = self.final_layer(img, vec)
|
|
return img.movedim(-2, -1) * (-1.0)
|