mirror of
https://git.datalinker.icu/kijai/ComfyUI-KJNodes.git
synced 2025-12-08 20:34:35 +08:00
Compare commits
5 Commits
92c0b351ab
...
1150f54dad
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1150f54dad | ||
|
|
acdd16a973 | ||
|
|
8643d75a6b | ||
|
|
e6ee59b4c2 | ||
|
|
cedea47902 |
@ -210,6 +210,7 @@ NODE_CONFIG = {
|
|||||||
"WanVideoNAG": {"class": WanVideoNAG, "name": "WanVideoNAG"},
|
"WanVideoNAG": {"class": WanVideoNAG, "name": "WanVideoNAG"},
|
||||||
"GGUFLoaderKJ": {"class": GGUFLoaderKJ, "name": "GGUF Loader KJ"},
|
"GGUFLoaderKJ": {"class": GGUFLoaderKJ, "name": "GGUF Loader KJ"},
|
||||||
"LatentInpaintTTM": {"class": LatentInpaintTTM, "name": "Latent Inpaint TTM"},
|
"LatentInpaintTTM": {"class": LatentInpaintTTM, "name": "Latent Inpaint TTM"},
|
||||||
|
"NABLA_AttentionKJ": {"class": NABLA_AttentionKJ, "name": "NABLA Attention KJ"},
|
||||||
|
|
||||||
#instance diffusion
|
#instance diffusion
|
||||||
"CreateInstanceDiffusionTracking": {"class": CreateInstanceDiffusionTracking},
|
"CreateInstanceDiffusionTracking": {"class": CreateInstanceDiffusionTracking},
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
import torch
|
import torch
|
||||||
import comfy.model_management
|
import comfy.model_management
|
||||||
import comfy.utils
|
import comfy.utils
|
||||||
|
import comfy.lora
|
||||||
import folder_paths
|
import folder_paths
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
@ -11,6 +12,50 @@ device = comfy.model_management.get_torch_device()
|
|||||||
|
|
||||||
CLAMP_QUANTILE = 0.99
|
CLAMP_QUANTILE = 0.99
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_weight_from_patches(patches, key):
|
||||||
|
base_weight, convert_func = patches[0]
|
||||||
|
weight_tensor = comfy.model_management.cast_to_device(
|
||||||
|
base_weight, torch.device("cpu"), torch.float32, copy=True
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
weight_tensor = convert_func(weight_tensor, inplace=True)
|
||||||
|
except TypeError:
|
||||||
|
weight_tensor = convert_func(weight_tensor)
|
||||||
|
|
||||||
|
if len(patches) > 1:
|
||||||
|
weight_tensor = comfy.lora.calculate_weight(
|
||||||
|
patches[1:],
|
||||||
|
weight_tensor,
|
||||||
|
key,
|
||||||
|
intermediate_dtype=torch.float32,
|
||||||
|
original_weights={key: patches},
|
||||||
|
)
|
||||||
|
|
||||||
|
return weight_tensor
|
||||||
|
|
||||||
|
|
||||||
|
def _build_scaled_fp8_diff(finetuned_model, original_model, prefix, bias_diff):
|
||||||
|
finetuned_patches = finetuned_model.get_key_patches(prefix)
|
||||||
|
original_patches = original_model.get_key_patches(prefix)
|
||||||
|
|
||||||
|
common_keys = set(finetuned_patches.keys()).intersection(original_patches.keys())
|
||||||
|
diff_sd = {}
|
||||||
|
|
||||||
|
for key in common_keys:
|
||||||
|
is_weight = key.endswith(".weight")
|
||||||
|
is_bias = key.endswith(".bias")
|
||||||
|
|
||||||
|
if not is_weight and not (bias_diff and is_bias):
|
||||||
|
continue
|
||||||
|
|
||||||
|
ft_tensor = _resolve_weight_from_patches(finetuned_patches[key], key)
|
||||||
|
orig_tensor = _resolve_weight_from_patches(original_patches[key], key)
|
||||||
|
|
||||||
|
diff_sd[key] = ft_tensor.sub(orig_tensor)
|
||||||
|
|
||||||
|
return diff_sd
|
||||||
|
|
||||||
def extract_lora(diff, key, rank, algorithm, lora_type, lowrank_iters=7, adaptive_param=1.0, clamp_quantile=True):
|
def extract_lora(diff, key, rank, algorithm, lora_type, lowrank_iters=7, adaptive_param=1.0, clamp_quantile=True):
|
||||||
"""
|
"""
|
||||||
Extracts LoRA weights from a weight difference tensor using SVD.
|
Extracts LoRA weights from a weight difference tensor using SVD.
|
||||||
@ -99,7 +144,8 @@ def extract_lora(diff, key, rank, algorithm, lora_type, lowrank_iters=7, adaptiv
|
|||||||
return (U, Vh)
|
return (U, Vh)
|
||||||
|
|
||||||
|
|
||||||
def calc_lora_model(model_diff, rank, prefix_model, prefix_lora, output_sd, lora_type, algorithm, lowrank_iters, out_dtype, bias_diff=False, adaptive_param=1.0, clamp_quantile=True):
|
def calc_lora_model(model_diff, rank, prefix_model, prefix_lora, output_sd, lora_type, algorithm, lowrank_iters, out_dtype, bias_diff=False, adaptive_param=1.0, clamp_quantile=True, sd_override=None):
|
||||||
|
if sd_override is None:
|
||||||
comfy.model_management.load_models_gpu([model_diff], force_patch_weights=True)
|
comfy.model_management.load_models_gpu([model_diff], force_patch_weights=True)
|
||||||
model_diff.model.diffusion_model.cpu()
|
model_diff.model.diffusion_model.cpu()
|
||||||
sd = model_diff.model_state_dict(filter_prefix=prefix_model)
|
sd = model_diff.model_state_dict(filter_prefix=prefix_model)
|
||||||
@ -108,6 +154,8 @@ def calc_lora_model(model_diff, rank, prefix_model, prefix_lora, output_sd, lora
|
|||||||
for k, v in sd.items():
|
for k, v in sd.items():
|
||||||
if isinstance(v, torch.Tensor):
|
if isinstance(v, torch.Tensor):
|
||||||
sd[k] = v.cpu()
|
sd[k] = v.cpu()
|
||||||
|
else:
|
||||||
|
sd = sd_override
|
||||||
|
|
||||||
# Get total number of keys to process for progress bar
|
# Get total number of keys to process for progress bar
|
||||||
total_keys = len([k for k in sd if k.endswith(".weight") or (bias_diff and k.endswith(".bias"))])
|
total_keys = len([k for k in sd if k.endswith(".weight") or (bias_diff and k.endswith(".bias"))])
|
||||||
@ -183,6 +231,26 @@ class LoraExtractKJ:
|
|||||||
raise ValueError("svd_lowrank algorithm is only supported for standard LoRA extraction.")
|
raise ValueError("svd_lowrank algorithm is only supported for standard LoRA extraction.")
|
||||||
|
|
||||||
dtype = {"fp8_e4m3fn": torch.float8_e4m3fn, "bf16": torch.bfloat16, "fp16": torch.float16, "fp16_fast": torch.float16, "fp32": torch.float32}[output_dtype]
|
dtype = {"fp8_e4m3fn": torch.float8_e4m3fn, "bf16": torch.bfloat16, "fp16": torch.float16, "fp16_fast": torch.float16, "fp32": torch.float32}[output_dtype]
|
||||||
|
|
||||||
|
model_diff = None
|
||||||
|
sd_override = None
|
||||||
|
|
||||||
|
scaled_fp8_ft = getattr(getattr(finetuned_model.model, "model_config", None), "scaled_fp8", None)
|
||||||
|
scaled_fp8_orig = getattr(getattr(original_model.model, "model_config", None), "scaled_fp8", None)
|
||||||
|
scaled_fp8_present = scaled_fp8_ft is not None or scaled_fp8_orig is not None
|
||||||
|
|
||||||
|
if scaled_fp8_present:
|
||||||
|
comfy.model_management.load_models_gpu([finetuned_model, original_model], force_patch_weights=True)
|
||||||
|
logging.info(
|
||||||
|
"LoraExtractKJ: detected scaled fp8 weights (finetuned=%s, original=%s); using high-precision diff path.",
|
||||||
|
scaled_fp8_ft is not None,
|
||||||
|
scaled_fp8_orig is not None,
|
||||||
|
)
|
||||||
|
sd_override = _build_scaled_fp8_diff(
|
||||||
|
finetuned_model, original_model, "diffusion_model.", bias_diff
|
||||||
|
)
|
||||||
|
comfy.model_management.soft_empty_cache()
|
||||||
|
else:
|
||||||
m = finetuned_model.clone()
|
m = finetuned_model.clone()
|
||||||
kp = original_model.get_key_patches("diffusion_model.")
|
kp = original_model.get_key_patches("diffusion_model.")
|
||||||
for k in kp:
|
for k in kp:
|
||||||
@ -194,6 +262,8 @@ class LoraExtractKJ:
|
|||||||
output_sd = {}
|
output_sd = {}
|
||||||
if model_diff is not None:
|
if model_diff is not None:
|
||||||
output_sd = calc_lora_model(model_diff, rank, "diffusion_model.", "diffusion_model.", output_sd, lora_type, algorithm, lowrank_iters, dtype, bias_diff=bias_diff, adaptive_param=adaptive_param, clamp_quantile=clamp_quantile)
|
output_sd = calc_lora_model(model_diff, rank, "diffusion_model.", "diffusion_model.", output_sd, lora_type, algorithm, lowrank_iters, dtype, bias_diff=bias_diff, adaptive_param=adaptive_param, clamp_quantile=clamp_quantile)
|
||||||
|
elif sd_override is not None:
|
||||||
|
output_sd = calc_lora_model(None, rank, "diffusion_model.", "diffusion_model.", output_sd, lora_type, algorithm, lowrank_iters, dtype, bias_diff=bias_diff, adaptive_param=adaptive_param, clamp_quantile=clamp_quantile, sd_override=sd_override)
|
||||||
if "adaptive" in lora_type:
|
if "adaptive" in lora_type:
|
||||||
rank_str = f"{lora_type}_{adaptive_param:.2f}"
|
rank_str = f"{lora_type}_{adaptive_param:.2f}"
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -3,15 +3,17 @@ from comfy.ldm.modules import attention as comfy_attention
|
|||||||
import logging
|
import logging
|
||||||
import torch
|
import torch
|
||||||
import importlib
|
import importlib
|
||||||
|
import math
|
||||||
|
|
||||||
import folder_paths
|
import folder_paths
|
||||||
import comfy.model_management as mm
|
import comfy.model_management as mm
|
||||||
from comfy.cli_args import args
|
from comfy.cli_args import args
|
||||||
from comfy.ldm.modules.attention import wrap_attn
|
from comfy.ldm.modules.attention import wrap_attn, optimized_attention
|
||||||
import comfy.model_patcher
|
import comfy.model_patcher
|
||||||
import comfy.utils
|
import comfy.utils
|
||||||
import comfy.sd
|
import comfy.sd
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from comfy_api.latest import io
|
from comfy_api.latest import io
|
||||||
v3_available = True
|
v3_available = True
|
||||||
@ -675,6 +677,7 @@ class TorchCompileModelFluxAdvancedV2:
|
|||||||
try:
|
try:
|
||||||
if double_blocks:
|
if double_blocks:
|
||||||
for i, block in enumerate(diffusion_model.double_blocks):
|
for i, block in enumerate(diffusion_model.double_blocks):
|
||||||
|
print("Adding double block to compile list", i)
|
||||||
compile_key_list.append(f"diffusion_model.double_blocks.{i}")
|
compile_key_list.append(f"diffusion_model.double_blocks.{i}")
|
||||||
if single_blocks:
|
if single_blocks:
|
||||||
for i, block in enumerate(diffusion_model.single_blocks):
|
for i, block in enumerate(diffusion_model.single_blocks):
|
||||||
@ -718,7 +721,7 @@ class TorchCompileModelHyVideo:
|
|||||||
}
|
}
|
||||||
RETURN_TYPES = ("MODEL",)
|
RETURN_TYPES = ("MODEL",)
|
||||||
FUNCTION = "patch"
|
FUNCTION = "patch"
|
||||||
|
DEPRECATED = True
|
||||||
CATEGORY = "KJNodes/torchcompile"
|
CATEGORY = "KJNodes/torchcompile"
|
||||||
EXPERIMENTAL = True
|
EXPERIMENTAL = True
|
||||||
|
|
||||||
@ -2005,3 +2008,126 @@ else:
|
|||||||
FUNCTION = ""
|
FUNCTION = ""
|
||||||
CATEGORY = ""
|
CATEGORY = ""
|
||||||
DESCRIPTION = "This node requires newer ComfyUI"
|
DESCRIPTION = "This node requires newer ComfyUI"
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
from torch.nn.attention.flex_attention import flex_attention, BlockMask
|
||||||
|
except:
|
||||||
|
flex_attention = None
|
||||||
|
BlockMask = None
|
||||||
|
|
||||||
|
class NABLA_AttentionKJ():
|
||||||
|
@classmethod
|
||||||
|
def INPUT_TYPES(s):
|
||||||
|
return {"required": {
|
||||||
|
"model": ("MODEL",),
|
||||||
|
"latent": ("LATENT", {"tooltip": "Only used to get the latent shape"}),
|
||||||
|
"window_time": ("INT", {"default": 11, "min": 1, "tooltip": "Temporal attention window size"}),
|
||||||
|
"window_width": ("INT", {"default": 3, "min": 1, "tooltip": "Spatial attention window size"}),
|
||||||
|
"window_height": ("INT", {"default": 3, "min": 1, "tooltip": "Spatial attention window size"}),
|
||||||
|
"sparsity": ("FLOAT", {"default": 0.9, "min": 0.0, "max": 1.0, "step": 0.01}),
|
||||||
|
"torch_compile": ("BOOLEAN", {"default": True, "tooltip": "Most likely required for reasonable memory usage"})
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
RETURN_TYPES = ("MODEL", )
|
||||||
|
FUNCTION = "patch"
|
||||||
|
DESCRIPTION = "Experimental node for patching attention mode to use NABLA sparse attention for video models, currently only works with Kadinsky5"
|
||||||
|
CATEGORY = "KJNodes/experimental"
|
||||||
|
|
||||||
|
def patch(self, model, latent, window_time, window_width, window_height, sparsity, torch_compile):
|
||||||
|
if flex_attention is None or BlockMask is None:
|
||||||
|
raise RuntimeError("can't import flex_attention from torch.nn.attention, requires newer pytorch version")
|
||||||
|
|
||||||
|
model_clone = model.clone()
|
||||||
|
samples = latent["samples"]
|
||||||
|
|
||||||
|
sparse_params = get_sparse_params(samples, window_time, window_height, window_width, sparsity)
|
||||||
|
nabla_attention = NABLA_Attention(sparse_params)
|
||||||
|
|
||||||
|
def attention_override_nabla(func, *args, **kwargs):
|
||||||
|
return nabla_attention(*args, **kwargs)
|
||||||
|
|
||||||
|
if torch_compile:
|
||||||
|
attention_override_nabla = torch.compile(attention_override_nabla, mode="max-autotune-no-cudagraphs", dynamic=True)
|
||||||
|
|
||||||
|
# attention override
|
||||||
|
model_clone.model_options["transformer_options"]["optimized_attention_override"] = attention_override_nabla
|
||||||
|
|
||||||
|
return model_clone,
|
||||||
|
|
||||||
|
|
||||||
|
class NABLA_Attention():
|
||||||
|
def __init__(self, sparse_params):
|
||||||
|
self.sparse_params = sparse_params
|
||||||
|
|
||||||
|
def __call__(self, q, k, v, heads, **kwargs):
|
||||||
|
if q.shape[-2] < 3000 or k.shape[-2] < 3000:
|
||||||
|
return optimized_attention(q, k, v, heads, **kwargs)
|
||||||
|
block_mask = self.nablaT_v2(q, k, self.sparse_params["sta_mask"], thr=self.sparse_params["P"])
|
||||||
|
out = flex_attention(q, k, v, block_mask=block_mask).transpose(1, 2).contiguous().flatten(-2, -1)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def nablaT_v2(self, q, k, sta, thr=0.9):
|
||||||
|
# Map estimation
|
||||||
|
BLOCK_SIZE = 64
|
||||||
|
B, h, S, D = q.shape
|
||||||
|
s1 = S // BLOCK_SIZE
|
||||||
|
qa = q.reshape(B, h, s1, BLOCK_SIZE, D).mean(-2)
|
||||||
|
ka = k.reshape(B, h, s1, BLOCK_SIZE, D).mean(-2).transpose(-2, -1)
|
||||||
|
map = qa @ ka
|
||||||
|
|
||||||
|
map = torch.softmax(map / math.sqrt(D), dim=-1)
|
||||||
|
# Map binarization
|
||||||
|
vals, inds = map.sort(-1)
|
||||||
|
cvals = vals.cumsum_(-1)
|
||||||
|
mask = (cvals >= 1 - thr).int()
|
||||||
|
mask = mask.gather(-1, inds.argsort(-1))
|
||||||
|
|
||||||
|
mask = torch.logical_or(mask, sta)
|
||||||
|
|
||||||
|
# BlockMask creation
|
||||||
|
kv_nb = mask.sum(-1).to(torch.int32)
|
||||||
|
kv_inds = mask.argsort(dim=-1, descending=True).to(torch.int32)
|
||||||
|
return BlockMask.from_kv_blocks(torch.zeros_like(kv_nb), kv_inds, kv_nb, kv_inds, BLOCK_SIZE=BLOCK_SIZE, mask_mod=None)
|
||||||
|
|
||||||
|
def fast_sta_nabla(T, H, W, wT=3, wH=3, wW=3):
|
||||||
|
l = torch.Tensor([T, H, W]).amax()
|
||||||
|
r = torch.arange(0, l, 1, dtype=torch.int16, device=mm.get_torch_device())
|
||||||
|
mat = (r.unsqueeze(1) - r.unsqueeze(0)).abs()
|
||||||
|
sta_t, sta_h, sta_w = (
|
||||||
|
mat[:T, :T].flatten(),
|
||||||
|
mat[:H, :H].flatten(),
|
||||||
|
mat[:W, :W].flatten(),
|
||||||
|
)
|
||||||
|
sta_t = sta_t <= wT // 2
|
||||||
|
sta_h = sta_h <= wH // 2
|
||||||
|
sta_w = sta_w <= wW // 2
|
||||||
|
sta_hw = (sta_h.unsqueeze(1) * sta_w.unsqueeze(0)).reshape(H, H, W, W).transpose(1, 2).flatten()
|
||||||
|
sta = (sta_t.unsqueeze(1) * sta_hw.unsqueeze(0)).reshape(T, T, H * W, H * W).transpose(1, 2)
|
||||||
|
return sta.reshape(T * H * W, T * H * W)
|
||||||
|
|
||||||
|
|
||||||
|
def get_sparse_params(x, wT, wH, wW, sparsity=0.9):
|
||||||
|
B, C, T, H, W = x.shape
|
||||||
|
print("x shape:", x.shape)
|
||||||
|
patch_size = (1, 2, 2)
|
||||||
|
T, H, W = (
|
||||||
|
T // patch_size[0],
|
||||||
|
H // patch_size[1],
|
||||||
|
W // patch_size[2],
|
||||||
|
)
|
||||||
|
sta_mask = fast_sta_nabla(T, H // 8, W // 8, wT, wH, wW)
|
||||||
|
sparse_params = {
|
||||||
|
"sta_mask": sta_mask.unsqueeze_(0).unsqueeze_(0),
|
||||||
|
"to_fractal": True,
|
||||||
|
"P": sparsity,
|
||||||
|
"wT": wT,
|
||||||
|
"wH": wH,
|
||||||
|
"wW": wW,
|
||||||
|
"add_sta": True,
|
||||||
|
"visual_shape": (T, H, W),
|
||||||
|
"method": "topcdf",
|
||||||
|
}
|
||||||
|
|
||||||
|
return sparse_params
|
||||||
Loading…
x
Reference in New Issue
Block a user