Merge 8643d75a6b98dfd1f39eb97ea53e1c927314200a into acdd16a973460b5be5d92133a9217787f0e085c6

Add NABLA_AttentionKJ
Only tested with Kadinsky5
2025-12-08 20:34:35 +08:00 · 2025-11-27 10:20:55 +08:00 · 2025-11-26 23:40:12 +02:00 · 2025-10-28 22:40:05 -04:00 · 2025-10-28 22:30:26 -04:00 · 2025-10-28 22:28:43 -04:00
3 changed files with 213 additions and 16 deletions
--- a/init.py
+++ b/init.py
@ -210,6 +210,7 @@ NODE_CONFIG = {
    "WanVideoNAG": {"class": WanVideoNAG, "name": "WanVideoNAG"},
    "GGUFLoaderKJ": {"class": GGUFLoaderKJ, "name": "GGUF Loader KJ"},
    "LatentInpaintTTM": {"class": LatentInpaintTTM, "name": "Latent Inpaint TTM"},
    "NABLA_AttentionKJ": {"class": NABLA_AttentionKJ, "name": "NABLA Attention KJ"},
    #instance diffusion
    "CreateInstanceDiffusionTracking": {"class": CreateInstanceDiffusionTracking},
--- a/nodes/lora_nodes.py
+++ b/nodes/lora_nodes.py
@ -1,6 +1,7 @@
 import torch
 import comfy.model_management
 import comfy.utils
 import comfy.lora
 import folder_paths
 import os
 import logging
@ -11,6 +12,50 @@ device = comfy.model_management.get_torch_device()
 CLAMP_QUANTILE = 0.99
 def _resolve_weight_from_patches(patches, key):
    base_weight, convert_func = patches[0]
    weight_tensor = comfy.model_management.cast_to_device(
        base_weight, torch.device("cpu"), torch.float32, copy=True
    )
    try:
        weight_tensor = convert_func(weight_tensor, inplace=True)
    except TypeError:
        weight_tensor = convert_func(weight_tensor)
    if len(patches) > 1:
        weight_tensor = comfy.lora.calculate_weight(
            patches[1:],
            weight_tensor,
            key,
            intermediate_dtype=torch.float32,
            original_weights={key: patches},
        )
    return weight_tensor
 def _build_scaled_fp8_diff(finetuned_model, original_model, prefix, bias_diff):
    finetuned_patches = finetuned_model.get_key_patches(prefix)
    original_patches = original_model.get_key_patches(prefix)
    common_keys = set(finetuned_patches.keys()).intersection(original_patches.keys())
    diff_sd = {}
    for key in common_keys:
        is_weight = key.endswith(".weight")
        is_bias = key.endswith(".bias")
        if not is_weight and not (bias_diff and is_bias):
            continue
        ft_tensor = _resolve_weight_from_patches(finetuned_patches[key], key)
        orig_tensor = _resolve_weight_from_patches(original_patches[key], key)
        diff_sd[key] = ft_tensor.sub(orig_tensor)
    return diff_sd
 def extract_lora(diff, key, rank, algorithm, lora_type, lowrank_iters=7, adaptive_param=1.0, clamp_quantile=True):
    """
    Extracts LoRA weights from a weight difference tensor using SVD.
@ -99,15 +144,18 @@ def extract_lora(diff, key, rank, algorithm, lora_type, lowrank_iters=7, adaptiv
    return (U, Vh)
-def calc_lora_model(model_diff, rank, prefix_model, prefix_lora, output_sd, lora_type, algorithm, lowrank_iters, out_dtype, bias_diff=False, adaptive_param=1.0, clamp_quantile=True):
+def calc_lora_model(model_diff, rank, prefix_model, prefix_lora, output_sd, lora_type, algorithm, lowrank_iters, out_dtype, bias_diff=False, adaptive_param=1.0, clamp_quantile=True, sd_override=None):
-    comfy.model_management.load_models_gpu([model_diff], force_patch_weights=True)
+    if sd_override is None:
-    model_diff.model.diffusion_model.cpu()
+        comfy.model_management.load_models_gpu([model_diff], force_patch_weights=True)
-    sd = model_diff.model_state_dict(filter_prefix=prefix_model)
+        model_diff.model.diffusion_model.cpu()
-    del model_diff
+        sd = model_diff.model_state_dict(filter_prefix=prefix_model)
-    comfy.model_management.soft_empty_cache()
+        del model_diff
-    for k, v in sd.items():
+        comfy.model_management.soft_empty_cache()
-        if isinstance(v, torch.Tensor):
+        for k, v in sd.items():
-            sd[k] = v.cpu()
+            if isinstance(v, torch.Tensor):
                sd[k] = v.cpu()
    else:
        sd = sd_override
    # Get total number of keys to process for progress bar
    total_keys = len([k for k in sd if k.endswith(".weight") or (bias_diff and k.endswith(".bias"))])
@ -183,17 +231,39 @@ class LoraExtractKJ:
            raise ValueError("svd_lowrank algorithm is only supported for standard LoRA extraction.")
        dtype = {"fp8_e4m3fn": torch.float8_e4m3fn, "bf16": torch.bfloat16, "fp16": torch.float16, "fp16_fast": torch.float16, "fp32": torch.float32}[output_dtype]
-        m = finetuned_model.clone()
+
-        kp = original_model.get_key_patches("diffusion_model.")
+        model_diff = None
-        for k in kp:
+        sd_override = None
-            m.add_patches({k: kp[k]}, - 1.0, 1.0)
+
-        model_diff = m
+        scaled_fp8_ft = getattr(getattr(finetuned_model.model, "model_config", None), "scaled_fp8", None)
        scaled_fp8_orig = getattr(getattr(original_model.model, "model_config", None), "scaled_fp8", None)
        scaled_fp8_present = scaled_fp8_ft is not None or scaled_fp8_orig is not None
        if scaled_fp8_present:
            comfy.model_management.load_models_gpu([finetuned_model, original_model], force_patch_weights=True)
            logging.info(
                "LoraExtractKJ: detected scaled fp8 weights (finetuned=%s, original=%s); using high-precision diff path.",
                scaled_fp8_ft is not None,
                scaled_fp8_orig is not None,
            )
            sd_override = _build_scaled_fp8_diff(
                finetuned_model, original_model, "diffusion_model.", bias_diff
            )
            comfy.model_management.soft_empty_cache()
        else:
            m = finetuned_model.clone()
            kp = original_model.get_key_patches("diffusion_model.")
            for k in kp:
                m.add_patches({k: kp[k]}, - 1.0, 1.0)
            model_diff = m
        full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir)
        output_sd = {}
        if model_diff is not None:
            output_sd = calc_lora_model(model_diff, rank, "diffusion_model.", "diffusion_model.", output_sd, lora_type, algorithm, lowrank_iters, dtype, bias_diff=bias_diff, adaptive_param=adaptive_param, clamp_quantile=clamp_quantile)
        elif sd_override is not None:
            output_sd = calc_lora_model(None, rank, "diffusion_model.", "diffusion_model.", output_sd, lora_type, algorithm, lowrank_iters, dtype, bias_diff=bias_diff, adaptive_param=adaptive_param, clamp_quantile=clamp_quantile, sd_override=sd_override)
        if "adaptive" in lora_type:
            rank_str = f"{lora_type}_{adaptive_param:.2f}"
        else:
--- a/nodes/model_optimization_nodes.py
+++ b/nodes/model_optimization_nodes.py
@ -3,15 +3,17 @@ from comfy.ldm.modules import attention as comfy_attention
 import logging
 import torch
 import importlib
 import math
 import folder_paths
 import comfy.model_management as mm
 from comfy.cli_args import args
-from comfy.ldm.modules.attention import wrap_attn
+from comfy.ldm.modules.attention import wrap_attn, optimized_attention
 import comfy.model_patcher
 import comfy.utils
 import comfy.sd
 try:
    from comfy_api.latest import io
    v3_available = True
@ -675,6 +677,7 @@ class TorchCompileModelFluxAdvancedV2:
        try:
            if double_blocks:
                for i, block in enumerate(diffusion_model.double_blocks):
                    print("Adding double block to compile list", i)
                    compile_key_list.append(f"diffusion_model.double_blocks.{i}")
            if single_blocks:
                for i, block in enumerate(diffusion_model.single_blocks):
@ -718,7 +721,7 @@ class TorchCompileModelHyVideo:
        }
    RETURN_TYPES = ("MODEL",)
    FUNCTION = "patch"
-
+    DEPRECATED = True
    CATEGORY = "KJNodes/torchcompile"
    EXPERIMENTAL = True
@ -2005,3 +2008,126 @@ else:
        FUNCTION = ""
        CATEGORY = ""
        DESCRIPTION = "This node requires newer ComfyUI"
 try:
    from torch.nn.attention.flex_attention import flex_attention, BlockMask
 except:
    flex_attention = None
    BlockMask = None
 class NABLA_AttentionKJ():
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {
            "model": ("MODEL",),
            "latent": ("LATENT", {"tooltip": "Only used to get the latent shape"}),
            "window_time": ("INT", {"default": 11, "min": 1, "tooltip": "Temporal attention window size"}),
            "window_width": ("INT", {"default": 3, "min": 1, "tooltip": "Spatial attention window size"}),
            "window_height": ("INT", {"default": 3, "min": 1, "tooltip": "Spatial attention window size"}),
            "sparsity": ("FLOAT", {"default": 0.9, "min": 0.0, "max": 1.0, "step": 0.01}),
            "torch_compile": ("BOOLEAN", {"default": True, "tooltip": "Most likely required for reasonable memory usage"})
        },
        }
    RETURN_TYPES = ("MODEL", )
    FUNCTION = "patch"
    DESCRIPTION = "Experimental node for patching attention mode to use NABLA sparse attention for video models, currently only works with Kadinsky5"
    CATEGORY = "KJNodes/experimental"
    def patch(self, model, latent, window_time, window_width, window_height, sparsity, torch_compile):
        if flex_attention is None or BlockMask is None:
            raise RuntimeError("can't import flex_attention from torch.nn.attention, requires newer pytorch version")
        model_clone = model.clone()
        samples = latent["samples"]
        sparse_params = get_sparse_params(samples, window_time, window_height, window_width, sparsity)
        nabla_attention = NABLA_Attention(sparse_params)
        def attention_override_nabla(func, *args, **kwargs):
            return nabla_attention(*args, **kwargs)
        if torch_compile:
            attention_override_nabla = torch.compile(attention_override_nabla, mode="max-autotune-no-cudagraphs", dynamic=True)
        # attention override
        model_clone.model_options["transformer_options"]["optimized_attention_override"] = attention_override_nabla
        return model_clone,
 class NABLA_Attention():
    def __init__(self, sparse_params):
        self.sparse_params = sparse_params
    def __call__(self, q, k, v, heads, **kwargs):
        if q.shape[-2] < 3000 or k.shape[-2] < 3000:
            return optimized_attention(q, k, v, heads, **kwargs)
        block_mask = self.nablaT_v2(q, k, self.sparse_params["sta_mask"], thr=self.sparse_params["P"])
        out = flex_attention(q, k, v, block_mask=block_mask).transpose(1, 2).contiguous().flatten(-2, -1)
        return out
    def nablaT_v2(self, q, k, sta, thr=0.9):
        # Map estimation
        BLOCK_SIZE = 64
        B, h, S, D = q.shape
        s1 = S // BLOCK_SIZE
        qa = q.reshape(B, h, s1, BLOCK_SIZE, D).mean(-2)
        ka = k.reshape(B, h, s1, BLOCK_SIZE, D).mean(-2).transpose(-2, -1)
        map = qa @ ka
        map = torch.softmax(map / math.sqrt(D), dim=-1)
        # Map binarization
        vals, inds = map.sort(-1)
        cvals = vals.cumsum_(-1)
        mask = (cvals >= 1 - thr).int()
        mask = mask.gather(-1, inds.argsort(-1))
        mask = torch.logical_or(mask, sta)
        # BlockMask creation
        kv_nb = mask.sum(-1).to(torch.int32)
        kv_inds = mask.argsort(dim=-1, descending=True).to(torch.int32)
        return BlockMask.from_kv_blocks(torch.zeros_like(kv_nb), kv_inds, kv_nb, kv_inds, BLOCK_SIZE=BLOCK_SIZE, mask_mod=None)
 def fast_sta_nabla(T, H, W, wT=3, wH=3, wW=3):
    l = torch.Tensor([T, H, W]).amax()
    r = torch.arange(0, l, 1, dtype=torch.int16, device=mm.get_torch_device())
    mat = (r.unsqueeze(1) - r.unsqueeze(0)).abs()
    sta_t, sta_h, sta_w = (
        mat[:T, :T].flatten(),
        mat[:H, :H].flatten(),
        mat[:W, :W].flatten(),
    )
    sta_t = sta_t <= wT // 2
    sta_h = sta_h <= wH // 2
    sta_w = sta_w <= wW // 2
    sta_hw = (sta_h.unsqueeze(1) * sta_w.unsqueeze(0)).reshape(H, H, W, W).transpose(1, 2).flatten()
    sta = (sta_t.unsqueeze(1) * sta_hw.unsqueeze(0)).reshape(T, T, H * W, H * W).transpose(1, 2)
    return sta.reshape(T * H * W, T * H * W)
 def get_sparse_params(x, wT, wH, wW, sparsity=0.9):
    B, C, T, H, W = x.shape
    print("x shape:", x.shape)
    patch_size = (1, 2, 2)
    T, H, W = (
        T // patch_size[0],
        H // patch_size[1],
        W // patch_size[2],
    )
    sta_mask = fast_sta_nabla(T, H // 8, W // 8, wT, wH, wW)
    sparse_params = {
        "sta_mask": sta_mask.unsqueeze_(0).unsqueeze_(0),
        "to_fractal": True,
        "P": sparsity,
        "wT": wT,
        "wH": wH,
        "wW": wW,
        "add_sta": True,
        "visual_shape": (T, H, W),
        "method": "topcdf",
    }
    return sparse_params
Author	SHA1	Message	Date
Dango233	1150f54dad	Merge 8643d75a6b98dfd1f39eb97ea53e1c927314200a into acdd16a973460b5be5d92133a9217787f0e085c6	2025-11-27 10:20:55 +08:00
kijai	acdd16a973	Add NABLA_AttentionKJ Only tested with Kadinsky5	2025-11-26 23:40:12 +02:00
Dango233	8643d75a6b	Extend fp8 diff path when either model is scaled	2025-10-28 22:40:05 -04:00
Dango233	e6ee59b4c2	Log when scaled fp8 diff path is used	2025-10-28 22:30:26 -04:00
Dango233	cedea47902	Fix LoRA extraction for scaled fp8 models	2025-10-28 22:28:43 -04:00