mirror of
https://git.datalinker.icu/kijai/ComfyUI-KJNodes.git
synced 2025-12-09 12:54:40 +08:00
Compare commits
6 Commits
21dd6170d8
...
3621a452d0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3621a452d0 | ||
|
|
50e7dd34d3 | ||
|
|
37206374ef | ||
|
|
8643d75a6b | ||
|
|
e6ee59b4c2 | ||
|
|
cedea47902 |
@ -212,6 +212,9 @@ NODE_CONFIG = {
|
||||
"LatentInpaintTTM": {"class": LatentInpaintTTM, "name": "Latent Inpaint TTM"},
|
||||
"NABLA_AttentionKJ": {"class": NABLA_AttentionKJ, "name": "NABLA Attention KJ"},
|
||||
"TorchCompileModelAdvanced": {"class": TorchCompileModelAdvanced, "name": "TorchCompileModelAdvanced"},
|
||||
"StartRecordCUDAMemoryHistory": {"class": StartRecordCUDAMemoryHistory, "name": "Start Recording CUDAMemory History"},
|
||||
"EndRecordCUDAMemoryHistory": {"class": EndRecordCUDAMemoryHistory, "name": "End Recording CUDAMemory History"},
|
||||
"VisualizeCUDAMemoryHistory": {"class": VisualizeCUDAMemoryHistory, "name": "Visualize CUDAMemory History"},
|
||||
|
||||
#instance diffusion
|
||||
"CreateInstanceDiffusionTracking": {"class": CreateInstanceDiffusionTracking},
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
import torch
|
||||
import comfy.model_management
|
||||
import comfy.utils
|
||||
import comfy.lora
|
||||
import folder_paths
|
||||
import os
|
||||
import logging
|
||||
@ -11,6 +12,50 @@ device = comfy.model_management.get_torch_device()
|
||||
|
||||
CLAMP_QUANTILE = 0.99
|
||||
|
||||
|
||||
def _resolve_weight_from_patches(patches, key):
|
||||
base_weight, convert_func = patches[0]
|
||||
weight_tensor = comfy.model_management.cast_to_device(
|
||||
base_weight, torch.device("cpu"), torch.float32, copy=True
|
||||
)
|
||||
try:
|
||||
weight_tensor = convert_func(weight_tensor, inplace=True)
|
||||
except TypeError:
|
||||
weight_tensor = convert_func(weight_tensor)
|
||||
|
||||
if len(patches) > 1:
|
||||
weight_tensor = comfy.lora.calculate_weight(
|
||||
patches[1:],
|
||||
weight_tensor,
|
||||
key,
|
||||
intermediate_dtype=torch.float32,
|
||||
original_weights={key: patches},
|
||||
)
|
||||
|
||||
return weight_tensor
|
||||
|
||||
|
||||
def _build_scaled_fp8_diff(finetuned_model, original_model, prefix, bias_diff):
|
||||
finetuned_patches = finetuned_model.get_key_patches(prefix)
|
||||
original_patches = original_model.get_key_patches(prefix)
|
||||
|
||||
common_keys = set(finetuned_patches.keys()).intersection(original_patches.keys())
|
||||
diff_sd = {}
|
||||
|
||||
for key in common_keys:
|
||||
is_weight = key.endswith(".weight")
|
||||
is_bias = key.endswith(".bias")
|
||||
|
||||
if not is_weight and not (bias_diff and is_bias):
|
||||
continue
|
||||
|
||||
ft_tensor = _resolve_weight_from_patches(finetuned_patches[key], key)
|
||||
orig_tensor = _resolve_weight_from_patches(original_patches[key], key)
|
||||
|
||||
diff_sd[key] = ft_tensor.sub(orig_tensor)
|
||||
|
||||
return diff_sd
|
||||
|
||||
def extract_lora(diff, key, rank, algorithm, lora_type, lowrank_iters=7, adaptive_param=1.0, clamp_quantile=True):
|
||||
"""
|
||||
Extracts LoRA weights from a weight difference tensor using SVD.
|
||||
@ -99,15 +144,18 @@ def extract_lora(diff, key, rank, algorithm, lora_type, lowrank_iters=7, adaptiv
|
||||
return (U, Vh)
|
||||
|
||||
|
||||
def calc_lora_model(model_diff, rank, prefix_model, prefix_lora, output_sd, lora_type, algorithm, lowrank_iters, out_dtype, bias_diff=False, adaptive_param=1.0, clamp_quantile=True):
|
||||
comfy.model_management.load_models_gpu([model_diff], force_patch_weights=True)
|
||||
model_diff.model.diffusion_model.cpu()
|
||||
sd = model_diff.model_state_dict(filter_prefix=prefix_model)
|
||||
del model_diff
|
||||
comfy.model_management.soft_empty_cache()
|
||||
for k, v in sd.items():
|
||||
if isinstance(v, torch.Tensor):
|
||||
sd[k] = v.cpu()
|
||||
def calc_lora_model(model_diff, rank, prefix_model, prefix_lora, output_sd, lora_type, algorithm, lowrank_iters, out_dtype, bias_diff=False, adaptive_param=1.0, clamp_quantile=True, sd_override=None):
|
||||
if sd_override is None:
|
||||
comfy.model_management.load_models_gpu([model_diff], force_patch_weights=True)
|
||||
model_diff.model.diffusion_model.cpu()
|
||||
sd = model_diff.model_state_dict(filter_prefix=prefix_model)
|
||||
del model_diff
|
||||
comfy.model_management.soft_empty_cache()
|
||||
for k, v in sd.items():
|
||||
if isinstance(v, torch.Tensor):
|
||||
sd[k] = v.cpu()
|
||||
else:
|
||||
sd = sd_override
|
||||
|
||||
# Get total number of keys to process for progress bar
|
||||
total_keys = len([k for k in sd if k.endswith(".weight") or (bias_diff and k.endswith(".bias"))])
|
||||
@ -183,17 +231,39 @@ class LoraExtractKJ:
|
||||
raise ValueError("svd_lowrank algorithm is only supported for standard LoRA extraction.")
|
||||
|
||||
dtype = {"fp8_e4m3fn": torch.float8_e4m3fn, "bf16": torch.bfloat16, "fp16": torch.float16, "fp16_fast": torch.float16, "fp32": torch.float32}[output_dtype]
|
||||
m = finetuned_model.clone()
|
||||
kp = original_model.get_key_patches("diffusion_model.")
|
||||
for k in kp:
|
||||
m.add_patches({k: kp[k]}, - 1.0, 1.0)
|
||||
model_diff = m
|
||||
|
||||
model_diff = None
|
||||
sd_override = None
|
||||
|
||||
scaled_fp8_ft = getattr(getattr(finetuned_model.model, "model_config", None), "scaled_fp8", None)
|
||||
scaled_fp8_orig = getattr(getattr(original_model.model, "model_config", None), "scaled_fp8", None)
|
||||
scaled_fp8_present = scaled_fp8_ft is not None or scaled_fp8_orig is not None
|
||||
|
||||
if scaled_fp8_present:
|
||||
comfy.model_management.load_models_gpu([finetuned_model, original_model], force_patch_weights=True)
|
||||
logging.info(
|
||||
"LoraExtractKJ: detected scaled fp8 weights (finetuned=%s, original=%s); using high-precision diff path.",
|
||||
scaled_fp8_ft is not None,
|
||||
scaled_fp8_orig is not None,
|
||||
)
|
||||
sd_override = _build_scaled_fp8_diff(
|
||||
finetuned_model, original_model, "diffusion_model.", bias_diff
|
||||
)
|
||||
comfy.model_management.soft_empty_cache()
|
||||
else:
|
||||
m = finetuned_model.clone()
|
||||
kp = original_model.get_key_patches("diffusion_model.")
|
||||
for k in kp:
|
||||
m.add_patches({k: kp[k]}, - 1.0, 1.0)
|
||||
model_diff = m
|
||||
|
||||
full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir)
|
||||
|
||||
output_sd = {}
|
||||
if model_diff is not None:
|
||||
output_sd = calc_lora_model(model_diff, rank, "diffusion_model.", "diffusion_model.", output_sd, lora_type, algorithm, lowrank_iters, dtype, bias_diff=bias_diff, adaptive_param=adaptive_param, clamp_quantile=clamp_quantile)
|
||||
elif sd_override is not None:
|
||||
output_sd = calc_lora_model(None, rank, "diffusion_model.", "diffusion_model.", output_sd, lora_type, algorithm, lowrank_iters, dtype, bias_diff=bias_diff, adaptive_param=adaptive_param, clamp_quantile=clamp_quantile, sd_override=sd_override)
|
||||
if "adaptive" in lora_type:
|
||||
rank_str = f"{lora_type}_{adaptive_param:.2f}"
|
||||
else:
|
||||
|
||||
@ -4,6 +4,7 @@ import logging
|
||||
import torch
|
||||
import importlib
|
||||
import math
|
||||
import datetime
|
||||
|
||||
import folder_paths
|
||||
import comfy.model_management as mm
|
||||
@ -2103,7 +2104,7 @@ class NABLA_AttentionKJ():
|
||||
|
||||
def attention_override_nabla(func, *args, **kwargs):
|
||||
return nabla_attention(*args, **kwargs)
|
||||
|
||||
|
||||
if torch_compile:
|
||||
attention_override_nabla = torch.compile(attention_override_nabla, mode="max-autotune-no-cudagraphs", dynamic=True)
|
||||
|
||||
@ -2146,7 +2147,7 @@ class NABLA_Attention():
|
||||
kv_nb = mask.sum(-1).to(torch.int32)
|
||||
kv_inds = mask.argsort(dim=-1, descending=True).to(torch.int32)
|
||||
return BlockMask.from_kv_blocks(torch.zeros_like(kv_nb), kv_inds, kv_nb, kv_inds, BLOCK_SIZE=BLOCK_SIZE, mask_mod=None)
|
||||
|
||||
|
||||
def fast_sta_nabla(T, H, W, wT=3, wH=3, wW=3):
|
||||
l = torch.Tensor([T, H, W]).amax()
|
||||
r = torch.arange(0, l, 1, dtype=torch.int16, device=mm.get_torch_device())
|
||||
@ -2166,7 +2167,7 @@ def fast_sta_nabla(T, H, W, wT=3, wH=3, wW=3):
|
||||
|
||||
def get_sparse_params(x, wT, wH, wW, sparsity=0.9):
|
||||
B, C, T, H, W = x.shape
|
||||
print("x shape:", x.shape)
|
||||
#print("x shape:", x.shape)
|
||||
patch_size = (1, 2, 2)
|
||||
T, H, W = (
|
||||
T // patch_size[0],
|
||||
@ -2186,4 +2187,119 @@ def get_sparse_params(x, wT, wH, wW, sparsity=0.9):
|
||||
"method": "topcdf",
|
||||
}
|
||||
|
||||
return sparse_params
|
||||
return sparse_params
|
||||
|
||||
from comfy.comfy_types.node_typing import IO
|
||||
class StartRecordCUDAMemoryHistory():
|
||||
# @classmethod
|
||||
# def IS_CHANGED(s):
|
||||
# return True
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {
|
||||
"required": {
|
||||
"input": (IO.ANY,),
|
||||
"enabled": (["all", "state", "None"], {"default": "all", "tooltip": "None: disable, 'state': keep info for allocated memory, 'all': keep history of all alloc/free calls"}),
|
||||
"context": (["all", "state", "alloc", "None"], {"default": "all", "tooltip": "None: no tracebacks, 'state': tracebacks for allocated memory, 'alloc': for alloc calls, 'all': for free calls"}),
|
||||
"stacks": (["python", "all"], {"default": "all", "tooltip": "'python': Python/TorchScript/inductor frames, 'all': also C++ frames"}),
|
||||
"max_entries": ("INT", {"default": 100000, "min": 1000, "max": 10000000, "tooltip": "Maximum number of entries to record"}),
|
||||
},
|
||||
}
|
||||
|
||||
RETURN_TYPES = (IO.ANY, )
|
||||
RETURN_NAMES = ("input", "output_path",)
|
||||
FUNCTION = "start"
|
||||
CATEGORY = "KJNodes/experimental"
|
||||
DESCRIPTION = "THIS NODE ALWAYS RUNS. Starts recording CUDA memory allocation history, can be ended and saved with EndRecordCUDAMemoryHistory. "
|
||||
|
||||
def start(self, input, enabled, context, stacks, max_entries):
|
||||
mm.soft_empty_cache()
|
||||
torch.cuda.reset_peak_memory_stats(mm.get_torch_device())
|
||||
torch.cuda.memory._record_memory_history(
|
||||
max_entries=max_entries,
|
||||
enabled=enabled if enabled != "None" else None,
|
||||
context=context if context != "None" else None,
|
||||
stacks=stacks
|
||||
)
|
||||
return input,
|
||||
|
||||
class EndRecordCUDAMemoryHistory():
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": {
|
||||
"input": (IO.ANY,),
|
||||
"output_path": ("STRING", {"default": "comfy_cuda_memory_history"}, "Base path for saving the CUDA memory history file, timestamp and .pt extension will be added"),
|
||||
},
|
||||
}
|
||||
|
||||
RETURN_TYPES = (IO.ANY, "STRING",)
|
||||
RETURN_NAMES = ("input", "output_path",)
|
||||
FUNCTION = "end"
|
||||
CATEGORY = "KJNodes/experimental"
|
||||
DESCRIPTION = "Records CUDA memory allocation history between start and end, saves to a file that can be analyzed here: https://docs.pytorch.org/memory_viz or with VisualizeCUDAMemoryHistory node"
|
||||
|
||||
def end(self, input, output_path):
|
||||
mm.soft_empty_cache()
|
||||
time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_path = f"{output_path}{time}.pt"
|
||||
torch.cuda.memory._dump_snapshot(output_path)
|
||||
torch.cuda.memory._record_memory_history(enabled=None)
|
||||
return input, output_path
|
||||
|
||||
|
||||
try:
|
||||
from server import PromptServer
|
||||
except:
|
||||
PromptServer = None
|
||||
|
||||
class VisualizeCUDAMemoryHistory():
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": {
|
||||
"snapshot_path": ("STRING", ),
|
||||
},
|
||||
"hidden": {
|
||||
"unique_id": "UNIQUE_ID",
|
||||
},
|
||||
}
|
||||
|
||||
RETURN_TYPES = ("STRING",)
|
||||
RETURN_NAMES = ("output_path",)
|
||||
FUNCTION = "visualize"
|
||||
CATEGORY = "KJNodes/experimental"
|
||||
DESCRIPTION = "Visualizes a CUDA memory allocation history file, opens in browser"
|
||||
OUTPUT_NODE = True
|
||||
|
||||
def visualize(self, snapshot_path, unique_id):
|
||||
import pickle
|
||||
from torch.cuda import _memory_viz
|
||||
import uuid
|
||||
|
||||
from folder_paths import get_output_directory
|
||||
output_dir = get_output_directory()
|
||||
|
||||
with open(snapshot_path, "rb") as f:
|
||||
snapshot = pickle.load(f)
|
||||
|
||||
html = _memory_viz.trace_plot(snapshot)
|
||||
html_filename = f"cuda_memory_history_{uuid.uuid4().hex}.html"
|
||||
output_path = os.path.join(output_dir, "memory_history", html_filename)
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
|
||||
api_url = f"http://localhost:8188/api/view?type=output&filename={html_filename}&subfolder=memory_history"
|
||||
|
||||
# Progress UI
|
||||
if unique_id and PromptServer is not None:
|
||||
try:
|
||||
PromptServer.instance.send_progress_text(
|
||||
api_url,
|
||||
unique_id
|
||||
)
|
||||
except:
|
||||
pass
|
||||
|
||||
return api_url,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user