make compatible with comfy cliptextencode

This commit is contained in:
kijai 2024-10-28 12:23:14 +02:00
parent ce903c0384
commit 3dce06b28b
3 changed files with 909 additions and 284 deletions

View File

@ -0,0 +1,557 @@
{
"last_node_id": 17,
"last_link_id": 25,
"nodes": [
{
"id": 1,
"type": "MochiTextEncode",
"pos": {
"0": 483,
"1": 281
},
"size": [
381.8630768000736,
227.23898384078808
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 1
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
17
],
"slot_index": 0
},
{
"name": "clip",
"type": "CLIP",
"links": [
20
],
"slot_index": 1
}
],
"properties": {
"Node name for S&R": "MochiTextEncode"
},
"widgets_values": [
"nature video of a red panda eating bamboo in front of a waterfall",
1,
false
]
},
{
"id": 2,
"type": "CLIPLoader",
"pos": {
"0": -41,
"1": 457
},
"size": [
479.5359523201174,
82
],
"flags": {},
"order": 0,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "CLIP",
"type": "CLIP",
"links": [
1
]
}
],
"properties": {
"Node name for S&R": "CLIPLoader"
},
"widgets_values": [
"t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
"sd3"
]
},
{
"id": 8,
"type": "MochiTextEncode",
"pos": {
"0": 487,
"1": 563
},
"size": [
378.8630768000736,
183.64429832064002
],
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 20
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
18,
21
],
"slot_index": 0
},
{
"name": "clip",
"type": "CLIP",
"links": null
}
],
"properties": {
"Node name for S&R": "MochiTextEncode"
},
"widgets_values": [
"",
1,
true
]
},
{
"id": 9,
"type": "VHS_VideoCombine",
"pos": {
"0": 1785,
"1": 227
},
"size": [
1261.0787353515625,
310
],
"flags": {},
"order": 9,
"mode": 0,
"inputs": [
{
"name": "images",
"type": "IMAGE",
"link": 24
},
{
"name": "audio",
"type": "AUDIO",
"link": null,
"shape": 7
},
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null,
"shape": 7
},
{
"name": "vae",
"type": "VAE",
"link": null,
"shape": 7
}
],
"outputs": [
{
"name": "Filenames",
"type": "VHS_FILENAMES",
"links": null
}
],
"properties": {
"Node name for S&R": "VHS_VideoCombine"
},
"widgets_values": {
"frame_rate": 24,
"loop_count": 0,
"filename_prefix": "Mochi_preview",
"format": "video/h264-mp4",
"pix_fmt": "yuv420p",
"crf": 19,
"save_metadata": true,
"pingpong": false,
"save_output": false,
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"filename": "Mochi_preview_00021.mp4",
"subfolder": "",
"type": "temp",
"format": "video/h264-mp4",
"frame_rate": 24
},
"muted": false
}
}
},
{
"id": 4,
"type": "DownloadAndLoadMochiModel",
"pos": {
"0": 465,
"1": 20
},
"size": {
"0": 437.7432556152344,
"1": 174
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "trigger",
"type": "CONDITIONING",
"link": 21,
"shape": 7
},
{
"name": "compile_args",
"type": "MOCHICOMPILEARGS",
"link": null,
"shape": 7
}
],
"outputs": [
{
"name": "mochi_model",
"type": "MOCHIMODEL",
"links": [
16
],
"slot_index": 0
},
{
"name": "mochi_vae",
"type": "MOCHIVAE",
"links": [
23
],
"slot_index": 1
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadMochiModel"
},
"widgets_values": [
"mochi_preview_dit_GGUF_Q8_0.safetensors",
"mochi_preview_vae_bf16.safetensors",
"fp8_e4m3fn",
"sdpa",
false
]
},
{
"id": 14,
"type": "MochiSampler",
"pos": {
"0": 960,
"1": 243
},
"size": [
315,
286
],
"flags": {},
"order": 6,
"mode": 0,
"inputs": [
{
"name": "model",
"type": "MOCHIMODEL",
"link": 16
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 17
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 18
},
{
"name": "opt_sigmas",
"type": "SIGMAS",
"link": null,
"shape": 7
},
{
"name": "cfg_schedule",
"type": "FLOAT",
"link": null,
"widget": {
"name": "cfg_schedule"
},
"shape": 7
}
],
"outputs": [
{
"name": "samples",
"type": "LATENT",
"links": [
22,
25
],
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "MochiSampler"
},
"widgets_values": [
848,
480,
49,
30,
4.5,
0,
"fixed",
0
]
},
{
"id": 15,
"type": "MochiDecodeSpatialTiling",
"pos": {
"0": 1340,
"1": 226
},
"size": {
"0": 390.5999755859375,
"1": 198
},
"flags": {},
"order": 7,
"mode": 0,
"inputs": [
{
"name": "vae",
"type": "MOCHIVAE",
"link": 23
},
{
"name": "samples",
"type": "LATENT",
"link": 22
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
24
],
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "MochiDecodeSpatialTiling"
},
"widgets_values": [
true,
4,
4,
16,
1,
6
]
},
{
"id": 12,
"type": "Note",
"pos": {
"0": 1349,
"1": -156
},
"size": {
"0": 365.5867919921875,
"1": 208.3488311767578
},
"flags": {},
"order": 1,
"mode": 0,
"inputs": [],
"outputs": [],
"title": "Note: WIP!",
"properties": {},
"widgets_values": [
"VAE decoding is extremely heavy so tiling is necessary, I have not found best settings for it yet so testing help is appreciated, you can keep decoding after sampling as the latents are still in memory to see what works.\n\nYou can also save the latents to disk and decode separately.\n\nIncrease the number of tiles until it fits your VRAM, and/or reduce per_batch to split the decoding time wise, this WILL cause frame skipping!\n"
],
"color": "#432",
"bgcolor": "#653"
},
{
"id": 17,
"type": "LoadLatent",
"pos": {
"0": 1775,
"1": -51
},
"size": {
"0": 315,
"1": 58
},
"flags": {},
"order": 2,
"mode": 2,
"inputs": [],
"outputs": [
{
"name": "LATENT",
"type": "LATENT",
"links": null
}
],
"properties": {
"Node name for S&R": "LoadLatent"
},
"widgets_values": [
"mochi_00001_.latent"
]
},
{
"id": 16,
"type": "SaveLatent",
"pos": {
"0": 1772,
"1": -168
},
"size": {
"0": 315,
"1": 58
},
"flags": {},
"order": 8,
"mode": 2,
"inputs": [
{
"name": "samples",
"type": "LATENT",
"link": 25
}
],
"outputs": [],
"properties": {
"Node name for S&R": "SaveLatent"
},
"widgets_values": [
"latents/mochi_latent"
]
}
],
"links": [
[
1,
2,
0,
1,
0,
"CLIP"
],
[
16,
4,
0,
14,
0,
"MOCHIMODEL"
],
[
17,
1,
0,
14,
1,
"CONDITIONING"
],
[
18,
8,
0,
14,
2,
"CONDITIONING"
],
[
20,
1,
1,
8,
0,
"CLIP"
],
[
21,
8,
0,
4,
0,
"CONDITIONING"
],
[
22,
14,
0,
15,
1,
"LATENT"
],
[
23,
4,
1,
15,
0,
"MOCHIVAE"
],
[
24,
15,
0,
9,
0,
"IMAGE"
],
[
25,
14,
0,
16,
0,
"LATENT"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 0.693433494944238,
"offset": [
64.81666033991527,
428.7032954894722
]
}
},
"version": 0.4
}

View File

@ -1,216 +1,7 @@
{
"last_node_id": 12,
"last_link_id": 15,
"last_node_id": 14,
"last_link_id": 21,
"nodes": [
{
"id": 4,
"type": "DownloadAndLoadMochiModel",
"pos": {
"0": 393,
"1": 59
},
"size": {
"0": 437.7432556152344,
"1": 126
},
"flags": {},
"order": 0,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "mochi_model",
"type": "MOCHIMODEL",
"links": [
3
],
"slot_index": 0
},
{
"name": "mochi_vae",
"type": "MOCHIVAE",
"links": [
11
],
"slot_index": 1
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadMochiModel"
},
"widgets_values": [
"mochi_preview_dit_fp8_e4m3fn.safetensors",
"mochi_preview_vae_bf16.safetensors",
"fp8_e4m3fn"
]
},
{
"id": 1,
"type": "MochiTextEncode",
"pos": {
"0": 484,
"1": 258
},
"size": {
"0": 413.45361328125,
"1": 268.5947265625
},
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 1
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
7
],
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "MochiTextEncode"
},
"widgets_values": [
"nature video of a red panda eating bamboo in front of a waterfall",
1,
true
]
},
{
"id": 8,
"type": "MochiTextEncode",
"pos": {
"0": 481,
"1": 577
},
"size": {
"0": 400,
"1": 200
},
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 8
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
9
],
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "MochiTextEncode"
},
"widgets_values": [
"",
1,
true
]
},
{
"id": 2,
"type": "CLIPLoader",
"pos": {
"0": -3,
"1": 462
},
"size": {
"0": 429.837646484375,
"1": 82
},
"flags": {},
"order": 1,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "CLIP",
"type": "CLIP",
"links": [
1,
8
]
}
],
"properties": {
"Node name for S&R": "CLIPLoader"
},
"widgets_values": [
"t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
"sd3"
]
},
{
"id": 5,
"type": "MochiSampler",
"pos": {
"0": 960,
"1": 243
},
"size": {
"0": 315,
"1": 242
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "model",
"type": "MOCHIMODEL",
"link": 3
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 7
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 9
}
],
"outputs": [
{
"name": "model",
"type": "LATENT",
"links": [
12
],
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "MochiSampler"
},
"widgets_values": [
848,
480,
163,
50,
4.5,
0,
"fixed"
]
},
{
"id": 10,
"type": "MochiDecode",
@ -234,7 +25,7 @@
{
"name": "samples",
"type": "LATENT",
"link": 12
"link": 19
}
],
"outputs": [
@ -291,17 +82,17 @@
"slot_index": 0
},
{
"name": "854 width",
"name": "width",
"type": "INT",
"links": null
},
{
"name": "480 height",
"name": "height",
"type": "INT",
"links": null
},
{
"name": "158 count",
"name": "count",
"type": "INT",
"links": null
}
@ -320,7 +111,7 @@
},
"size": [
1261.0787353515625,
1019.9320011317172
310
],
"flags": {},
"order": 8,
@ -391,12 +182,12 @@
"0": 1271,
"1": -119
},
"size": [
365.586792085973,
208.34883369101206
],
"size": {
"0": 365.5867919921875,
"1": 208.3488311767578
},
"flags": {},
"order": 2,
"order": 0,
"mode": 0,
"inputs": [],
"outputs": [],
@ -407,6 +198,258 @@
],
"color": "#432",
"bgcolor": "#653"
},
{
"id": 14,
"type": "MochiSampler",
"pos": {
"0": 960,
"1": 243
},
"size": [
315,
286
],
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "model",
"type": "MOCHIMODEL",
"link": 16
},
{
"name": "positive",
"type": "CONDITIONING",
"link": 17
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 18
},
{
"name": "opt_sigmas",
"type": "SIGMAS",
"link": null,
"shape": 7
},
{
"name": "cfg_schedule",
"type": "FLOAT",
"link": null,
"widget": {
"name": "cfg_schedule"
},
"shape": 7
}
],
"outputs": [
{
"name": "samples",
"type": "LATENT",
"links": [
19
]
}
],
"properties": {
"Node name for S&R": "MochiSampler"
},
"widgets_values": [
848,
480,
163,
50,
4.5,
0,
"fixed",
0
]
},
{
"id": 4,
"type": "DownloadAndLoadMochiModel",
"pos": {
"0": 452,
"1": -20
},
"size": {
"0": 437.7432556152344,
"1": 174
},
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "trigger",
"type": "CONDITIONING",
"link": 21,
"shape": 7
},
{
"name": "compile_args",
"type": "MOCHICOMPILEARGS",
"link": null,
"shape": 7
}
],
"outputs": [
{
"name": "mochi_model",
"type": "MOCHIMODEL",
"links": [
16
],
"slot_index": 0
},
{
"name": "mochi_vae",
"type": "MOCHIVAE",
"links": [
11
],
"slot_index": 1
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadMochiModel"
},
"widgets_values": [
"mochi_preview_dit_GGUF_Q8_0.safetensors",
"mochi_preview_vae_bf16.safetensors",
"fp8_e4m3fn",
"sdpa",
false
]
},
{
"id": 1,
"type": "MochiTextEncode",
"pos": {
"0": 483,
"1": 281
},
"size": [
381.8630768000736,
227.23898384078808
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 1
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
17
],
"slot_index": 0
},
{
"name": "clip",
"type": "CLIP",
"links": [
20
],
"slot_index": 1
}
],
"properties": {
"Node name for S&R": "MochiTextEncode"
},
"widgets_values": [
"nature video of a red panda eating bamboo in front of a waterfall",
1,
false
]
},
{
"id": 2,
"type": "CLIPLoader",
"pos": {
"0": -41,
"1": 457
},
"size": [
479.5359523201174,
82
],
"flags": {},
"order": 1,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "CLIP",
"type": "CLIP",
"links": [
1
]
}
],
"properties": {
"Node name for S&R": "CLIPLoader"
},
"widgets_values": [
"t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
"sd3"
]
},
{
"id": 8,
"type": "MochiTextEncode",
"pos": {
"0": 487,
"1": 563
},
"size": [
378.8630768000736,
183.64429832064002
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 20
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
18,
21
],
"slot_index": 0
},
{
"name": "clip",
"type": "CLIP",
"links": null
}
],
"properties": {
"Node name for S&R": "MochiTextEncode"
},
"widgets_values": [
"",
1,
true
]
}
],
"links": [
@ -418,38 +461,6 @@
0,
"CLIP"
],
[
3,
4,
0,
5,
0,
"MOCHIMODEL"
],
[
7,
1,
0,
5,
1,
"CONDITIONING"
],
[
8,
2,
0,
8,
0,
"CLIP"
],
[
9,
8,
0,
5,
2,
"CONDITIONING"
],
[
11,
4,
@ -458,14 +469,6 @@
0,
"MOCHIVAE"
],
[
12,
5,
0,
10,
1,
"LATENT"
],
[
14,
10,
@ -481,16 +484,64 @@
9,
0,
"IMAGE"
],
[
16,
4,
0,
14,
0,
"MOCHIMODEL"
],
[
17,
1,
0,
14,
1,
"CONDITIONING"
],
[
18,
8,
0,
14,
2,
"CONDITIONING"
],
[
19,
14,
0,
10,
1,
"LATENT"
],
[
20,
1,
1,
8,
0,
"CLIP"
],
[
21,
8,
0,
4,
0,
"CONDITIONING"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 0.6934334949442466,
"scale": 0.8390545288825276,
"offset": [
-193.29818918510955,
307.42265737796134
74.08380372279714,
307.44392783781285
]
}
},

View File

@ -65,14 +65,14 @@ class DownloadAndLoadMochiModel:
{"tooltip": "Downloads from 'https://huggingface.co/Kijai/Mochi_preview_comfy' to 'models/vae/mochi'", },
),
"precision": (["fp8_e4m3fn","fp8_e4m3fn_fast","fp16", "fp32", "bf16"],
{"default": "fp8_e4m3fn", }),
{"default": "fp8_e4m3fn", "tooltip": "The precision to use for the model weights. Has no effect with GGUF models"},),
"attention_mode": (["sdpa","flash_attn","sage_attn", "comfy"],
),
},
"optional": {
"trigger": ("CONDITIONING", {"tooltip": "Dummy input for forcing execution order",}),
"compile_args": ("MOCHICOMPILEARGS", {"tooltip": "Optional torch.compile arguments",}),
"cublas_ops": ("BOOLEAN", {"tooltip": "tested on 4090, unsure of gpu requirements, enables faster linear ops from'https://github.com/aredden/torch-cublas-hgemm'",}),
"cublas_ops": ("BOOLEAN", {"tooltip": "tested on 4090, unsure of gpu requirements, enables faster linear ops for the GGUF models, for more info:'https://github.com/aredden/torch-cublas-hgemm'",}),
},
}
@ -169,7 +169,7 @@ class MochiModelLoader:
"optional": {
"trigger": ("CONDITIONING", {"tooltip": "Dummy input for forcing execution order",}),
"compile_args": ("MOCHICOMPILEARGS", {"tooltip": "Optional torch.compile arguments",}),
"cublas_ops": ("BOOLEAN", {"tooltip": "tested on 4090, unsure of gpu requirements, enables faster linear ops from'https://github.com/aredden/torch-cublas-hgemm'",}),
"cublas_ops": ("BOOLEAN", {"tooltip": "tested on 4090, unsure of gpu requirements, enables faster linear ops for the GGUF models, for more info:'https://github.com/aredden/torch-cublas-hgemm'",}),
},
}
@ -315,18 +315,23 @@ class MochiTextEncode:
load_device = mm.text_encoder_device()
offload_device = mm.text_encoder_offload_device()
clip.tokenizer.t5xxl.pad_to_max_length = True
clip.tokenizer.t5xxl.max_length = max_tokens
clip.cond_stage_model.t5xxl.return_attention_masks = True
clip.cond_stage_model.t5xxl.enable_attention_masks = True
clip.cond_stage_model.t5_attention_mask = True
clip.cond_stage_model.to(load_device)
tokens = clip.tokenizer.t5xxl.tokenize_with_weights(prompt, return_word_ids=True)
try:
embeds, _, attention_mask = clip.cond_stage_model.t5xxl.encode_token_weights(tokens)
clip.tokenizer.t5xxl.pad_to_max_length = True
clip.tokenizer.t5xxl.max_length = max_tokens
clip.cond_stage_model.t5xxl.return_attention_masks = True
clip.cond_stage_model.t5xxl.enable_attention_masks = True
clip.cond_stage_model.t5_attention_mask = True
clip.cond_stage_model.to(load_device)
tokens = clip.tokenizer.t5xxl.tokenize_with_weights(prompt, return_word_ids=True)
try:
embeds, _, attention_mask = clip.cond_stage_model.t5xxl.encode_token_weights(tokens)
except:
NotImplementedError("Failed to get attention mask from T5, is your ComfyUI up to date?")
except:
NotImplementedError("Failed to get attention mask from T5, is your ComfyUI up to date?")
clip.cond_stage_model.to(load_device)
tokens = clip.tokenizer.tokenize_with_weights(prompt, return_word_ids=True)
embeds, _, attention_mask = clip.cond_stage_model.encode_token_weights(tokens)
if embeds.shape[1] > 256:
raise ValueError(f"Prompt is too long, max tokens supported is {max_tokens} or less, got {embeds.shape[1]}")
@ -358,8 +363,8 @@ class MochiSampler:
#"batch_cfg": ("BOOLEAN", {"default": False, "tooltip": "Enable batched cfg"}),
},
"optional": {
"cfg_schedule": ("FLOAT", {"forceInput": True,}),
"opt_sigmas": ("SIGMAS",),
"cfg_schedule": ("FLOAT", {"forceInput": True, "tooltip": "Override cfg schedule with a list of ints"}),
"opt_sigmas": ("SIGMAS", {"tooltip": "Override sigma schedule and steps"}),
}
}
@ -373,16 +378,28 @@ class MochiSampler:
if opt_sigmas is not None:
sigma_schedule = opt_sigmas.tolist()
steps = len(sigma_schedule)
steps = int(len(sigma_schedule))
sigma_schedule.extend([0.0])
logging.info(f"Using sigma_schedule: {sigma_schedule}")
else:
sigma_schedule = linear_quadratic_schedule(steps, 0.025)
logging.info(f"Using sigma_schedule: {sigma_schedule}")
cfg_schedule = cfg_schedule or [cfg] * steps
logging.info(f"Using cfg schedule: {cfg_schedule}")
if cfg_schedule is None:
cfg_schedule = [cfg] * steps
else:
logging.info(f"Using cfg schedule: {cfg_schedule}")
#For compatibility with Comfy CLIPTextEncode
if not isinstance(positive, dict):
positive = {
"embeds": positive[0][0],
"attention_mask": positive[0][1]["attention_mask"].bool(),
}
if not isinstance(negative, dict):
negative = {
"embeds": negative[0][0],
"attention_mask": negative[0][1]["attention_mask"].bool(),
}
args = {
"height": height,