From 3dce06b28b05c1446c3eaa40fdfe249b83dd2d99 Mon Sep 17 00:00:00 2001 From: kijai <40791699+kijai@users.noreply.github.com> Date: Mon, 28 Oct 2024 12:23:14 +0200 Subject: [PATCH] make compatible with comfy cliptextencode --- examples/mochi_example_49_frames_16GB.json | 557 ++++++++++++++++++++ examples/mochi_test_163_frames_01.json | 579 +++++++++++---------- nodes.py | 57 +- 3 files changed, 909 insertions(+), 284 deletions(-) create mode 100644 examples/mochi_example_49_frames_16GB.json diff --git a/examples/mochi_example_49_frames_16GB.json b/examples/mochi_example_49_frames_16GB.json new file mode 100644 index 0000000..31c8a07 --- /dev/null +++ b/examples/mochi_example_49_frames_16GB.json @@ -0,0 +1,557 @@ +{ + "last_node_id": 17, + "last_link_id": 25, + "nodes": [ + { + "id": 1, + "type": "MochiTextEncode", + "pos": { + "0": 483, + "1": 281 + }, + "size": [ + 381.8630768000736, + 227.23898384078808 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 1 + } + ], + "outputs": [ + { + "name": "conditioning", + "type": "CONDITIONING", + "links": [ + 17 + ], + "slot_index": 0 + }, + { + "name": "clip", + "type": "CLIP", + "links": [ + 20 + ], + "slot_index": 1 + } + ], + "properties": { + "Node name for S&R": "MochiTextEncode" + }, + "widgets_values": [ + "nature video of a red panda eating bamboo in front of a waterfall", + 1, + false + ] + }, + { + "id": 2, + "type": "CLIPLoader", + "pos": { + "0": -41, + "1": 457 + }, + "size": [ + 479.5359523201174, + 82 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "CLIP", + "type": "CLIP", + "links": [ + 1 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPLoader" + }, + "widgets_values": [ + "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors", + "sd3" + ] + }, + { + "id": 8, + "type": "MochiTextEncode", + "pos": { + "0": 487, + "1": 563 + }, + "size": [ + 378.8630768000736, + 183.64429832064002 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 20 + } + ], + "outputs": [ + { + "name": "conditioning", + "type": "CONDITIONING", + "links": [ + 18, + 21 + ], + "slot_index": 0 + }, + { + "name": "clip", + "type": "CLIP", + "links": null + } + ], + "properties": { + "Node name for S&R": "MochiTextEncode" + }, + "widgets_values": [ + "", + 1, + true + ] + }, + { + "id": 9, + "type": "VHS_VideoCombine", + "pos": { + "0": 1785, + "1": 227 + }, + "size": [ + 1261.0787353515625, + 310 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 24 + }, + { + "name": "audio", + "type": "AUDIO", + "link": null, + "shape": 7 + }, + { + "name": "meta_batch", + "type": "VHS_BatchManager", + "link": null, + "shape": 7 + }, + { + "name": "vae", + "type": "VAE", + "link": null, + "shape": 7 + } + ], + "outputs": [ + { + "name": "Filenames", + "type": "VHS_FILENAMES", + "links": null + } + ], + "properties": { + "Node name for S&R": "VHS_VideoCombine" + }, + "widgets_values": { + "frame_rate": 24, + "loop_count": 0, + "filename_prefix": "Mochi_preview", + "format": "video/h264-mp4", + "pix_fmt": "yuv420p", + "crf": 19, + "save_metadata": true, + "pingpong": false, + "save_output": false, + "videopreview": { + "hidden": false, + "paused": false, + "params": { + "filename": "Mochi_preview_00021.mp4", + "subfolder": "", + "type": "temp", + "format": "video/h264-mp4", + "frame_rate": 24 + }, + "muted": false + } + } + }, + { + "id": 4, + "type": "DownloadAndLoadMochiModel", + "pos": { + "0": 465, + "1": 20 + }, + "size": { + "0": 437.7432556152344, + "1": 174 + }, + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "trigger", + "type": "CONDITIONING", + "link": 21, + "shape": 7 + }, + { + "name": "compile_args", + "type": "MOCHICOMPILEARGS", + "link": null, + "shape": 7 + } + ], + "outputs": [ + { + "name": "mochi_model", + "type": "MOCHIMODEL", + "links": [ + 16 + ], + "slot_index": 0 + }, + { + "name": "mochi_vae", + "type": "MOCHIVAE", + "links": [ + 23 + ], + "slot_index": 1 + } + ], + "properties": { + "Node name for S&R": "DownloadAndLoadMochiModel" + }, + "widgets_values": [ + "mochi_preview_dit_GGUF_Q8_0.safetensors", + "mochi_preview_vae_bf16.safetensors", + "fp8_e4m3fn", + "sdpa", + false + ] + }, + { + "id": 14, + "type": "MochiSampler", + "pos": { + "0": 960, + "1": 243 + }, + "size": [ + 315, + 286 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "name": "model", + "type": "MOCHIMODEL", + "link": 16 + }, + { + "name": "positive", + "type": "CONDITIONING", + "link": 17 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 18 + }, + { + "name": "opt_sigmas", + "type": "SIGMAS", + "link": null, + "shape": 7 + }, + { + "name": "cfg_schedule", + "type": "FLOAT", + "link": null, + "widget": { + "name": "cfg_schedule" + }, + "shape": 7 + } + ], + "outputs": [ + { + "name": "samples", + "type": "LATENT", + "links": [ + 22, + 25 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "MochiSampler" + }, + "widgets_values": [ + 848, + 480, + 49, + 30, + 4.5, + 0, + "fixed", + 0 + ] + }, + { + "id": 15, + "type": "MochiDecodeSpatialTiling", + "pos": { + "0": 1340, + "1": 226 + }, + "size": { + "0": 390.5999755859375, + "1": 198 + }, + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "name": "vae", + "type": "MOCHIVAE", + "link": 23 + }, + { + "name": "samples", + "type": "LATENT", + "link": 22 + } + ], + "outputs": [ + { + "name": "images", + "type": "IMAGE", + "links": [ + 24 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "MochiDecodeSpatialTiling" + }, + "widgets_values": [ + true, + 4, + 4, + 16, + 1, + 6 + ] + }, + { + "id": 12, + "type": "Note", + "pos": { + "0": 1349, + "1": -156 + }, + "size": { + "0": 365.5867919921875, + "1": 208.3488311767578 + }, + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [], + "title": "Note: WIP!", + "properties": {}, + "widgets_values": [ + "VAE decoding is extremely heavy so tiling is necessary, I have not found best settings for it yet so testing help is appreciated, you can keep decoding after sampling as the latents are still in memory to see what works.\n\nYou can also save the latents to disk and decode separately.\n\nIncrease the number of tiles until it fits your VRAM, and/or reduce per_batch to split the decoding time wise, this WILL cause frame skipping!\n" + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 17, + "type": "LoadLatent", + "pos": { + "0": 1775, + "1": -51 + }, + "size": { + "0": 315, + "1": 58 + }, + "flags": {}, + "order": 2, + "mode": 2, + "inputs": [], + "outputs": [ + { + "name": "LATENT", + "type": "LATENT", + "links": null + } + ], + "properties": { + "Node name for S&R": "LoadLatent" + }, + "widgets_values": [ + "mochi_00001_.latent" + ] + }, + { + "id": 16, + "type": "SaveLatent", + "pos": { + "0": 1772, + "1": -168 + }, + "size": { + "0": 315, + "1": 58 + }, + "flags": {}, + "order": 8, + "mode": 2, + "inputs": [ + { + "name": "samples", + "type": "LATENT", + "link": 25 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "SaveLatent" + }, + "widgets_values": [ + "latents/mochi_latent" + ] + } + ], + "links": [ + [ + 1, + 2, + 0, + 1, + 0, + "CLIP" + ], + [ + 16, + 4, + 0, + 14, + 0, + "MOCHIMODEL" + ], + [ + 17, + 1, + 0, + 14, + 1, + "CONDITIONING" + ], + [ + 18, + 8, + 0, + 14, + 2, + "CONDITIONING" + ], + [ + 20, + 1, + 1, + 8, + 0, + "CLIP" + ], + [ + 21, + 8, + 0, + 4, + 0, + "CONDITIONING" + ], + [ + 22, + 14, + 0, + 15, + 1, + "LATENT" + ], + [ + 23, + 4, + 1, + 15, + 0, + "MOCHIVAE" + ], + [ + 24, + 15, + 0, + 9, + 0, + "IMAGE" + ], + [ + 25, + 14, + 0, + 16, + 0, + "LATENT" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.693433494944238, + "offset": [ + 64.81666033991527, + 428.7032954894722 + ] + } + }, + "version": 0.4 +} \ No newline at end of file diff --git a/examples/mochi_test_163_frames_01.json b/examples/mochi_test_163_frames_01.json index 481ecee..3643e8a 100644 --- a/examples/mochi_test_163_frames_01.json +++ b/examples/mochi_test_163_frames_01.json @@ -1,216 +1,7 @@ { - "last_node_id": 12, - "last_link_id": 15, + "last_node_id": 14, + "last_link_id": 21, "nodes": [ - { - "id": 4, - "type": "DownloadAndLoadMochiModel", - "pos": { - "0": 393, - "1": 59 - }, - "size": { - "0": 437.7432556152344, - "1": 126 - }, - "flags": {}, - "order": 0, - "mode": 0, - "inputs": [], - "outputs": [ - { - "name": "mochi_model", - "type": "MOCHIMODEL", - "links": [ - 3 - ], - "slot_index": 0 - }, - { - "name": "mochi_vae", - "type": "MOCHIVAE", - "links": [ - 11 - ], - "slot_index": 1 - } - ], - "properties": { - "Node name for S&R": "DownloadAndLoadMochiModel" - }, - "widgets_values": [ - "mochi_preview_dit_fp8_e4m3fn.safetensors", - "mochi_preview_vae_bf16.safetensors", - "fp8_e4m3fn" - ] - }, - { - "id": 1, - "type": "MochiTextEncode", - "pos": { - "0": 484, - "1": 258 - }, - "size": { - "0": 413.45361328125, - "1": 268.5947265625 - }, - "flags": {}, - "order": 3, - "mode": 0, - "inputs": [ - { - "name": "clip", - "type": "CLIP", - "link": 1 - } - ], - "outputs": [ - { - "name": "conditioning", - "type": "CONDITIONING", - "links": [ - 7 - ], - "slot_index": 0 - } - ], - "properties": { - "Node name for S&R": "MochiTextEncode" - }, - "widgets_values": [ - "nature video of a red panda eating bamboo in front of a waterfall", - 1, - true - ] - }, - { - "id": 8, - "type": "MochiTextEncode", - "pos": { - "0": 481, - "1": 577 - }, - "size": { - "0": 400, - "1": 200 - }, - "flags": {}, - "order": 4, - "mode": 0, - "inputs": [ - { - "name": "clip", - "type": "CLIP", - "link": 8 - } - ], - "outputs": [ - { - "name": "conditioning", - "type": "CONDITIONING", - "links": [ - 9 - ], - "slot_index": 0 - } - ], - "properties": { - "Node name for S&R": "MochiTextEncode" - }, - "widgets_values": [ - "", - 1, - true - ] - }, - { - "id": 2, - "type": "CLIPLoader", - "pos": { - "0": -3, - "1": 462 - }, - "size": { - "0": 429.837646484375, - "1": 82 - }, - "flags": {}, - "order": 1, - "mode": 0, - "inputs": [], - "outputs": [ - { - "name": "CLIP", - "type": "CLIP", - "links": [ - 1, - 8 - ] - } - ], - "properties": { - "Node name for S&R": "CLIPLoader" - }, - "widgets_values": [ - "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors", - "sd3" - ] - }, - { - "id": 5, - "type": "MochiSampler", - "pos": { - "0": 960, - "1": 243 - }, - "size": { - "0": 315, - "1": 242 - }, - "flags": {}, - "order": 5, - "mode": 0, - "inputs": [ - { - "name": "model", - "type": "MOCHIMODEL", - "link": 3 - }, - { - "name": "positive", - "type": "CONDITIONING", - "link": 7 - }, - { - "name": "negative", - "type": "CONDITIONING", - "link": 9 - } - ], - "outputs": [ - { - "name": "model", - "type": "LATENT", - "links": [ - 12 - ], - "slot_index": 0 - } - ], - "properties": { - "Node name for S&R": "MochiSampler" - }, - "widgets_values": [ - 848, - 480, - 163, - 50, - 4.5, - 0, - "fixed" - ] - }, { "id": 10, "type": "MochiDecode", @@ -234,7 +25,7 @@ { "name": "samples", "type": "LATENT", - "link": 12 + "link": 19 } ], "outputs": [ @@ -291,17 +82,17 @@ "slot_index": 0 }, { - "name": "854 width", + "name": "width", "type": "INT", "links": null }, { - "name": "480 height", + "name": "height", "type": "INT", "links": null }, { - "name": "158 count", + "name": "count", "type": "INT", "links": null } @@ -320,7 +111,7 @@ }, "size": [ 1261.0787353515625, - 1019.9320011317172 + 310 ], "flags": {}, "order": 8, @@ -391,12 +182,12 @@ "0": 1271, "1": -119 }, - "size": [ - 365.586792085973, - 208.34883369101206 - ], + "size": { + "0": 365.5867919921875, + "1": 208.3488311767578 + }, "flags": {}, - "order": 2, + "order": 0, "mode": 0, "inputs": [], "outputs": [], @@ -407,6 +198,258 @@ ], "color": "#432", "bgcolor": "#653" + }, + { + "id": 14, + "type": "MochiSampler", + "pos": { + "0": 960, + "1": 243 + }, + "size": [ + 315, + 286 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "model", + "type": "MOCHIMODEL", + "link": 16 + }, + { + "name": "positive", + "type": "CONDITIONING", + "link": 17 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 18 + }, + { + "name": "opt_sigmas", + "type": "SIGMAS", + "link": null, + "shape": 7 + }, + { + "name": "cfg_schedule", + "type": "FLOAT", + "link": null, + "widget": { + "name": "cfg_schedule" + }, + "shape": 7 + } + ], + "outputs": [ + { + "name": "samples", + "type": "LATENT", + "links": [ + 19 + ] + } + ], + "properties": { + "Node name for S&R": "MochiSampler" + }, + "widgets_values": [ + 848, + 480, + 163, + 50, + 4.5, + 0, + "fixed", + 0 + ] + }, + { + "id": 4, + "type": "DownloadAndLoadMochiModel", + "pos": { + "0": 452, + "1": -20 + }, + "size": { + "0": 437.7432556152344, + "1": 174 + }, + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "trigger", + "type": "CONDITIONING", + "link": 21, + "shape": 7 + }, + { + "name": "compile_args", + "type": "MOCHICOMPILEARGS", + "link": null, + "shape": 7 + } + ], + "outputs": [ + { + "name": "mochi_model", + "type": "MOCHIMODEL", + "links": [ + 16 + ], + "slot_index": 0 + }, + { + "name": "mochi_vae", + "type": "MOCHIVAE", + "links": [ + 11 + ], + "slot_index": 1 + } + ], + "properties": { + "Node name for S&R": "DownloadAndLoadMochiModel" + }, + "widgets_values": [ + "mochi_preview_dit_GGUF_Q8_0.safetensors", + "mochi_preview_vae_bf16.safetensors", + "fp8_e4m3fn", + "sdpa", + false + ] + }, + { + "id": 1, + "type": "MochiTextEncode", + "pos": { + "0": 483, + "1": 281 + }, + "size": [ + 381.8630768000736, + 227.23898384078808 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 1 + } + ], + "outputs": [ + { + "name": "conditioning", + "type": "CONDITIONING", + "links": [ + 17 + ], + "slot_index": 0 + }, + { + "name": "clip", + "type": "CLIP", + "links": [ + 20 + ], + "slot_index": 1 + } + ], + "properties": { + "Node name for S&R": "MochiTextEncode" + }, + "widgets_values": [ + "nature video of a red panda eating bamboo in front of a waterfall", + 1, + false + ] + }, + { + "id": 2, + "type": "CLIPLoader", + "pos": { + "0": -41, + "1": 457 + }, + "size": [ + 479.5359523201174, + 82 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "CLIP", + "type": "CLIP", + "links": [ + 1 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPLoader" + }, + "widgets_values": [ + "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors", + "sd3" + ] + }, + { + "id": 8, + "type": "MochiTextEncode", + "pos": { + "0": 487, + "1": 563 + }, + "size": [ + 378.8630768000736, + 183.64429832064002 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 20 + } + ], + "outputs": [ + { + "name": "conditioning", + "type": "CONDITIONING", + "links": [ + 18, + 21 + ], + "slot_index": 0 + }, + { + "name": "clip", + "type": "CLIP", + "links": null + } + ], + "properties": { + "Node name for S&R": "MochiTextEncode" + }, + "widgets_values": [ + "", + 1, + true + ] } ], "links": [ @@ -418,38 +461,6 @@ 0, "CLIP" ], - [ - 3, - 4, - 0, - 5, - 0, - "MOCHIMODEL" - ], - [ - 7, - 1, - 0, - 5, - 1, - "CONDITIONING" - ], - [ - 8, - 2, - 0, - 8, - 0, - "CLIP" - ], - [ - 9, - 8, - 0, - 5, - 2, - "CONDITIONING" - ], [ 11, 4, @@ -458,14 +469,6 @@ 0, "MOCHIVAE" ], - [ - 12, - 5, - 0, - 10, - 1, - "LATENT" - ], [ 14, 10, @@ -481,16 +484,64 @@ 9, 0, "IMAGE" + ], + [ + 16, + 4, + 0, + 14, + 0, + "MOCHIMODEL" + ], + [ + 17, + 1, + 0, + 14, + 1, + "CONDITIONING" + ], + [ + 18, + 8, + 0, + 14, + 2, + "CONDITIONING" + ], + [ + 19, + 14, + 0, + 10, + 1, + "LATENT" + ], + [ + 20, + 1, + 1, + 8, + 0, + "CLIP" + ], + [ + 21, + 8, + 0, + 4, + 0, + "CONDITIONING" ] ], "groups": [], "config": {}, "extra": { "ds": { - "scale": 0.6934334949442466, + "scale": 0.8390545288825276, "offset": [ - -193.29818918510955, - 307.42265737796134 + 74.08380372279714, + 307.44392783781285 ] } }, diff --git a/nodes.py b/nodes.py index c875c18..fa567dc 100644 --- a/nodes.py +++ b/nodes.py @@ -65,14 +65,14 @@ class DownloadAndLoadMochiModel: {"tooltip": "Downloads from 'https://huggingface.co/Kijai/Mochi_preview_comfy' to 'models/vae/mochi'", }, ), "precision": (["fp8_e4m3fn","fp8_e4m3fn_fast","fp16", "fp32", "bf16"], - {"default": "fp8_e4m3fn", }), + {"default": "fp8_e4m3fn", "tooltip": "The precision to use for the model weights. Has no effect with GGUF models"},), "attention_mode": (["sdpa","flash_attn","sage_attn", "comfy"], ), }, "optional": { "trigger": ("CONDITIONING", {"tooltip": "Dummy input for forcing execution order",}), "compile_args": ("MOCHICOMPILEARGS", {"tooltip": "Optional torch.compile arguments",}), - "cublas_ops": ("BOOLEAN", {"tooltip": "tested on 4090, unsure of gpu requirements, enables faster linear ops from'https://github.com/aredden/torch-cublas-hgemm'",}), + "cublas_ops": ("BOOLEAN", {"tooltip": "tested on 4090, unsure of gpu requirements, enables faster linear ops for the GGUF models, for more info:'https://github.com/aredden/torch-cublas-hgemm'",}), }, } @@ -169,7 +169,7 @@ class MochiModelLoader: "optional": { "trigger": ("CONDITIONING", {"tooltip": "Dummy input for forcing execution order",}), "compile_args": ("MOCHICOMPILEARGS", {"tooltip": "Optional torch.compile arguments",}), - "cublas_ops": ("BOOLEAN", {"tooltip": "tested on 4090, unsure of gpu requirements, enables faster linear ops from'https://github.com/aredden/torch-cublas-hgemm'",}), + "cublas_ops": ("BOOLEAN", {"tooltip": "tested on 4090, unsure of gpu requirements, enables faster linear ops for the GGUF models, for more info:'https://github.com/aredden/torch-cublas-hgemm'",}), }, } @@ -315,18 +315,23 @@ class MochiTextEncode: load_device = mm.text_encoder_device() offload_device = mm.text_encoder_offload_device() - clip.tokenizer.t5xxl.pad_to_max_length = True - clip.tokenizer.t5xxl.max_length = max_tokens - clip.cond_stage_model.t5xxl.return_attention_masks = True - clip.cond_stage_model.t5xxl.enable_attention_masks = True - clip.cond_stage_model.t5_attention_mask = True - clip.cond_stage_model.to(load_device) - tokens = clip.tokenizer.t5xxl.tokenize_with_weights(prompt, return_word_ids=True) - try: - embeds, _, attention_mask = clip.cond_stage_model.t5xxl.encode_token_weights(tokens) + clip.tokenizer.t5xxl.pad_to_max_length = True + clip.tokenizer.t5xxl.max_length = max_tokens + clip.cond_stage_model.t5xxl.return_attention_masks = True + clip.cond_stage_model.t5xxl.enable_attention_masks = True + clip.cond_stage_model.t5_attention_mask = True + clip.cond_stage_model.to(load_device) + tokens = clip.tokenizer.t5xxl.tokenize_with_weights(prompt, return_word_ids=True) + try: + embeds, _, attention_mask = clip.cond_stage_model.t5xxl.encode_token_weights(tokens) + except: + NotImplementedError("Failed to get attention mask from T5, is your ComfyUI up to date?") except: - NotImplementedError("Failed to get attention mask from T5, is your ComfyUI up to date?") + clip.cond_stage_model.to(load_device) + tokens = clip.tokenizer.tokenize_with_weights(prompt, return_word_ids=True) + embeds, _, attention_mask = clip.cond_stage_model.encode_token_weights(tokens) + if embeds.shape[1] > 256: raise ValueError(f"Prompt is too long, max tokens supported is {max_tokens} or less, got {embeds.shape[1]}") @@ -358,8 +363,8 @@ class MochiSampler: #"batch_cfg": ("BOOLEAN", {"default": False, "tooltip": "Enable batched cfg"}), }, "optional": { - "cfg_schedule": ("FLOAT", {"forceInput": True,}), - "opt_sigmas": ("SIGMAS",), + "cfg_schedule": ("FLOAT", {"forceInput": True, "tooltip": "Override cfg schedule with a list of ints"}), + "opt_sigmas": ("SIGMAS", {"tooltip": "Override sigma schedule and steps"}), } } @@ -373,16 +378,28 @@ class MochiSampler: if opt_sigmas is not None: sigma_schedule = opt_sigmas.tolist() - steps = len(sigma_schedule) + steps = int(len(sigma_schedule)) sigma_schedule.extend([0.0]) - logging.info(f"Using sigma_schedule: {sigma_schedule}") else: sigma_schedule = linear_quadratic_schedule(steps, 0.025) - logging.info(f"Using sigma_schedule: {sigma_schedule}") - cfg_schedule = cfg_schedule or [cfg] * steps - logging.info(f"Using cfg schedule: {cfg_schedule}") + if cfg_schedule is None: + cfg_schedule = [cfg] * steps + else: + logging.info(f"Using cfg schedule: {cfg_schedule}") + + #For compatibility with Comfy CLIPTextEncode + if not isinstance(positive, dict): + positive = { + "embeds": positive[0][0], + "attention_mask": positive[0][1]["attention_mask"].bool(), + } + if not isinstance(negative, dict): + negative = { + "embeds": negative[0][0], + "attention_mask": negative[0][1]["attention_mask"].bool(), + } args = { "height": height,